SpiderBase类和Setting.py

SpiderBase

import requests
from urllib import parse
import random
from spider import setting
import execjs

class BaseClass(type):
    def __new__(cls, name, base, attrs):
        attrs["cw_func"] = []
        count = 0
        for k, v in attrs.items():
            if k.startswith("crawl_"):
                attrs["cw_func"].append(k)
                count += 1
        attrs["func_count"] = count
        if name == "BaseABC":
            attrs["setting"] = setting
            for key in dir(setting):
                if not key.startswith("__"):
                    attrs[key] = eval("setting.{}".format(key))

        return type.__new__(cls, name, base, attrs)



class BaseABC(metaclass=BaseClass):
    pass


class SpiderBase(BaseABC):
    def __init__(self):
        self.session = requests.Session()


    def download_page(self,url,**kwargs):
        cookies = kwargs.pop("cookies","")
        method = kwargs.pop("method","get")
        if not isinstance(cookies,dict):
            cookies = self.cookies(cookies)
        if method == "get":
            if cookies:
                resp = self.session.get(url,cookies=cookies,**kwargs)
            else:
                resp = self.session.get(url, **kwargs)
        elif method == "post":
            if cookies:
                resp = self.session.post(url,cookies=cookies,**kwargs)
            else:
                resp = self.session.post(url, **kwargs)
        return resp


    def _download_page(self,**kwargs):
        print(kwargs)

    def crawler(self,url):
        pass

    def extract_data(self,html):
        pass

    @property
    def headers(self):
        headers = {
            "User-Agent": random.choice(self.ua)
        }
        return headers


    def cookies(self,cookies):
        cookie_dict = {i.split("=")[0]: i.split("=")[-1] for i in cookies.split("; ")}
        return cookie_dict

    def url(self,**kargs):
        _url = kargs.pop("url","")
        search_key = kargs.pop("search_key","")
        if search_key:
            search_key = parse.quote(search_key)
            _url = _url.format(search_key)
        return _url

    def schedule(self):
        pass

    def re_exract(self, html_text,pattern,filename=None):
        import re
        if html_text:
            html = html_text
        elif filename:
            with open(filename, 'r', encoding='utf-8') as f:
                html = f.readlines()
            html = "".join(html)
        pat = re.compile(pattern)
        ret = pat.findall(html)
        return ret

    def save_to_file(self, text,filename):
        with open(filename, 'w', encoding='utf-8') as f:
            f.writelines(text)

    def save_to_db(self,db_object):
        pass

    def exec_js(self,js_file_path):
        with open(js_file_path, encoding='utf-8') as f:
            content = f.read()
        js = execjs.compile(content)
        return js

if __name__ == '__main__':
    sb = SpiderBase()
    search_key = "test"
    url = "url"
    sb.url(url=url,search_key=search_key)
    print(sb.setting.deny)

Setting.py

一些配置内容

ua = [
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 OPR/26.0.1656.60",
    "Opera/8.0 (Windows NT 5.1; U; en)",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36",
    "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.133 Safari/534.16",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36",
    "Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11",
    "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER)"
]

deny=[]

MYSQL={
    "host":"localhost",
    "user":"root",
    "password":"root",
    "port":3306,
    "db":"movie",
    "charset":"utf8"
}

   转载规则


《SpiderBase类和Setting.py》 罗华 采用 知识共享署名 4.0 国际许可协议 进行许可。
 上一篇
Celery的使用(1)---简单配置与初次运行 Celery的使用(1)---简单配置与初次运行
前言celery是一个基于python开发的简单、灵活且可靠的分布式任务队列框架, 是一个分布式队列的管理工具, 可以用 Celery 提供的接口快速实现并管理一个分布式的任务队列.。它采用了典型的生产者-消费者模型,主要由三部分组成:
2019-11-21
本篇 
SpiderBase类和Setting.py SpiderBase类和Setting.py
SpiderBaseimport requests from urllib import parse import random from spider import setting import execjs class BaseCla
2019-07-05
  目录