# 使用了fake库 from fake_useragent import UserAgent # 配置headers class RandomUserAgentMidddlware(object): # 随机更换user-agent def __init__(self, crawler): super(RandomUserAgentMidddlware, self).__init__() self.ua = UserAgent() # 从配置文件读取随机类型 self.ua_type = crawler.settings.get('RANDOM_UA_TYPE', 'random') @classmethod def from_crawler(cls, crawler): return cls(crawler) def process_request(self, request, spider): # 通过配置文件的随机类型进行调用 def get_ua(): return getattr(self.ua, self.ua_type) request.headers.setdefault('User-Agent', get_ua()) # 使用了阿里云的IP代理服务 from myscrapy.aliproxy import get_proxy_ip # 配置代理 class ProxyMiddleware(object): def process_request(self, request, spider): request.meta['proxy'] = get_proxy_ip()
setting中开启中间件:
DOWNLOADER_MIDDLEWARES = { 'myscrapy.middlewares.MyscrapyDownloaderMiddleware': 543, 'myscrapy.middlewares.RandomUserAgentMidddlware': 0, 'myscrapy.middlewares.ProxyMiddleware': 1, }
封装阿里云IP代理:
import urllib.request import json def get_proxy_ip(): host = 'http://zip.market.alicloudapi.com' path = '/devtoolservice/ipagency' method = 'GET' appcode = 'xxxxxxxxxxxx' querys = 'foreigntype=0&protocol=0' bodys = {} url = host + path + '?' + querys request = urllib.request.Request(url) request.add_header('Authorization', 'APPCODE ' + "xxxxxxxxx") response = urllib.request.urlopen(request) content = response.read() if (content): load = json.loads(str(content, encoding='utf8')) address_ = load['result'][0]['address'] return address_
发布者:全栈程序员-站长,转载请注明出处:https://javaforall.net/218368.html原文链接:https://javaforall.net
