| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349 |
- # #! -*- coding: utf-8 -*-
- # # import urllib2
- # import logging
- # import threading
- # import math
- # import re
- # import time
- #
- # from bs4 import BeautifulSoup
- # from twisted.internet import defer
- # from twisted.internet.error import TimeoutError, ConnectionRefusedError, \
- # ConnectError, ConnectionLost, TCPTimedOutError, ConnectionDone
- #
- # logger = logging.getLogger(__name__)
- #
- #
- # class AutoProxyMiddleware(object):
- #
- # EXCEPTIONS_TO_CHANGE = (defer.TimeoutError, TimeoutError, ConnectionRefusedError, ConnectError, ConnectionLost, TCPTimedOutError, ConnectionDone)
- #
- # _settings = [
- # ('enable', True),
- # ('test_urls', [('http://www.w3school.com.cn', '06004630'), ]),
- # ('test_proxy_timeout', 5),
- # ('download_timeout', 60),
- # ('test_threadnums', 20),
- # ('ban_code', [503, ]),
- # ('ban_re', r''),
- # ('proxy_least', 3),
- # ('init_valid_proxys', 1),
- # ('invalid_limit', 200),
- # ]
- #
- # def __init__(self, proxy_set=None):
- # self.proxy_set = proxy_set or {}
- # for k, v in self._settings:
- # setattr(self, k, self.proxy_set.get(k, v))
- #
- # # 代理列表和当前的代理指针,couter_proxy用作该代理下载的网页数量
- # self.proxy = []
- # self.proxy_index = 0
- # self.proxyes = {}
- # self.counter_proxy = {}
- #
- # self.fetch_new_proxy()
- # self.test_proxyes(self.proxyes, wait=True)
- # logger.info('Use proxy : %s', self.proxy)
- #
- # @classmethod
- # def from_crawler(cls, crawler):
- # return cls(crawler.settings.getdict('AUTO_PROXY'))
- #
- # def process_request(self, request, spider):
- # if not self._is_enabled_for_request(request):
- # return
- #
- # if self.len_valid_proxy() > 0:
- # self.set_proxy(request)
- # # if 'download_timeout' not in request.meta:
- # request.meta['download_timeout'] = self.download_timeout
- # else:
- # # 没有可用代理,直连
- # if 'proxy' in request.meta:
- # del request.meta['proxy']
- #
- # def process_response(self, request, response, spider):
- # if not self._is_enabled_for_request(request):
- # return response
- #
- # if response.status in self.ban_code:
- # self.invaild_proxy(request.meta['proxy'])
- # logger.debug("Proxy[%s] ban because return httpstatuscode:[%s]. ", request.meta['proxy'], str(response.status))
- # new_request = request.copy()
- # new_request.dont_filter = True
- # return new_request
- #
- # if self.ban_re:
- # try:
- # pattern = re.compile(self.ban_re)
- # except TypeError:
- # logger.error('Wrong "ban_re", please check settings')
- # return response
- # match = re.search(pattern, response.body)
- # if match:
- # self.invaild_proxy(request.meta['proxy'])
- # logger.debug("Proxy[%s] ban because pattern match:[%s]. ", request.meta['proxy'], str(match))
- # new_request = request.copy()
- # new_request.dont_filter = True
- # return new_request
- #
- # p = request.meta['proxy']
- # self.counter_proxy[p] = self.counter_proxy.setdefault(p, 1) + 1
- # return response
- #
- # def process_exception(self, request, exception, spider):
- # if isinstance(exception, self.EXCEPTIONS_TO_CHANGE) \
- # and request.meta.get('proxy', False):
- # self.invaild_proxy(request.meta['proxy'])
- # logger.debug("Proxy[%s] connect exception[%s].", request.meta['proxy'], exception)
- # new_request = request.copy()
- # new_request.dont_filter = True
- # return new_request
- #
- # def invaild_proxy(self, proxy):
- # """
- # 将代理设为invaild。如果之前该代理已下载超过200页(默认)的资源,则暂时不设置,仅切换代理,并减少其计数。
- # """
- # if self.counter_proxy.get(proxy, 0) > self.invalid_limit:
- # self.counter_proxy[proxy] = self.counter_proxy.get(proxy, 0) - 50
- # if self.counter_proxy[proxy] < 0:
- # self.counter_proxy[proxy] = 0
- # self.change_proxy()
- # else:
- # self.proxyes[proxy] = False
- # # logger.info('Set proxy[%s] invaild.', proxy)
- #
- # def change_proxy(self):
- # """
- # 切换代理。
- # """
- # while True:
- # self.proxy_index = (self.proxy_index + 1) % len(self.proxy)
- # proxy_valid = self.proxyes[self.proxy[self.proxy_index]]
- # if proxy_valid:
- # break
- # if self.len_valid_proxy() == 0:
- # logger.info('Available proxys is none.Waiting for fetch new proxy.')
- # break
- # logger.info('Change proxy to %s', self.proxy[self.proxy_index])
- # logger.info('Available proxys[%s]: %s', self.len_valid_proxy(), self.valid_proxyes())
- #
- # # 可用代理数量小于预设值则扩展代理
- # if self.len_valid_proxy() < self.proxy_least:
- # self.extend_proxy()
- #
- # def set_proxy(self, request):
- # """
- # 设置代理。
- # """
- # proxy_valid = self.proxyes[self.proxy[self.proxy_index]]
- # if not proxy_valid:
- # self.change_proxy()
- #
- # request.meta['proxy'] = self.proxy[self.proxy_index]
- # # logger.info('Set proxy. request.meta: %s', request.meta)
- #
- # def len_valid_proxy(self):
- # """
- # 计算可用代理的数量
- # """
- # count = 0
- # for p in self.proxy:
- # if self.proxyes[p]:
- # count += 1
- # return count
- #
- # def valid_proxyes(self):
- # """
- # 可用代理列表
- # """
- # proxyes = []
- # for p in self.proxy:
- # if self.proxyes[p]:
- # proxyes.append(p)
- # return proxyes
- #
- # def extend_proxy(self):
- # """
- # 扩展代理。测试代理是异步的。
- # """
- # self.fetch_new_proxy()
- # self.test_proxyes(self.proxyes)
- #
- # def append_proxy(self, p):
- # """
- # 辅助函数,将测试通过的代理添加到列表
- # """
- # if p not in self.proxy:
- # self.proxy.append(p)
- #
- # def fetch_new_proxy(self):
- # """
- # 获取新的代理,目前从三个网站抓取代理,每个网站开一个线程抓取代理。
- # """
- # logger.info('Starting fetch new proxy.')
- # urls = ['xici', 'ip3336', 'kxdaili']
- # threads = []
- # for url in urls:
- # t = ProxyFetch(self.proxyes, url)
- # threads.append(t)
- # t.start()
- # for t in threads:
- # t.join()
- #
- # def test_proxyes(self, proxyes, wait=False):
- # """
- # 测试代理可通性。测试网址、特征码以及测试线程数均可设置。
- # """
- # list_proxy = proxyes.items()
- # threads = []
- # n = int(math.ceil(len(list_proxy) / self.test_threadnums))
- # for i in range(self.test_threadnums):
- # # 将待测试的代理平均分给测试线程
- # list_part = list_proxy[i * n: (i + 1) * n]
- # part = {k: v for k, v in list_part}
- # t = ProxyValidate(self, part)
- # threads.append(t)
- # t.start()
- #
- # # 初始化该中间件时,等待有可用的代理
- # if wait:
- # while True:
- # for t in threads:
- # t.join(0.2)
- # if self._has_valid_proxy():
- # break
- # if self._has_valid_proxy():
- # break
- #
- # def _has_valid_proxy(self):
- # if self.len_valid_proxy() >= self.init_valid_proxys:
- # return True
- #
- # def _is_enabled_for_request(self, request):
- # return self.enable and 'dont_proxy' not in request.meta
- #
- #
- # class ProxyValidate(threading.Thread):
- # """
- # 测试代理线程类
- # """
- #
- # def __init__(self, autoproxy, part):
- # super(ProxyValidate, self).__init__()
- # self.autoproxy = autoproxy
- # self.part = part
- #
- # def run(self):
- # self.test_proxyes(self.part)
- #
- # def test_proxyes(self, proxyes):
- # for proxy, valid in proxyes.iteritems():
- # if(self.check_proxy(proxy)):
- # self.autoproxy.proxyes[proxy] = True
- # self.autoproxy.append_proxy(proxy)
- #
- # def check_proxy(self, proxy):
- # proxy_handler = urllib2.ProxyHandler({'http': proxy})
- # opener = urllib2.build_opener(proxy_handler, urllib2.HTTPHandler)
- # try:
- # for url, code in self.autoproxy.test_urls:
- # resbody = opener.open(url, timeout=self.autoproxy.test_proxy_timeout).read()
- # if code not in resbody:
- # return False
- # return True
- # except Exception:
- # return False
- #
- #
- # class ProxyFetch(threading.Thread):
- #
- # def __init__(self, proxyes, url):
- # super(ProxyFetch, self).__init__()
- # self.proxyes = proxyes
- # self.url = url
- #
- # def run(self):
- # self.proxyes.update(getattr(self, 'fetch_proxy_from_' + self.url)())
- #
- # def fetch_proxy_from_xici(self):
- # proxyes = {}
- # url = "http://www.xicidaili.com/nn/"
- # try:
- # for i in range(1, 4):
- # soup = self.get_soup(url + str(i))
- # trs = soup.find("table", attrs={"id": "ip_list"}).find_all("tr")
- # for i, tr in enumerate(trs):
- # if(0 == i):
- # continue
- # tds = tr.find_all('td')
- # ip = tds[1].text
- # port = tds[2].text
- # proxy = ''.join(['http://', ip, ':', port]).encode('utf-8')
- # proxyes[proxy] = False
- # except Exception as e:
- # logger.error('Failed to fetch_proxy_from_xici. Exception[%s]', e)
- #
- # return proxyes
- #
- # def fetch_proxy_from_ip3336(self):
- # proxyes = {}
- # url = 'http://www.ip3366.net/free/?stype=1&page='
- # try:
- # for i in range(1, 6):
- # soup = self.get_soup(url + str(i))
- # trs = soup.find("div", attrs={"id": "list"}).table.find_all("tr")
- # for i, tr in enumerate(trs):
- # if 0 == i:
- # continue
- # tds = tr.find_all("td")
- # ip = tds[0].string.strip().encode('utf-8')
- # port = tds[1].string.strip().encode('utf-8')
- # proxy = ''.join(['http://', ip, ':', port])
- # proxyes[proxy] = False
- # except Exception as e:
- # logger.error('Failed to fetch_proxy_from_ip3336. Exception[%s]', e)
- #
- # return proxyes
- #
- # def fetch_proxy_from_kxdaili(self):
- # proxyes = {}
- # url = 'http://www.kxdaili.com/dailiip/1/%d.html'
- # try:
- # for i in range(1, 11):
- # soup = self.get_soup(url % i)
- # trs = soup.find("table", attrs={"class": "ui table segment"}).find_all("tr")
- # for i, tr in enumerate(trs):
- # if 0 == i:
- # continue
- # tds = tr.find_all("td")
- # ip = tds[0].string.strip().encode('utf-8')
- # port = tds[1].string.strip().encode('utf-8')
- # proxy = ''.join(['http://', ip, ':', port])
- # proxyes[proxy] = False
- # except Exception as e:
- # logger.error('Failed to fetch_proxy_from_kxdaili. Exception[%s]', e)
- #
- # return proxyes
- #
- # def get_soup(self, url):
- # request = urllib2.Request(url)
- # request.add_header("User-Agent", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit\/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 Safari/537.36")
- # while True:
- # try:
- # html_doc = urllib2.urlopen(request, timeout=30).read()
- # break
- # except:
- # logger.info("Fetch proxy from {} fail, will try later.".format(url))
- # time.sleep(120)
- #
- #
- # soup = BeautifulSoup(html_doc)
- #
- # return soup
- #
- #
- # if __name__ == '__main__':
- #
- # AutoProxyMiddleware()
|