6 år sedan · 3d6064713c
--- a/scrapy_yzd/scrapy_yzd/autoproxy.py
+++ b/scrapy_yzd/scrapy_yzd/autoproxy.py
@@ -1,349 +0,0 @@
 
				-# #! -*- coding: utf-8 -*-
			
 
				-# # import urllib2
			
 
				-# import logging
			
 
				-# import threading
			
 
				-# import math
			
 
				-# import re
			
 
				-# import time
			
 
				-#
			
 
				-# from bs4 import BeautifulSoup
			
 
				-# from twisted.internet import defer
			
 
				-# from twisted.internet.error import TimeoutError, ConnectionRefusedError, \
			
 
				-#     ConnectError, ConnectionLost, TCPTimedOutError, ConnectionDone
			
 
				-#
			
 
				-# logger = logging.getLogger(__name__)
			
 
				-#
			
 
				-#
			
 
				-# class AutoProxyMiddleware(object):
			
 
				-#
			
 
				-#     EXCEPTIONS_TO_CHANGE = (defer.TimeoutError, TimeoutError, ConnectionRefusedError, ConnectError, ConnectionLost, TCPTimedOutError, ConnectionDone)
			
 
				-#
			
 
				-#     _settings = [
			
 
				-#         ('enable', True),
			
 
				-#         ('test_urls', [('http://www.w3school.com.cn', '06004630'), ]),
			
 
				-#         ('test_proxy_timeout', 5),
			
 
				-#         ('download_timeout', 60),
			
 
				-#         ('test_threadnums', 20),
			
 
				-#         ('ban_code', [503, ]),
			
 
				-#         ('ban_re', r''),
			
 
				-#         ('proxy_least', 3),
			
 
				-#         ('init_valid_proxys', 1),
			
 
				-#         ('invalid_limit', 200),
			
 
				-#     ]
			
 
				-#
			
 
				-#     def __init__(self, proxy_set=None):
			
 
				-#         self.proxy_set = proxy_set or {}
			
 
				-#         for k, v in self._settings:
			
 
				-#             setattr(self, k, self.proxy_set.get(k, v))
			
 
				-#
			
 
				-#         # 代理列表和当前的代理指针，couter_proxy用作该代理下载的网页数量
			
 
				-#         self.proxy = []
			
 
				-#         self.proxy_index = 0
			
 
				-#         self.proxyes = {}
			
 
				-#         self.counter_proxy = {}
			
 
				-#
			
 
				-#         self.fetch_new_proxy()
			
 
				-#         self.test_proxyes(self.proxyes, wait=True)
			
 
				-#         logger.info('Use proxy : %s', self.proxy)
			
 
				-#
			
 
				-#     @classmethod
			
 
				-#     def from_crawler(cls, crawler):
			
 
				-#         return cls(crawler.settings.getdict('AUTO_PROXY'))
			
 
				-#
			
 
				-#     def process_request(self, request, spider):
			
 
				-#         if not self._is_enabled_for_request(request):
			
 
				-#             return
			
 
				-#
			
 
				-#         if self.len_valid_proxy() > 0:
			
 
				-#             self.set_proxy(request)
			
 
				-#             # if 'download_timeout' not in request.meta:
			
 
				-#             request.meta['download_timeout'] = self.download_timeout
			
 
				-#         else:
			
 
				-#             # 没有可用代理，直连
			
 
				-#             if 'proxy' in request.meta:
			
 
				-#                 del request.meta['proxy']
			
 
				-#
			
 
				-#     def process_response(self, request, response, spider):
			
 
				-#         if not self._is_enabled_for_request(request):
			
 
				-#             return response
			
 
				-#
			
 
				-#         if response.status in self.ban_code:
			
 
				-#             self.invaild_proxy(request.meta['proxy'])
			
 
				-#             logger.debug("Proxy[%s] ban because return httpstatuscode:[%s]. ", request.meta['proxy'], str(response.status))
			
 
				-#             new_request = request.copy()
			
 
				-#             new_request.dont_filter = True
			
 
				-#             return new_request
			
 
				-#
			
 
				-#         if self.ban_re:
			
 
				-#             try:
			
 
				-#                 pattern = re.compile(self.ban_re)
			
 
				-#             except TypeError:
			
 
				-#                 logger.error('Wrong "ban_re", please check settings')
			
 
				-#                 return response
			
 
				-#             match = re.search(pattern, response.body)
			
 
				-#             if match:
			
 
				-#                 self.invaild_proxy(request.meta['proxy'])
			
 
				-#                 logger.debug("Proxy[%s] ban because pattern match:[%s]. ", request.meta['proxy'], str(match))
			
 
				-#                 new_request = request.copy()
			
 
				-#                 new_request.dont_filter = True
			
 
				-#                 return new_request
			
 
				-#
			
 
				-#         p = request.meta['proxy']
			
 
				-#         self.counter_proxy[p] = self.counter_proxy.setdefault(p, 1) + 1
			
 
				-#         return response
			
 
				-#
			
 
				-#     def process_exception(self, request, exception, spider):
			
 
				-#         if isinstance(exception, self.EXCEPTIONS_TO_CHANGE) \
			
 
				-#                 and request.meta.get('proxy', False):
			
 
				-#             self.invaild_proxy(request.meta['proxy'])
			
 
				-#             logger.debug("Proxy[%s] connect exception[%s].", request.meta['proxy'], exception)
			
 
				-#             new_request = request.copy()
			
 
				-#             new_request.dont_filter = True
			
 
				-#             return new_request
			
 
				-#
			
 
				-#     def invaild_proxy(self, proxy):
			
 
				-#         """
			
 
				-#         将代理设为invaild。如果之前该代理已下载超过200页（默认）的资源，则暂时不设置，仅切换代理，并减少其计数。
			
 
				-#         """
			
 
				-#         if self.counter_proxy.get(proxy, 0) > self.invalid_limit:
			
 
				-#             self.counter_proxy[proxy] = self.counter_proxy.get(proxy, 0) - 50
			
 
				-#             if self.counter_proxy[proxy] < 0:
			
 
				-#                 self.counter_proxy[proxy] = 0
			
 
				-#             self.change_proxy()
			
 
				-#         else:
			
 
				-#             self.proxyes[proxy] = False
			
 
				-#             # logger.info('Set proxy[%s] invaild.', proxy)
			
 
				-#
			
 
				-#     def change_proxy(self):
			
 
				-#         """
			
 
				-#         切换代理。
			
 
				-#         """
			
 
				-#         while True:
			
 
				-#             self.proxy_index = (self.proxy_index + 1) % len(self.proxy)
			
 
				-#             proxy_valid = self.proxyes[self.proxy[self.proxy_index]]
			
 
				-#             if proxy_valid:
			
 
				-#                 break
			
 
				-#             if self.len_valid_proxy() == 0:
			
 
				-#                 logger.info('Available proxys is none.Waiting for fetch new proxy.')
			
 
				-#                 break
			
 
				-#         logger.info('Change proxy to %s', self.proxy[self.proxy_index])
			
 
				-#         logger.info('Available proxys[%s]: %s', self.len_valid_proxy(), self.valid_proxyes())
			
 
				-#
			
 
				-#         # 可用代理数量小于预设值则扩展代理
			
 
				-#         if self.len_valid_proxy() < self.proxy_least:
			
 
				-#             self.extend_proxy()
			
 
				-#
			
 
				-#     def set_proxy(self, request):
			
 
				-#         """
			
 
				-#         设置代理。
			
 
				-#         """
			
 
				-#         proxy_valid = self.proxyes[self.proxy[self.proxy_index]]
			
 
				-#         if not proxy_valid:
			
 
				-#             self.change_proxy()
			
 
				-#
			
 
				-#         request.meta['proxy'] = self.proxy[self.proxy_index]
			
 
				-#         # logger.info('Set proxy. request.meta: %s', request.meta)
			
 
				-#
			
 
				-#     def len_valid_proxy(self):
			
 
				-#         """
			
 
				-#         计算可用代理的数量
			
 
				-#         """
			
 
				-#         count = 0
			
 
				-#         for p in self.proxy:
			
 
				-#             if self.proxyes[p]:
			
 
				-#                 count += 1
			
 
				-#         return count
			
 
				-#
			
 
				-#     def valid_proxyes(self):
			
 
				-#         """
			
 
				-#         可用代理列表
			
 
				-#         """
			
 
				-#         proxyes = []
			
 
				-#         for p in self.proxy:
			
 
				-#             if self.proxyes[p]:
			
 
				-#                 proxyes.append(p)
			
 
				-#         return proxyes
			
 
				-#
			
 
				-#     def extend_proxy(self):
			
 
				-#         """
			
 
				-#         扩展代理。测试代理是异步的。
			
 
				-#         """
			
 
				-#         self.fetch_new_proxy()
			
 
				-#         self.test_proxyes(self.proxyes)
			
 
				-#
			
 
				-#     def append_proxy(self, p):
			
 
				-#         """
			
 
				-#         辅助函数，将测试通过的代理添加到列表
			
 
				-#         """
			
 
				-#         if p not in self.proxy:
			
 
				-#             self.proxy.append(p)
			
 
				-#
			
 
				-#     def fetch_new_proxy(self):
			
 
				-#         """
			
 
				-#         获取新的代理，目前从三个网站抓取代理，每个网站开一个线程抓取代理。
			
 
				-#         """
			
 
				-#         logger.info('Starting fetch new proxy.')
			
 
				-#         urls = ['xici', 'ip3336', 'kxdaili']
			
 
				-#         threads = []
			
 
				-#         for url in urls:
			
 
				-#             t = ProxyFetch(self.proxyes, url)
			
 
				-#             threads.append(t)
			
 
				-#             t.start()
			
 
				-#         for t in threads:
			
 
				-#             t.join()
			
 
				-#
			
 
				-#     def test_proxyes(self, proxyes, wait=False):
			
 
				-#         """
			
 
				-#         测试代理可通性。测试网址、特征码以及测试线程数均可设置。
			
 
				-#         """
			
 
				-#         list_proxy = proxyes.items()
			
 
				-#         threads = []
			
 
				-#         n = int(math.ceil(len(list_proxy) / self.test_threadnums))
			
 
				-#         for i in range(self.test_threadnums):
			
 
				-#             # 将待测试的代理平均分给测试线程
			
 
				-#             list_part = list_proxy[i * n: (i + 1) * n]
			
 
				-#             part = {k: v for k, v in list_part}
			
 
				-#             t = ProxyValidate(self, part)
			
 
				-#             threads.append(t)
			
 
				-#             t.start()
			
 
				-#
			
 
				-#         # 初始化该中间件时，等待有可用的代理
			
 
				-#         if wait:
			
 
				-#             while True:
			
 
				-#                 for t in threads:
			
 
				-#                     t.join(0.2)
			
 
				-#                     if self._has_valid_proxy():
			
 
				-#                         break
			
 
				-#                 if self._has_valid_proxy():
			
 
				-#                         break
			
 
				-#
			
 
				-#     def _has_valid_proxy(self):
			
 
				-#         if self.len_valid_proxy() >= self.init_valid_proxys:
			
 
				-#             return True
			
 
				-#
			
 
				-#     def _is_enabled_for_request(self, request):
			
 
				-#         return self.enable and 'dont_proxy' not in request.meta
			
 
				-#
			
 
				-#
			
 
				-# class ProxyValidate(threading.Thread):
			
 
				-#     """
			
 
				-#     测试代理线程类
			
 
				-#     """
			
 
				-#
			
 
				-#     def __init__(self, autoproxy, part):
			
 
				-#         super(ProxyValidate, self).__init__()
			
 
				-#         self.autoproxy = autoproxy
			
 
				-#         self.part = part
			
 
				-#
			
 
				-#     def run(self):
			
 
				-#         self.test_proxyes(self.part)
			
 
				-#
			
 
				-#     def test_proxyes(self, proxyes):
			
 
				-#         for proxy, valid in proxyes.iteritems():
			
 
				-#             if(self.check_proxy(proxy)):
			
 
				-#                 self.autoproxy.proxyes[proxy] = True
			
 
				-#                 self.autoproxy.append_proxy(proxy)
			
 
				-#
			
 
				-#     def check_proxy(self, proxy):
			
 
				-#         proxy_handler = urllib2.ProxyHandler({'http': proxy})
			
 
				-#         opener = urllib2.build_opener(proxy_handler, urllib2.HTTPHandler)
			
 
				-#         try:
			
 
				-#             for url, code in self.autoproxy.test_urls:
			
 
				-#                 resbody = opener.open(url, timeout=self.autoproxy.test_proxy_timeout).read()
			
 
				-#                 if code not in resbody:
			
 
				-#                     return False
			
 
				-#             return True
			
 
				-#         except Exception:
			
 
				-#             return False
			
 
				-#
			
 
				-#
			
 
				-# class ProxyFetch(threading.Thread):
			
 
				-#
			
 
				-#     def __init__(self, proxyes, url):
			
 
				-#         super(ProxyFetch, self).__init__()
			
 
				-#         self.proxyes = proxyes
			
 
				-#         self.url = url
			
 
				-#
			
 
				-#     def run(self):
			
 
				-#         self.proxyes.update(getattr(self, 'fetch_proxy_from_' + self.url)())
			
 
				-#
			
 
				-#     def fetch_proxy_from_xici(self):
			
 
				-#         proxyes = {}
			
 
				-#         url = "http://www.xicidaili.com/nn/"
			
 
				-#         try:
			
 
				-#             for i in range(1, 4):
			
 
				-#                 soup = self.get_soup(url + str(i))
			
 
				-#                 trs = soup.find("table", attrs={"id": "ip_list"}).find_all("tr")
			
 
				-#                 for i, tr in enumerate(trs):
			
 
				-#                     if(0 == i):
			
 
				-#                         continue
			
 
				-#                     tds = tr.find_all('td')
			
 
				-#                     ip = tds[1].text
			
 
				-#                     port = tds[2].text
			
 
				-#                     proxy = ''.join(['http://', ip, ':', port]).encode('utf-8')
			
 
				-#                     proxyes[proxy] = False
			
 
				-#         except Exception as e:
			
 
				-#             logger.error('Failed to fetch_proxy_from_xici. Exception[%s]', e)
			
 
				-#
			
 
				-#         return proxyes
			
 
				-#
			
 
				-#     def fetch_proxy_from_ip3336(self):
			
 
				-#         proxyes = {}
			
 
				-#         url = 'http://www.ip3366.net/free/?stype=1&page='
			
 
				-#         try:
			
 
				-#             for i in range(1, 6):
			
 
				-#                 soup = self.get_soup(url + str(i))
			
 
				-#                 trs = soup.find("div", attrs={"id": "list"}).table.find_all("tr")
			
 
				-#                 for i, tr in enumerate(trs):
			
 
				-#                     if 0 == i:
			
 
				-#                         continue
			
 
				-#                     tds = tr.find_all("td")
			
 
				-#                     ip = tds[0].string.strip().encode('utf-8')
			
 
				-#                     port = tds[1].string.strip().encode('utf-8')
			
 
				-#                     proxy = ''.join(['http://', ip, ':', port])
			
 
				-#                     proxyes[proxy] = False
			
 
				-#         except Exception as e:
			
 
				-#             logger.error('Failed to fetch_proxy_from_ip3336. Exception[%s]', e)
			
 
				-#
			
 
				-#         return proxyes
			
 
				-#
			
 
				-#     def fetch_proxy_from_kxdaili(self):
			
 
				-#         proxyes = {}
			
 
				-#         url = 'http://www.kxdaili.com/dailiip/1/%d.html'
			
 
				-#         try:
			
 
				-#             for i in range(1, 11):
			
 
				-#                 soup = self.get_soup(url % i)
			
 
				-#                 trs = soup.find("table", attrs={"class": "ui table segment"}).find_all("tr")
			
 
				-#                 for i, tr in enumerate(trs):
			
 
				-#                     if 0 == i:
			
 
				-#                         continue
			
 
				-#                     tds = tr.find_all("td")
			
 
				-#                     ip = tds[0].string.strip().encode('utf-8')
			
 
				-#                     port = tds[1].string.strip().encode('utf-8')
			
 
				-#                     proxy = ''.join(['http://', ip, ':', port])
			
 
				-#                     proxyes[proxy] = False
			
 
				-#         except Exception as e:
			
 
				-#             logger.error('Failed to fetch_proxy_from_kxdaili. Exception[%s]', e)
			
 
				-#
			
 
				-#         return proxyes
			
 
				-#
			
 
				-#     def get_soup(self, url):
			
 
				-#         request = urllib2.Request(url)
			
 
				-#         request.add_header("User-Agent", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit\/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 Safari/537.36")
			
 
				-#         while True:
			
 
				-#             try:
			
 
				-#                 html_doc = urllib2.urlopen(request, timeout=30).read()
			
 
				-#                 break
			
 
				-#             except:
			
 
				-#                 logger.info("Fetch proxy from {} fail, will try later.".format(url))
			
 
				-#                 time.sleep(120)
			
 
				-#
			
 
				-#
			
 
				-#         soup = BeautifulSoup(html_doc)
			
 
				-#
			
 
				-#         return soup
			
 
				-#
			
 
				-#
			
 
				-# if __name__ == '__main__':
			
 
				-#
			
 
				-#     AutoProxyMiddleware()