autoproxy.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349
  1. # #! -*- coding: utf-8 -*-
  2. # # import urllib2
  3. # import logging
  4. # import threading
  5. # import math
  6. # import re
  7. # import time
  8. #
  9. # from bs4 import BeautifulSoup
  10. # from twisted.internet import defer
  11. # from twisted.internet.error import TimeoutError, ConnectionRefusedError, \
  12. # ConnectError, ConnectionLost, TCPTimedOutError, ConnectionDone
  13. #
  14. # logger = logging.getLogger(__name__)
  15. #
  16. #
  17. # class AutoProxyMiddleware(object):
  18. #
  19. # EXCEPTIONS_TO_CHANGE = (defer.TimeoutError, TimeoutError, ConnectionRefusedError, ConnectError, ConnectionLost, TCPTimedOutError, ConnectionDone)
  20. #
  21. # _settings = [
  22. # ('enable', True),
  23. # ('test_urls', [('http://www.w3school.com.cn', '06004630'), ]),
  24. # ('test_proxy_timeout', 5),
  25. # ('download_timeout', 60),
  26. # ('test_threadnums', 20),
  27. # ('ban_code', [503, ]),
  28. # ('ban_re', r''),
  29. # ('proxy_least', 3),
  30. # ('init_valid_proxys', 1),
  31. # ('invalid_limit', 200),
  32. # ]
  33. #
  34. # def __init__(self, proxy_set=None):
  35. # self.proxy_set = proxy_set or {}
  36. # for k, v in self._settings:
  37. # setattr(self, k, self.proxy_set.get(k, v))
  38. #
  39. # # 代理列表和当前的代理指针,couter_proxy用作该代理下载的网页数量
  40. # self.proxy = []
  41. # self.proxy_index = 0
  42. # self.proxyes = {}
  43. # self.counter_proxy = {}
  44. #
  45. # self.fetch_new_proxy()
  46. # self.test_proxyes(self.proxyes, wait=True)
  47. # logger.info('Use proxy : %s', self.proxy)
  48. #
  49. # @classmethod
  50. # def from_crawler(cls, crawler):
  51. # return cls(crawler.settings.getdict('AUTO_PROXY'))
  52. #
  53. # def process_request(self, request, spider):
  54. # if not self._is_enabled_for_request(request):
  55. # return
  56. #
  57. # if self.len_valid_proxy() > 0:
  58. # self.set_proxy(request)
  59. # # if 'download_timeout' not in request.meta:
  60. # request.meta['download_timeout'] = self.download_timeout
  61. # else:
  62. # # 没有可用代理,直连
  63. # if 'proxy' in request.meta:
  64. # del request.meta['proxy']
  65. #
  66. # def process_response(self, request, response, spider):
  67. # if not self._is_enabled_for_request(request):
  68. # return response
  69. #
  70. # if response.status in self.ban_code:
  71. # self.invaild_proxy(request.meta['proxy'])
  72. # logger.debug("Proxy[%s] ban because return httpstatuscode:[%s]. ", request.meta['proxy'], str(response.status))
  73. # new_request = request.copy()
  74. # new_request.dont_filter = True
  75. # return new_request
  76. #
  77. # if self.ban_re:
  78. # try:
  79. # pattern = re.compile(self.ban_re)
  80. # except TypeError:
  81. # logger.error('Wrong "ban_re", please check settings')
  82. # return response
  83. # match = re.search(pattern, response.body)
  84. # if match:
  85. # self.invaild_proxy(request.meta['proxy'])
  86. # logger.debug("Proxy[%s] ban because pattern match:[%s]. ", request.meta['proxy'], str(match))
  87. # new_request = request.copy()
  88. # new_request.dont_filter = True
  89. # return new_request
  90. #
  91. # p = request.meta['proxy']
  92. # self.counter_proxy[p] = self.counter_proxy.setdefault(p, 1) + 1
  93. # return response
  94. #
  95. # def process_exception(self, request, exception, spider):
  96. # if isinstance(exception, self.EXCEPTIONS_TO_CHANGE) \
  97. # and request.meta.get('proxy', False):
  98. # self.invaild_proxy(request.meta['proxy'])
  99. # logger.debug("Proxy[%s] connect exception[%s].", request.meta['proxy'], exception)
  100. # new_request = request.copy()
  101. # new_request.dont_filter = True
  102. # return new_request
  103. #
  104. # def invaild_proxy(self, proxy):
  105. # """
  106. # 将代理设为invaild。如果之前该代理已下载超过200页(默认)的资源,则暂时不设置,仅切换代理,并减少其计数。
  107. # """
  108. # if self.counter_proxy.get(proxy, 0) > self.invalid_limit:
  109. # self.counter_proxy[proxy] = self.counter_proxy.get(proxy, 0) - 50
  110. # if self.counter_proxy[proxy] < 0:
  111. # self.counter_proxy[proxy] = 0
  112. # self.change_proxy()
  113. # else:
  114. # self.proxyes[proxy] = False
  115. # # logger.info('Set proxy[%s] invaild.', proxy)
  116. #
  117. # def change_proxy(self):
  118. # """
  119. # 切换代理。
  120. # """
  121. # while True:
  122. # self.proxy_index = (self.proxy_index + 1) % len(self.proxy)
  123. # proxy_valid = self.proxyes[self.proxy[self.proxy_index]]
  124. # if proxy_valid:
  125. # break
  126. # if self.len_valid_proxy() == 0:
  127. # logger.info('Available proxys is none.Waiting for fetch new proxy.')
  128. # break
  129. # logger.info('Change proxy to %s', self.proxy[self.proxy_index])
  130. # logger.info('Available proxys[%s]: %s', self.len_valid_proxy(), self.valid_proxyes())
  131. #
  132. # # 可用代理数量小于预设值则扩展代理
  133. # if self.len_valid_proxy() < self.proxy_least:
  134. # self.extend_proxy()
  135. #
  136. # def set_proxy(self, request):
  137. # """
  138. # 设置代理。
  139. # """
  140. # proxy_valid = self.proxyes[self.proxy[self.proxy_index]]
  141. # if not proxy_valid:
  142. # self.change_proxy()
  143. #
  144. # request.meta['proxy'] = self.proxy[self.proxy_index]
  145. # # logger.info('Set proxy. request.meta: %s', request.meta)
  146. #
  147. # def len_valid_proxy(self):
  148. # """
  149. # 计算可用代理的数量
  150. # """
  151. # count = 0
  152. # for p in self.proxy:
  153. # if self.proxyes[p]:
  154. # count += 1
  155. # return count
  156. #
  157. # def valid_proxyes(self):
  158. # """
  159. # 可用代理列表
  160. # """
  161. # proxyes = []
  162. # for p in self.proxy:
  163. # if self.proxyes[p]:
  164. # proxyes.append(p)
  165. # return proxyes
  166. #
  167. # def extend_proxy(self):
  168. # """
  169. # 扩展代理。测试代理是异步的。
  170. # """
  171. # self.fetch_new_proxy()
  172. # self.test_proxyes(self.proxyes)
  173. #
  174. # def append_proxy(self, p):
  175. # """
  176. # 辅助函数,将测试通过的代理添加到列表
  177. # """
  178. # if p not in self.proxy:
  179. # self.proxy.append(p)
  180. #
  181. # def fetch_new_proxy(self):
  182. # """
  183. # 获取新的代理,目前从三个网站抓取代理,每个网站开一个线程抓取代理。
  184. # """
  185. # logger.info('Starting fetch new proxy.')
  186. # urls = ['xici', 'ip3336', 'kxdaili']
  187. # threads = []
  188. # for url in urls:
  189. # t = ProxyFetch(self.proxyes, url)
  190. # threads.append(t)
  191. # t.start()
  192. # for t in threads:
  193. # t.join()
  194. #
  195. # def test_proxyes(self, proxyes, wait=False):
  196. # """
  197. # 测试代理可通性。测试网址、特征码以及测试线程数均可设置。
  198. # """
  199. # list_proxy = proxyes.items()
  200. # threads = []
  201. # n = int(math.ceil(len(list_proxy) / self.test_threadnums))
  202. # for i in range(self.test_threadnums):
  203. # # 将待测试的代理平均分给测试线程
  204. # list_part = list_proxy[i * n: (i + 1) * n]
  205. # part = {k: v for k, v in list_part}
  206. # t = ProxyValidate(self, part)
  207. # threads.append(t)
  208. # t.start()
  209. #
  210. # # 初始化该中间件时,等待有可用的代理
  211. # if wait:
  212. # while True:
  213. # for t in threads:
  214. # t.join(0.2)
  215. # if self._has_valid_proxy():
  216. # break
  217. # if self._has_valid_proxy():
  218. # break
  219. #
  220. # def _has_valid_proxy(self):
  221. # if self.len_valid_proxy() >= self.init_valid_proxys:
  222. # return True
  223. #
  224. # def _is_enabled_for_request(self, request):
  225. # return self.enable and 'dont_proxy' not in request.meta
  226. #
  227. #
  228. # class ProxyValidate(threading.Thread):
  229. # """
  230. # 测试代理线程类
  231. # """
  232. #
  233. # def __init__(self, autoproxy, part):
  234. # super(ProxyValidate, self).__init__()
  235. # self.autoproxy = autoproxy
  236. # self.part = part
  237. #
  238. # def run(self):
  239. # self.test_proxyes(self.part)
  240. #
  241. # def test_proxyes(self, proxyes):
  242. # for proxy, valid in proxyes.iteritems():
  243. # if(self.check_proxy(proxy)):
  244. # self.autoproxy.proxyes[proxy] = True
  245. # self.autoproxy.append_proxy(proxy)
  246. #
  247. # def check_proxy(self, proxy):
  248. # proxy_handler = urllib2.ProxyHandler({'http': proxy})
  249. # opener = urllib2.build_opener(proxy_handler, urllib2.HTTPHandler)
  250. # try:
  251. # for url, code in self.autoproxy.test_urls:
  252. # resbody = opener.open(url, timeout=self.autoproxy.test_proxy_timeout).read()
  253. # if code not in resbody:
  254. # return False
  255. # return True
  256. # except Exception:
  257. # return False
  258. #
  259. #
  260. # class ProxyFetch(threading.Thread):
  261. #
  262. # def __init__(self, proxyes, url):
  263. # super(ProxyFetch, self).__init__()
  264. # self.proxyes = proxyes
  265. # self.url = url
  266. #
  267. # def run(self):
  268. # self.proxyes.update(getattr(self, 'fetch_proxy_from_' + self.url)())
  269. #
  270. # def fetch_proxy_from_xici(self):
  271. # proxyes = {}
  272. # url = "http://www.xicidaili.com/nn/"
  273. # try:
  274. # for i in range(1, 4):
  275. # soup = self.get_soup(url + str(i))
  276. # trs = soup.find("table", attrs={"id": "ip_list"}).find_all("tr")
  277. # for i, tr in enumerate(trs):
  278. # if(0 == i):
  279. # continue
  280. # tds = tr.find_all('td')
  281. # ip = tds[1].text
  282. # port = tds[2].text
  283. # proxy = ''.join(['http://', ip, ':', port]).encode('utf-8')
  284. # proxyes[proxy] = False
  285. # except Exception as e:
  286. # logger.error('Failed to fetch_proxy_from_xici. Exception[%s]', e)
  287. #
  288. # return proxyes
  289. #
  290. # def fetch_proxy_from_ip3336(self):
  291. # proxyes = {}
  292. # url = 'http://www.ip3366.net/free/?stype=1&page='
  293. # try:
  294. # for i in range(1, 6):
  295. # soup = self.get_soup(url + str(i))
  296. # trs = soup.find("div", attrs={"id": "list"}).table.find_all("tr")
  297. # for i, tr in enumerate(trs):
  298. # if 0 == i:
  299. # continue
  300. # tds = tr.find_all("td")
  301. # ip = tds[0].string.strip().encode('utf-8')
  302. # port = tds[1].string.strip().encode('utf-8')
  303. # proxy = ''.join(['http://', ip, ':', port])
  304. # proxyes[proxy] = False
  305. # except Exception as e:
  306. # logger.error('Failed to fetch_proxy_from_ip3336. Exception[%s]', e)
  307. #
  308. # return proxyes
  309. #
  310. # def fetch_proxy_from_kxdaili(self):
  311. # proxyes = {}
  312. # url = 'http://www.kxdaili.com/dailiip/1/%d.html'
  313. # try:
  314. # for i in range(1, 11):
  315. # soup = self.get_soup(url % i)
  316. # trs = soup.find("table", attrs={"class": "ui table segment"}).find_all("tr")
  317. # for i, tr in enumerate(trs):
  318. # if 0 == i:
  319. # continue
  320. # tds = tr.find_all("td")
  321. # ip = tds[0].string.strip().encode('utf-8')
  322. # port = tds[1].string.strip().encode('utf-8')
  323. # proxy = ''.join(['http://', ip, ':', port])
  324. # proxyes[proxy] = False
  325. # except Exception as e:
  326. # logger.error('Failed to fetch_proxy_from_kxdaili. Exception[%s]', e)
  327. #
  328. # return proxyes
  329. #
  330. # def get_soup(self, url):
  331. # request = urllib2.Request(url)
  332. # request.add_header("User-Agent", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit\/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 Safari/537.36")
  333. # while True:
  334. # try:
  335. # html_doc = urllib2.urlopen(request, timeout=30).read()
  336. # break
  337. # except:
  338. # logger.info("Fetch proxy from {} fail, will try later.".format(url))
  339. # time.sleep(120)
  340. #
  341. #
  342. # soup = BeautifulSoup(html_doc)
  343. #
  344. # return soup
  345. #
  346. #
  347. # if __name__ == '__main__':
  348. #
  349. # AutoProxyMiddleware()