aaaa.py 3.6 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394
  1. # -*- coding: utf-8 -*-
  2. import scrapy
  3. import re
  4. import copy
  5. from ..items import LanqiuItem
  6. class LanqiuSpider(scrapy.Spider):
  7. name = 'aaaaa'
  8. allowed_domains = ['m.hgg070.com/']
  9. start_urls = ['http://m.hgg070.com//']
  10. remath=re.compile("篮球")
  11. custom_settings={
  12. "ITEM_PIPELINES": {
  13. "hgg070_spider.pipelines.lanqiu.ZuqiuPipeline": 200,
  14. },
  15. }
  16. def start_requests(self):
  17. #今日,早盘
  18. h_types=[('FT'),('FU')]
  19. headers = {
  20. 'Accept': '*/*',
  21. 'Accept-Encoding': 'gzip, deflate',
  22. 'Accept-Language': 'zh-CN,zh;q=0.9',
  23. 'Connection': 'keep-alive',
  24. 'Content-Length': '130',
  25. 'Content-type': 'application/x-www-form-urlencoded',
  26. 'Cookie': '_ga=GA1.2.471918301.1572059707; _gid=GA1.2.2109447865.1572059707; _gat=1',
  27. 'Host': 'm.hgg070.com',
  28. 'Origin': 'http://m.hgg070.com',
  29. 'Referer': 'http://m.hgg070.com/',
  30. 'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.70 Mobile Safari/537.36'
  31. }
  32. url = "http://m.hgg070.com/app/member/get_league_list.php"
  33. for item in h_types:
  34. showtype = item
  35. data={
  36. 'uid': '257853bc6f4166ca4e84f4d75d1cfc3540c6eab54b34898f4ad405cb2412402f',
  37. 'langx': 'zh-cn',
  38. 'ltype': '3',
  39. 'gtype': 'BK',
  40. 'showtype': showtype,
  41. 'sorttype': '',
  42. 'date': '',
  43. 'isP': ''
  44. }
  45. yield scrapy.FormRequest(url=url,formdata=data,callback=self.parse,headers=headers,
  46. meta={"data":data}, dont_filter=True)
  47. def parse(self, response):
  48. #获取id并判断抓取的球型
  49. data=response.meta["data"]
  50. fromdata=copy.deepcopy(data)
  51. league=response.xpath('//league')
  52. url="http://m.hgg070.com/app/member/get_game_list.php"
  53. for le in league:
  54. name=le.xpath('./league_name/text()').extract_first()
  55. if len(self.remath.findall(name))>0:
  56. lid = le.xpath('./league_id/text()').extract_first()
  57. # 抓取今日
  58. if data["showtype"]=="FT":
  59. data['lid'],data['sorttype'],data['date']=lid,'league',''
  60. # 抓取早盘
  61. elif data["showtype"]=="FU":
  62. data['lid'], data['sorttype'], data['date'] = lid, 'league', 'all'
  63. yield scrapy.FormRequest(url=url,formdata=data,callback=self.detailball,meta={"data":fromdata},dont_filter=True)
  64. def detailball(self,response):
  65. data=response.meta["data"]
  66. url="http://m.hgg070.com/app/member/get_game_more.php"
  67. #获取联赛id gid
  68. game=response.xpath("//game")
  69. for g in game:
  70. gid=g.xpath("./gid/text()").extract_first()
  71. data["gid"]=gid
  72. yield scrapy.FormRequest(url=url,formdata=data,callback=self.getItem,dont_filter=True)
  73. def getItem(self,response):
  74. data=response.xpath("//game")
  75. for item in data:
  76. obj=LanqiuItem()
  77. obj['id']=item.xpath("./gid/text()").extract_first()
  78. obj['league'] = item.xpath("./league/text()").extract_first()
  79. obj['team_h'] = item.xpath("./team_h/text()").extract_first()
  80. obj['team_c'] = item.xpath("./team_c/text()").extract_first()
  81. obj['showtype'] = item.xpath("./gtype/text()").extract_first()
  82. obj['datetime'] = item.xpath("./datetime/text()").extract_first()
  83. yield obj