lanqiu.py 3.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384
  1. # -*- coding: utf-8 -*-
  2. import scrapy
  3. import re
  4. class LanqiuSpider(scrapy.Spider):
  5. name = 'lanqiu'
  6. allowed_domains = ['m.hgg070.com/']
  7. start_urls = ['http://m.hgg070.com//']
  8. remath=re.compile("篮球")
  9. # custom_settings={
  10. # "ITEM_PIPELINES": {
  11. # "collectSports.pipelines.zuqiu.ZuqiuPipeline": 200,
  12. # },
  13. # }
  14. def start_requests(self):
  15. #今日,早盘
  16. h_types=[('FT'),('FU')]
  17. headers = {
  18. 'Accept': '*/*',
  19. 'Accept-Encoding': 'gzip, deflate',
  20. 'Accept-Language': 'zh-CN,zh;q=0.9',
  21. 'Connection': 'keep-alive',
  22. 'Content-Length': '130',
  23. 'Content-type': 'application/x-www-form-urlencoded',
  24. 'Cookie': '_ga=GA1.2.471918301.1572059707; _gid=GA1.2.2109447865.1572059707; _gat=1',
  25. 'Host': 'm.hgg070.com',
  26. 'Origin': 'http://m.hgg070.com',
  27. 'Referer': 'http://m.hgg070.com/',
  28. 'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.70 Mobile Safari/537.36'
  29. }
  30. url = "http://m.hgg070.com/app/member/get_league_list.php"
  31. for item in h_types:
  32. showtype = item
  33. data={
  34. 'uid': '7554a670e92d06105fe567b75e5b80fe65e6e40167f4979c8d74ca5eaa461d4d',
  35. 'langx': 'zh-cn',
  36. 'ltype': '3',
  37. 'gtype': 'BK',
  38. 'showtype': showtype,
  39. 'sorttype': '',
  40. 'date': '',
  41. 'isP': ''
  42. }
  43. yield scrapy.FormRequest(url=url,formdata=data,callback=self.parse,headers=headers,
  44. meta={"data":data}, dont_filter=True)
  45. def parse(self, response):
  46. #获取id并判断抓取的球型
  47. data=response.meta["data"]
  48. league=response.xpath('//league')
  49. url="http://m.hgg070.com/app/member/get_game_list.php"
  50. for le in league:
  51. name=le.xpath('./league_name/text()').extract_first()
  52. if len(self.remath.findall(name))>0:
  53. lid = le.xpath('./league_id/text()').extract_first()
  54. # 抓取今日
  55. if data["showtype"]=="FT":
  56. data['lid'],data['sorttype'],data['date']=lid,'league',''
  57. # 抓取早盘
  58. elif data["showtype"]=="FU":
  59. data['lid'], data['sorttype'], data['date'] = lid, 'league', 'all'
  60. print('77777777777777777777777777777777777777777777L',data)
  61. yield scrapy.FormRequest(url=url,formdata=data,callback=self.detailball,meta={"data":response.meta["data"]},dont_filter=True)
  62. def detailball(self,response):
  63. data=response.meta["data"]
  64. url=""
  65. #获取联赛id gid
  66. game=response.xpath("//game")
  67. for g in game:
  68. gid=g.xpath("./gid/text()").extract_first()
  69. data["gid"]=gid
  70. print('wwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwww',data)
  71. yield scrapy.FormRequest(url=url,formdata=data,callback=self.getItem,dont_filter=True)
  72. def getItem(self,response):
  73. print('ffffffffffffffffffffffffffffffffffffffffffffffffffffff',response.text)