lq_sports.py 5.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119
  1. # -*- coding: utf-8 -*-
  2. import scrapy
  3. from ..items import LanqiuItem
  4. import copy
  5. import lxml.etree
  6. import re
  7. class LqSportsSpider(scrapy.Spider):
  8. name = 'lq_sports'
  9. allowed_domains = ['m.hgg070.com/']
  10. start_urls = ['http://m.hgg070.com//']
  11. remath = re.compile("篮球")
  12. # custom_settings={
  13. # "ITEM_PIPELINES": {
  14. # "hgg070_spider.pipelines.lanqiu.ZuqiuPipeline": 200,
  15. # },
  16. # }
  17. def start_requests(self):
  18. #今日,早盘
  19. h_types=[('FT'),('FU')]
  20. headers = {
  21. 'Accept': '*/*',
  22. 'Accept-Encoding': 'gzip, deflate',
  23. 'Accept-Language': 'zh-CN,zh;q=0.9',
  24. 'Connection': 'keep-alive',
  25. 'Content-Length': '130',
  26. 'Content-type': 'application/x-www-form-urlencoded',
  27. 'Cookie': '_ga=GA1.2.471918301.1572059707; _gid=GA1.2.2109447865.1572059707; _gat=1',
  28. 'Host': 'm.hgg070.com',
  29. 'Origin': 'http://m.hgg070.com',
  30. 'Referer': 'http://m.hgg070.com/',
  31. 'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.70 Mobile Safari/537.36'
  32. }
  33. url = "http://m.hgg070.com/app/member/get_league_list.php"
  34. for item in h_types:
  35. showtype = item
  36. data={
  37. 'uid': '86797ef15d547a503c926bb658051dd137586e25f0936536d424c09f4fb74d83',
  38. 'langx': 'zh-cn',
  39. 'ltype': '3',
  40. 'gtype': 'BK',
  41. 'showtype': showtype,
  42. 'sorttype': '',
  43. 'date': '',
  44. 'isP': ''
  45. }
  46. yield scrapy.FormRequest(url=url,formdata=data,callback=self.parse,headers=headers,
  47. meta={"data":data}, dont_filter=True)
  48. def parse(self, response):
  49. #获取id并判断抓取的球型
  50. data=response.meta["data"]
  51. fromdata=copy.deepcopy(data)
  52. league=response.xpath('//league')
  53. url="http://m.hgg070.com/app/member/get_game_list.php"
  54. for le in league:
  55. name=le.xpath('./league_name/text()').extract_first()
  56. if len(self.remath.findall(name))>0:
  57. lid = le.xpath('./league_id/text()').extract_first()
  58. # 抓取今日
  59. if data["showtype"]=="FT":
  60. data['lid'],data['sorttype'],data['date']=lid,'league',''
  61. # 抓取早盘
  62. elif data["showtype"]=="FU":
  63. data['lid'], data['sorttype'], data['date'] = lid, 'league', 'all'
  64. yield scrapy.FormRequest(url=url,formdata=data,callback=self.detailball,meta={"data":fromdata},dont_filter=True)
  65. def detailball(self,response):
  66. data=response.meta["data"]
  67. url="http://m.hgg070.com/app/member/get_game_more.php"
  68. #获取联赛id gid
  69. game=response.xpath("//game")
  70. for g in game:
  71. gid=g.xpath("./gid/text()").extract_first()
  72. data["gid"]=gid
  73. yield scrapy.FormRequest(url=url,formdata=data,callback=self.getItem,dont_filter=True)
  74. def getItem(self,response):
  75. game_lists = []
  76. data=response.xpath("//game")
  77. if data:
  78. for game in data:
  79. game_odds = {}
  80. gopen = game.xpath('//game/gopen/text()').extract_first()
  81. if gopen == 'Y':
  82. game = lxml.etree.fromstring(game.extract())
  83. for i in game.getchildren():
  84. if i.text == None:
  85. game_odds[i.tag] = ""
  86. else:
  87. game_odds[i.tag] = i.text.replace(' ', '')
  88. game_lists.append(game_odds)
  89. else:
  90. print('gopen == N, 详细赔率盘口未开启')
  91. # for item in data:
  92. # obj=LanqiuItem()
  93. # obj['id']=item.xpath("./gid/text()").extract_first()
  94. # obj['league'] = item.xpath("./league/text()").extract_first()
  95. # obj['team_h'] = item.xpath("./team_h/text()").extract_first()
  96. # obj['team_c'] = item.xpath("./team_c/text()").extract_first()
  97. # obj['showtype'] = item.xpath("./gtype/text()").extract_first()
  98. # obj['datetime'] = item.xpath("./datetime/text()").extract_first()
  99. # item['match_id'] = item.xpath("./gid/text()").extract_first()
  100. # item['uuid'] = uid_list
  101. # item['source'] = "hg0088"
  102. # item['updata'] = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
  103. # item['content'] = odd_list
  104. # item['gidm'] = league_id
  105. # item['tag'] = tag
  106. # item['league'] = league
  107. # item['match_uid'] = match_uid
  108. # item['datetime'] = datetime
  109. # item['team_h'] = team_h
  110. # item['team_c'] = team_c
  111. yield obj