lq_sports.py 7.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160
  1. # -*- coding: utf-8 -*-
  2. import scrapy
  3. from ..items import LanqiuItem
  4. import copy
  5. import lxml.etree
  6. import re, os, json
  7. from ..utils.helper import Helper
  8. import time
  9. from ..items import LanqiuItem
  10. import xmltodict
  11. class LqSportsSpider(scrapy.Spider):
  12. name = 'lq_sports'
  13. allowed_domains = ['m.hgg070.com/']
  14. start_urls = ['http://m.hgg070.com//']
  15. remath = re.compile("篮球")
  16. # custom_settings={
  17. # "ITEM_PIPELINES": {
  18. # "hgg070_spider.pipelines.lq_sports.LqSportsPipeline": 200,
  19. # },
  20. # }
  21. def start_requests(self):
  22. # 今日,早盘
  23. h_types = [('FT'), ('FU')]
  24. headers = {
  25. 'Accept': '*/*',
  26. 'Accept-Encoding': 'gzip, deflate',
  27. 'Accept-Language': 'zh-CN,zh;q=0.9',
  28. 'Connection': 'keep-alive',
  29. 'Content-Length': '130',
  30. 'Content-type': 'application/x-www-form-urlencoded',
  31. 'Cookie': '_ga=GA1.2.471918301.1572059707; _gid=GA1.2.2109447865.1572059707; _gat=1',
  32. 'Host': 'm.hgg070.com',
  33. 'Origin': 'http://m.hgg070.com',
  34. 'Referer': 'http://m.hgg070.com/',
  35. 'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.70 Mobile Safari/537.36'
  36. }
  37. url = "http://m.hgg070.com/app/member/get_league_list.php"
  38. for item in h_types:
  39. showtype = item
  40. data = {
  41. 'uid': '3970335d20df9b8ceca8673ae9b6ea910c912492f595c0ef163623ae0ea883b6',
  42. 'langx': 'zh-cn',
  43. 'ltype': '3',
  44. 'gtype': 'BK',
  45. 'showtype': showtype,
  46. 'sorttype': '',
  47. 'date': '',
  48. 'isP': ''
  49. }
  50. yield scrapy.FormRequest(url=url, formdata=data, callback=self.parse, headers=headers,
  51. meta={"data": data}, dont_filter=True)
  52. def parse(self, response):
  53. # 获取id并判断抓取的球型
  54. data = response.meta["data"]
  55. fromdata = copy.deepcopy(data)
  56. league = response.xpath('//league')
  57. url = "http://m.hgg070.com/app/member/get_game_list.php"
  58. for le in league:
  59. name = le.xpath('./league_name/text()').extract_first()
  60. if len(self.remath.findall(name)) > 0:
  61. lid = le.xpath('./league_id/text()').extract_first()
  62. # 抓取今日
  63. if data["showtype"] == "FT":
  64. data['lid'], data['sorttype'], data['date'] = lid, 'league', ''
  65. # 抓取早盘
  66. elif data["showtype"] == "FU":
  67. data['lid'], data['sorttype'], data['date'] = lid, 'league', 'all'
  68. yield scrapy.FormRequest(url=url, formdata=data, callback=self.detailball, meta={"data": fromdata},
  69. dont_filter=True)
  70. def detailball(self, response):
  71. data = response.meta["data"]
  72. url = "http://m.hgg070.com/app/member/get_game_more.php"
  73. # 获取联赛id gid
  74. game = response.xpath("//game")
  75. for g in game:
  76. gid = g.xpath("./gid/text()").extract_first()
  77. more_count = g.xpath("./more_count/text()").extract_first()
  78. data["gid"] = gid
  79. yield scrapy.FormRequest(url=url, formdata=data, callback=self.getItem,
  80. meta={"more_count": more_count, "isP": data["isP"]}, dont_filter=True)
  81. def getItem(self, response):
  82. more_count = response.meta["more_count"]
  83. isP = response.meta["isP"]
  84. data = xmltodict.parse(response.text)['serverresponse']['game']
  85. game_lists = [i for i in data if i['gopen'] == 'Y']
  86. if game_lists:
  87. for gl in game_lists:
  88. cpath = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
  89. with open(cpath + "/conf/hgg070.json", encoding='utf8') as hg:
  90. hgg = json.load(hg)['bk']
  91. datetime = gl['datetime'][:-8] + " " + gl['datetime'][-8:]
  92. team_h = gl['team_h']
  93. team_c = gl['team_c']
  94. league_id = gl['gidm']
  95. match_id = gl.get('gid', '')
  96. match_uid = Helper.genearte_uuid(team_h + team_c + datetime)
  97. data = []
  98. for hg in hgg:
  99. items = hg['items']
  100. if gl[hg['prodds']] == 'Y':
  101. for x in items:
  102. odds_code = gl[x['rodds']]
  103. p_code = gl[hg['prodds']]
  104. odds = gl["ior_OUH"]
  105. # 有两个条件,加两条数据
  106. if x['ratio_name']: # 大的
  107. condition_u = gl[x['ratio_name']]
  108. odds_only = hg["plodds"] + x["lodds"] + '0' + condition_u + str(odds) + "hg3535" + str(
  109. match_id)
  110. sole = hg["plodds"] + x["lodds"] + '0' + str(match_id) + "hg3535"
  111. tobj = {"match_id": match_id, "lg_id": league_id, "odds_code": odds_code, "status": 0,
  112. "sort": 0, "p_code": p_code,
  113. "odds": odds, "condition": condition_u, "odds_only": odds_only, "sole": sole,
  114. "source": "hgg070", "type": 0, "team": ""}
  115. data.append(tobj)
  116. if x['latio']: # 小的
  117. condition_s = gl[x['latio']]
  118. odds_only = hg["plodds"] + x["lodds"] + '0' + condition_s + str(odds) + "hg3535" + str(
  119. match_id)
  120. sole = hg["plodds"] + x["lodds"] + '0' + str(match_id) + "hg3535"
  121. tobj = {"match_id": match_id, "lg_id": league_id, "odds_code": odds_code, "status": 0,
  122. "sort": 0, "p_code": p_code,
  123. "odds": odds, "condition": condition_s, "odds_only": odds_only, "sole": sole,
  124. "source": "hgg070", "type": 0, "team": ""}
  125. data.append(tobj)
  126. if not x['latio'] and not x['ratio_name']:
  127. condition_s = ''
  128. odds_only = hg["plodds"] + x["lodds"] + '0' + condition_s + str(odds) + "hg3535" + str(
  129. match_id)
  130. sole = hg["plodds"] + x["lodds"] + '0' + str(match_id) + "hg3535"
  131. tobj = {"match_id": match_id, "lg_id": league_id, "odds_code": odds_code, "status": 0,
  132. "sort": 0, "p_code": p_code,
  133. "odds": odds, "condition": condition_s, "odds_only": odds_only, "sole": sole,
  134. "source": "hgg070", "type": 0, "team": ""}
  135. data.append(tobj)
  136. item = LanqiuItem()
  137. item['match_id'] = match_id
  138. item['source'] = "hg0088"
  139. item['updata'] = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
  140. item['content'] = data
  141. item['league_id'] = league_id
  142. item['more_count'] = more_count
  143. item['league'] = gl["league"]
  144. item['match_identity'] = match_uid
  145. item['datetime'] = datetime
  146. item['team_h'] = team_h
  147. item['team_c'] = team_c
  148. item['isP'] = isP
  149. print('wwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwww', item)
  150. yield item