saiguo.py 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334
  1. #!/usr/bin/env python
  2. # -*- coding: utf-8 -*-
  3. import datetime
  4. import re
  5. import scrapy
  6. from lxml import etree
  7. from ..items import Hgsaiguo
  8. class HgjieshuSpider(scrapy.Spider):
  9. name = 'saiguo'
  10. to_day = datetime.datetime.now()
  11. allowed_domains = ['hg3535z.com']
  12. custom_settings = {
  13. "ITEM_PIPELINES": {
  14. 'hg3535.pipeline.saiguo.Jieshuqiupipeline': 300,
  15. },
  16. # 'LOG_LEVEL': 'DEBUG',
  17. # 'LOG_FILE': "../hg3535/log/saiguo{}_{}_{}.log".format(to_day.year, to_day.month, to_day.day)
  18. }
  19. def start_requests(self):
  20. for y in range(1, 5):
  21. for z in range(1, 3):
  22. url = 'https://www.hg3535.cn/zh-cn/info-centre/sportsbook-info/results/{}/normal/{}'.format(y, z)
  23. yield scrapy.Request(url=url, callback=self.parse, dont_filter=True, meta={'pt': y, "page": z})
  24. def parse(self, response):
  25. if response.status == 200:
  26. pt = response.meta['pt']
  27. page = response.meta['page']
  28. if page == 1:
  29. us_datetime = datetime.datetime.now() - datetime.timedelta(hours=12)
  30. else:
  31. us_datetime = datetime.datetime.now() - datetime.timedelta(hours=36)
  32. match_date = us_datetime.strftime("%Y-%m-%d")
  33. # 足球赛果
  34. if pt == 1:
  35. league_ids = response.xpath('//div[@class="rt-l-bar football"]/@id').extract()
  36. league_names = response.xpath('//div[@class="rt-l-bar football"]/span[@class="comp-txt"]/text()').extract()
  37. if league_ids:
  38. for index in range(len(league_ids)):
  39. league_id = league_ids[index]
  40. league_name = league_names[index]
  41. response_data = response.xpath('//div[@id="dt-{}"]'.format(league_id)).extract_first()
  42. data = etree.HTML(response_data)
  43. # 球队名
  44. # team_names = data.xpath('//div[@class="rt-event"]/@title')
  45. h_names = data.xpath('//div[@class="rt-event"]/span[1]')
  46. a_names = data.xpath('//div[@class="rt-event"]/span[3]')
  47. # 全场
  48. f_scores = data.xpath('.//div[contains(@class, "rt-ft ")]')
  49. # 上半场
  50. h_scores = data.xpath('.//div[contains(@class, "rt-ht ")]')
  51. # 时间
  52. stimes = data.xpath('//div[@class="rt-event"]/../div[1]/span/text()')
  53. match_ids = data.xpath('//div[@class="flex-wrap"]/../div[1]/@id')
  54. odd_datas = data.xpath('//div[contains(@class, "rt-sub ")]/table/tbody[2]')
  55. for y in range(len(odd_datas)):
  56. match_id = match_ids[y].replace('e-', '')
  57. league_id = league_id.replace('cmp-', '')
  58. # 子集玩法
  59. odd_names = odd_datas[y].xpath('.//tr/td[2]')
  60. # 子集玩法结果
  61. odd_plays = odd_datas[y].xpath('.//tr/td[3]/span')
  62. # 主队
  63. h_name = h_names[y].text
  64. # 客队
  65. a_name = a_names[y].text
  66. # 上半场
  67. h_score = h_scores[y].xpath('string(.)').replace(' ', '').replace('\r\n', '')
  68. # 全场
  69. f_score = f_scores[y].xpath('string(.)').replace(' ', '').replace('\r\n', '')
  70. # 正则时间匹配规则
  71. pattern = re.compile(r"\d{1,3}:\d{1,3}")
  72. match_time = pattern.findall(stimes[y])[0]
  73. play_datas = []
  74. if odd_names:
  75. for i in range(len(odd_names)):
  76. name = odd_names[i].text
  77. plays = odd_plays[i].xpath('text()')
  78. if len(plays) == 2:
  79. play = '{}&&{}'.format(plays[0], plays[1])
  80. else:
  81. play = plays[0]
  82. play_datas.append({'play_name': name, 'play_result': play})
  83. item = Hgsaiguo()
  84. item["league_id"] = league_id
  85. item["league_name"] = league_name
  86. item["match_id"] = match_id
  87. item["match_date"] = match_date
  88. item["match_time"] = match_time
  89. item["home_team"] = h_name
  90. item["guest_team"] = a_name
  91. item["score_half"] = h_score
  92. item["score_full"] = f_score
  93. item["play_data"] = play_datas
  94. item["pt"] = pt
  95. item["page"] = page
  96. yield item
  97. else:
  98. return
  99. # 篮球赛果
  100. if pt == 2:
  101. league_ids = response.xpath('//div[@class="rt-l-bar sportHasQuater"]/@id').extract()
  102. league_names = response.xpath('//div[@class="rt-l-bar sportHasQuater"]/span[@class="comp-txt"]/text()').extract()
  103. if league_ids:
  104. for index in range(len(league_ids)):
  105. league_id = league_ids[index]
  106. league_name = league_names[index]
  107. response_data = response.xpath('//div[@id="dt-{}"]'.format(league_id)).extract_first()
  108. data = etree.HTML(response_data)
  109. # 球队名
  110. h_names = data.xpath('//div[@class="rt-event"]/span[1]')
  111. a_names = data.xpath('//div[@class="rt-event"]/span[3]')
  112. # 全场
  113. f_scores = data.xpath('.//div[@class="rt-qft"]')
  114. # 上半场
  115. h_scores = data.xpath('.//div[@class="rt-qt1"]')
  116. # 下半场
  117. x_scores = data.xpath('.//div[@class="rt-qt2"]')
  118. # 时间
  119. stimes = data.xpath('//div[@class="rt-event"]/../div[1]/span/text()')
  120. match_ids = data.xpath('//div[@class="flex-wrap"]/../div[1]/@id')
  121. odd_datas = data.xpath('//div[contains(@class, "rt-sub ")]/table/tbody[2]')
  122. for y in range(len(odd_datas)):
  123. match_id = match_ids[y].replace('e-', '')
  124. league_id = league_id.replace('cmp-', '')
  125. # 子集玩法
  126. child_data = odd_datas[y].xpath('./tr')
  127. # 主队
  128. h_name = h_names[y].text
  129. # 客队
  130. a_name = a_names[y].text
  131. # 上半场
  132. h_score = h_scores[y].xpath('string(.)').replace(' ', '').replace('\r\n', '')
  133. # 全场
  134. f_score = f_scores[y].xpath('string(.)').replace(' ', '').replace('\r\n', '')
  135. # 下半场
  136. x_score = x_scores[y].xpath('string(.)').replace(' ', '').replace('\r\n', '')
  137. # 正则时间匹配规则
  138. pattern = re.compile(r"\d{1,3}:\d{1,3}")
  139. match_time = pattern.findall(stimes[y])[0]
  140. play_datas = []
  141. if child_data:
  142. h_dict = {'team_name': h_name}
  143. a_dict = {'team_name': a_name}
  144. for i in range(len(child_data)):
  145. if i == 0:
  146. h_datas = child_data[i].xpath('.//td/table/tbody/tr[3]/td[@class="r-odds"]')
  147. a_datas = child_data[i].xpath('.//td/table/tbody/tr[4]/td[@class="r-odds"]')
  148. rule = {0: "sc_1th", 1: "sc_2th", 2: "sc_3th", 3: "sc_4th", 4: "sc_other"}
  149. if h_datas and a_datas:
  150. for x in range(len(h_datas)):
  151. # 主队节得分
  152. h_data = h_datas[x].text.replace(' ', '').replace('\r\n', '')
  153. h_dict[rule[x]] = h_data
  154. # 客队节得分
  155. a_data = a_datas[x].text.replace(' ', '').replace('\r\n', '')
  156. a_dict[rule[x]] = a_data
  157. else:
  158. # 子玩法名
  159. child_name = child_data[i].xpath('.//td[contains(@class, "r-bt ")]/text()')[0].replace(' ', '').replace('\r\n', '')
  160. # 子玩法结果
  161. child_play = child_data[i].xpath('.//td[@class="r-odds"]/span[@class="prop"]/text()')[0]
  162. play_datas.append({"play_name": child_name, "play_result": child_play})
  163. play_datas.append(h_dict)
  164. play_datas.append(a_dict)
  165. item = Hgsaiguo()
  166. item["league_id"] = league_id
  167. item["league_name"] = league_name
  168. item["match_id"] = match_id
  169. item["match_date"] = match_date
  170. item["match_time"] = match_time
  171. item["home_team"] = h_name
  172. item["guest_team"] = a_name
  173. item["score_half"] = h_score
  174. item["score_result"] = f_score
  175. item["play_data"] = play_datas
  176. item["pt"] = pt
  177. item["page"] = page
  178. item["score_below"] = x_score
  179. yield item
  180. else:
  181. return
  182. #
  183. # 网球赛果
  184. if pt == 3:
  185. league_ids = response.xpath('//div[@class="rt-l-bar tennis"]/@id').extract()
  186. league_names = response.xpath('//div[@class="rt-l-bar tennis"]/span[@class="comp-txt"]/text()').extract()
  187. if league_ids:
  188. for index in range(len(league_ids)):
  189. league_id = league_ids[index]
  190. league_name = league_names[index]
  191. response_data = response.xpath('//div[@id="dt-{}"]'.format(league_id)).extract_first()
  192. data = etree.HTML(response_data)
  193. # 球队名
  194. h_names = data.xpath('//div[@class="rt-event"]/span[1]')
  195. a_names = data.xpath('//div[@class="rt-event"]/span[3]')
  196. # 赛果
  197. f_scores = data.xpath('.//div[@class="rt-set"]')
  198. # 时间
  199. stimes = data.xpath('//div[@class="rt-event"]/../div[1]/span/text()')
  200. # 赛事id
  201. match_ids = data.xpath('//div[@class="flex-wrap"]/../div[1]/@id')
  202. # 详细赛果信息, 比分等
  203. odd_datas = data.xpath('//div[contains(@class, "rt-sub ")]/table/tbody')
  204. for y in range(len(odd_datas)):
  205. match_id = match_ids[y].replace('e-', '')
  206. league_id = league_id.replace('cmp-', '')
  207. # 子集玩法
  208. child_data = odd_datas[y].xpath('./tr')
  209. # 主队
  210. h_name = h_names[y].text
  211. # 客队
  212. a_name = a_names[y].text
  213. # 全场
  214. f_score = f_scores[y].xpath('string(.)').replace(' ', '').replace('\r\n', '')
  215. # 正则时间匹配规则
  216. pattern = re.compile(r"\d{1,3}:\d{1,3}")
  217. match_time = pattern.findall(stimes[y])[0]
  218. play_datas = []
  219. if child_data:
  220. rule = {0: "sc_1th", 1: "sc_2th", 2: "sc_3th", 3: "sc_4th", 4: "sc_5th", 5: "game_num", 6: "disc_num"}
  221. h_dict = {'team_name': h_name}
  222. a_dict = {'team_name': a_name}
  223. for i in range(len(child_data)):
  224. if i == 0:
  225. h_datas = child_data[i].xpath('.//tbody/tr[3]/td[contains(@class, "r-odds")]')
  226. a_datas = child_data[i].xpath('.//tbody/tr[4]/td[contains(@class, "r-odds")]')
  227. if h_datas and a_datas:
  228. for x in range(len(h_datas)):
  229. # 主队节得分
  230. h_data = h_datas[x].text.replace(' ', '').replace('\r\n', '')
  231. h_dict[rule[x]] = h_data
  232. # 客队节得分
  233. a_data = a_datas[x].text.replace(' ', '').replace('\r\n', '')
  234. a_dict[rule[x]] = a_data
  235. else:
  236. # 子玩法名
  237. child_name = child_data[i].xpath('.//td[contains(@class, "r-bt ")]/text()')[0].replace(' ', '').replace('\r\n', '')
  238. # 子玩法结果
  239. child_play = child_data[i].xpath('.//td[@class="r-odds"]/span[@class="prop"]')[0]
  240. play = child_play.xpath('string(.)')
  241. play_datas.append({"play_name": child_name, "play_result": play})
  242. play_datas.append(h_dict)
  243. play_datas.append(a_dict)
  244. item = Hgsaiguo()
  245. item["league_id"] = league_id
  246. item["league_name"] = league_name
  247. item["match_id"] = match_id
  248. item["match_date"] = match_date
  249. item["match_time"] = match_time
  250. item["home_team"] = h_name
  251. item["guest_team"] = a_name
  252. item["score_result"] = f_score
  253. item["play_data"] = play_datas
  254. item["pt"] = pt
  255. item["page"] = page
  256. yield item
  257. else:
  258. return
  259. # 棒球赛果
  260. if pt == 4:
  261. league_ids = response.xpath('//div[@class="rt-l-bar baseball"]/@id').extract()
  262. league_names = response.xpath('//div[@class="rt-l-bar baseball"]/span[@class="comp-txt"]/text()').extract()
  263. if league_ids:
  264. for index in range(len(league_ids)):
  265. league_id = league_ids[index]
  266. league_name = league_names[index]
  267. response_data = response.xpath('//div[@id="dt-{}"]'.format(league_id)).extract_first()
  268. data = etree.HTML(response_data)
  269. # 球队名
  270. h_names = data.xpath('//div[@class="rt-event"]/span[1]')
  271. a_names = data.xpath('//div[@class="rt-event"]/span[3]')
  272. # 全场
  273. f_scores = data.xpath('.//div[@class="rt-ft"]')
  274. # 上半场
  275. h_scores = data.xpath('.//div[@class="rt-ht"]')
  276. # 时间
  277. stimes = data.xpath('//div[@class="rt-event"]/../div[1]/span/text()')
  278. # 子集玩法
  279. # odd_names = data.xpath('//div[@class="rt-sub rt-data-hide"]/table/tbody[2]/tr/td[2]')
  280. # 子集玩法结果
  281. # odd_plays = data.xpath('//div[@class="rt-sub rt-data-hide"]/table/tbody[2]/tr/td[3]/span')
  282. match_ids = data.xpath('//div[@class="flex-wrap"]/../div[1]/@id')
  283. odd_datas = data.xpath('//div[contains(@class, "rt-sub")]/table/tbody')
  284. for y in range(len(odd_datas)):
  285. match_id = match_ids[y].replace('e-', '')
  286. league_id = league_id.replace('cmp-', '')
  287. # 子集玩法
  288. odd_names = odd_datas[y].xpath('.//tr/td[2]')
  289. # 子集玩法结果
  290. odd_plays = odd_datas[y].xpath('.//tr/td[3]/span')
  291. # 主队
  292. h_name = h_names[y].text
  293. # 客队
  294. a_name = a_names[y].text
  295. # 上半场
  296. h_score = h_scores[y].xpath('string(.)').replace(' ', '').replace('\r\n', '')
  297. # 全场
  298. f_score = f_scores[y].xpath('string(.)').replace(' ', '').replace('\r\n', '')
  299. # 正则时间匹配规则
  300. pattern = re.compile(r"\d{1,3}:\d{1,3}")
  301. match_time = pattern.findall(stimes[y])[0]
  302. play_datas = []
  303. if odd_names:
  304. for i in range(len(odd_names)):
  305. # 子玩法名
  306. name = odd_names[i].text.replace(' ', '').replace('\r\n', '')
  307. # 子玩法赛果
  308. play = odd_plays[i].xpath('string(.)').replace(' ', '').replace('\r\n', '')
  309. play_datas.append({"play_name": name, "play_result": play})
  310. item = Hgsaiguo()
  311. item["league_id"] = league_id
  312. item["league_name"] = league_name
  313. item["match_id"] = match_id
  314. item["match_date"] = match_date
  315. item["match_time"] = match_time
  316. item["home_team"] = h_name
  317. item["guest_team"] = a_name
  318. item["score_half"] = h_score
  319. item["score_full"] = f_score
  320. item["play_data"] = play_datas
  321. item["pt"] = pt
  322. item["page"] = page
  323. yield item
  324. else:
  325. return