saiguo.py 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335
  1. import datetime
  2. import re
  3. import scrapy
  4. from lxml import etree
  5. from ..items import Hgsaiguo
  6. class HgjieshuSpider(scrapy.Spider):
  7. name = 'saiguo'
  8. to_day = datetime.datetime.now()
  9. allowed_domains = ['hg3535z.com']
  10. custom_settings = {
  11. "ITEM_PIPELINES": {
  12. 'hg3535.pipeline.saiguo.Jieshuqiupipeline': 300,
  13. },
  14. # 'LOG_LEVEL': 'DEBUG',
  15. # 'LOG_FILE': "../hg3535/log/saiguo{}_{}_{}.log".format(to_day.year, to_day.month, to_day.day)
  16. }
  17. def start_requests(self):
  18. for y in range(1, 5):
  19. for z in range(1, 3):
  20. url = 'https://www.hg3535.cn/zh-cn/info-centre/sportsbook-info/results/{}/normal/{}'.format(y, z)
  21. yield scrapy.Request(url=url, callback=self.parse, dont_filter=True, meta={'pt': y, "page": z})
  22. def parse(self, response):
  23. if response.status == 200:
  24. pt = response.meta['pt']
  25. page = response.meta['page']
  26. if page == 1:
  27. us_datetime = datetime.datetime.now() - datetime.timedelta(hours=12)
  28. else:
  29. us_datetime = datetime.datetime.now() - datetime.timedelta(hours=36)
  30. match_date = us_datetime.strftime("%Y-%m-%d")
  31. # 足球赛果
  32. if pt == 1:
  33. league_ids = response.xpath('//div[@class="rt-l-bar football"]/@id').extract()
  34. league_names = response.xpath('//div[@class="rt-l-bar football"]/span[@class="comp-txt"]/text()').extract()
  35. if league_ids:
  36. for index in range(len(league_ids)):
  37. league_id = league_ids[index]
  38. league_name = league_names[index]
  39. response_data = response.xpath('//div[@id="dt-{}"]'.format(league_id)).extract_first()
  40. data = etree.HTML(response_data)
  41. # 球队名
  42. # team_names = data.xpath('//div[@class="rt-event"]/@title')
  43. h_names = data.xpath('//div[@class="rt-event"]/span[1]')
  44. a_names = data.xpath('//div[@class="rt-event"]/span[3]')
  45. # 全场
  46. f_scores = data.xpath('.//div[contains(@class, "rt-ft ")]')
  47. # 上半场
  48. h_scores = data.xpath('.//div[contains(@class, "rt-ht ")]')
  49. # 时间
  50. stimes = data.xpath('//div[@class="rt-event"]/../div[1]/span/text()')
  51. # 子集玩法
  52. # odd_names = data.xpath('//div[@class="rt-sub rt-data-hide"]/table/tbody[2]/tr/td[2]')
  53. # 子集玩法结果
  54. # odd_plays = data.xpath('//div[@class="rt-sub rt-data-hide"]/table/tbody[2]/tr/td[3]/span')
  55. match_ids = data.xpath('//div[@class="flex-wrap"]/../div[1]/@id')
  56. odd_datas = data.xpath('//div[contains(@class, "rt-sub ")]/table/tbody[2]')
  57. for y in range(len(odd_datas)):
  58. match_id = match_ids[y].replace('e-', '')
  59. league_id = league_id.replace('cmp-', '')
  60. # 子集玩法
  61. odd_names = odd_datas[y].xpath('.//tr/td[2]')
  62. # 子集玩法结果
  63. odd_plays = odd_datas[y].xpath('.//tr/td[3]/span')
  64. # 主队
  65. h_name = h_names[y].text
  66. # 客队
  67. a_name = a_names[y].text
  68. # 上半场
  69. h_score = h_scores[y].xpath('string(.)').replace(' ', '').replace('\r\n', '')
  70. # 全场
  71. f_score = f_scores[y].xpath('string(.)').replace(' ', '').replace('\r\n', '')
  72. # 正则时间匹配规则
  73. pattern = re.compile(r"\d{1,3}:\d{1,3}")
  74. match_time = pattern.findall(stimes[y])[0]
  75. play_datas = []
  76. if odd_names:
  77. for i in range(len(odd_names)):
  78. name = odd_names[i].text
  79. plays = odd_plays[i].xpath('text()')
  80. if len(plays) == 2:
  81. play = '{}&&{}'.format(plays[0], plays[1])
  82. else:
  83. play = plays[0]
  84. play_datas.append({'play_name': name, 'play_result': play})
  85. item = Hgsaiguo()
  86. item["league_id"] = league_id
  87. item["league_name"] = league_name
  88. item["match_id"] = match_id
  89. item["match_date"] = match_date
  90. item["match_time"] = match_time
  91. item["home_team"] = h_name
  92. item["guest_team"] = a_name
  93. item["score_half"] = h_score
  94. item["score_full"] = f_score
  95. item["play_data"] = play_datas
  96. item["pt"] = pt
  97. item["page"] = page
  98. yield item
  99. else:
  100. return
  101. # 篮球赛果
  102. if pt == 2:
  103. league_ids = response.xpath('//div[@class="rt-l-bar sportHasQuater"]/@id').extract()
  104. league_names = response.xpath('//div[@class="rt-l-bar sportHasQuater"]/span[@class="comp-txt"]/text()').extract()
  105. if league_ids:
  106. for index in range(len(league_ids)):
  107. league_id = league_ids[index]
  108. league_name = league_names[index]
  109. response_data = response.xpath('//div[@id="dt-{}"]'.format(league_id)).extract_first()
  110. data = etree.HTML(response_data)
  111. # 球队名
  112. h_names = data.xpath('//div[@class="rt-event"]/span[1]')
  113. a_names = data.xpath('//div[@class="rt-event"]/span[3]')
  114. # 全场
  115. f_scores = data.xpath('.//div[@class="rt-qft"]')
  116. # 上半场
  117. h_scores = data.xpath('.//div[@class="rt-qt1"]')
  118. # 下半场
  119. x_scores = data.xpath('.//div[@class="rt-qt2"]')
  120. # 时间
  121. stimes = data.xpath('//div[@class="rt-event"]/../div[1]/span/text()')
  122. match_ids = data.xpath('//div[@class="flex-wrap"]/../div[1]/@id')
  123. odd_datas = data.xpath('//div[contains(@class, "rt-sub ")]/table/tbody[2]')
  124. for y in range(len(odd_datas)):
  125. match_id = match_ids[y].replace('e-', '')
  126. league_id = league_id.replace('cmp-', '')
  127. # 子集玩法
  128. child_data = odd_datas[y].xpath('./tr')
  129. # 主队
  130. h_name = h_names[y].text
  131. # 客队
  132. a_name = a_names[y].text
  133. # 上半场
  134. h_score = h_scores[y].xpath('string(.)').replace(' ', '').replace('\r\n', '')
  135. # 全场
  136. f_score = f_scores[y].xpath('string(.)').replace(' ', '').replace('\r\n', '')
  137. # 下半场
  138. x_score = x_scores[y].xpath('string(.)').replace(' ', '').replace('\r\n', '')
  139. # 正则时间匹配规则
  140. pattern = re.compile(r"\d{1,3}:\d{1,3}")
  141. match_time = pattern.findall(stimes[y])[0]
  142. play_datas = []
  143. if child_data:
  144. h_dict = {'team_name': h_name}
  145. a_dict = {'team_name': a_name}
  146. for i in range(len(child_data)):
  147. if i == 0:
  148. h_datas = child_data[i].xpath('.//td/table/tbody/tr[3]/td[@class="r-odds"]')
  149. a_datas = child_data[i].xpath('.//td/table/tbody/tr[4]/td[@class="r-odds"]')
  150. rule = {0: "sc_1th", 1: "sc_2th", 2: "sc_3th", 3: "sc_4th", 4: "sc_other"}
  151. if h_datas and a_datas:
  152. for x in range(len(h_datas)):
  153. # 主队节得分
  154. h_data = h_datas[x].text.replace(' ', '').replace('\r\n', '')
  155. h_dict[rule[x]] = h_data
  156. # 客队节得分
  157. a_data = a_datas[x].text.replace(' ', '').replace('\r\n', '')
  158. a_dict[rule[x]] = a_data
  159. else:
  160. # 子玩法名
  161. child_name = child_data[i].xpath('.//td[contains(@class, "r-bt ")]/text()')[0].replace(' ', '').replace('\r\n', '')
  162. # 子玩法结果
  163. child_play = child_data[i].xpath('.//td[@class="r-odds"]/span[@class="prop"]/text()')[0]
  164. play_datas.append({"play_name": child_name, "play_result": child_play})
  165. play_datas.append(h_dict)
  166. play_datas.append(a_dict)
  167. item = Hgsaiguo()
  168. item["league_id"] = league_id
  169. item["league_name"] = league_name
  170. item["match_id"] = match_id
  171. item["match_date"] = match_date
  172. item["match_time"] = match_time
  173. item["home_team"] = h_name
  174. item["guest_team"] = a_name
  175. item["score_half"] = h_score
  176. item["score_result"] = f_score
  177. item["play_data"] = play_datas
  178. item["pt"] = pt
  179. item["page"] = page
  180. item["score_below"] = x_score
  181. yield item
  182. else:
  183. return
  184. #
  185. # 网球赛果
  186. if pt == 3:
  187. league_ids = response.xpath('//div[@class="rt-l-bar tennis"]/@id').extract()
  188. league_names = response.xpath('//div[@class="rt-l-bar tennis"]/span[@class="comp-txt"]/text()').extract()
  189. if league_ids:
  190. for index in range(len(league_ids)):
  191. league_id = league_ids[index]
  192. league_name = league_names[index]
  193. response_data = response.xpath('//div[@id="dt-{}"]'.format(league_id)).extract_first()
  194. data = etree.HTML(response_data)
  195. # 球队名
  196. h_names = data.xpath('//div[@class="rt-event"]/span[1]')
  197. a_names = data.xpath('//div[@class="rt-event"]/span[3]')
  198. # 赛果
  199. f_scores = data.xpath('.//div[@class="rt-set"]')
  200. # 时间
  201. stimes = data.xpath('//div[@class="rt-event"]/../div[1]/span/text()')
  202. # 赛事id
  203. match_ids = data.xpath('//div[@class="flex-wrap"]/../div[1]/@id')
  204. # 详细赛果信息, 比分等
  205. odd_datas = data.xpath('//div[contains(@class, "rt-sub ")]/table/tbody')
  206. for y in range(len(odd_datas)):
  207. match_id = match_ids[y].replace('e-', '')
  208. league_id = league_id.replace('cmp-', '')
  209. # 子集玩法
  210. child_data = odd_datas[y].xpath('./tr')
  211. # 主队
  212. h_name = h_names[y].text
  213. # 客队
  214. a_name = a_names[y].text
  215. # 全场
  216. f_score = f_scores[y].xpath('string(.)').replace(' ', '').replace('\r\n', '')
  217. # 正则时间匹配规则
  218. pattern = re.compile(r"\d{1,3}:\d{1,3}")
  219. match_time = pattern.findall(stimes[y])[0]
  220. play_datas = []
  221. if child_data:
  222. rule = {0: "sc_1th", 1: "sc_2th", 2: "sc_3th", 3: "sc_4th", 4: "sc_5th", 5: "game_num", 6: "disc_num"}
  223. h_dict = {'team_name': h_name}
  224. a_dict = {'team_name': a_name}
  225. for i in range(len(child_data)):
  226. if i == 0:
  227. h_datas = child_data[i].xpath('.//tbody/tr[3]/td[contains(@class, "r-odds")]')
  228. a_datas = child_data[i].xpath('.//tbody/tr[4]/td[contains(@class, "r-odds")]')
  229. if h_datas and a_datas:
  230. for x in range(len(h_datas)):
  231. # 主队节得分
  232. h_data = h_datas[x].text.replace(' ', '').replace('\r\n', '')
  233. h_dict[rule[x]] = h_data
  234. # 客队节得分
  235. a_data = a_datas[x].text.replace(' ', '').replace('\r\n', '')
  236. a_dict[rule[x]] = a_data
  237. else:
  238. # 子玩法名
  239. child_name = child_data[i].xpath('.//td[contains(@class, "r-bt ")]/text()')[0].replace(' ', '').replace('\r\n', '')
  240. # 子玩法结果
  241. child_play = child_data[i].xpath('.//td[@class="r-odds"]/span[@class="prop"]')[0]
  242. play = child_play.xpath('string(.)')
  243. play_datas.append({"play_name": child_name, "play_result": play})
  244. play_datas.append(h_dict)
  245. play_datas.append(a_dict)
  246. item = Hgsaiguo()
  247. item["league_id"] = league_id
  248. item["league_name"] = league_name
  249. item["match_id"] = match_id
  250. item["match_date"] = match_date
  251. item["match_time"] = match_time
  252. item["home_team"] = h_name
  253. item["guest_team"] = a_name
  254. item["score_result"] = f_score
  255. item["play_data"] = play_datas
  256. item["pt"] = pt
  257. item["page"] = page
  258. yield item
  259. else:
  260. return
  261. # 棒球赛果
  262. if pt == 4:
  263. league_ids = response.xpath('//div[@class="rt-l-bar baseball"]/@id').extract()
  264. league_names = response.xpath('//div[@class="rt-l-bar baseball"]/span[@class="comp-txt"]/text()').extract()
  265. if league_ids:
  266. for index in range(len(league_ids)):
  267. league_id = league_ids[index]
  268. league_name = league_names[index]
  269. response_data = response.xpath('//div[@id="dt-{}"]'.format(league_id)).extract_first()
  270. data = etree.HTML(response_data)
  271. # 球队名
  272. h_names = data.xpath('//div[@class="rt-event"]/span[1]')
  273. a_names = data.xpath('//div[@class="rt-event"]/span[3]')
  274. # 全场
  275. f_scores = data.xpath('.//div[@class="rt-ft"]')
  276. # 上半场
  277. h_scores = data.xpath('.//div[@class="rt-ht"]')
  278. # 时间
  279. stimes = data.xpath('//div[@class="rt-event"]/../div[1]/span/text()')
  280. # 子集玩法
  281. # odd_names = data.xpath('//div[@class="rt-sub rt-data-hide"]/table/tbody[2]/tr/td[2]')
  282. # 子集玩法结果
  283. # odd_plays = data.xpath('//div[@class="rt-sub rt-data-hide"]/table/tbody[2]/tr/td[3]/span')
  284. match_ids = data.xpath('//div[@class="flex-wrap"]/../div[1]/@id')
  285. odd_datas = data.xpath('//div[contains(@class, "rt-sub")]/table/tbody')
  286. for y in range(len(odd_datas)):
  287. match_id = match_ids[y].replace('e-', '')
  288. league_id = league_id.replace('cmp-', '')
  289. # 子集玩法
  290. odd_names = odd_datas[y].xpath('.//tr/td[2]')
  291. # 子集玩法结果
  292. odd_plays = odd_datas[y].xpath('.//tr/td[3]/span')
  293. # 主队
  294. h_name = h_names[y].text
  295. # 客队
  296. a_name = a_names[y].text
  297. # 上半场
  298. h_score = h_scores[y].xpath('string(.)').replace(' ', '').replace('\r\n', '')
  299. # 全场
  300. f_score = f_scores[y].xpath('string(.)').replace(' ', '').replace('\r\n', '')
  301. # 正则时间匹配规则
  302. pattern = re.compile(r"\d{1,3}:\d{1,3}")
  303. match_time = pattern.findall(stimes[y])[0]
  304. play_datas = []
  305. if odd_names:
  306. for i in range(len(odd_names)):
  307. # 子玩法名
  308. name = odd_names[i].text.replace(' ', '').replace('\r\n', '')
  309. # 子玩法赛果
  310. play = odd_plays[i].xpath('string(.)').replace(' ', '').replace('\r\n', '')
  311. play_datas.append({"play_name": name, "play_result": play})
  312. item = Hgsaiguo()
  313. item["league_id"] = league_id
  314. item["league_name"] = league_name
  315. item["match_id"] = match_id
  316. item["match_date"] = match_date
  317. item["match_time"] = match_time
  318. item["home_team"] = h_name
  319. item["guest_team"] = a_name
  320. item["score_half"] = h_score
  321. item["score_full"] = f_score
  322. item["play_data"] = play_datas
  323. item["pt"] = pt
  324. item["page"] = page
  325. yield item
  326. else:
  327. return