saiguo.py 8.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152
  1. import datetime
  2. import re
  3. import scrapy
  4. from lxml import etree
  5. # from ..items import Hgsaiguo
  6. class HgjieshuSpider(scrapy.Spider):
  7. name = 'saiguo'
  8. to_day = datetime.datetime.now()
  9. allowed_domains = ['hg3535z.com']
  10. custom_settings = {
  11. "ITEM_PIPELINES": {
  12. 'hg3535.pipeline.saiguo.Jieshuqiupipeline': 300,
  13. },
  14. # 'LOG_LEVEL': 'DEBUG',
  15. # 'LOG_FILE': "../hg3535/log/saiguo{}_{}_{}.log".format(to_day.year, to_day.month, to_day.day)
  16. }
  17. def start_requests(self):
  18. for y in range(1, 5):
  19. url = 'https://www.hg3535.cn/zh-cn/info-centre/sportsbook-info/results/{}/normal/1'.format(y)
  20. yield scrapy.Request(url=url, callback=self.parse, dont_filter=True, meta={'pt': y})
  21. def parse(self, response):
  22. if response.status == 200:
  23. pt = response.meta['pt']
  24. # 足球赛果
  25. if pt == 1:
  26. league_ids = response.xpath('//div[@class="rt-l-bar football"]/@id').extract()
  27. league_names = response.xpath('//div[@class="rt-l-bar football"]/span[@class="comp-txt"]/text()').extract()
  28. for index in range(len(league_ids)):
  29. league_id = league_ids[index]
  30. league_name = league_names[index]
  31. response_data = response.xpath('//div[@id="dt-{}"]'.format(league_id)).extract_first()
  32. # response_data = response.xpath('//div[@id="dt-{}"]'.format('cmp-36254')).extract_first()
  33. data = etree.HTML(response_data)
  34. # 球队名
  35. team_names = data.xpath('//div[@class="rt-event"]/@title')
  36. # 全场
  37. f_scores = data.xpath('.//div[contains(@class, "rt-ft ")]')
  38. # 上半场
  39. h_scores = data.xpath('.//div[contains(@class, "rt-ht ")]')
  40. # 时间
  41. stimes = data.xpath('//div[@class="rt-event"]/../div[1]/span/text()')
  42. # 子集玩法
  43. # odd_names = data.xpath('//div[@class="rt-sub rt-data-hide"]/table/tbody[2]/tr/td[2]')
  44. # 子集玩法结果
  45. # odd_plays = data.xpath('//div[@class="rt-sub rt-data-hide"]/table/tbody[2]/tr/td[3]/span')
  46. match_ids = data.xpath('//div[@class="flex-wrap"]/../div[1]/@id')
  47. odd_datas = data.xpath('//div[contains(@class, "rt-sub ")]/table/tbody[2]')
  48. for y in range(len(odd_datas)):
  49. match_id = match_ids[y].replace('e-', '')
  50. league_id = league_id.replace('cmp-', '')
  51. team_name = team_names[y].replace(' ', '').split('-')
  52. # 子集玩法
  53. odd_names = odd_datas[y].xpath('.//tr/td[2]')
  54. # 子集玩法结果
  55. odd_plays = odd_datas[y].xpath('.//tr/td[3]/span')
  56. # 主队
  57. h_name = team_name[0]
  58. # 客队
  59. a_name = team_name[1]
  60. print(h_name, a_name)
  61. # 上半场
  62. h_score = h_scores[y].xpath('string(.)').replace(' ', '').replace('\r\n', '')
  63. # 全场
  64. f_score = f_scores[y].xpath('string(.)').replace(' ', '').replace('\r\n', '')
  65. # 正则时间匹配规则
  66. pattern = re.compile(r"\d{1,3}:\d{1,3}")
  67. stime = pattern.findall(stimes[y])[0]
  68. if odd_names:
  69. for i in range(len(odd_names)):
  70. name = odd_names[i].text
  71. plays = odd_plays[i].xpath('text()')
  72. if len(plays) == 2:
  73. play = '{}&&{}'.format(plays[0], plays[1])
  74. else:
  75. play = plays[0]
  76. else:
  77. pass
  78. # 篮球赛果
  79. if pt == 2:
  80. league_ids = response.xpath('//div[@class="rt-l-bar sportHasQuater"]/@id').extract()
  81. league_names = response.xpath('//div[@class="rt-l-bar sportHasQuater"]/span[@class="comp-txt"]/text()').extract()
  82. for index in range(len(league_ids)):
  83. league_id = league_ids[index]
  84. league_name = league_names[index]
  85. response_data = response.xpath('//div[@id="dt-{}"]'.format(league_id)).extract_first()
  86. # response_data = response.xpath('//div[@id="dt-{}"]'.format('cmp-26405')).extract_first()
  87. data = etree.HTML(response_data)
  88. # 球队名
  89. team_names = data.xpath('//div[@class="rt-event"]/@title')
  90. # 全场
  91. f_scores = data.xpath('.//div[@class="rt-qft"]')
  92. # 上半场
  93. h_scores = data.xpath('.//div[@class="rt-qt1"]')
  94. # 下半场
  95. x_scores = data.xpath('.//div[@class="rt-qt2"]')
  96. # 时间
  97. stimes = data.xpath('//div[@class="rt-event"]/../div[1]/span/text()')
  98. match_ids = data.xpath('//div[@class="flex-wrap"]/../div[1]/@id')
  99. odd_datas = data.xpath('//div[contains(@class, "rt-sub ")]/table/tbody[2]')
  100. for y in range(len(odd_datas)):
  101. match_id = match_ids[y].replace('e-', '')
  102. league_id = league_id.replace('cmp-', '')
  103. team_name = team_names[y].replace(' ', '').split('-')
  104. # 子集玩法
  105. child_data = odd_datas[y].xpath('./tr')
  106. # 主队
  107. h_name = team_name[0]
  108. # 客队
  109. a_name = team_name[1]
  110. print(h_name, a_name)
  111. # 上半场
  112. h_score = h_scores[y].xpath('string(.)').replace(' ', '').replace('\r\n', '')
  113. # 全场
  114. f_score = f_scores[y].xpath('string(.)').replace(' ', '').replace('\r\n', '')
  115. # 下半场
  116. x_score = x_scores[y].xpath('string(.)').replace(' ', '').replace('\r\n', '')
  117. # 正则时间匹配规则
  118. pattern = re.compile(r"\d{1,3}:\d{1,3}")
  119. stime = pattern.findall(stimes[y])[0]
  120. if child_data:
  121. for i in range(len(child_data)):
  122. if i == 0:
  123. h_datas = child_data[i].xpath('.//td/table/tbody/tr[3]/td[@class="r-odds"]')
  124. a_datas = child_data[i].xpath('.//td/table/tbody/tr[4]/td[@class="r-odds"]')
  125. if h_datas and a_datas:
  126. for x in range(len(h_datas)):
  127. # 主队节得分
  128. h_data = h_datas[x].text.replace(' ', '').replace('\r\n', '')
  129. # 客队节得分
  130. a_data = a_datas[x].text.replace(' ', '').replace('\r\n', '')
  131. else:
  132. # 子玩法名
  133. child_name = child_data[i].xpath('.//td[contains(@class, "r-bt ")]/text()')[0].replace(' ', '').replace('\r\n', '')
  134. # 子玩法结果
  135. child_play = child_data[i].xpath('.//td[@class="r-odds"]/span[@class="prop"]/text()')[0]
  136. else:
  137. pass
  138. # 网球赛果
  139. if pt == 3:
  140. pass
  141. # 棒球赛果
  142. if pt == 4:
  143. pass