| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334 |
- #!/usr/bin/env python
- # -*- coding: utf-8 -*-
- import datetime
- import re
- import scrapy
- from lxml import etree
- from ..items import Hgsaiguo
- class HgjieshuSpider(scrapy.Spider):
- name = 'saiguo'
- to_day = datetime.datetime.now()
- allowed_domains = ['hg3535z.com']
- custom_settings = {
- "ITEM_PIPELINES": {
- 'hg3535.pipeline.saiguo.Jieshuqiupipeline': 300,
- },
- # 'LOG_LEVEL': 'DEBUG',
- # 'LOG_FILE': "../hg3535/log/saiguo{}_{}_{}.log".format(to_day.year, to_day.month, to_day.day)
- }
- def start_requests(self):
- for y in range(1, 5):
- for z in range(1, 3):
- url = 'https://www.hg3535.cn/zh-cn/info-centre/sportsbook-info/results/{}/normal/{}'.format(y, z)
- yield scrapy.Request(url=url, callback=self.parse, dont_filter=True, meta={'pt': y, "page": z})
- def parse(self, response):
- if response.status == 200:
- pt = response.meta['pt']
- page = response.meta['page']
- if page == 1:
- us_datetime = datetime.datetime.now() - datetime.timedelta(hours=12)
- else:
- us_datetime = datetime.datetime.now() - datetime.timedelta(hours=36)
- match_date = us_datetime.strftime("%Y-%m-%d")
- # 足球赛果
- if pt == 1:
- league_ids = response.xpath('//div[@class="rt-l-bar football"]/@id').extract()
- league_names = response.xpath('//div[@class="rt-l-bar football"]/span[@class="comp-txt"]/text()').extract()
- if league_ids:
- for index in range(len(league_ids)):
- league_id = league_ids[index]
- league_name = league_names[index]
- response_data = response.xpath('//div[@id="dt-{}"]'.format(league_id)).extract_first()
- data = etree.HTML(response_data)
- # 球队名
- # team_names = data.xpath('//div[@class="rt-event"]/@title')
- h_names = data.xpath('//div[@class="rt-event"]/span[1]')
- a_names = data.xpath('//div[@class="rt-event"]/span[3]')
- # 全场
- f_scores = data.xpath('.//div[contains(@class, "rt-ft ")]')
- # 上半场
- h_scores = data.xpath('.//div[contains(@class, "rt-ht ")]')
- # 时间
- stimes = data.xpath('//div[@class="rt-event"]/../div[1]/span/text()')
- match_ids = data.xpath('//div[@class="flex-wrap"]/../div[1]/@id')
- odd_datas = data.xpath('//div[contains(@class, "rt-sub ")]/table/tbody[2]')
- for y in range(len(odd_datas)):
- match_id = match_ids[y].replace('e-', '')
- league_id = league_id.replace('cmp-', '')
- # 子集玩法
- odd_names = odd_datas[y].xpath('.//tr/td[2]')
- # 子集玩法结果
- odd_plays = odd_datas[y].xpath('.//tr/td[3]/span')
- # 主队
- h_name = h_names[y].text
- # 客队
- a_name = a_names[y].text
- # 上半场
- h_score = h_scores[y].xpath('string(.)').replace(' ', '').replace('\r\n', '')
- # 全场
- f_score = f_scores[y].xpath('string(.)').replace(' ', '').replace('\r\n', '')
- # 正则时间匹配规则
- pattern = re.compile(r"\d{1,3}:\d{1,3}")
- match_time = pattern.findall(stimes[y])[0]
- play_datas = []
- if odd_names:
- for i in range(len(odd_names)):
- name = odd_names[i].text
- plays = odd_plays[i].xpath('text()')
- if len(plays) == 2:
- play = '{}&&{}'.format(plays[0], plays[1])
- else:
- play = plays[0]
- play_datas.append({'play_name': name, 'play_result': play})
- item = Hgsaiguo()
- item["league_id"] = league_id
- item["league_name"] = league_name
- item["match_id"] = match_id
- item["match_date"] = match_date
- item["match_time"] = match_time
- item["home_team"] = h_name
- item["guest_team"] = a_name
- item["score_half"] = h_score
- item["score_full"] = f_score
- item["play_data"] = play_datas
- item["pt"] = pt
- item["page"] = page
- yield item
- else:
- return
- # 篮球赛果
- if pt == 2:
- league_ids = response.xpath('//div[@class="rt-l-bar sportHasQuater"]/@id').extract()
- league_names = response.xpath('//div[@class="rt-l-bar sportHasQuater"]/span[@class="comp-txt"]/text()').extract()
- if league_ids:
- for index in range(len(league_ids)):
- league_id = league_ids[index]
- league_name = league_names[index]
- response_data = response.xpath('//div[@id="dt-{}"]'.format(league_id)).extract_first()
- data = etree.HTML(response_data)
- # 球队名
- h_names = data.xpath('//div[@class="rt-event"]/span[1]')
- a_names = data.xpath('//div[@class="rt-event"]/span[3]')
- # 全场
- f_scores = data.xpath('.//div[@class="rt-qft"]')
- # 上半场
- h_scores = data.xpath('.//div[@class="rt-qt1"]')
- # 下半场
- x_scores = data.xpath('.//div[@class="rt-qt2"]')
- # 时间
- stimes = data.xpath('//div[@class="rt-event"]/../div[1]/span/text()')
- match_ids = data.xpath('//div[@class="flex-wrap"]/../div[1]/@id')
- odd_datas = data.xpath('//div[contains(@class, "rt-sub ")]/table/tbody[2]')
- for y in range(len(odd_datas)):
- match_id = match_ids[y].replace('e-', '')
- league_id = league_id.replace('cmp-', '')
- # 子集玩法
- child_data = odd_datas[y].xpath('./tr')
- # 主队
- h_name = h_names[y].text
- # 客队
- a_name = a_names[y].text
- # 上半场
- h_score = h_scores[y].xpath('string(.)').replace(' ', '').replace('\r\n', '')
- # 全场
- f_score = f_scores[y].xpath('string(.)').replace(' ', '').replace('\r\n', '')
- # 下半场
- x_score = x_scores[y].xpath('string(.)').replace(' ', '').replace('\r\n', '')
- # 正则时间匹配规则
- pattern = re.compile(r"\d{1,3}:\d{1,3}")
- match_time = pattern.findall(stimes[y])[0]
- play_datas = []
- if child_data:
- h_dict = {'team_name': h_name}
- a_dict = {'team_name': a_name}
- for i in range(len(child_data)):
- if i == 0:
- h_datas = child_data[i].xpath('.//td/table/tbody/tr[3]/td[@class="r-odds"]')
- a_datas = child_data[i].xpath('.//td/table/tbody/tr[4]/td[@class="r-odds"]')
- rule = {0: "sc_1th", 1: "sc_2th", 2: "sc_3th", 3: "sc_4th", 4: "sc_other"}
- if h_datas and a_datas:
- for x in range(len(h_datas)):
- # 主队节得分
- h_data = h_datas[x].text.replace(' ', '').replace('\r\n', '')
- h_dict[rule[x]] = h_data
- # 客队节得分
- a_data = a_datas[x].text.replace(' ', '').replace('\r\n', '')
- a_dict[rule[x]] = a_data
- else:
- # 子玩法名
- child_name = child_data[i].xpath('.//td[contains(@class, "r-bt ")]/text()')[0].replace(' ', '').replace('\r\n', '')
- # 子玩法结果
- child_play = child_data[i].xpath('.//td[@class="r-odds"]/span[@class="prop"]/text()')[0]
- play_datas.append({"play_name": child_name, "play_result": child_play})
- play_datas.append(h_dict)
- play_datas.append(a_dict)
- item = Hgsaiguo()
- item["league_id"] = league_id
- item["league_name"] = league_name
- item["match_id"] = match_id
- item["match_date"] = match_date
- item["match_time"] = match_time
- item["home_team"] = h_name
- item["guest_team"] = a_name
- item["score_half"] = h_score
- item["score_result"] = f_score
- item["play_data"] = play_datas
- item["pt"] = pt
- item["page"] = page
- item["score_below"] = x_score
- yield item
- else:
- return
- #
- # 网球赛果
- if pt == 3:
- league_ids = response.xpath('//div[@class="rt-l-bar tennis"]/@id').extract()
- league_names = response.xpath('//div[@class="rt-l-bar tennis"]/span[@class="comp-txt"]/text()').extract()
- if league_ids:
- for index in range(len(league_ids)):
- league_id = league_ids[index]
- league_name = league_names[index]
- response_data = response.xpath('//div[@id="dt-{}"]'.format(league_id)).extract_first()
- data = etree.HTML(response_data)
- # 球队名
- h_names = data.xpath('//div[@class="rt-event"]/span[1]')
- a_names = data.xpath('//div[@class="rt-event"]/span[3]')
- # 赛果
- f_scores = data.xpath('.//div[@class="rt-set"]')
- # 时间
- stimes = data.xpath('//div[@class="rt-event"]/../div[1]/span/text()')
- # 赛事id
- match_ids = data.xpath('//div[@class="flex-wrap"]/../div[1]/@id')
- # 详细赛果信息, 比分等
- odd_datas = data.xpath('//div[contains(@class, "rt-sub ")]/table/tbody')
- for y in range(len(odd_datas)):
- match_id = match_ids[y].replace('e-', '')
- league_id = league_id.replace('cmp-', '')
- # 子集玩法
- child_data = odd_datas[y].xpath('./tr')
- # 主队
- h_name = h_names[y].text
- # 客队
- a_name = a_names[y].text
- # 全场
- f_score = f_scores[y].xpath('string(.)').replace(' ', '').replace('\r\n', '')
- # 正则时间匹配规则
- pattern = re.compile(r"\d{1,3}:\d{1,3}")
- match_time = pattern.findall(stimes[y])[0]
- play_datas = []
- if child_data:
- rule = {0: "sc_1th", 1: "sc_2th", 2: "sc_3th", 3: "sc_4th", 4: "sc_5th", 5: "game_num", 6: "disc_num"}
- h_dict = {'team_name': h_name}
- a_dict = {'team_name': a_name}
- for i in range(len(child_data)):
- if i == 0:
- h_datas = child_data[i].xpath('.//tbody/tr[3]/td[contains(@class, "r-odds")]')
- a_datas = child_data[i].xpath('.//tbody/tr[4]/td[contains(@class, "r-odds")]')
- if h_datas and a_datas:
- for x in range(len(h_datas)):
- # 主队节得分
- h_data = h_datas[x].text.replace(' ', '').replace('\r\n', '')
- h_dict[rule[x]] = h_data
- # 客队节得分
- a_data = a_datas[x].text.replace(' ', '').replace('\r\n', '')
- a_dict[rule[x]] = a_data
- else:
- # 子玩法名
- child_name = child_data[i].xpath('.//td[contains(@class, "r-bt ")]/text()')[0].replace(' ', '').replace('\r\n', '')
- # 子玩法结果
- child_play = child_data[i].xpath('.//td[@class="r-odds"]/span[@class="prop"]')[0]
- play = child_play.xpath('string(.)')
- play_datas.append({"play_name": child_name, "play_result": play})
- play_datas.append(h_dict)
- play_datas.append(a_dict)
- item = Hgsaiguo()
- item["league_id"] = league_id
- item["league_name"] = league_name
- item["match_id"] = match_id
- item["match_date"] = match_date
- item["match_time"] = match_time
- item["home_team"] = h_name
- item["guest_team"] = a_name
- item["score_result"] = f_score
- item["play_data"] = play_datas
- item["pt"] = pt
- item["page"] = page
- yield item
- else:
- return
- # 棒球赛果
- if pt == 4:
- league_ids = response.xpath('//div[@class="rt-l-bar baseball"]/@id').extract()
- league_names = response.xpath('//div[@class="rt-l-bar baseball"]/span[@class="comp-txt"]/text()').extract()
- if league_ids:
- for index in range(len(league_ids)):
- league_id = league_ids[index]
- league_name = league_names[index]
- response_data = response.xpath('//div[@id="dt-{}"]'.format(league_id)).extract_first()
- data = etree.HTML(response_data)
- # 球队名
- h_names = data.xpath('//div[@class="rt-event"]/span[1]')
- a_names = data.xpath('//div[@class="rt-event"]/span[3]')
- # 全场
- f_scores = data.xpath('.//div[@class="rt-ft"]')
- # 上半场
- h_scores = data.xpath('.//div[@class="rt-ht"]')
- # 时间
- stimes = data.xpath('//div[@class="rt-event"]/../div[1]/span/text()')
- # 子集玩法
- # odd_names = data.xpath('//div[@class="rt-sub rt-data-hide"]/table/tbody[2]/tr/td[2]')
- # 子集玩法结果
- # odd_plays = data.xpath('//div[@class="rt-sub rt-data-hide"]/table/tbody[2]/tr/td[3]/span')
- match_ids = data.xpath('//div[@class="flex-wrap"]/../div[1]/@id')
- odd_datas = data.xpath('//div[contains(@class, "rt-sub")]/table/tbody')
- for y in range(len(odd_datas)):
- match_id = match_ids[y].replace('e-', '')
- league_id = league_id.replace('cmp-', '')
- # 子集玩法
- odd_names = odd_datas[y].xpath('.//tr/td[2]')
- # 子集玩法结果
- odd_plays = odd_datas[y].xpath('.//tr/td[3]/span')
- # 主队
- h_name = h_names[y].text
- # 客队
- a_name = a_names[y].text
- # 上半场
- h_score = h_scores[y].xpath('string(.)').replace(' ', '').replace('\r\n', '')
- # 全场
- f_score = f_scores[y].xpath('string(.)').replace(' ', '').replace('\r\n', '')
- # 正则时间匹配规则
- pattern = re.compile(r"\d{1,3}:\d{1,3}")
- match_time = pattern.findall(stimes[y])[0]
- play_datas = []
- if odd_names:
- for i in range(len(odd_names)):
- # 子玩法名
- name = odd_names[i].text.replace(' ', '').replace('\r\n', '')
- # 子玩法赛果
- play = odd_plays[i].xpath('string(.)').replace(' ', '').replace('\r\n', '')
- play_datas.append({"play_name": name, "play_result": play})
- item = Hgsaiguo()
- item["league_id"] = league_id
- item["league_name"] = league_name
- item["match_id"] = match_id
- item["match_date"] = match_date
- item["match_time"] = match_time
- item["home_team"] = h_name
- item["guest_team"] = a_name
- item["score_half"] = h_score
- item["score_full"] = f_score
- item["play_data"] = play_datas
- item["pt"] = pt
- item["page"] = page
- yield item
- else:
- return
|