import datetime import re import scrapy from lxml import etree from ..items import Hgsaiguo class HgjieshuSpider(scrapy.Spider): name = 'saiguo' to_day = datetime.datetime.now() allowed_domains = ['hg3535z.com'] custom_settings = { "ITEM_PIPELINES": { 'hg3535.pipeline.saiguo.Jieshuqiupipeline': 300, }, # 'LOG_LEVEL': 'DEBUG', # 'LOG_FILE': "../hg3535/log/saiguo{}_{}_{}.log".format(to_day.year, to_day.month, to_day.day) } def start_requests(self): for y in range(1, 5): for z in range(1, 3): url = 'https://www.hg3535.cn/zh-cn/info-centre/sportsbook-info/results/{}/normal/{}'.format(y, z) yield scrapy.Request(url=url, callback=self.parse, dont_filter=True, meta={'pt': y, "page": z}) def parse(self, response): if response.status == 200: pt = response.meta['pt'] page = response.meta['page'] if page == 1: us_datetime = datetime.datetime.now() - datetime.timedelta(hours=12) else: us_datetime = datetime.datetime.now() - datetime.timedelta(hours=36) match_date = us_datetime.strftime("%Y-%m-%d") # 足球赛果 if pt == 1: league_ids = response.xpath('//div[@class="rt-l-bar football"]/@id').extract() league_names = response.xpath('//div[@class="rt-l-bar football"]/span[@class="comp-txt"]/text()').extract() if league_ids: for index in range(len(league_ids)): league_id = league_ids[index] league_name = league_names[index] response_data = response.xpath('//div[@id="dt-{}"]'.format(league_id)).extract_first() data = etree.HTML(response_data) # 球队名 # team_names = data.xpath('//div[@class="rt-event"]/@title') h_names = data.xpath('//div[@class="rt-event"]/span[1]') a_names = data.xpath('//div[@class="rt-event"]/span[3]') # 全场 f_scores = data.xpath('.//div[contains(@class, "rt-ft ")]') # 上半场 h_scores = data.xpath('.//div[contains(@class, "rt-ht ")]') # 时间 stimes = data.xpath('//div[@class="rt-event"]/../div[1]/span/text()') # 子集玩法 # odd_names = data.xpath('//div[@class="rt-sub rt-data-hide"]/table/tbody[2]/tr/td[2]') # 子集玩法结果 # odd_plays = data.xpath('//div[@class="rt-sub rt-data-hide"]/table/tbody[2]/tr/td[3]/span') match_ids = data.xpath('//div[@class="flex-wrap"]/../div[1]/@id') odd_datas = data.xpath('//div[contains(@class, "rt-sub ")]/table/tbody[2]') for y in range(len(odd_datas)): match_id = match_ids[y].replace('e-', '') league_id = league_id.replace('cmp-', '') # 子集玩法 odd_names = odd_datas[y].xpath('.//tr/td[2]') # 子集玩法结果 odd_plays = odd_datas[y].xpath('.//tr/td[3]/span') # 主队 h_name = h_names[y].text # 客队 a_name = a_names[y].text # 上半场 h_score = h_scores[y].xpath('string(.)').replace(' ', '').replace('\r\n', '') # 全场 f_score = f_scores[y].xpath('string(.)').replace(' ', '').replace('\r\n', '') # 正则时间匹配规则 pattern = re.compile(r"\d{1,3}:\d{1,3}") match_time = pattern.findall(stimes[y])[0] play_datas = [] if odd_names: for i in range(len(odd_names)): name = odd_names[i].text plays = odd_plays[i].xpath('text()') if len(plays) == 2: play = '{}&&{}'.format(plays[0], plays[1]) else: play = plays[0] play_datas.append({'play_name': name, 'play_result': play}) item = Hgsaiguo() item["league_id"] = league_id item["league_name"] = league_name item["match_id"] = match_id item["match_date"] = match_date item["match_time"] = match_time item["home_team"] = h_name item["guest_team"] = a_name item["score_half"] = h_score item["score_full"] = f_score item["play_data"] = play_datas item["pt"] = pt item["page"] = page yield item else: return # 篮球赛果 if pt == 2: league_ids = response.xpath('//div[@class="rt-l-bar sportHasQuater"]/@id').extract() league_names = response.xpath('//div[@class="rt-l-bar sportHasQuater"]/span[@class="comp-txt"]/text()').extract() if league_ids: for index in range(len(league_ids)): league_id = league_ids[index] league_name = league_names[index] response_data = response.xpath('//div[@id="dt-{}"]'.format(league_id)).extract_first() data = etree.HTML(response_data) # 球队名 h_names = data.xpath('//div[@class="rt-event"]/span[1]') a_names = data.xpath('//div[@class="rt-event"]/span[3]') # 全场 f_scores = data.xpath('.//div[@class="rt-qft"]') # 上半场 h_scores = data.xpath('.//div[@class="rt-qt1"]') # 下半场 x_scores = data.xpath('.//div[@class="rt-qt2"]') # 时间 stimes = data.xpath('//div[@class="rt-event"]/../div[1]/span/text()') match_ids = data.xpath('//div[@class="flex-wrap"]/../div[1]/@id') odd_datas = data.xpath('//div[contains(@class, "rt-sub ")]/table/tbody[2]') for y in range(len(odd_datas)): match_id = match_ids[y].replace('e-', '') league_id = league_id.replace('cmp-', '') # 子集玩法 child_data = odd_datas[y].xpath('./tr') # 主队 h_name = h_names[y].text # 客队 a_name = a_names[y].text # 上半场 h_score = h_scores[y].xpath('string(.)').replace(' ', '').replace('\r\n', '') # 全场 f_score = f_scores[y].xpath('string(.)').replace(' ', '').replace('\r\n', '') # 下半场 x_score = x_scores[y].xpath('string(.)').replace(' ', '').replace('\r\n', '') # 正则时间匹配规则 pattern = re.compile(r"\d{1,3}:\d{1,3}") match_time = pattern.findall(stimes[y])[0] play_datas = [] if child_data: h_dict = {'team_name': h_name} a_dict = {'team_name': a_name} for i in range(len(child_data)): if i == 0: h_datas = child_data[i].xpath('.//td/table/tbody/tr[3]/td[@class="r-odds"]') a_datas = child_data[i].xpath('.//td/table/tbody/tr[4]/td[@class="r-odds"]') rule = {0: "sc_1th", 1: "sc_2th", 2: "sc_3th", 3: "sc_4th", 4: "sc_other"} if h_datas and a_datas: for x in range(len(h_datas)): # 主队节得分 h_data = h_datas[x].text.replace(' ', '').replace('\r\n', '') h_dict[rule[x]] = h_data # 客队节得分 a_data = a_datas[x].text.replace(' ', '').replace('\r\n', '') a_dict[rule[x]] = a_data else: # 子玩法名 child_name = child_data[i].xpath('.//td[contains(@class, "r-bt ")]/text()')[0].replace(' ', '').replace('\r\n', '') # 子玩法结果 child_play = child_data[i].xpath('.//td[@class="r-odds"]/span[@class="prop"]/text()')[0] play_datas.append({"play_name": child_name, "play_result": child_play}) play_datas.append(h_dict) play_datas.append(a_dict) item = Hgsaiguo() item["league_id"] = league_id item["league_name"] = league_name item["match_id"] = match_id item["match_date"] = match_date item["match_time"] = match_time item["home_team"] = h_name item["guest_team"] = a_name item["score_half"] = h_score item["score_result"] = f_score item["play_data"] = play_datas item["pt"] = pt item["page"] = page item["score_below"] = x_score yield item else: return # # 网球赛果 if pt == 3: league_ids = response.xpath('//div[@class="rt-l-bar tennis"]/@id').extract() league_names = response.xpath('//div[@class="rt-l-bar tennis"]/span[@class="comp-txt"]/text()').extract() if league_ids: for index in range(len(league_ids)): league_id = league_ids[index] league_name = league_names[index] response_data = response.xpath('//div[@id="dt-{}"]'.format(league_id)).extract_first() data = etree.HTML(response_data) # 球队名 h_names = data.xpath('//div[@class="rt-event"]/span[1]') a_names = data.xpath('//div[@class="rt-event"]/span[3]') # 赛果 f_scores = data.xpath('.//div[@class="rt-set"]') # 时间 stimes = data.xpath('//div[@class="rt-event"]/../div[1]/span/text()') # 赛事id match_ids = data.xpath('//div[@class="flex-wrap"]/../div[1]/@id') # 详细赛果信息, 比分等 odd_datas = data.xpath('//div[contains(@class, "rt-sub ")]/table/tbody') for y in range(len(odd_datas)): match_id = match_ids[y].replace('e-', '') league_id = league_id.replace('cmp-', '') # 子集玩法 child_data = odd_datas[y].xpath('./tr') # 主队 h_name = h_names[y].text # 客队 a_name = a_names[y].text # 全场 f_score = f_scores[y].xpath('string(.)').replace(' ', '').replace('\r\n', '') # 正则时间匹配规则 pattern = re.compile(r"\d{1,3}:\d{1,3}") match_time = pattern.findall(stimes[y])[0] play_datas = [] if child_data: rule = {0: "sc_1th", 1: "sc_2th", 2: "sc_3th", 3: "sc_4th", 4: "sc_5th", 5: "game_num", 6: "disc_num"} h_dict = {'team_name': h_name} a_dict = {'team_name': a_name} for i in range(len(child_data)): if i == 0: h_datas = child_data[i].xpath('.//tbody/tr[3]/td[contains(@class, "r-odds")]') a_datas = child_data[i].xpath('.//tbody/tr[4]/td[contains(@class, "r-odds")]') if h_datas and a_datas: for x in range(len(h_datas)): # 主队节得分 h_data = h_datas[x].text.replace(' ', '').replace('\r\n', '') h_dict[rule[x]] = h_data # 客队节得分 a_data = a_datas[x].text.replace(' ', '').replace('\r\n', '') a_dict[rule[x]] = a_data else: # 子玩法名 child_name = child_data[i].xpath('.//td[contains(@class, "r-bt ")]/text()')[0].replace(' ', '').replace('\r\n', '') # 子玩法结果 child_play = child_data[i].xpath('.//td[@class="r-odds"]/span[@class="prop"]')[0] play = child_play.xpath('string(.)') play_datas.append({"play_name": child_name, "play_result": play}) play_datas.append(h_dict) play_datas.append(a_dict) item = Hgsaiguo() item["league_id"] = league_id item["league_name"] = league_name item["match_id"] = match_id item["match_date"] = match_date item["match_time"] = match_time item["home_team"] = h_name item["guest_team"] = a_name item["score_result"] = f_score item["play_data"] = play_datas item["pt"] = pt item["page"] = page yield item else: return # 棒球赛果 if pt == 4: league_ids = response.xpath('//div[@class="rt-l-bar baseball"]/@id').extract() league_names = response.xpath('//div[@class="rt-l-bar baseball"]/span[@class="comp-txt"]/text()').extract() if league_ids: for index in range(len(league_ids)): league_id = league_ids[index] league_name = league_names[index] response_data = response.xpath('//div[@id="dt-{}"]'.format(league_id)).extract_first() data = etree.HTML(response_data) # 球队名 h_names = data.xpath('//div[@class="rt-event"]/span[1]') a_names = data.xpath('//div[@class="rt-event"]/span[3]') # 全场 f_scores = data.xpath('.//div[@class="rt-ft"]') # 上半场 h_scores = data.xpath('.//div[@class="rt-ht"]') # 时间 stimes = data.xpath('//div[@class="rt-event"]/../div[1]/span/text()') # 子集玩法 # odd_names = data.xpath('//div[@class="rt-sub rt-data-hide"]/table/tbody[2]/tr/td[2]') # 子集玩法结果 # odd_plays = data.xpath('//div[@class="rt-sub rt-data-hide"]/table/tbody[2]/tr/td[3]/span') match_ids = data.xpath('//div[@class="flex-wrap"]/../div[1]/@id') odd_datas = data.xpath('//div[contains(@class, "rt-sub")]/table/tbody') for y in range(len(odd_datas)): match_id = match_ids[y].replace('e-', '') league_id = league_id.replace('cmp-', '') # 子集玩法 odd_names = odd_datas[y].xpath('.//tr/td[2]') # 子集玩法结果 odd_plays = odd_datas[y].xpath('.//tr/td[3]/span') # 主队 h_name = h_names[y].text # 客队 a_name = a_names[y].text # 上半场 h_score = h_scores[y].xpath('string(.)').replace(' ', '').replace('\r\n', '') # 全场 f_score = f_scores[y].xpath('string(.)').replace(' ', '').replace('\r\n', '') # 正则时间匹配规则 pattern = re.compile(r"\d{1,3}:\d{1,3}") match_time = pattern.findall(stimes[y])[0] play_datas = [] if odd_names: for i in range(len(odd_names)): # 子玩法名 name = odd_names[i].text.replace(' ', '').replace('\r\n', '') # 子玩法赛果 play = odd_plays[i].xpath('string(.)').replace(' ', '').replace('\r\n', '') play_datas.append({"play_name": name, "play_result": play}) item = Hgsaiguo() item["league_id"] = league_id item["league_name"] = league_name item["match_id"] = match_id item["match_date"] = match_date item["match_time"] = match_time item["home_team"] = h_name item["guest_team"] = a_name item["score_half"] = h_score item["score_full"] = f_score item["play_data"] = play_datas item["pt"] = pt item["page"] = page yield item else: return