import datetime import re import scrapy from ..items import Hgsaiguo class HgjieshuSpider(scrapy.Spider): name = 'saiguo' to_day = datetime.datetime.now() allowed_domains = ['hg3535z.com'] custom_settings = { "ITEM_PIPELINES": { 'hg3535.pipeline.saiguo.Jieshuqiupipeline': 300, }, # 'LOG_LEVEL': 'DEBUG', # 'LOG_FILE': "../hg3535/log/saiguo{}_{}_{}.log".format(to_day.year, to_day.month, to_day.day) } def start_requests(self): for y in range(1, 5): url = 'https://www.hg3535.cn/zh-cn/info-centre/sportsbook-info/results/{}/normal/1'.format(y) yield scrapy.Request(url=url, callback=self.parse, dont_filter=True, meta={'pt': y}) def parse(self, response): if response.status == 200: pt = response.meta['pt'] if pt == 1: tema_score = response.xpath('//div[@class="flex-wrap"]/../div[5]') # 获得所有比赛id对象 tema_id = response.xpath('//div[@class="flex-wrap"]/../div[1]/@id') # 所有比赛id列表 temaid_list = [i.extract().replace('e-', "") for i in tema_id] temascore_list = [] for score in tema_score: # 正则匹配规则 p1 = r"\d{1,3}-\d{1,3}" pattern1 = re.compile(p1) try: # 获取正则匹配结果 c = pattern1.findall(score.extract())[0] temascore_list.append(c) except: c = "" temascore_list.append(c) # 赛事id,赛事比元组列表 tema_tupe = {(temaid_list[i], temascore_list[i]) for i in range(len(temaid_list))} for y in tema_tupe: if y[1]: item = Hgsaiguo() item['id_score'] = y item['pt'] = pt yield item if pt == 2: tema_score = response.xpath('//div[@class="flex-wrap"]/../div[5]') # 获得所有比赛id对象 tema_id = response.xpath('//div[@class="flex-wrap"]/../div[1]/@id') # 所有比赛id列表 temaid_list = [i.extract().replace('e-', "") for i in tema_id] temascore_list = [] for score in tema_score: # 正则匹配规则 p1 = r"\d{1,3}-\d{1,3}" pattern1 = re.compile(p1) try: # 获取正则匹配结果 c = pattern1.findall(score.extract())[0] temascore_list.append(c) except: c = "" temascore_list.append(c) # 赛事id,赛事比元组列表 tema_tupe = {(temaid_list[i], temascore_list[i]) for i in range(len(temaid_list))} for y in tema_tupe: if y[1]: item = Hgsaiguo() item['id_score'] = y item['pt'] = pt yield item if pt == 3: # 获得所有比赛获胜人,判断赛事是否结束 # tema_score = response.xpath('//div[@class="flex-wrap"]/../div[4]/text()') tema_score = response.xpath('//div[@class="flex-wrap"]/../div[4]/@title') # 获得所有比赛id对象 tema_id = response.xpath('//div[@class="flex-wrap"]/../div[1]/@id') # 所有比赛id列表 temaid_list = [i.extract().replace('e-', "") for i in tema_id] temascore_list = [] for score in tema_score: # 正则匹配规则 # p1 = r"\d{1,3}-\d{1,3}" # pattern1 = re.compile(p1) try: # 获取正则匹配结果 c = score.extract() temascore_list.append(c) except: c = "" temascore_list.append(c) # 赛事id,赛事比元组列表 tema_tupe = {(temaid_list[i], temascore_list[i]) for i in range(len(temaid_list))} for y in tema_tupe: if y[1]: item = Hgsaiguo() item['id_score'] = y item['pt'] = pt yield item if pt == 4: tema_score = response.xpath('//div[@class="flex-wrap"]/../div[5]') # 获得所有比赛id对象 tema_id = response.xpath('//div[@class="flex-wrap"]/../div[1]/@id') # str.replace() # 所有比赛id列表 temaid_list = [i.extract().replace('e-', "") for i in tema_id] temascore_list = [] for score in tema_score: # 正则匹配规则 p1 = r"\d{1,3}-\d{1,3}" pattern1 = re.compile(p1) try: # 获取正则匹配结果 c = pattern1.findall(score.extract())[0] temascore_list.append(c) except: c = "" temascore_list.append(c) # 赛事id,赛事比元组列表 tema_tupe = {(temaid_list[i], temascore_list[i]) for i in range(len(temaid_list))} for y in tema_tupe: if y[1]: item = Hgsaiguo() item['id_score'] = y item['pt'] = pt yield item