# -*- coding: utf-8 -*- # import re import datetime import re import scrapy from ..items import Zujieguo class HgjieshuSpider(scrapy.Spider): name = 'zq_jieshu' to_day = datetime.datetime.now() allowed_domains = ['hg3535z.com'] custom_settings = { "ITEM_PIPELINES":{ 'hg3535.pipelines.Zujieshuqiupipeline': 300, }, 'LOG_LEVEL': 'DEBUG', 'LOG_FILE': "../hg3535/log/zq_jieshu_{}_{}_{}.log".format(to_day.year, to_day.month, to_day.day) } # start_urls = ['https://hg3535z.com/zh-cn/info-centre/sportsbook-info/results/1/normal/1', 'https://hg3535z.com/zh-cn/info-centre/sportsbook-info/results/1/normal/2'] def start_requests(self): urls = ['https://hg3535z.com/zh-cn/info-centre/sportsbook-info/results/1/normal/1','https://hg3535z.com/zh-cn/info-centre/sportsbook-info/results/1/normal/2'] for url in urls: yield scrapy.Request(url, callback=self.parse, dont_filter=True) def parse(self, response): if response.status == 200: # 所有比赛对象 # tema = response.xpath('//div[@class="rt-event"]//span[@class="pt"]/text()') # print(tema) # 所有比赛队名 # tema_name = [i.extract() for i in tema] # 获得所有比分对象 # tema_score = response.xpath('//div[contains(@class,"rt-ft ")]') tema_score = response.xpath('//div[@class="flex-wrap"]/../div[5]') # print(tema_score) # 获得所有比赛id对象 tema_id = response.xpath('//div[@class="flex-wrap"]/../div[1]/@id') # str.replace() # 所有比赛id列表 temaid_list = [i.extract().replace('e-', "") for i in tema_id] temascore_list = [] for score in tema_score: # 正则匹配规则 p1 = r"\d{1,3}-\d{1,3}" pattern1 = re.compile(p1) try: # 获取正则匹配结果 c = pattern1.findall(score.extract())[0] temascore_list.append(c) except: c = "" temascore_list.append(c) # print(temaid_list) # print(temascore_list) # print(len(temaid_list)) # print(len(temascore_list)) # 赛事id,赛事比元组列表 tema_tupe = {(temaid_list[i], temascore_list[i]) for i in range(len(temaid_list))} print(tema_tupe) # print(tema_tupe) # print(len(tema_tupe)) for y in tema_tupe: if y[1]: item = Zujieguo() item['id_score'] = y yield item