# -*- coding: utf-8 -*- import datetime import re import scrapy from ..items import Bangjieguo class HgjieshuSpider(scrapy.Spider): name = 'bq_jieshu' to_day = datetime.datetime.now() allowed_domains = ['hg3535z.com'] custom_settings = { "ITEM_PIPELINES":{ 'hg3535.pipeline.Bangjieshuqiupipeline': 300, }, 'LOG_LEVEL': 'DEBUG', 'LOG_FILE': "../hg3535/log/bq_jieshu_{}_{}_{}.log".format(to_day.year, to_day.month, to_day.day) } start_urls = ['https://hg3535z.com/zh-cn/info-centre/sportsbook-info/results/4/normal/1', 'https://hg3535z.com/zh-cn/info-centre/sportsbook-info/results/4/normal/2'] def parse(self, response): if response.status == 200: # 所有比赛对象 # tema = response.xpath('//div[@class="rt-event"]//span[@class="pt"]/text()') # print(tema) # 所有比赛队名 # tema_name = [i.extract() for i in tema] # 获得所有比分对象 # tema_score = response.xpath('//div[contains(@class,"rt-ft")]') tema_score = response.xpath('//div[@class="flex-wrap"]/../div[5]') # 获得所有比赛id对象 tema_id = response.xpath('//div[@class="flex-wrap"]/../div[1]/@id') # str.replace() # 所有比赛id列表 temaid_list = [i.extract().replace('e-', "") for i in tema_id] temascore_list = [] for score in tema_score: # 正则匹配规则 p1 = r"\d{1,3}-\d{1,3}" pattern1 = re.compile(p1) try: # 获取正则匹配结果 c = pattern1.findall(score.extract())[0] temascore_list.append(c) except: c = "" temascore_list.append(c) # print(temaid_list) # print(temascore_list) # print(len(temaid_list)) # print(len(temascore_list)) # 赛事id,赛事比元组列表 tema_tupe = {(temaid_list[i], temascore_list[i]) for i in range(len(temaid_list))} print(tema_tupe) print(len(tema_tupe)) # for y in tema_tupe: # if y[1]: # item = Bangjieguo() # item['id_score'] = y # yield item id_list = [] item = Bangjieguo() for y in tema_tupe: if y[1]: id_list.append(y[0]) item['id_score'] = id_list yield item