# -*- coding: utf-8 -*- import scrapy import re import copy from ..items import LanqiuItem class LanqiuSpider(scrapy.Spider): name = 'lanqiu' allowed_domains = ['m.hgg070.com/'] start_urls = ['http://m.hgg070.com//'] remath=re.compile("篮球") custom_settings={ "ITEM_PIPELINES": { "hgg070_spider.pipelines.lanqiu.ZuqiuPipeline": 200, }, } def start_requests(self): #今日,早盘 h_types=[('FT'),('FU')] headers = { 'Accept': '*/*', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.9', 'Connection': 'keep-alive', 'Content-Length': '130', 'Content-type': 'application/x-www-form-urlencoded', 'Cookie': '_ga=GA1.2.471918301.1572059707; _gid=GA1.2.2109447865.1572059707; _gat=1', 'Host': 'm.hgg070.com', 'Origin': 'http://m.hgg070.com', 'Referer': 'http://m.hgg070.com/', 'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.70 Mobile Safari/537.36' } url = "http://m.hgg070.com/app/member/get_league_list.php" for item in h_types: showtype = item data={ 'uid': '257853bc6f4166ca4e84f4d75d1cfc3540c6eab54b34898f4ad405cb2412402f', 'langx': 'zh-cn', 'ltype': '3', 'gtype': 'BK', 'showtype': showtype, 'sorttype': '', 'date': '', 'isP': '' } yield scrapy.FormRequest(url=url,formdata=data,callback=self.parse,headers=headers, meta={"data":data}, dont_filter=True) def parse(self, response): #获取id并判断抓取的球型 data=response.meta["data"] fromdata=copy.deepcopy(data) league=response.xpath('//league') url="http://m.hgg070.com/app/member/get_game_list.php" for le in league: name=le.xpath('./league_name/text()').extract_first() if len(self.remath.findall(name))>0: lid = le.xpath('./league_id/text()').extract_first() # 抓取今日 if data["showtype"]=="FT": data['lid'],data['sorttype'],data['date']=lid,'league','' # 抓取早盘 elif data["showtype"]=="FU": data['lid'], data['sorttype'], data['date'] = lid, 'league', 'all' yield scrapy.FormRequest(url=url,formdata=data,callback=self.detailball,meta={"data":fromdata},dont_filter=True) def detailball(self,response): data=response.meta["data"] url="http://m.hgg070.com/app/member/get_game_more.php" #获取联赛id gid game=response.xpath("//game") for g in game: gid=g.xpath("./gid/text()").extract_first() data["gid"]=gid yield scrapy.FormRequest(url=url,formdata=data,callback=self.getItem,dont_filter=True) def getItem(self,response): data=response.xpath("//game") for item in data: obj=LanqiuItem() obj['id']=item.xpath("./gid/text()").extract_first() obj['league'] = item.xpath("./league/text()").extract_first() obj['team_h'] = item.xpath("./team_h/text()").extract_first() obj['team_c'] = item.xpath("./team_c/text()").extract_first() obj['showtype'] = item.xpath("./gtype/text()").extract_first() obj['datetime'] = item.xpath("./datetime/text()").extract_first() yield obj