# -*- coding: utf-8 -*- import scrapy import re class LanqiuSpider(scrapy.Spider): name = 'lanqiu' allowed_domains = ['m.hgg070.com/'] start_urls = ['http://m.hgg070.com//'] remath=re.compile("篮球") # custom_settings={ # "ITEM_PIPELINES": { # "collectSports.pipelines.zuqiu.ZuqiuPipeline": 200, # }, # } def start_requests(self): #今日,早盘 h_types=[('FT'),('FU')] headers = { 'Accept': '*/*', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.9', 'Connection': 'keep-alive', 'Content-Length': '130', 'Content-type': 'application/x-www-form-urlencoded', 'Cookie': '_ga=GA1.2.471918301.1572059707; _gid=GA1.2.2109447865.1572059707; _gat=1', 'Host': 'm.hgg070.com', 'Origin': 'http://m.hgg070.com', 'Referer': 'http://m.hgg070.com/', 'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.70 Mobile Safari/537.36' } url = "http://m.hgg070.com/app/member/get_league_list.php" for item in h_types: showtype = item data={ 'uid': '7554a670e92d06105fe567b75e5b80fe65e6e40167f4979c8d74ca5eaa461d4d', 'langx': 'zh-cn', 'ltype': '3', 'gtype': 'BK', 'showtype': showtype, 'sorttype': '', 'date': '', 'isP': '' } yield scrapy.FormRequest(url=url,formdata=data,callback=self.parse,headers=headers, meta={"data":data}, dont_filter=True) def parse(self, response): #获取id并判断抓取的球型 data=response.meta["data"] league=response.xpath('//league') url="http://m.hgg070.com/app/member/get_game_list.php" for le in league: name=le.xpath('./league_name/text()').extract_first() if len(self.remath.findall(name))>0: lid = le.xpath('./league_id/text()').extract_first() # 抓取今日 if data["showtype"]=="FT": data['lid'],data['sorttype'],data['date']=lid,'league','' # 抓取早盘 elif data["showtype"]=="FU": data['lid'], data['sorttype'], data['date'] = lid, 'league', 'all' print('77777777777777777777777777777777777777777777L',data) yield scrapy.FormRequest(url=url,formdata=data,callback=self.detailball,meta={"data":response.meta["data"]},dont_filter=True) def detailball(self,response): data=response.meta["data"] url="" #获取联赛id gid game=response.xpath("//game") for g in game: gid=g.xpath("./gid/text()").extract_first() data["gid"]=gid print('wwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwww',data) yield scrapy.FormRequest(url=url,formdata=data,callback=self.getItem,dont_filter=True) def getItem(self,response): print('ffffffffffffffffffffffffffffffffffffffffffffffffffffff',response.text)