# -*- coding: utf-8 -*- import scrapy from ..items import LanqiuItem import copy import lxml.etree import re class LqSportsSpider(scrapy.Spider): name = 'lq_sports' allowed_domains = ['m.hgg070.com/'] start_urls = ['http://m.hgg070.com//'] remath = re.compile("篮球") # custom_settings={ # "ITEM_PIPELINES": { # "hgg070_spider.pipelines.lanqiu.ZuqiuPipeline": 200, # }, # } def start_requests(self): #今日,早盘 h_types=[('FT'),('FU')] headers = { 'Accept': '*/*', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.9', 'Connection': 'keep-alive', 'Content-Length': '130', 'Content-type': 'application/x-www-form-urlencoded', 'Cookie': '_ga=GA1.2.471918301.1572059707; _gid=GA1.2.2109447865.1572059707; _gat=1', 'Host': 'm.hgg070.com', 'Origin': 'http://m.hgg070.com', 'Referer': 'http://m.hgg070.com/', 'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.70 Mobile Safari/537.36' } url = "http://m.hgg070.com/app/member/get_league_list.php" for item in h_types: showtype = item data={ 'uid': '86797ef15d547a503c926bb658051dd137586e25f0936536d424c09f4fb74d83', 'langx': 'zh-cn', 'ltype': '3', 'gtype': 'BK', 'showtype': showtype, 'sorttype': '', 'date': '', 'isP': '' } yield scrapy.FormRequest(url=url,formdata=data,callback=self.parse,headers=headers, meta={"data":data}, dont_filter=True) def parse(self, response): #获取id并判断抓取的球型 data=response.meta["data"] fromdata=copy.deepcopy(data) league=response.xpath('//league') url="http://m.hgg070.com/app/member/get_game_list.php" for le in league: name=le.xpath('./league_name/text()').extract_first() if len(self.remath.findall(name))>0: lid = le.xpath('./league_id/text()').extract_first() # 抓取今日 if data["showtype"]=="FT": data['lid'],data['sorttype'],data['date']=lid,'league','' # 抓取早盘 elif data["showtype"]=="FU": data['lid'], data['sorttype'], data['date'] = lid, 'league', 'all' yield scrapy.FormRequest(url=url,formdata=data,callback=self.detailball,meta={"data":fromdata},dont_filter=True) def detailball(self,response): data=response.meta["data"] url="http://m.hgg070.com/app/member/get_game_more.php" #获取联赛id gid game=response.xpath("//game") for g in game: gid=g.xpath("./gid/text()").extract_first() data["gid"]=gid yield scrapy.FormRequest(url=url,formdata=data,callback=self.getItem,dont_filter=True) def getItem(self,response): game_lists = [] data=response.xpath("//game") if data: for game in data: game_odds = {} gopen = game.xpath('//game/gopen/text()').extract_first() if gopen == 'Y': game = lxml.etree.fromstring(game.extract()) for i in game.getchildren(): if i.text == None: game_odds[i.tag] = "" else: game_odds[i.tag] = i.text.replace(' ', '') game_lists.append(game_odds) else: print('gopen == N, 详细赔率盘口未开启') # for item in data: # obj=LanqiuItem() # obj['id']=item.xpath("./gid/text()").extract_first() # obj['league'] = item.xpath("./league/text()").extract_first() # obj['team_h'] = item.xpath("./team_h/text()").extract_first() # obj['team_c'] = item.xpath("./team_c/text()").extract_first() # obj['showtype'] = item.xpath("./gtype/text()").extract_first() # obj['datetime'] = item.xpath("./datetime/text()").extract_first() # item['match_id'] = item.xpath("./gid/text()").extract_first() # item['uuid'] = uid_list # item['source'] = "hg0088" # item['updata'] = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) # item['content'] = odd_list # item['gidm'] = league_id # item['tag'] = tag # item['league'] = league # item['match_uid'] = match_uid # item['datetime'] = datetime # item['team_h'] = team_h # item['team_c'] = team_c yield obj