# -*- coding: utf-8 -*- import scrapy from ..items import LanqiuItem import copy import lxml.etree import re, os, json from ..utils.helper import Helper import time from ..items import LanqiuItem import xmltodict class LqSportsSpider(scrapy.Spider): name = 'lanqiu' allowed_domains = ['m.hgg070.com/'] start_urls = ['http://m.hgg070.com//'] remath = re.compile("篮球") custom_settings = { "ITEM_PIPELINES": { "hgg070_spider.pipelines.lanqiu.LanqiuPipeline": 200, }, } def start_requests(self): # 今日,早盘 h_types = [('FT'), ('FU')] headers = { 'Accept': '*/*', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.9', 'Connection': 'keep-alive', 'Content-Length': '130', 'Content-type': 'application/x-www-form-urlencoded', 'Cookie': '_ga=GA1.2.471918301.1572059707; _gid=GA1.2.2109447865.1572059707; _gat=1', 'Host': 'm.hgg070.com', 'Origin': 'http://m.hgg070.com', 'Referer': 'http://m.hgg070.com/', 'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.70 Mobile Safari/537.36' } url = "http://m.hgg070.com/app/member/get_league_list.php" for item in h_types: showtype = item data = { 'uid': '75384e6a840f9d5801ed61a0ee22b7586b70d4d015b144e1ae83625ec46ca392', 'langx': 'zh-cn', 'ltype': '3', 'gtype': 'BK', 'showtype': showtype, 'sorttype': '', 'date': '', 'isP': '' } yield scrapy.FormRequest(url=url, formdata=data, callback=self.parse, headers=headers, meta={"data": data}, dont_filter=True) def parse(self, response): # 获取id并判断抓取的球型 data = response.meta["data"] fromdata = copy.deepcopy(data) league = response.xpath('//league') url = "http://m.hgg070.com/app/member/get_game_list.php" for le in league: name = le.xpath('./league_name/text()').extract_first() if len(self.remath.findall(name)) > 0: lid = le.xpath('./league_id/text()').extract_first() # 抓取今日 if data["showtype"] == "FT": data['lid'], data['sorttype'], data['date'] = lid, 'league', '' # 抓取早盘 elif data["showtype"] == "FU": data['lid'], data['sorttype'], data['date'] = lid, 'league', 'all' yield scrapy.FormRequest(url=url, formdata=data, callback=self.detailball, meta={"data": fromdata}, dont_filter=True) def detailball(self, response): data = response.meta["data"] url = "http://m.hgg070.com/app/member/get_game_more.php" # 获取联赛id gid game = response.xpath("//game") for g in game: gid = g.xpath("./gid/text()").extract_first() more_count = g.xpath("./more_count/text()").extract_first() data["gid"] = gid yield scrapy.FormRequest(url=url, formdata=data, callback=self.getItem, meta={"more_count": more_count, "isP": data["isP"]}, dont_filter=True) def getItem(self, response): more_count = response.meta["more_count"] isP = response.meta["isP"] showtype = response.xpath('//serverresponse/showtype') data = xmltodict.parse(response.text)['serverresponse']['game'] game_lists = [i for i in data if i['gopen'] == 'Y'] if game_lists: for gl in game_lists: cpath = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) with open(cpath + "/conf/hgg070.json", encoding='utf8') as hg: hgg = json.load(hg)['bk'] datetime = gl['datetime'][:-8] + " " + gl['datetime'][-8:] team_h = gl['team_h'] team_c = gl['team_c'] league_id = gl['gidm'] match_id = gl.get('gid', '') match_uid = Helper.genearte_uuid(team_h + team_c + datetime) data = [] for hg in hgg: items = hg['items'] if gl[hg['prodds']] == 'Y': for x in items: odds_code = gl[x['rodds']] p_code = gl[hg['prodds']] odds = gl["ior_OUH"] # 有两个条件,加两条数据 if x['ratio_name']: # 大的 condition_u = gl[x['ratio_name']] odds_only = hg["plodds"] + x["lodds"] + '0' + str(odds) + "hg3535" + str(match_id) sole = hg["plodds"] + x["lodds"] + '0' + str(match_id) + "hg3535" tobj = {"match_id": match_id, "lg_id": league_id, "odds_code": odds_code, "status": 0, "sort": 0, "p_code": p_code, "odds": odds, "condition": condition_u, "odds_only": odds_only, "sole": sole, "source": "hgg070", "type": 0, "team": ""} data.append(tobj) if x['latio']: # 小的 condition_s = gl[x['latio']] odds_only = hg["plodds"] + x["lodds"] + '0' + str(odds) + "hg3535" + str(match_id) sole = hg["plodds"] + x["lodds"] + '0' + str(match_id) + "hg3535" tobj = {"match_id": match_id, "lg_id": league_id, "odds_code": odds_code, "status": 0, "sort": 0, "p_code": p_code, "odds": odds, "condition": condition_s, "odds_only": odds_only, "sole": sole, "source": "hgg070", "type": 0, "team": ""} data.append(tobj) if not x['latio'] and not x['ratio_name']: condition_s = '' odds_only = hg["plodds"] + x["lodds"] + '0' + str(odds) + "hg3535" + str(match_id) sole = hg["plodds"] + x["lodds"] + '0' + str(match_id) + "hg3535" tobj = {"match_id": match_id, "lg_id": league_id, "odds_code": odds_code, "status": 0, "sort": 0, "p_code": p_code, "odds": odds, "condition": condition_s, "odds_only": odds_only, "sole": sole, "source": "hgg070", "type": 0, "team": ""} data.append(tobj) item = LanqiuItem() item['match_id'] = match_id item['source'] = "hg0088" item['updata'] = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) item['content'] = data item['league_id'] = league_id item['more_count'] = more_count item['league'] = gl["league"] item['match_identity'] = match_uid item['datetime'] = datetime item['team_h'] = team_h item['team_c'] = team_c item['isP'] = isP item['showtype'] = showtype yield item