浏览代码

新增足球滚球爬虫

Your Name 6 年之前
父节点
当前提交
5153a45b33
共有 2 个文件被更改,包括 48 次插入50 次删除
  1. 1 6
      hgg070_spider/pipelines/aaaaa.py
  2. 47 44
      hgg070_spider/spiders/lanqiu.py

+ 1 - 6
hgg070_spider/pipelines/aaaaa.py

@@ -7,14 +7,9 @@ import time
 
 class LqSportsPipeline(object):
     @defer.inlineCallbacks
-<<<<<<< HEAD:hgg070_spider/pipelines/aaaaa.py
+
     def process_item(self,item,spider):
-        logger=logging.getLogger(__name__)
-=======
-    def process_item(self, item, spider):
-        print('555555555555555555555555555555555555555555555555555555555555555555555')
         logger = logging.getLogger(__name__)
->>>>>>> d24c6bfe93390b44800468998733ecf116d4bdf0:hgg070_spider/pipelines/lq_sports.py
         logger.info("进入管道")
         out = defer.Deferred()
         reactor.callInThread(self._do_calculation, item, out)

+ 47 - 44
hgg070_spider/spiders/lanqiu.py

@@ -3,25 +3,27 @@ import scrapy
 from ..items import LanqiuItem
 import copy
 import lxml.etree
-import re,os,json
+import re, os, json
 from ..utils.helper import Helper
 import time
 from ..items import LanqiuItem
 import xmltodict
 
+
 class LqSportsSpider(scrapy.Spider):
     name = 'lanqiu'
     allowed_domains = ['m.hgg070.com/']
     start_urls = ['http://m.hgg070.com//']
     remath = re.compile("篮球")
-    custom_settings={
+    custom_settings = {
         "ITEM_PIPELINES": {
             "hgg070_spider.pipelines.lanqiu.LanqiuPipeline": 200,
         },
     }
+
     def start_requests(self):
-        #今日,早盘
-        h_types=[('FT'),('FU')]
+        # 今日,早盘
+        h_types = [('FT'), ('FU')]
         headers = {
             'Accept': '*/*',
             'Accept-Encoding': 'gzip, deflate',
@@ -38,7 +40,7 @@ class LqSportsSpider(scrapy.Spider):
         url = "http://m.hgg070.com/app/member/get_league_list.php"
         for item in h_types:
             showtype = item
-            data={
+            data = {
                 'uid': 'a8b9b2facd6b19ab7023a2b8686207d4ea98c3ab68e455abe8fe49a4861ff68f',
                 'langx': 'zh-cn',
                 'ltype': '3',
@@ -48,51 +50,52 @@ class LqSportsSpider(scrapy.Spider):
                 'date': '',
                 'isP': ''
             }
-            yield scrapy.FormRequest(url=url,formdata=data,callback=self.parse,headers=headers,
-                                      meta={"data":data}, dont_filter=True)
+            yield scrapy.FormRequest(url=url, formdata=data, callback=self.parse, headers=headers,
+                                     meta={"data": data}, dont_filter=True)
 
     def parse(self, response):
-        #获取id并判断抓取的球型
-        data=response.meta["data"]
-        fromdata=copy.deepcopy(data)
-        league=response.xpath('//league')
-        url="http://m.hgg070.com/app/member/get_game_list.php"
+        # 获取id并判断抓取的球型
+        data = response.meta["data"]
+        fromdata = copy.deepcopy(data)
+        league = response.xpath('//league')
+        url = "http://m.hgg070.com/app/member/get_game_list.php"
         for le in league:
-            name=le.xpath('./league_name/text()').extract_first()
-            if len(self.remath.findall(name))>0:
+            name = le.xpath('./league_name/text()').extract_first()
+            if len(self.remath.findall(name)) > 0:
                 lid = le.xpath('./league_id/text()').extract_first()
                 # 抓取今日
-                if data["showtype"]=="FT":
-                    data['lid'],data['sorttype'],data['date']=lid,'league',''
+                if data["showtype"] == "FT":
+                    data['lid'], data['sorttype'], data['date'] = lid, 'league', ''
                 # 抓取早盘
-                elif data["showtype"]=="FU":
+                elif data["showtype"] == "FU":
                     data['lid'], data['sorttype'], data['date'] = lid, 'league', 'all'
-                yield scrapy.FormRequest(url=url,formdata=data,callback=self.detailball,meta={"data":fromdata},dont_filter=True)
+                yield scrapy.FormRequest(url=url, formdata=data, callback=self.detailball, meta={"data": fromdata},
+                                         dont_filter=True)
 
-    def detailball(self,response):
-        data=response.meta["data"]
-        url="http://m.hgg070.com/app/member/get_game_more.php"
-        #获取联赛id gid
-        game=response.xpath("//game")
+    def detailball(self, response):
+        data = response.meta["data"]
+        url = "http://m.hgg070.com/app/member/get_game_more.php"
+        # 获取联赛id gid
+        game = response.xpath("//game")
         for g in game:
-            gid=g.xpath("./gid/text()").extract_first()
+            gid = g.xpath("./gid/text()").extract_first()
             more_count = g.xpath("./more_count/text()").extract_first()
-            data["gid"]=gid
-            yield scrapy.FormRequest(url=url,formdata=data,callback=self.getItem,meta={"more_count":more_count,"isP":data["isP"]},dont_filter=True)
+            data["gid"] = gid
+            yield scrapy.FormRequest(url=url, formdata=data, callback=self.getItem,
+                                     meta={"more_count": more_count, "isP": data["isP"]}, dont_filter=True)
 
-
-    def getItem(self,response):
+    def getItem(self, response):
         more_count = response.meta["more_count"]
         isP = response.meta["isP"]
-        showtype=response.xpath('//serverresponse/showtype')
-        data= xmltodict.parse(response.text)['serverresponse']['game']
-        game_lists=[i for i in data if i['gopen']=='Y']
+        showtype = response.xpath('//serverresponse/showtype')
+        data = xmltodict.parse(response.text)['serverresponse']['game']
+        game_lists = [i for i in data if i['gopen'] == 'Y']
 
         if game_lists:
             for gl in game_lists:
-                cpath=os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
-                with open(cpath+"/conf/hgg070.json",encoding='utf8') as hg:
-                    hgg=json.load(hg)['bk']
+                cpath = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+                with open(cpath + "/conf/hgg070.json", encoding='utf8') as hg:
+                    hgg = json.load(hg)['bk']
                 datetime = gl['datetime'][:-8] + " " + gl['datetime'][-8:]
                 team_h = gl['team_h']
                 team_c = gl['team_c']
@@ -101,15 +104,15 @@ class LqSportsSpider(scrapy.Spider):
                 match_uid = Helper.genearte_uuid(team_h + team_c + datetime)
                 data = []
                 for hg in hgg:
-                    items=hg['items']
-                    if gl[hg['prodds']]=='Y':
+                    items = hg['items']
+                    if gl[hg['prodds']] == 'Y':
                         for x in items:
                             odds_code = gl[x['rodds']]
                             p_code = gl[hg['prodds']]
-                            odds=gl["ior_OUH"]
-                            #有两个条件,加两条数据
-                            if x['ratio_name']:      #大的
-                                condition_u=gl[x['ratio_name']]
+                            odds = gl["ior_OUH"]
+                            # 有两个条件,加两条数据
+                            if x['ratio_name']:  # 大的
+                                condition_u = gl[x['ratio_name']]
                                 odds_only = hg["plodds"] + x["lodds"] + '0' + str(odds) + "hg3535" + str(match_id)
                                 sole = hg["plodds"] + x["lodds"] + '0' + str(match_id) + "hg3535"
                                 tobj = {"match_id": match_id, "lg_id": league_id, "odds_code": odds_code, "status": 0,
@@ -118,13 +121,13 @@ class LqSportsSpider(scrapy.Spider):
                                         "source": "hgg070", "type": 0, "team": ""}
                                 data.append(tobj)
 
-                            if x['latio']:   #小的
+                            if x['latio']:  # 小的
                                 condition_s = gl[x['latio']]
-                                odds_only =hg["plodds"] + x["lodds"] + '0' + str(odds) + "hg3535" + str(match_id)
+                                odds_only = hg["plodds"] + x["lodds"] + '0' + str(odds) + "hg3535" + str(match_id)
                                 sole = hg["plodds"] + x["lodds"] + '0' + str(match_id) + "hg3535"
                                 tobj = {"match_id": match_id, "lg_id": league_id, "odds_code": odds_code, "status": 0,
                                         "sort": 0, "p_code": p_code,
-                                        "odds": odds,"condition": condition_s, "odds_only": odds_only, "sole": sole,
+                                        "odds": odds, "condition": condition_s, "odds_only": odds_only, "sole": sole,
                                         "source": "hgg070", "type": 0, "team": ""}
                                 data.append(tobj)
 
@@ -134,7 +137,7 @@ class LqSportsSpider(scrapy.Spider):
                                 sole = hg["plodds"] + x["lodds"] + '0' + str(match_id) + "hg3535"
                                 tobj = {"match_id": match_id, "lg_id": league_id, "odds_code": odds_code, "status": 0,
                                         "sort": 0, "p_code": p_code,
-                                        "odds": odds,"condition": condition_s, "odds_only": odds_only, "sole": sole,
+                                        "odds": odds, "condition": condition_s, "odds_only": odds_only, "sole": sole,
                                         "source": "hgg070", "type": 0, "team": ""}
                                 data.append(tobj)