小小的粉丝 发表于 2019-4-30 16:53

scrapy框架实现爬取球探网相关数据



项目简介:

球探中的英超比赛
1. 球队的信息
   (球队ID,名字,创建时间,城市,训练场,风格特点,胜率相关)
http://zq.win007.com/cn/team/Summary/19.html
2. 从2013年到2019年所有的比赛
   (比赛id, host_id, guest_id, 比赛的信息)
http://zq.win007.com/cn/League/2018-2019/36.html
3. 需要找到每个球员在上面的比赛中的数据
   (球员的名字, 比赛id, 球队id, 这个球员在这场比赛中的数据)
点击比分->球员统计
http://bf.win007.com/Count/1552443cn.htm


存到MySQL中


spider页面# -*- coding: utf-8 -*-
import scrapy
import re,time
from qiutan.items import SaichengItem
from qiutan.items import Team_DataItem
from qiutan.items import Member_Data_New_Item
from qiutan.items import Member_Data_Old_Item

class EcSpider(scrapy.Spider):

    name = 'Ec'
    allowed_domains = ['zq.win007.com','bf.win007.com']

    #将不同年份url交给Scheduler
    def start_requests(self):
      re = time.strftime('%Y%m%d%H', time.localtime())# 2019042509
      base_url = 'http://zq.win007.com/jsData/matchResult/{}/s36.js?version={}'
      date_lis = ['{}-{}'.format(i,i+1) for i in range(2013,2019)]
      for date in date_lis:
            req_base = scrapy.Request(base_url.format(date,re), callback = self.parse)
            req_base.meta['date'] = date
            req_base.meta['re'] = re
            yield req_base


    def team_data_id(self,response):
      # 获取每个队伍的id和队名
      pat = re.compile("\[(\d+),'(.*?)'")
      ballteam = pat.findall(response.text)
      lis_all_team = []
      for item in ballteam:
            lis_all_team.append(item)
            lis_all_team.append(item[-1])
      return lis_all_team


    #表2 全部轮次的数据表
    def parse(self, response):
      #获取球队id_队名列表
      lis_all_team = self.team_data_id(response)
      #获取每年所有队伍数据 38轮
      ball_lunci_team = re.findall('\[(\[\d{3,}.*?\])\];',response.text)
      num = 0
      #根据38轮遍历每一小轮
      for eve_turn in ball_lunci_team:
            #每小页数据
            item = SaichengItem()
            num += 1
            # 每轮次的10条数据
            eve_turn_team = re.findall('\[\d{6}.*?\]',eve_turn)
            for eve_turn_team_data in eve_turn_team:
                #将每行数据转化为list类型 索引取值
                #[851543,36,-1,'2013-08-17 19:45',25,58,'1-0','1-0','7',
                # '13',1.25,0.5,'2.5/3','1',1,1,1,1,0,0,'']
                lis = eve_turn_team_data.strip('[|]').replace('\'','').split(',')
                #根据获取的战队id去之前的列表找索引位置
                index_num_h = lis_all_team.index(lis)
                index_num_g = lis_all_team.index(lis)
                item['lunci'] = num
                bs_num_id = lis
                item['bs_time'] = lis      #2014-05-04 23:00 <class 'str'>
                item['bs_num_id'] = bs_num_id
                item['host_team'] = lis_all_team
                item['h_team_id'] = lis
                item['res_score'] = lis
                item['guest_team'] = lis_all_team
                item['g_team_id'] = lis
                item['all_rang'] = self.rangqiu(lis)
                item['half_rang'] = self.rangqiu(lis)
                item['sizes_balls_a'] = lis
                item['sizes_balls_h'] = lis
                item['half_score'] = lis
                yield item
                # 拼接每个比赛详细的url http://bf.win007.com/detail/ 1130517 cn.htm
                # 2013-08-17 ,2014-5-12 老版页面判断年份 保存版本
                if item['bs_time'] < '2014-05-12 0:00':
                  url = 'http://bf.win007.com/detail/{}cn.htm'.format(bs_num_id)
                  req = scrapy.Request(url, callback=self.bs_data_old)
                  req.meta['bs_num_id'] = bs_num_id
                  req.meta['l_team_id'] = lis
                  req.meta['r_team_id'] = lis
                  yield req

                else:
                  url = 'http://bf.win007.com/Count/{}cn.htm'.format(bs_num_id)
                  req = scrapy.Request(url, callback=self.bs_data_new)
                  req.meta['bs_num_id'] = bs_num_id
                  req.meta['l_team_id'] = lis
                  req.meta['r_team_id'] = lis
                  yield req

      team_url = 'http://zq.win007.com/jsData/teamInfo/teamDetail/tdl{}.js?version={}'
      #根据 偶数索引 取 球队id
      for i in range(len(lis_all_team)):
            if i%2 == 0:
                url = team_url.format(lis_all_team,response.meta['re'])
                req = scrapy.Request(url,callback=self.team_data)
                #加上防盗链获取接口
                req.meta['Referer'] = 'http://zq.win007.com/cn/team/Summary/{}.html'.format(lis_all_team)
                yield req

    #每场比赛队员数据: 新版
    def bs_data_new(self,response):
      #实例化Item
      item = Member_Data_New_Item()
      #分别 取上下两个队伍的信息
      member_lis_tr_s = response.xpath('//div[@id="content"]/div/table//tr')
      member_lis_tr_x = response.xpath('//div[@id="content"]/div/table//tr')
      for member_lis in member_lis_tr_s:
            item['bs_num_id'] = response.meta['bs_num_id']
            item['team_id'] = response.meta['l_team_id']
            item['member_id'] = member_lis.xpath('./td/text()').extract_first()
            item['member_name'] = member_lis.xpath('./td/a//text()').extract_first().strip()
            item['position'] = member_lis.xpath('./td/text()').extract_first().strip()
            item['shoot_d'] = member_lis.xpath('./td/text()').extract_first()
            item['shoot_z'] = member_lis.xpath('./td/text()').extract_first()
            item['key_ball'] = member_lis.xpath('./td/text()').extract_first()
            item['guoren'] = member_lis.xpath('./td/text()').extract_first()
            item['chuanq_count'] = member_lis.xpath('./td/text()').extract_first()
            item['chuanq_succ'] = member_lis.xpath('./td/text()').extract_first()
            item['passing'] = member_lis.xpath('./td/text()').extract_first()
            item['hengchuanc'] = member_lis.xpath('./td/text()').extract_first()
            item['success_zd'] = member_lis.xpath('./td/text()').extract_first()
            item['body_jc'] = member_lis.xpath('./td/text()').extract_first()
            item['score'] = member_lis.xpath('./td/text()').extract_first()
            item['key_event'] = member_lis.xpath('./td/a/img/@title').extract_first()
            yield item

      for member_lis in member_lis_tr_x:
            item['bs_num_id'] = response.meta['bs_num_id']
            item['team_id'] = response.meta['r_team_id']
            item['member_id'] = member_lis.xpath('./td/text()').extract_first()
            item['member_name'] = member_lis.xpath('./td/a/text()').extract_first().strip()
            item['position'] = member_lis.xpath('./td/text()').extract_first().strip()
            item['shoot_d'] = member_lis.xpath('./td/text()').extract_first()
            item['shoot_z'] = member_lis.xpath('./td/text()').extract_first()
            item['key_ball'] = member_lis.xpath('./td/text()').extract_first()
            item['guoren'] = member_lis.xpath('./td/text()').extract_first()
            item['chuanq_count'] = member_lis.xpath('./td/text()').extract_first()
            item['chuanq_succ'] = member_lis.xpath('./td/text()').extract_first()
            item['passing'] = member_lis.xpath('./td/text()').extract_first()
            item['hengchuanc'] = member_lis.xpath('./td/text()').extract_first()
            item['success_zd'] = member_lis.xpath('./td/text()').extract_first()
            item['body_jc'] = member_lis.xpath('./td/text()').extract_first()
            item['score'] = member_lis.xpath('./td/text()').extract_first()
            item['key_event'] = member_lis.xpath('./td/a/img/@title').extract_first()

            yield item

    def bs_data_old(self,response):
      #获取13年左边的阵容数据和后备数据,返回列表[含字符串,]
      member_lis_l1 = response.xpath("/html/body/table/tr/td/table/tr/td/a//text()").extract()
      member_lis_l2 = response.xpath("/html/body/table/tr/td/table/tr/td/a/text()").extract()
      # 获取13年右边的阵容数据和后备数据
      member_lis_r1 = response.xpath("/html/body/table/tr/td/table/tr/td/a/text()").extract()
      member_lis_r2 = response.xpath("/html/body/table/tr/td/table/tr/td/a/text()").extract()
      item = Member_Data_Old_Item()

      #将阵容和后备列表合并
      member_lis_l = member_lis_l1 + member_lis_l2
      member_lis_r = member_lis_r1 + member_lis_r2
      # 遍历每个元组(球员号,球员名字)
      for member in member_lis_l:
            res = member.strip()
            member_list = re.findall('(\d+)\s?(.*)', res)    #('22', '雅斯科莱宁') ('11', '麦加')
            item['bs_num_id'] = response.meta['bs_num_id']
            item['team_id'] = response.meta['l_team_id']
            item['member_id'] = member_list
            item['member_name'] = member_list

            yield item

      for member in member_lis_r:

            res = member.strip()      # 1切赫
            member_list = re.findall('(\d+)\s+(.*)', res)    # ('17', '奥布莱恩')
            item['bs_num_id'] = response.meta['bs_num_id']
            item['team_id'] = response.meta['r_team_id']
            item['member_id'] = member_list
            item['member_name'] = member_list
            yield item

    #球队信息
    def team_data(self,response):
      #第一行数据
      teamDetail = re.findall('var teamDetail = \[(\d+.*)\]',response.text)
      teamDetail_lis = eval(teamDetail)
      #获取教练
      var_coach =re.findall("var coach = \[\['\d+','','(.*?)','.*','.*',\d\]\];",response.text)
      item = Team_DataItem()
      #
      item['team_id'] = teamDetail_lis
      item['team_name'] = teamDetail_lis
      item['Eng_name'] = teamDetail_lis
      item['team_city'] = teamDetail_lis
      item['team_home'] = teamDetail_lis
      item['build_team_time'] = teamDetail_lis
      try:
            item['var_coach'] = var_coach
      except:
            item['var_coach'] = 'NULL'

      # 球队特点
      item['team_youshi'] = str(re.findall('\[1,\d,"(.*?)\^',response.text))
      item['team_ruodian'] = str(re.findall('\[2,\d,"(.*?)\^',response.text))
      item['team_style'] = str(re.findall('\[3,\d,"(.*?)\^',response.text))
      team_stats_lis = re.findall('var countSum = \[\[(\'.*?)\]',response.text)
      stats_tuple = eval(team_stats_lis)

      s = stats_tuple

      winrate = int(s)/(int(s)+int(s)+int(s))
      data = (s,s,s,winrate,s,s,s,s,s,(s),s,(s),s,s,s)
      str_stats = '全部:胜:%s,平:%s,负:%s,胜率:%.3f,犯规:%s,黄牌:%s,红牌:%s,' \
                  '控球率:%s,射门(射正):%s(%s),传球(成功):%s(%s),传球成功率:%s,过人次数:%s,评分:%s'
      item['team_stats'] = str_stats % (data)
      yield item


    def rangqiu(self,num_rang):
      if num_rang == '0':
            return '平手'
      elif num_rang == '0.25':
            return '平/半'
      elif num_rang == '0.5':
            return '半球'
      elif num_rang == '0.75':
            return '半/一'
      elif num_rang == '1':
            return '一球'
      elif num_rang == '1.25':
            return '一/球半'
      elif num_rang == '1.5':
            return '球半'
      elif num_rang == '1.75':
            return '半/二'
      elif num_rang == '2':
            return '二球'
      elif num_rang == '2.25':
            return '二/半'
      elif num_rang == '-0.25':
            return '*平/半'
      elif num_rang == '-0.5':
            return '*半球'
      elif num_rang == '-0.75':
            return '*半/一'
      elif num_rang == '-1':
            return '*一球'
      elif num_rang == '-1.25':
            return '*一/球半'
      elif num_rang == '-1.5':
            return '*球半'
      else:
            return '暂未收录'

items页面

# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html

import scrapy


class QiutanItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    pass

class SaichengItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    lunci = scrapy.Field()
    bs_num_id = scrapy.Field()
    bs_time = scrapy.Field()
    host_team = scrapy.Field()
    h_team_id = scrapy.Field()
    res_score = scrapy.Field()
    guest_team = scrapy.Field()
    g_team_id = scrapy.Field()
    all_rang = scrapy.Field()
    half_rang = scrapy.Field()
    sizes_balls_a = scrapy.Field()
    sizes_balls_h = scrapy.Field()
    half_score = scrapy.Field()

    def get_insert_data(self):

      insert_sql = 'INSERT INTO all_bs_data values (null,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'
      data = ( self['lunci'],self['bs_num_id'],self['bs_time'],self['host_team'],self['h_team_id'],self['res_score'],self['guest_team'],
               self['g_team_id'],self['all_rang'],self['half_rang'],self['sizes_balls_a'],self['sizes_balls_h'],self['half_score'])

      return insert_sql,data


#all_bs_data 建表语句
# CREATE TABLE all_bs_data(id INT PRIMARY KEY AUTO_INCREMENT,
# lunci TINYINT,
# bs_time VARCHAR(20),
# host_team VARCHAR(20),
# h_team_id VARCHAR(6),
# res_score VARCHAR(10),
# guest_team VARCHAR(20),
# g_team_id VARCHAR(6),
# all_rang VARCHAR(6),
# half_rang VARCHAR(6),
# sizes_balls_a VARCHAR6),
# sizes_balls_h VARCHAR(6),
# half_score VARCHAR(6)
# )DEFAULT CHARSET=utf8mb4;
# alter table all_bs_data add bs_num_id int after lunci;



class Team_DataItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    team_id = scrapy.Field()
    team_name = scrapy.Field()
    Eng_name = scrapy.Field()
    team_city = scrapy.Field()
    team_home = scrapy.Field()
    build_team_time = scrapy.Field()
    var_coach = scrapy.Field()
    team_youshi = scrapy.Field()
    team_style = scrapy.Field()
    team_ruodian = scrapy.Field()
    team_stats = scrapy.Field()

    def get_insert_data(self):
      insert_sql = 'INSERT INTO all_team_data(team_id,team_name,Eng_name,team_city,team_home,build_team_time,var_coach,team_youshi,team_style,team_ruodian,team_stats)values (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'
      data = (self['team_id'],self['team_name'],self['Eng_name'],self['team_city'],self['team_home'],self['build_team_time'],
               self['var_coach'],self['team_youshi'],self['team_style'],self['team_ruodian'],self['team_stats'])

      return insert_sql,data

# CREATE TABLE all_team_data(id INT PRIMARY KEY AUTO_INCREMENT,
# team_id INT,
# team_name VARCHAR(20),
# Eng_name VARCHAR(30),
# team_city VARCHAR(30),
# team_home VARCHAR(30),
# build_team_time VARCHAR(20),
# var_coach VARCHAR(20),
# team_youshi VARCHAR(200),
# team_style VARCHAR(200),
# team_ruodian VARCHAR(200),
# team_stats VARCHAR(300)
# )DEFAULT CHARSET=utf8mb4;

class Member_Data_New_Item(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    bs_num_id = scrapy.Field()
    team_id = scrapy.Field()
    member_id = scrapy.Field()
    member_name = scrapy.Field()
    position = scrapy.Field()
    shoot_d = scrapy.Field()
    shoot_z = scrapy.Field()
    key_ball = scrapy.Field()
    guoren = scrapy.Field()
    chuanq_count = scrapy.Field()
    chuanq_succ = scrapy.Field()
    passing = scrapy.Field()
    hengchuanc = scrapy.Field()
    success_zd = scrapy.Field()
    body_jc = scrapy.Field()
    score = scrapy.Field()
    key_event = scrapy.Field()

    def get_insert_data(self):

      insert_sql = 'INSERT INTO all_member_data values (null,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'
      data = (self['bs_num_id'],self['team_id'],self['member_id'],self['member_name'],self['position'],self['shoot_d'],
               self['shoot_z'],self['key_ball'],self['guoren'],self['chuanq_count'],self['chuanq_succ'],self['passing']
                ,self['hengchuanc'],self['success_zd'],self['body_jc'],self['score'],self['key_event'])

      return insert_sql,data


# CREATE TABLE all_member_data(id INT PRIMARY KEY AUTO_INCREMENT,
# bs_num_id INT,
# team_id INT,
# member_id INT,
# member_name VARCHAR(30),
# position VARCHAR(10),
# shoot_d INT,
# shoot_z INT,
# key_ball INT,
# guoren INT,
# chuanq_count INT,
# chuanq_succ INT,
# passing VARCHAR(200),
# hengchuanc INT,
# success_zd INT,
# body_jc INT,
# score FLOAT,
# key_event VARCHAR(20)
# )DEFAULT CHARSET=utf8mb4;

class Member_Data_Old_Item(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    bs_num_id = scrapy.Field()
    team_id = scrapy.Field()
    member_id = scrapy.Field()
    member_name = scrapy.Field()


    def get_insert_data(self):

      insert_sql = 'INSERT INTO all_member_data(bs_num_id,team_id,member_id,member_name) values (%s,%s,%s,%s)'
      data = (self['bs_num_id'],self['team_id'],self['member_id'],self['member_name'])

      return insert_sql,data

settings页面
# -*- coding: utf-8 -*-

# Scrapy settings for qiutan project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
#   https://doc.scrapy.org/en/latest/topics/settings.html
#   https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#   https://doc.scrapy.org/en/latest/topics/spider-middleware.html

BOT_NAME = 'qiutan'

SPIDER_MODULES = ['qiutan.spiders']
NEWSPIDER_MODULE = 'qiutan.spiders'

#日志等级
LOG_LEVEL='WARNING'

# Crawl responsibly by identifying yourself (and your website) on the user-agent
# USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36'

# Obey robots.txt rules
ROBOTSTXT_OBEY = False

# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs

# DOWNLOAD_DELAY = 1
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)
COOKIES_ENABLED = False

# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False

# Override the default request headers:
DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36',
    'Referer':'http://zq.win007.com/cn/TeamHeadPage/2013-2014/36.html'
}

# Enable or disable spider middlewares
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
#    'qiutan.middlewares.QiutanSpiderMiddleware': 543,
#}

# Enable or disable downloader middlewares
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
#    'qiutan.middlewares.QiutanDownloaderMiddleware': 543,
#}

# Enable or disable extensions
# See https://doc.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
#    'scrapy.extensions.telnet.TelnetConsole': None,
#}

# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html

ITEM_PIPELINES = {
   # 'qiutan.pipelines.QiutanPipeline': 300,
   'qiutan.pipelines.MySql_data_Pipeline': 300,
}

# Enable and configure the AutoThrottle extension (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'


piplines页面
# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
from qiutan.db_sql import MySql

class QiutanPipeline(object):

    def process_item(self, item, spider):

      return item

class MySql_data_Pipeline(object):
    def __init__(self):
      self.db = MySql('localhost','root','123456','second',3306)

    def process_item(self, item, spider):
      if hasattr(item,'get_insert_data'):
            insert_sql,data = item.get_insert_data()
            self.db.update(insert_sql,data)

      return item

数据库模块页面
import pymysql

class MySql:

    def __init__(self,host,user,password,database,port):
      self.db = pymysql.connect(host=host,user=user,password=password,database=database,
                                  port=port,cursorclass=pymysql.cursors.DictCursor)
      self.cursor = self.db.cursor()


    def update(self,sql,data):
      try:
            self.cursor.execute(sql, data)
            self.db.commit()
      except:
            self.db.rollback()
            print('数据修改失败,请检查sql语句~')
            print(sql,data)


    def query(self,sql,data):
      try:
            result = self.cursor.execute(sql, data)

            return result
      except:
            print('数据查询失败,请查看sql语句~')


if __name__ == '__main__':
    db = MySql('localhost','root','123456','second',3306)






℡小疯、 发表于 2019-4-30 17:17

膜拜大神

mrleochan 发表于 2019-4-30 23:02

挺好玩的,python爬虫不少啊

bosseing 发表于 2019-5-1 10:39

很厉害了啊。

sunxiaolu 发表于 2019-5-27 12:21

楼主,能不能加一下微信。看了你的球探抓取数据那个帖子而来,深有感触。微信:四七八三一五七七七

小小的粉丝 发表于 2019-5-27 20:39

sunxiaolu 发表于 2019-5-27 12:21
楼主,能不能加一下微信。看了你的球探抓取数据那个帖子而来,深有感触。微信:四七八三一五七七七

emmm 有问题就在这里问吧

leoncheng25 发表于 2020-2-16 11:29

能爬到球探网里的V推荐购买后的推荐吗?

szp0305 发表于 2020-7-24 16:38

写的不错,python我只会皮毛,之前都是用c#爬取的
页: [1]
查看完整版本: scrapy框架实现爬取球探网相关数据