Python爬虫 requests和create_engine

艾英范
2023-12-01

CSDN项目之爬虫小试

项目背景

说来也巧,闲来无事,在微信中水群正好看到某群友说想爬点LOL(英雄联盟)的比赛数据,索性就要来了网址链接。网址链接:PentaQ官网
作为LPL粉丝,当然是先拿LPL模块做模板,闲话不多说,直接进入主题。
项目整体思路:
1、通过观察发现网站数据是按照赛区/赛季/赛事活动和游戏版本划分模块,换句话说,爬到某个版本中某个赛区某个赛季就可同理抓取其他版本/赛区/赛季的数据
2、以2019 LPL Summer All Patches为例,进行Overview/Team Stats/Player Stats三个模块数据爬取

爬虫语言:Python,这里对于没有Python环境的读者,建议使用Anaconda(Anaconda下载地址
IDE:采用Anaconda自带的Spyder,读者可根据自己使用习惯选择;接下来就是安装本次需要的python库和包,由于作者采用spyder,因此使用Anaconda Prompt进行所需模块的下载
下载模块一般采用:pip install module_name

#python模块导入
import json
import pandas as pd
import time
import requests
import datetime
from sqlalchemy import create_engine
import pymysql
import random
from numpy import *
import re,urllib.request
# from celery_app import app #定时器

Player Stats模块:作者采用的是阿里云MySQL数据库存储,读者可根据自己的存储方式,调整相应代码

 class LPL():
    #选手数据
    def lpl_player_stats(self):
        tour = 59,
        url = 'https://data.pentaq.com/business_api/2018may/tournament_player_duty_data?tour=59&patch='
        patch = '9.6.1', #可细分版本获取
        headers = {
            'Cookie':'抓取你的cookie',
            'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36',
            'Host': 'data.pentaq.com',
            'Referer': 'https://data.pentaq.com/PlayerStats?tour=59',
            'Sec-Fetch-Dest': 'empty',
            'Sec-Fetch-Mode': 'cors',
            'Sec-Fetch-Site': 'same-origin',
            }
        res = requests.get(url, headers)
        # print(res)
        js = json.loads(res.text)['data']['players_data']
        #记得注释字段
        player_name=[];team_name=[];team_full_name=[];psr=[];player_id=[];team_id=[];appear=[];win=[];win_rate=[];kill=[];dead=[];assist=[];kda=[];battle_rate_per_game=[];
        solo_kill=[];solo_dead=[];ten_minutes_gold_offset_per_game=[];ten_minutes_creeps_per_game=[];
        exp_diff_10m_per_game=[];damage_per_minute=[];damage_percent=[];damage_efficiency=[];tank_per_minute=[];tank_percent=[];
        tank_efficiency=[];put_eye_per_minute=[];destroy_eye_per_minute=[];buy_true_eye_per_minute=[];duty_id=[];lose=[];insert_time=[]
        for data in js:
            # print(data['player_name'])
            player_name.append(data['player_name'])
            team_name.append(data['team_name'])
            team_full_name.append(data['team_full_name'])
            psr.append(data['psr'])
            player_id.append(data['player_id'])
            team_id.append(data['team_id'])
            appear.append(data['appear'])
            win.append(data['win'])
            win_rate.append(data['win_rate'])
            kill.append(data['kill'])
            dead.append(data['dead'])
            assist.append(data['assist'])
            kda.append(data['kda'])
            battle_rate_per_game.append(data['battle_rate_per_game'])
            solo_kill.append(data['solo_kill'])
            solo_dead.append(data['solo_dead'])
            ten_minutes_gold_offset_per_game.append(data['ten_minutes_gold_offset_per_game'])
            ten_minutes_creeps_per_game.append(data['ten_minutes_creeps_per_game'])
            exp_diff_10m_per_game.append(data['exp_diff_10m_per_game'])
            damage_per_minute.append(data['damage_per_minute'])
            damage_percent.append(data['damage_percent'])
            damage_efficiency.append(data['damage_efficiency'])
            tank_per_minute.append(data['tank_per_minute'])
            tank_percent.append(data['tank_percent'])
            tank_efficiency.append(data['tank_efficiency'])
            put_eye_per_minute.append(data['put_eye_per_minute'])
            buy_true_eye_per_minute.append(data['buy_true_eye_per_minute'])
            duty_id.append(data['duty_id'])
            lose.append(data['lose'])
            insert_time.append(datetime.datetime.now().strftime("%Y--%m--%d %H:%M:%S")) #由于工作要求,增加一个数据入库时间字段
        # print(player_name,'\n')
        dic={}
        dic['player_name']=player_name
        dic['team_name']=team_name
        dic['team_full_name']=team_full_name
        dic['psr']=psr
        dic['player_id']=player_id
        dic['team_id']=team_id
        dic['appear']=appear
        dic['win']=win
        dic['win_rate']=win_rate
        dic['kill']=kill
        dic['dead']=dead
        dic['assist']=assist
        dic['kda']=kda
        dic['battle_rate_per_game']=battle_rate_per_game
        dic['solo_kill']=solo_kill
        dic['solo_dead']=solo_dead
        dic['ten_minutes_gold_offset_per_game']=ten_minutes_gold_offset_per_game
        dic['ten_minutes_creeps_per_game']=ten_minutes_creeps_per_game
        dic['exp_diff_10m_per_game']=exp_diff_10m_per_game
        dic['damage_per_minute']=damage_per_minute
        dic['damage_percent']=damage_percent
        dic['damage_efficiency']=damage_efficiency
        dic['tank_per_minute']=tank_per_minute
        dic['tank_percent']=tank_percent
        dic['tank_efficiency']=tank_efficiency
        dic['put_eye_per_minute']=put_eye_per_minute
        dic['buy_true_eye_per_minute']=buy_true_eye_per_minute
        dic['duty_id']=duty_id
        dic['lose']=lose
        dic['insert_time']=insert_time
        df=pd.DataFrame(dic)
        pd.io.sql.to_sql(df, 'lpl_player_stats',create_engine('mysql+pymysql://'user_name':'DB_name'@'password'@'IP地址'/'DB_name'?charset=user_nameutf8mb4'),if_exists='append' ) #读者可使用自己的数据库链接信息

Overview 对于比赛记录中队伍的logo图片信息,MySQL数据库只能存储图片的链接信息,作者通过解析requests.get(url).content存储在本地

#战队ranking数据  
    def lpl_overview(self):
        tour = 69,
        url = 'https://data.pentaq.com/business_api/2018mar/tournament_situation?tour=69'
        headers = {
            'Cookie':'抓取你的cookie',
            'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36',
            'Host': 'data.pentaq.com',
            'Referer': 'https://data.pentaq.com/?tour=69',
            'Sec-Fetch-Dest': 'empty',
            'Sec-Fetch-Mode': 'cors',
            'Sec-Fetch-Site': 'same-origin',
            }
        res = requests.get(url, headers)
        # print(res.text)
        js = json.loads(res.text)['situations'][0]['teams']
        # print(js)
        team_id=[];name=[];small_logo_url=[];match_win=[];match_lose=[];match_draw=[];match_count=[];game_win=[];game_lose=[];game_count=[];match_win_rate=[];
        game_win_rate=[];rank=[];insert_time=[]
        for i in js:
            # print(i['team_id'],'\n')
            team_id.append(i['team_id'])
            name.append(i['name'])
            small_logo_url.append(i['small_logo_url'])
            match_win.append(i['match_win'])
            match_lose.append(i['match_lose'])
            match_draw.append(i['match_draw'])
            match_count.append(i['match_count'])
            game_win.append(i['game_win'])
            game_lose.append(i['game_lose'])
            game_count.append(i['game_count'])
            match_win_rate.append(i['match_win_rate'])
            game_win_rate.append(i['game_win_rate'])
            rank.append(i['rank'])
            insert_time.append(datetime.datetime.now().strftime("%Y--%m--%d %H:%M:%S"))
        dic={}
        dic['team_id']=team_id
        dic['name']=name
        dic['small_logo_url']=small_logo_url
        dic['match_win']=match_win
        dic['match_lose']=match_lose
        dic['match_draw']=match_draw
        dic['match_count']=match_count
        dic['game_win']=game_win
        dic['game_lose']=game_lose
        dic['game_count']=game_count
        dic['match_win_rate']=match_win_rate
        dic['game_win_rate']=game_win_rate
        dic['rank']=rank
        dic['insert_time']=insert_time
        df=pd.DataFrame(dic)
        # print(df['team_id'][:5])
        pd.io.sql.to_sql(df, 'lpl_overview', create_engine('mysql+pymysql://'user_name':'DB_name'@'password'@'IP地址'/'DB_name'?charset=user_nameutf8mb4'),if_exists='append')
        
    #Pick数据
    def lpl_most_pick(self):
        tour = 69,
        url = 'https://data.pentaq.com/business_api/2018mar/tournament_most_pick?tour=69&team=&patch=&begin=&end=&duty=&born_color='
        headers = {
            'Cookie':'抓取你的cookie',
            'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36',
            'Host': 'data.pentaq.com',
            'Referer': 'https://data.pentaq.com/?tour=69',
            'Sec-Fetch-Dest': 'empty',
            'Sec-Fetch-Mode': 'cors',
            'Sec-Fetch-Site': 'same-origin',
            }
        res = requests.get(url, headers)
        # print(res.text)
        js = json.loads(res.text)['heros']
        # print(js)
        title=[];logo=[];pick_time=[];win_time=[];duty_ids=[];win_rate=[]
        for i in js:
            # print(i['duty_ids'][0],'\n')
            title.append(i['title'])
            logo.append(i['logo'])
            pick_time.append(i['pick_time'])
            win_time.append(i['win_time'])
            duty_ids.append(i['duty_ids'][0])
            if i['pick_time']!=0:
                win_rate.append(round(i['win_time']/i['pick_time'],3))
            else:
                win_rate.append(0)
        dic_most_pick={}
        dic_most_pick['title']=title
        dic_most_pick['logo']=logo
        dic_most_pick['pick_time']=pick_time
        dic_most_pick['win_time']=win_time
        dic_most_pick['duty_ids']=duty_ids
        dic_most_pick['win_rate']=win_rate
        # print(dic_most_pick)
        df=pd.DataFrame(dic_most_pick)
        # print(df['duty_ids'])
        pd.io.sql.to_sql(df, 'lpl_most_pick', create_engine('mysql+pymysql://'user_name':'DB_name'@'password'@'IP地址'/'DB_name'?charset=user_nameutf8mb4'),if_exists='append')

    
    #获取图片,并存储
    def load_photos(self):
        conn = pymysql.connect(
        host='ip',
        # port='3306',
        user='user_name',
        password='password',
        db='db_name',
        charset='utf8mb4')
        cur = conn.cursor()
        #测试连接
        try:
            insert_sqli = 'select distinct team_a_logo from lpl_matches_new'
            cur.execute(insert_sqli)
            result=cur.fetchall()
            # print(result)
        except Exception as e:
            print("插入数据失败:", e)
        else:
            # 如果是插入数据, 一定要提交数据, 不然数据库中找不到要插入的数据;
            conn.commit()
            print("插入数据成功;")
        # print(type(result))
        for i in result:
            # print(i,'\n')
            # print(i[0],'\n')
            url=i[0]
            text=i[0][51:53]
            text_new=text.replace('/','',)
            f=open('Team id is'+str(text_new)+'.jpg','wb')
            # print(text_new)
            res=requests.get(url)
            # print(res.content)
            buf=res.content
            f.write(buf)
        cur.close()
        conn.close()
        
    #比赛数据(分页加载)
    def lpl_matches_new(self):
        tour = 69,
        url1='https://data.pentaq.com/business_api/2018mar/match_list?tour=69&team=&patch=&begin=&end=&page=1'
        url = 'https://data.pentaq.com/business_api/2018mar/match_list?tour=69&team=&patch=&begin=&end=&page={}'
        headers = {
            'Cookie':'抓取你的cookie',
            'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36',
            'Host': 'data.pentaq.com',
            'Referer': 'https://data.pentaq.com/?tour=69',
            'Sec-Fetch-Dest': 'empty',
            'Sec-Fetch-Mode': 'cors',
            'Sec-Fetch-Site': 'same-origin',
            }
        res=requests.get(url1,headers)
        # print(res.text)
        count_matches=json.loads(res.text)['count']
        js=json.loads(res.text)['matches']
        length=len(js)
        print(len(js))
        if count_matches%length == 0:
            page_size = int(count_matches/length)
        else:
            page_size = int(count_matches/length) + 1
        print(page_size)
        id=[];team_a_id=[];team_a_name=[];team_a_logo=[];team_a_win=[];team_b_id=[];team_b_name=[];team_b_logo=[];team_b_win=[];start_at=[];bo=[];lol_version=[];
        for page in range(1,page_size+1):
            resp = requests.get(url.format(page), headers)
            # print(resp.text,'\n')
            js=json.loads(resp.text)['matches']
            # print(js)
            for i in js:
                # print(i,'\n')
                id.append(i['id'])
                team_a_id.append(i['team_a_id'])
                team_a_name.append(i['team_a_name'])
                team_a_logo.append(i['team_a_logo'])
                team_a_win.append(i['team_a_win'])
                team_b_id.append(i['team_b_id'])
                team_b_name.append(i['team_b_name'])
                team_b_logo.append(i['team_b_logo'])
                team_b_win.append(i['team_b_win'])
                start_at.append(i['start_at'])
                bo.append(i['bo'])
                lol_version.append(i['lol_version'])
        # print(id,'\n')
        dic_most_pick={}
        dic_most_pick['id']=id
        dic_most_pick['team_a_id']=team_a_id
        dic_most_pick['team_a_name']=team_a_name
        dic_most_pick['team_a_logo']=team_a_logo
        dic_most_pick['team_a_win']=team_a_win
        dic_most_pick['team_b_id']=team_b_id
        dic_most_pick['team_b_name']=team_a_id
        dic_most_pick['team_b_logo']=team_a_name
        dic_most_pick['team_b_win']=team_a_logo
        dic_most_pick['start_at']=team_a_win
        dic_most_pick['bo']=team_b_id 
        dic_most_pick['lol_version']=lol_version 
        df=pd.DataFrame(dic_most_pick)
        print(df)
        pd.io.sql.to_sql(df,'lpl_matches_new',create_engine('mysql+pymysql://'user_name':'DB_name'@'password'@'IP地址'/'DB_name'?charset=user_nameutf8mb4'),if_exists='append')```
       

Team Stats 该模块和Player Stats模块类似

 #战队数据
    def lpl_team_stats(self):
        tour = 69,
        url = 'https://data.pentaq.com/business_api/2018may/tournament_team_data?tour=69&patch='
        headers = {
            'Cookie':'抓取你的cookie',
            'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36',
            'Host': 'data.pentaq.com',
            'Referer': 'https://data.pentaq.com/?tour=69',
            'Sec-Fetch-Dest': 'empty',
            'Sec-Fetch-Mode': 'cors',
            'Sec-Fetch-Site': 'same-origin',
            }    
        res = requests.get(url, headers)
        # print(res.text)
        js = json.loads(res.text)['data']['teams_data']
        # print(js)
        team_name=[];team_full_name=[];appear=[];win=[];win_rate=[];duration_per_game=[];duration_per_win=[];duration_per_lose=[];kill=[];dead=[];bloodiness=[];minions_percent_per_game=[];
        first_blood_percent=[];first_tower_percent=[];first_dragon_percent=[];first_baron_nashor_percent=[];dragon_percent=[];baron_nashor_percent=[];rift_hearld_percent=[];ten_minutes_gold_offset_per_game=[];
        damage_per_minute=[];tank_per_minute=[];put_eye_per_minute=[];destroy_eye_per_minute=[];buy_true_eye_per_minute=[];lose=[]
        for i in js:
            # print(i,'\n')
            team_name.append(i['team_name'])
            team_full_name.append(i['team_full_name'])
            appear.append(i['appear'])
            win.append(i['win'])
            win_rate.append(i['win_rate'])
            duration_per_game.append(i['duration_per_game'])
            duration_per_win.append(i['duration_per_win'])
            duration_per_lose.append(i['duration_per_lose'])
            kill.append(i['kill'])
            dead.append(i['dead'])
            bloodiness.append(i['bloodiness'])
            minions_percent_per_game.append(i['minions_percent_per_game'])
            first_blood_percent.append(i['first_blood_percent'])
            first_tower_percent.append(i['first_tower_percent'])
            first_dragon_percent.append(i['first_dragon_percent'])
            first_baron_nashor_percent.append(i['first_baron_nashor_percent'])
            dragon_percent.append(i['dragon_percent'])
            baron_nashor_percent.append(i['baron_nashor_percent'])
            rift_hearld_percent.append(i['rift_hearld_percent'])
            ten_minutes_gold_offset_per_game.append(i['ten_minutes_gold_offset_per_game'])
            damage_per_minute.append(i['damage_per_minute'])
            tank_per_minute.append(i['tank_per_minute'])
            put_eye_per_minute.append(i['put_eye_per_minute'])
            destroy_eye_per_minute.append(i['destroy_eye_per_minute'])
            buy_true_eye_per_minute.append(i['buy_true_eye_per_minute'])
            lose.append(i['lose'])
        dic_most_pick={}
        dic_most_pick['team_name']=team_name
        dic_most_pick['team_full_name']=team_full_name
        dic_most_pick['appear']=appear
        dic_most_pick['win']=win
        dic_most_pick['win_rate']=win_rate
        dic_most_pick['duration_per_game']=duration_per_game
        dic_most_pick['duration_per_win']=duration_per_win
        dic_most_pick['duration_per_lose']=duration_per_lose
        dic_most_pick['kill']=kill
        dic_most_pick['dead']=dead
        dic_most_pick['bloodiness']=bloodiness 
        dic_most_pick['minions_percent_per_game']=minions_percent_per_game 
        dic_most_pick['first_blood_percent']=first_blood_percent
        dic_most_pick['first_tower_percent']=first_tower_percent
        dic_most_pick['first_dragon_percent']=first_dragon_percent
        dic_most_pick['first_baron_nashor_percent']=first_baron_nashor_percent
        dic_most_pick['dragon_percent']=dragon_percent
        dic_most_pick['baron_nashor_percent']=baron_nashor_percent
        dic_most_pick['rift_hearld_percent']=rift_hearld_percent
        dic_most_pick['ten_minutes_gold_offset_per_game']=ten_minutes_gold_offset_per_game
        dic_most_pick['damage_per_minute']=damage_per_minute
        dic_most_pick['tank_per_minute']=tank_per_minute
        dic_most_pick['put_eye_per_minute']=put_eye_per_minute 
        dic_most_pick['destroy_eye_per_minute']=destroy_eye_per_minute 
        dic_most_pick['buy_true_eye_per_minute']=buy_true_eye_per_minute 
        dic_most_pick['lose']=lose
        df=pd.DataFrame(dic_most_pick)
        # print(df[0:])
        pd.io.sql.to_sql(df, 'lpl_team_stats', create_engine('mysql+pymysql://'user_name':'DB_name'@'password'@'IP地址'/'DB_name'?charset=user_nameutf8mb4'),if_exists='replace')     

调用模块:

if __name__ =='__main__':
    # LPL().lpl_player_stats()
    # LPL().lpl_overview()
    # LPL().lpl_most_pick()
    # LPL().lpl_matches()
    # LPL().load_photos()
    # LPL().lpl_matches_new()
    #LPL().lpl_team_stats()

#定时器
# @app.task
# def get_test():
#     LPL().lpl_team_stats()
#     return '执行成功'

在调用模块注释了定时器,采用的是celery模块,有兴趣的读者可以去官网阅读文档,这里不做解释说明,第一次写csdn文档,代码写的不好,各位看官,请轻喷!

 类似资料: