python 中央竞马（JRA）の予想対象レースの情报をスクレイピングする的Pythonスクリプト

Posted 2021-05-09
tags:
篇首语：本文由小常识网(cha138.com)小编为大家整理，主要介绍了python 中央竞马（JRA）の予想対象レースの情报をスクレイピングする的Pythonスクリプト相关的知识，希望对你有一定的参考价值。
# -*- coding: utf-8 -*- 

import urllib.request
import codecs
import time
from datetime import datetime as dt
from collections import Counter
from bs4 import BeautifulSoup
import re
import pandas

# 競馬場コード（スポーツナビ参照）
race_course_num="07"
race_no="12"
# 開催情報（1回東京1日→0101など）
race_date="0206"

if race_course_num == "01":
        race_course="札幌"
elif race_course_num == "02":
        race_course="函館"
elif race_course_num == "03":
        race_course="福島"
elif race_course_num == "04":
        race_course="新潟"
elif race_course_num == "05":
        race_course="東京"
elif race_course_num == "06":
        race_course="中山"
elif race_course_num == "07":
        race_course="中京"
elif race_course_num == "08":
        race_course="京都"
elif race_course_num == "09":
        race_course="阪神"
elif race_course_num == "10":
        race_course="小倉"

out=codecs.open("./this_race_info.csv","w","utf-8")
out.write("競馬場,レース番号,レース名,コース,周回,距離,馬場状態,枠番,馬番,馬名,性別,馬齢,毛色,調教師,馬体重,増減,斤量,騎手,人気,オッズ,調教コメント,調教評価\n")

race_html=urllib.request.urlopen("http://keiba.yahoo.co.jp/race/denma/16"+race_course_num+race_date+race_no+"/")
race_soup=BeautifulSoup(race_html,"lxml")

# レース名の取得
race_name=race_soup.find_all("h1",class_="fntB")
race_name=re.sub(r"<[^>]*?>","",str(race_name))
race_name=re.sub(r"[ \n\[\]]","",str(race_name))
# 大阪-ハンブルグカップ対策
race_name=re.sub(r"—","-",str(race_name))
# 19XX-19XXsダービーメモリーズ対策
race_name=re.sub(r"〜","-",str(race_name))
# 重賞の回次を削除
race_name=re.sub(r"第.*?回","",str(race_name))

# コース区分・距離の取得
race_info=race_soup.find_all("p",class_="fntSS gryB",attrs={'id':'raceTitMeta'})
track=re.sub(r"\n","",str(race_info))
track=re.sub(r" \[.*","",str(track))
track=re.sub(r"[\[m]","",str(track))
track=re.sub("・外","",str(track))
track=re.sub("・内","",str(track))
track=re.sub(r"・"," ",str(track))
track=re.sub(r" ",",",str(track))
track=re.sub(r"<[^>]*?>","",str(track))

# 馬場状態の取得
cond=race_soup.find_all("img",attrs={'width':'25'})
cond=re.sub(r"\[<img alt=\"","",str(cond))
cond=re.sub(r"\" border.*$","",str(cond))

horse_info=race_soup.find_all("td",class_="fntN")
horse_info=re.sub(r"<[^>]*?>","",str(horse_info))
horse_info=re.sub(r",","",str(horse_info))
horse_info=re.sub(r"毛 ","毛,",str(horse_info))
# 石毛厩舎対策
horse_info=re.sub(r"石毛,","石毛 ",str(horse_info))
horse_info=re.sub(r"牡","牡,",str(horse_info))
horse_info=re.sub(r"牝","牝,",str(horse_info))
horse_info=re.sub(r"せん","せん,",str(horse_info))
horse_info=re.sub(r"/",",",str(horse_info))
horse_info=re.sub(r"\n",",",str(horse_info))
horse_info=re.sub(r"\) ,",")\n",str(horse_info))
horse_info=re.sub(r", ,","\n",str(horse_info))
horse_info=re.sub(r"[\[\]]","",str(horse_info))
horse_info=re.sub(r"^,","",str(horse_info))
horse_info=horse_info.split("\n")

weight_tax=race_soup.find_all("td",class_="txC")
weight_tax=re.sub(r"<[^>]*?>","",str(weight_tax))
weight_tax=re.findall(r"[456][0-9]\.[05]",str(weight_tax))

weight=race_soup.find_all("td",class_="txC")
weight=re.sub(r"<[^>]*?>","",str(weight))

#weight=[None]*len(horse_info)
weight=re.findall(r"[0-9][0-9][0-9]\(",str(weight))
weight=re.sub(r"\(","",str(weight))
weight=re.sub(r"[\[\'\] ]","",str(weight))
weight=weight.split(",")
#weight.insert(12,"NA")

#weight_change=[None]*len(horse_info)
weight_change=race_soup.find_all("td",class_="txC")
weight_change=re.sub(r"<[^>]*?>","",str(weight_change))
weight_change=re.findall(r"\(.*\)",str(weight_change))
weight_change=re.sub(r"[\(\)\+]","",str(weight_change))
weight_change=re.sub(r"[\[\'\] ]","",str(weight_change))
weight_change=weight_change.split(",")
#weight_change.insert(12,"NA")

jockey=race_soup.find_all("a",href=re.compile("/directory/jocky"))
jockey=re.sub(r"<[^>]*?>","",str(jockey))
jockey=re.sub(r"[\[\]]","",str(jockey))
jockey=jockey.split(", ")

# オッズ、人気の取得
odds_url="http://race.netkeiba.com/?pid=race_old&id=c2016"+race_course_num+race_date+race_no
odds_df=pandas.io.html.read_html(odds_url)

horse_gate=odds_df[0][0][3:]
horse_no=odds_df[0][1][3:]
odds=odds_df[0][9][3:]
popularity=odds_df[0][10][3:]
horse_gate_list=[0]*len(horse_gate)
horse_no_list=[0]*len(horse_no)
odds_list=[0]*len(horse_no)
popularity_list=[0]*len(horse_no)

j=3
for i in horse_no:
    horse_gate_list[int(i)-1]=horse_gate[j]
    horse_no_list[int(i)-1]=horse_no[j]
    odds_list[int(i)-1]=odds[j]
    popularity_list[int(i)-1]=popularity[j]
    j+=1

# 調教評価の取得
train_url="http://race.netkeiba.com/?pid=race_old&id=c2016"+race_course_num+race_date+race_no+"&mode=oikiri"
train_df=pandas.io.html.read_html(train_url)
train_comment=train_df[0][3][1:]
train_mark=train_df[0][4][1:]

train_comment_list=[0]*len(train_comment)
train_mark_list=[0]*len(train_mark)

j=1
for i in range(1,len(train_mark_list)+1):
    train_comment_list[int(i)-1]=train_comment[j]
    train_mark_list[int(i)-1]=train_mark[j]
    j+=1

for i in range(0,len(horse_info)):
    print(race_course+","+race_no+","+race_name+","+track+","+cond+","+str(horse_gate_list[i])+","+str(horse_no_list[i])+","+str(horse_info[i])+","+str(weight[i])+","+str(weight_change[i])+","+str(weight_tax[i])+","+str(jockey[i])+","+str(popularity_list[i])+","+str(odds_list[i])+","+str(train_comment_list[i])+","+str(train_mark_list[i]))
    out.write(race_course+","+race_no+","+race_name+","+track+","+cond+","+str(horse_gate_list[i])+","+str(horse_no_list[i])+","+str(horse_info[i])+","+str(weight[i])+","+str(weight_change[i])+","+str(weight_tax[i])+","+str(jockey[i])+","+str(popularity_list[i])+","+str(odds_list[i])+","+str(train_comment_list[i])+","+str(train_mark_list[i])+"\n")

out.close()
以上是关于python 中央竞马（JRA）の予想対象レースの情报をスクレイピングする的Pythonスクリプト的主要内容，如果未能解决你的问题，请参考以下文章