Spaces:
Runtime error
Runtime error
File size: 10,708 Bytes
8920568 b3c9d45 763a979 4f59094 7e798dd b3c9d45 7e798dd b3c9d45 4f59094 b3c9d45 7e798dd b3c9d45 7e798dd b3c9d45 7e798dd b3c9d45 359e36e 8ad803e 76b43fa 8ad803e 76b43fa 1cbad75 83738aa 5c45673 b63dcce b3c9d45 359e36e b3c9d45 a23b013 1cbad75 4fe001a b3c9d45 a23b013 b3c9d45 c8ca045 b3c9d45 1cbad75 6a2c3c8 4f59094 1cbad75 359e36e 4f59094 1cbad75 a23b013 7e493b4 1cbad75 5c45673 4f59094 359e36e 1cbad75 359e36e 4fe001a 230f893 359e36e 86fd1b3 359e36e 6a2c3c8 359e36e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 |
import altair
import gradio as gr
from math import sqrt
import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import datetime
from sklearn.linear_model import LinearRegression
from sklearn.metrics import log_loss
from sklearn.preprocessing import StandardScaler
import requests
from bs4 import BeautifulSoup as bs
from requests_html import AsyncHTMLSession
import codecs
import io
import random
import requests
import time
from datetime import date, timedelta
from tqdm import tqdm
from typing import Generator, Tuple
import numpy as np
import pandas as pd
def date_range(
start: date, stop: date, step: timedelta = timedelta(1)
) -> Generator[date, None, None]:
"""startからendまで日付をstep日ずつループさせるジェネレータ"""
current = start
while current < stop:
yield current
current += step
def get_url(download_date: date) -> Tuple[str, str]:
"""ダウンロードするURLと日付の文字列を返す"""
month = download_date.strftime("%Y%m")
day = download_date.strftime("%Y%m%d")
return (
f"https://www.shijou-nippo.metro.tokyo.lg.jp/SN/{month}/{day}/Sui/Sui_K1.csv",
day,
)
def content_wrap(content):
"""1行目にヘッダ行が来るまでスキップする"""
buffer = ""
first = True
for line in io.BytesIO(content):
line_str = codecs.decode(line, "shift-jis")
if first:
if "品名" in line_str:
first = False
buffer = line_str
else:
continue
else:
buffer += line_str
return io.StringIO(buffer)
def insert_data(data, day, low_price, center_price, high_price, quantity):
""" "データをリストに追加する"""
data["date"].append(day)
data["low_price"].append(low_price)
data["center_price"].append(center_price)
data["high_price"].append(high_price)
data["quantity"].append(quantity)
def to_numeric(x):
"""文字列を数値に変換する"""
if isinstance(x, str):
return float(x)
else:
return x
def get_fish_price_data(start_date: date, end_date: date) -> pd.core.frame.DataFrame:
"""
東京卸売市場からデータを引っ張ってくる
:param start_date: 開始日
:param end_date: 終了日
:return: あじの値段を結合したデータ
"""
data = {
"date": [],
"low_price": [],
"center_price": [],
"high_price": [],
"quantity": [],
}
iterator = tqdm(
date_range(start_date, end_date), total=(end_date - start_date).days
)
for download_date in iterator:
url, day = get_url(download_date)
iterator.set_description(day)
response = requests.get(url)
# URLが存在しないとき
if response.status_code == 404:
insert_data(data, day, np.nan, np.nan, np.nan, 0)
continue
assert (
response.status_code == 200
), f"Unexpected HTTP response. Please check the website {url}."
df = pd.read_csv(content_wrap(response.content))
# 欠損値補完
price_cols = ["安値(円)", "中値(円)", "高値(円)"]
for c in price_cols:
df[c].mask(df[c] == "-", np.nan, inplace=True)
df[c].mask(df[c] == "−", np.nan, inplace=True)
df["卸売数量"].mask(df["卸売数量"] == "-", np.nan, inplace=True)
df["卸売数量"].mask(df["卸売数量"] == "−", np.nan, inplace=True)
# 長崎で獲れたあじの中値と卸売数量
# 品目 == あじ の行だけ抽出
df_aji = df.loc[df["品名"] == "あじ", ["卸売数量"] + price_cols]
# あじの販売がなかったら欠損扱いに
if len(df_aji) == 0:
insert_data(data, day, np.nan, np.nan, np.nan, 0)
continue
isnan = lambda x: isinstance(x, float) and np.isnan(x)
# 産地ごと(?)の鯵の販売実績を調べる
low_prices = []
center_prices = []
high_prices = []
quantities = []
for i, row in enumerate(df_aji.iloc):
lp, cp, hp, q = row[price_cols + ["卸売数量"]]
lp, cp, hp, q = (
to_numeric(lp),
to_numeric(cp),
to_numeric(hp),
to_numeric(q),
)
# 中値だけが記録されている -> 価格帯が1個だけなので高値、安値も中値と同じにしておく
if isnan(lp) and isnan(hp) and (not isnan(cp)):
low_prices.append(cp)
center_prices.append(cp)
high_prices.append(cp)
# 高値・安値があり中値がない -> 価格帯2個、とりあえず両者の平均を中値とする
elif (not isnan(lp)) and (not isnan(hp)) and isnan(cp):
low_prices.append(lp)
center_prices.append((lp + hp) / 2)
high_prices.append(hp)
else:
low_prices.append(lp)
center_prices.append(cp)
high_prices.append(hp)
if isnan(row["卸売数量"]):
quantities.append(0)
else:
quantities.append(q)
low_price = int(min(low_prices))
center_price = int(sum(center_prices) / len(center_prices))
high_price = int(max(high_prices))
quantity = int(float(sum(quantities)))
# 保存
insert_data(data, day, low_price, center_price, high_price, quantity)
# 短期間にアクセスが集中しないようにクールタイムを設定
time.sleep(max(0.5 + random.normalvariate(0, 0.3), 0.1))
# DataFrameを作成
df = pd.DataFrame(data)
return df
# Webページを取得し解析する
load_url = "https://www.football-lab.jp/kyot/match/"
html = requests.get(load_url)
soup = bs(html.content, "html.parser")
df_train = pd.read_csv('df_train.csv')
X = df_train.drop('audience', axis=1)
y = df_train['audience']
linear_regression = LinearRegression()
model = linear_regression.fit(X,y)
d_today = datetime.date.today()
d_tom = datetime.date.today() + datetime.timedelta(days = 1)
d_y = datetime.date.today() + datetime.timedelta(days = -1)
# 前日のあじデータ抽出
if __name__ == "__main__":
start_date = d_y
end_date = d_today
df_aji_pre = get_fish_price_data(start_date=start_date, end_date=end_date)
df_aji_pre['date'] = df_aji_pre['date'].astype(int)
url23 = 'https://www.football-lab.jp/ka-f/match/'
dfs23 = pd.read_html(url23)
#シーズン毎に分類
res23 = pd.DataFrame([['S2023']]*len(dfs23[0])).join(dfs23)
df = res23
df = df.rename(columns={'会場': 'stadium', 0: 'year', '開催日': 'date', '観客数': 'audience'})
df = df.query('stadium=="等々力"').reset_index()
df = df.query('audience.notna()', engine='python').reset_index()
df = df[['audience', 'year', 'date']]
#seasonカラムから年を抽出
df["year"] = df["year"].apply(lambda x: str(x)[1:5])
#開催日から月と日を分割
df['month'] = df['date'].str.split(pat='.', expand=True)[0]
df['day'] = df['date'].str.split(pat='.', expand=True)[1]
#数値データを日付データに変換
df['date'] = pd.to_datetime({'year': df['year'], 'month': df['month'], 'day': df['day']})
#日付昇順に並び替える
df = df.sort_values('date', ascending=True)
df['date_ymd'] = pd.to_datetime(df['date']).dt.strftime('%Y%m%d')
df['date_ym'] = pd.to_datetime(df['date']).dt.strftime('%Y%m')
df["date_ymd"] = df["date_ymd"].astype(int)
df['date_before'] = df['date_ymd'] - 1
df["date_before"] = df["date_before"]
df = df[['audience', 'date_ymd', 'date_before']]
df['last_audience'] = df['audience'].shift(1)
df_pre = df.tail(1).reset_index()
df_pre = df_pre.drop('index', axis=1)
df_aji_ft_pre = pd.concat([df_pre, df_aji_pre], axis=1)
df_aji_ft_pre = df_aji_ft_pre[['date_ymd', 'audience', 'low_price', 'center_price', 'high_price', 'quantity']]
df_aji_ft_pre = df_aji_ft_pre.rename(columns={'audience': 'last_audience', 0: 'year', '開催日': 'date_ymd', '観客数': 'audience'})
df_aji_ft_pre ['last_audience'] = df_aji_ft_pre ['last_audience'].astype(int)
pred = linear_regression.predict(df_aji_ft_pre)
df_aji_ft_pre['audience_pred'] = pred
df_aji_ft_pre['date_ymd'] = df_aji_ft_pre['date_ymd'].astype(int)
def outbreak(date):
if date:
fig = plt.figure()
plt.plot(df_train['date_ymd'], df_train['audience'], label='original')
plt.plot(df_aji_ft_pre['date_ymd'], df_aji_ft_pre['audience_pred'], '*', label='predict')
plt.title(f"today prediction value : {pred}")
plt.ylabel("audience")
plt.xlabel("Days")
plt.legend()
return fig
with gr.Blocks() as demo:
gr.Markdown(
"""
# 川崎フロンターレの観客動員数の予測
川崎フロンターレの等々力陸上競技場での試合の観客数を「あじ」の価格をもとに予測する。
## 使用データ
* 東京卸売市場日報
* Football Lab
## 予測ロジック
観客動員数は雨天か否かで左右されると考えられる。そこで雨天の可能性をあじの価格を利用し表した。
一般的に雨天の場合、低気圧の影響で海面が上昇し漁に出ることが難しくなる。
そのため漁獲量が減少し、あじの価格が上昇すると考えられる。
## モデルについて
モデル名:sklearn
特徴量:予測日前日のあじの高値、予測日前日のあじの中値、予測日前日のあじの安値、
予測日前日のあじの卸売数量、等々力競技場での川崎フロンターレの前回試合の観客数
## 注意点
予測日前日のあじのデータがない場合はErrorとなります。
"""
)
with gr.Row():
with gr.Column():
date_input = gr.Checkbox(label='Do you want to predict audiences?')
prediction_btn = gr.Button(value="predict")
with gr.Column():
prediction = gr.Plot(label = "時系列プロット")
prediction_btn.click(outbreak, inputs=date_input, outputs=prediction)
demo.launch() |