import altair import gradio as gr from math import sqrt import matplotlib matplotlib.use("Agg") import matplotlib.pyplot as plt import numpy as np import pandas as pd import datetime from sklearn.linear_model import LinearRegression from sklearn.metrics import log_loss from sklearn.preprocessing import StandardScaler import requests from bs4 import BeautifulSoup as bs from requests_html import AsyncHTMLSession def date_range( start: date, stop: date, step: timedelta = timedelta(1) ) -> Generator[date, None, None]: """startからendまで日付をstep日ずつループさせるジェネレータ""" current = start while current < stop: yield current current += step def get_url(download_date: date) -> Tuple[str, str]: """ダウンロードするURLと日付の文字列を返す""" month = download_date.strftime("%Y%m") day = download_date.strftime("%Y%m%d") return ( f"https://www.shijou-nippo.metro.tokyo.lg.jp/SN/{month}/{day}/Sui/Sui_K1.csv", day, ) def content_wrap(content): """1行目にヘッダ行が来るまでスキップする""" buffer = "" first = True for line in io.BytesIO(content): line_str = codecs.decode(line, "shift-jis") if first: if "品名" in line_str: first = False buffer = line_str else: continue else: buffer += line_str return io.StringIO(buffer) def insert_data(data, day, low_price, center_price, high_price, quantity): """ "データをリストに追加する""" data["date"].append(day) data["low_price"].append(low_price) data["center_price"].append(center_price) data["high_price"].append(high_price) data["quantity"].append(quantity) def to_numeric(x): """文字列を数値に変換する""" if isinstance(x, str): return float(x) else: return x def get_fish_price_data(start_date: date, end_date: date) -> pd.core.frame.DataFrame: """ 東京卸売市場からデータを引っ張ってくる :param start_date: 開始日 :param end_date: 終了日 :return: あじの値段を結合したデータ """ data = { "date": [], "low_price": [], "center_price": [], "high_price": [], "quantity": [], } iterator = tqdm( date_range(start_date, end_date), total=(end_date - start_date).days ) for download_date in iterator: url, day = get_url(download_date) iterator.set_description(day) response = requests.get(url) # URLが存在しないとき if response.status_code == 404: insert_data(data, day, np.nan, np.nan, np.nan, 0) continue assert ( response.status_code == 200 ), f"Unexpected HTTP response. Please check the website {url}." df = pd.read_csv(content_wrap(response.content)) # 欠損値補完 price_cols = ["安値(円)", "中値(円)", "高値(円)"] for c in price_cols: df[c].mask(df[c] == "-", np.nan, inplace=True) df[c].mask(df[c] == "−", np.nan, inplace=True) df["卸売数量"].mask(df["卸売数量"] == "-", np.nan, inplace=True) df["卸売数量"].mask(df["卸売数量"] == "−", np.nan, inplace=True) # 長崎で獲れたあじの中値と卸売数量 # 品目 == あじ の行だけ抽出 df_aji = df.loc[df["品名"] == "あじ", ["卸売数量"] + price_cols] # あじの販売がなかったら欠損扱いに if len(df_aji) == 0: insert_data(data, day, np.nan, np.nan, np.nan, 0) continue isnan = lambda x: isinstance(x, float) and np.isnan(x) # 産地ごと(?)の鯵の販売実績を調べる low_prices = [] center_prices = [] high_prices = [] quantities = [] for i, row in enumerate(df_aji.iloc): lp, cp, hp, q = row[price_cols + ["卸売数量"]] lp, cp, hp, q = ( to_numeric(lp), to_numeric(cp), to_numeric(hp), to_numeric(q), ) # 中値だけが記録されている -> 価格帯が1個だけなので高値、安値も中値と同じにしておく if isnan(lp) and isnan(hp) and (not isnan(cp)): low_prices.append(cp) center_prices.append(cp) high_prices.append(cp) # 高値・安値があり中値がない -> 価格帯2個、とりあえず両者の平均を中値とする elif (not isnan(lp)) and (not isnan(hp)) and isnan(cp): low_prices.append(lp) center_prices.append((lp + hp) / 2) high_prices.append(hp) else: low_prices.append(lp) center_prices.append(cp) high_prices.append(hp) if isnan(row["卸売数量"]): quantities.append(0) else: quantities.append(q) low_price = int(min(low_prices)) center_price = int(sum(center_prices) / len(center_prices)) high_price = int(max(high_prices)) quantity = int(float(sum(quantities))) # 保存 insert_data(data, day, low_price, center_price, high_price, quantity) # 短期間にアクセスが集中しないようにクールタイムを設定 time.sleep(max(0.5 + random.normalvariate(0, 0.3), 0.1)) # DataFrameを作成 df = pd.DataFrame(data) return df # Webページを取得して解析する load_url = "https://www.football-lab.jp/kyot/match/" html = requests.get(load_url) soup = bs(html.content, "html.parser") df_train = pd.read_csv('df_train.csv') X = df_train.drop('audience', axis=1) y = df_train['audience'] linear_regression = LinearRegression() model = linear_regression.fit(X,y) d_today = datetime.date.today() d_tom = datetime.date.today() + datetime.timedelta(days = 1) # 動作確認 d_y = datetime.date.today() + datetime.timedelta(days = -1) if __name__ == "__main__": start_date = d_y end_date = d_today df_aji_pre = get_fish_price_data(start_date=start_date, end_date=end_date) df_aji_pre['date'] = df_aji_pre['date'].astype(int) # if __name__ == "__main__": # start_date = d_today # end_date = d_tom # df_aji_pre = get_fish_price_data(start_date=start_date, end_date=end_date) # df_aji_pre['date'] = df_aji_pre['date'].astype(int) url23 = 'https://www.football-lab.jp/ka-f/match/' dfs23 = pd.read_html(url23) #シーズン毎に分類 res23 = pd.DataFrame([['S2023']]*len(dfs23[0])).join(dfs23, lsuffix='0') df = res23 df = df.rename(columns={'会場': 'stadium', 0: 'year', '開催日': 'date', '観客数': 'audience'}) df = df.query('stadium=="等々力"').reset_index() df = df.query('audience.notna()', engine='python').reset_index() df = df[['audience', 'year', 'date']] #seasonカラムから年を抽出 df["year"] = df["year"].apply(lambda x: str(x)[1:5]) #開催日から月と日を分割 df['month'] = df['date'].str.split(pat='.', expand=True)[0] df['day'] = df['date'].str.split(pat='.', expand=True)[1] #数値データを日付データに変換 df['date'] = pd.to_datetime({'year': df['year'], 'month': df['month'], 'day': df['day']}) #日付昇順に並び替える df = df.sort_values('date', ascending=True) df['date_ymd'] = pd.to_datetime(df['date']).dt.strftime('%Y%m%d') df['date_ym'] = pd.to_datetime(df['date']).dt.strftime('%Y%m') df["date_ymd"] = df["date_ymd"].astype(int) df['date_before'] = df['date_ymd'] - 1 df["date_before"] = df["date_before"] df = df[['audience', 'date_ymd', 'date_before']] df['last_audience'] = df['audience'].shift(1) # df_pre = pd.merge(df, df_aji_pre, left_on='date_before', right_on='date', how='left') # df_pre = df_pre.drop(['date_before', 'date_ymd'], axis=1) # df_pre["audience"] = df_pre["audience"].str.replace(",", "").astype(int) # df_pre["last_audience"] = df_pre["last_audience"].str.replace(",", "").astype(int) # start_date = int(start_date) # df_pre = df.query('date <= start_date') df_pre = df.tail(1).reset_index() df_pre = df_pre.drop('index', axis=1) df_aji_ft_pre = pd.concat([df_pre, df_aji_pre], axis=1) df_aji_ft_pre = df_aji_ft_pre[['audience', 'date', 'low_price', 'center_price', 'high_price', 'quantity']] df_aji_ft_pre = df_aji_ft_pre.rename(columns={'audience': 'last_audience', 0: 'year', '開催日': 'date', '観客数': 'audience'}) def outbreak(date): if date: # if __name__ == "__main__": # import datetime # d_today = datetime.date.today() # d_tom = datetime.date.today() + datetime.timedelta(days = 1) # start_date = d_today # end_date = d_tom # df_aji_pre = get_fish_price_data(start_date=start_date, end_date=end_date) # # df_aji_pre.to_csv("fish_price_pre.csv", index=False) df_pre = df.tail(1).reset_index() df_pre = df_pre.drop('index', axis=1) df_aji_ft_pre = pd.concat([df_pre, df_aji_pre], axis=1) df_aji_ft_pre = df_aji_ft_pre[['audience', 'date', 'low_price', 'center_price', 'high_price', 'quantity']] df_aji_ft_pre = df_aji_ft_pre.rename(columns={'audience': 'last_audience', 0: 'year', '開催日': 'date', '観客数': 'audience'}) X = df_train.drop('audience', axis=1) y = df_train['audience'] pred = linear_regression.predict(df_aji_ft_pre) df_aji_ft_pre['audience_pred'] = pred df_aji_ft_pre['date'] = df_aji_ft_pre['date'].astype(int) fig = plt.figure() plt.plot(df_train['date'], df_train['audience'], label='original') plt.plot(df_aji_ft_pre['date'], df_aji_ft_pre['audience_pred'], '*', label='predict') plt.title("prediction of audince") plt.ylabel("audience") plt.xlabel("Days since Day 0") return fig with gr.Blocks() as demo: gr.Markdown( """ # 川崎フロンターレの観客動員数の予測 川崎フロンターレの等々力陸上競技場での試合の観客数を「あじ」の価格をもとに予測する。 ## 使用データ * 東京卸売市場日報 * Football Lab ## 予測ロジック 観客動員数は雨天か否かで左右されると考えられる。そこで雨天の可能性をあじの価格を利用し表した。 一般的に雨天の場合、低気圧の影響で海面が上昇し漁に出ることが難しくなる。 そのため漁獲量が減少し、あじの価格が上昇すると考えられる。 """ ) with gr.Row(): with gr.Column(): date_input = gr.Checkbox(label='please input date') prediction_btn = gr.Button(value="predict") with gr.Column(): prediction = gr.Plot(label = "時系列プロット") prediction_btn.click(outbreak, inputs=date_input, outputs=prediction) demo.launch()