Spaces:

masa729406
/

chEstyleU

Runtime error

File size: 12,090 Bytes

import altair

import gradio as gr
from math import sqrt
import matplotlib

matplotlib.use("Agg")

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

import datetime

from sklearn.linear_model import LinearRegression
from sklearn.metrics import log_loss
from sklearn.preprocessing import StandardScaler

import requests
from bs4 import BeautifulSoup as bs
from requests_html import AsyncHTMLSession

import codecs
import io
import random
import requests
import time
from datetime import date, timedelta
from tqdm import tqdm
from typing import Generator, Tuple

import numpy as np
import pandas as pd

def date_range(
    start: date, stop: date, step: timedelta = timedelta(1)
) -> Generator[date, None, None]:
    """startからendまで日付をstep日ずつループさせるジェネレータ"""
    current = start
    while current < stop:
        yield current
        current += step


def get_url(download_date: date) -> Tuple[str, str]:
    """ダウンロードするURLと日付の文字列を返す"""
    month = download_date.strftime("%Y%m")
    day = download_date.strftime("%Y%m%d")
    return (
        f"https://www.shijou-nippo.metro.tokyo.lg.jp/SN/{month}/{day}/Sui/Sui_K1.csv",
        day,
    )


def content_wrap(content):
    """1行目にヘッダ行が来るまでスキップする"""
    buffer = ""
    first = True
    for line in io.BytesIO(content):
        line_str = codecs.decode(line, "shift-jis")
        if first:
            if "品名" in line_str:
                first = False
                buffer = line_str
            else:
                continue
        else:
            buffer += line_str
    return io.StringIO(buffer)


def insert_data(data, day, low_price, center_price, high_price, quantity):
    """ "データをリストに追加する"""
    data["date"].append(day)
    data["low_price"].append(low_price)
    data["center_price"].append(center_price)
    data["high_price"].append(high_price)
    data["quantity"].append(quantity)


def to_numeric(x):
    """文字列を数値に変換する"""
    if isinstance(x, str):
        return float(x)
    else:
        return x


def get_fish_price_data(start_date: date, end_date: date) -> pd.core.frame.DataFrame:
    """
    東京卸売市場からデータを引っ張ってくる

    :param start_date: 開始日
    :param end_date: 終了日
    :return: あじの値段を結合したデータ
    """
    data = {
        "date": [],
        "low_price": [],
        "center_price": [],
        "high_price": [],
        "quantity": [],
    }
    iterator = tqdm(
        date_range(start_date, end_date), total=(end_date - start_date).days
    )

    for download_date in iterator:
        url, day = get_url(download_date)
        iterator.set_description(day)
        response = requests.get(url)

        # URLが存在しないとき
        if response.status_code == 404:
            insert_data(data, day, np.nan, np.nan, np.nan, 0)
            continue
        assert (
            response.status_code == 200
        ), f"Unexpected HTTP response. Please check the website {url}."

        df = pd.read_csv(content_wrap(response.content))

        # 欠損値補完
        price_cols = ["安値(円)", "中値(円)", "高値(円)"]
        for c in price_cols:
            df[c].mask(df[c] == "-", np.nan, inplace=True)
            df[c].mask(df[c] == "−", np.nan, inplace=True)
        df["卸売数量"].mask(df["卸売数量"] == "-", np.nan, inplace=True)
        df["卸売数量"].mask(df["卸売数量"] == "−", np.nan, inplace=True)

        # 長崎で獲れたあじの中値と卸売数量
        # 品目 == あじ の行だけ抽出
        df_aji = df.loc[df["品名"] == "あじ", ["卸売数量"] + price_cols]

        # あじの販売がなかったら欠損扱いに
        if len(df_aji) == 0:
            insert_data(data, day, np.nan, np.nan, np.nan, 0)
            continue

        isnan = lambda x: isinstance(x, float) and np.isnan(x)
        # 産地ごと(?)の鯵の販売実績を調べる
        low_prices = []
        center_prices = []
        high_prices = []
        quantities = []
        for i, row in enumerate(df_aji.iloc):
            lp, cp, hp, q = row[price_cols + ["卸売数量"]]
            lp, cp, hp, q = (
                to_numeric(lp),
                to_numeric(cp),
                to_numeric(hp),
                to_numeric(q),
            )

            # 中値だけが記録されている -> 価格帯が1個だけなので高値、安値も中値と同じにしておく
            if isnan(lp) and isnan(hp) and (not isnan(cp)):
                low_prices.append(cp)
                center_prices.append(cp)
                high_prices.append(cp)

            # 高値・安値があり中値がない -> 価格帯2個、とりあえず両者の平均を中値とする
            elif (not isnan(lp)) and (not isnan(hp)) and isnan(cp):
                low_prices.append(lp)
                center_prices.append((lp + hp) / 2)
                high_prices.append(hp)
            else:
                low_prices.append(lp)
                center_prices.append(cp)
                high_prices.append(hp)

            if isnan(row["卸売数量"]):
                quantities.append(0)
            else:
                quantities.append(q)

        low_price = int(min(low_prices))
        center_price = int(sum(center_prices) / len(center_prices))
        high_price = int(max(high_prices))
        quantity = int(float(sum(quantities)))

        # 保存
        insert_data(data, day, low_price, center_price, high_price, quantity)
        # 短期間にアクセスが集中しないようにクールタイムを設定
        time.sleep(max(0.5 + random.normalvariate(0, 0.3), 0.1))
    # DataFrameを作成
    df = pd.DataFrame(data)
    return df

# Webページを取得し解析する
load_url = "https://www.football-lab.jp/kyot/match/"
html = requests.get(load_url)
soup = bs(html.content, "html.parser")

df_train = pd.read_csv('df_train.csv')
X = df_train.drop('audience', axis=1)
y = df_train['audience']
linear_regression = LinearRegression()
model = linear_regression.fit(X,y)

d_today = datetime.date.today()
d_tom = datetime.date.today() + datetime.timedelta(days = 1)

# 予測日前日の魚データを取得
d_y = datetime.date.today() + datetime.timedelta(days = -1)
if __name__ == "__main__":
    start_date = d_y
    end_date = d_today
    df_aji_pre = get_fish_price_data(start_date=start_date, end_date=end_date)
    df_aji_pre['date'] = df_aji_pre['date'].astype(int)

# if __name__ == "__main__":
#     start_date = d_today
#     end_date = d_tom
#     df_aji_pre = get_fish_price_data(start_date=start_date, end_date=end_date)
#     df_aji_pre['date'] = df_aji_pre['date'].astype(int)

url23 = 'https://www.football-lab.jp/ka-f/match/'
dfs23 = pd.read_html(url23)

#シーズン毎に分類
res23 = pd.DataFrame([['S2023']]*len(dfs23[0])).join(dfs23) #lsuffix='0'

df = res23

df = df.rename(columns={'会場': 'stadium', 0: 'year', '開催日': 'date', '観客数': 'audience'})
df = df.query('stadium=="等々力"').reset_index()
df = df.query('audience.notna()', engine='python').reset_index()
df = df[['audience', 'year', 'date']]
#seasonカラムから年を抽出
df["year"] = df["year"].apply(lambda x: str(x)[1:5])
#開催日から月と日を分割
df['month']  = df['date'].str.split(pat='.', expand=True)[0]
df['day'] = df['date'].str.split(pat='.', expand=True)[1]
#数値データを日付データに変換
df['date'] = pd.to_datetime({'year': df['year'], 'month': df['month'], 'day': df['day']})
#日付昇順に並び替える
df = df.sort_values('date', ascending=True)
df['date_ymd'] = pd.to_datetime(df['date']).dt.strftime('%Y%m%d')
df['date_ym'] = pd.to_datetime(df['date']).dt.strftime('%Y%m')
df["date_ymd"] = df["date_ymd"].astype(int)
df['date_before'] = df['date_ymd'] - 1
df["date_before"] = df["date_before"]
df = df[['audience', 'date_ymd', 'date_before']] 
df['last_audience'] = df['audience'].shift(1)

# df_pre = pd.merge(df, df_aji_pre, left_on='date_before', right_on='date', how='left')

# df_pre = df_pre.drop(['date_before', 'date_ymd'], axis=1)
# df_pre["audience"] = df_pre["audience"].str.replace(",", "").astype(int)
# df_pre["last_audience"] = df_pre["last_audience"].str.replace(",", "").astype(int)

# start_date = int(start_date)
# df_pre = df.query('date <= start_date')


df_pre = df.tail(1).reset_index()
df_pre = df_pre.drop('index', axis=1)
df_aji_ft_pre = pd.concat([df_pre, df_aji_pre], axis=1)
df_aji_ft_pre = df_aji_ft_pre[['audience', 'date', 'low_price', 'center_price', 'high_price', 'quantity']]
df_aji_ft_pre = df_aji_ft_pre.rename(columns={'audience': 'last_audience', 0: 'year', '開催日': 'date', '観客数': 'audience'})

def outbreak(date):
  if date:  

    # if __name__ == "__main__":
    #   import datetime
    #   d_today = datetime.date.today()
    #   d_tom = datetime.date.today() + datetime.timedelta(days = 1)
    #   start_date = d_today
    #   end_date = d_tom
    #   df_aji_pre = get_fish_price_data(start_date=start_date, end_date=end_date)
    #   # df_aji_pre.to_csv("fish_price_pre.csv", index=False)
    
    df_pre = df.tail(1).reset_index()
    df_pre = df_pre.drop('index', axis=1)
    df_aji_ft_pre = pd.concat([df_pre, df_aji_pre], axis=1)
    df_aji_ft_pre = df_aji_ft_pre[['audience', 'date', 'low_price', 'center_price', 'high_price', 'quantity']]
    df_aji_ft_pre = df_aji_ft_pre.rename(columns={'audience': 'last_audience', 0: 'year', '開催日': 'date', '観客数': 'audience'})

    X = df_train.drop('audience', axis=1)
    y = df_train['audience']
    
    pred = linear_regression.predict(df_aji_ft_pre)
    df_aji_ft_pre['audience_pred'] = pred
    df_aji_ft_pre['date'] = df_aji_ft_pre['date'].astype(int)

    fig = plt.figure()
    plt.plot(df_train['date'], df_train['audience'], label='original')
    plt.plot(df_aji_ft_pre['date'], df_aji_ft_pre['audience_pred'], '*', label='predict')
    plt.title("prediction of audince")
    plt.ylabel("audience")
    plt.xlabel("date")
    plt.legend()
    return fig

with gr.Blocks() as demo:
    gr.Markdown(
                            """
                            # 川崎フロンターレの観客動員数予測
                             等々力陸上競技場で行われる川崎フロンターレの直近の試合の観客数を「あじ」の価格と漁獲高を使用し予測する。
                            ## 使用データ
                             * 東京卸売市場日報
                             * Football Lab 
                            ## 予測ロジック
                            観客動員数は雨天か否かで左右されると考えられる。そこで雨天の可能性をあじの価格を利用し表した。
                            一般的に雨天の場合、低気圧の影響で海面が上昇し漁に出ることが難しくなる。
                            そのため漁獲量が減少し、あじの価格が上昇すると考えられる。
                            ## 特徴量
                            予測日前日のあじの高値、予測日前日のあじの中値、予測日前日のあじの安値、予測日前日のあじの卸売数量、等々力競技場で行われた前回の試合の観客動員数
                            ##注意点
                            予測日前日の東京卸売市場のデータがない場合はErrorとなります。
                             """
                        )
    with gr.Row():
      with gr.Column():
        date_input = gr.Checkbox(label='予測したい気持ちがありますか？ある場合はチェックしてから、下のpredictボタンを押してください')
        prediction_btn = gr.Button(value="predict")
      with gr.Column():
        prediction = gr.Plot(label = "時系列プロット")
    prediction_btn.click(outbreak, inputs=date_input, outputs=prediction)

demo.launch()