masa729406 commited on
Commit
83738aa
·
1 Parent(s): e683c9b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +154 -0
app.py CHANGED
@@ -20,6 +20,160 @@ import requests
20
  from bs4 import BeautifulSoup as bs
21
  from requests_html import AsyncHTMLSession
22
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
  # Webページを取得して解析する
24
  load_url = "https://www.football-lab.jp/kyot/match/"
25
  html = requests.get(load_url)
 
20
  from bs4 import BeautifulSoup as bs
21
  from requests_html import AsyncHTMLSession
22
 
23
+ def date_range(
24
+ start: date, stop: date, step: timedelta = timedelta(1)
25
+ ) -> Generator[date, None, None]:
26
+ """startからendまで日付をstep日ずつループさせるジェネレータ"""
27
+ current = start
28
+ while current < stop:
29
+ yield current
30
+ current += step
31
+
32
+
33
+ def get_url(download_date: date) -> Tuple[str, str]:
34
+ """ダウンロードするURLと日付の文字列を返す"""
35
+ month = download_date.strftime("%Y%m")
36
+ day = download_date.strftime("%Y%m%d")
37
+ return (
38
+ f"https://www.shijou-nippo.metro.tokyo.lg.jp/SN/{month}/{day}/Sui/Sui_K1.csv",
39
+ day,
40
+ )
41
+
42
+
43
+ def content_wrap(content):
44
+ """1行目にヘッダ行が来るまでスキップする"""
45
+ buffer = ""
46
+ first = True
47
+ for line in io.BytesIO(content):
48
+ line_str = codecs.decode(line, "shift-jis")
49
+ if first:
50
+ if "品名" in line_str:
51
+ first = False
52
+ buffer = line_str
53
+ else:
54
+ continue
55
+ else:
56
+ buffer += line_str
57
+ return io.StringIO(buffer)
58
+
59
+
60
+ def insert_data(data, day, low_price, center_price, high_price, quantity):
61
+ """ "データをリストに追加する"""
62
+ data["date"].append(day)
63
+ data["low_price"].append(low_price)
64
+ data["center_price"].append(center_price)
65
+ data["high_price"].append(high_price)
66
+ data["quantity"].append(quantity)
67
+
68
+
69
+ def to_numeric(x):
70
+ """文字列を数値に変換する"""
71
+ if isinstance(x, str):
72
+ return float(x)
73
+ else:
74
+ return x
75
+
76
+
77
+ def get_fish_price_data(start_date: date, end_date: date) -> pd.core.frame.DataFrame:
78
+ """
79
+ 東京卸売市場からデータを引っ張ってくる
80
+
81
+ :param start_date: 開始日
82
+ :param end_date: 終了日
83
+ :return: あじの値段を結合したデータ
84
+ """
85
+ data = {
86
+ "date": [],
87
+ "low_price": [],
88
+ "center_price": [],
89
+ "high_price": [],
90
+ "quantity": [],
91
+ }
92
+ iterator = tqdm(
93
+ date_range(start_date, end_date), total=(end_date - start_date).days
94
+ )
95
+
96
+ for download_date in iterator:
97
+ url, day = get_url(download_date)
98
+ iterator.set_description(day)
99
+ response = requests.get(url)
100
+
101
+ # URLが存在しないとき
102
+ if response.status_code == 404:
103
+ insert_data(data, day, np.nan, np.nan, np.nan, 0)
104
+ continue
105
+ assert (
106
+ response.status_code == 200
107
+ ), f"Unexpected HTTP response. Please check the website {url}."
108
+
109
+ df = pd.read_csv(content_wrap(response.content))
110
+
111
+ # 欠損値補完
112
+ price_cols = ["安値(円)", "中値(円)", "高値(円)"]
113
+ for c in price_cols:
114
+ df[c].mask(df[c] == "-", np.nan, inplace=True)
115
+ df[c].mask(df[c] == "−", np.nan, inplace=True)
116
+ df["卸売数量"].mask(df["卸売数量"] == "-", np.nan, inplace=True)
117
+ df["卸売数量"].mask(df["卸売数量"] == "−", np.nan, inplace=True)
118
+
119
+ # 長崎で獲れたあじの中値と卸売数量
120
+ # 品目 == あじ の行だけ抽出
121
+ df_aji = df.loc[df["品名"] == "あじ", ["卸売数量"] + price_cols]
122
+
123
+ # あじの販売がなかったら欠損扱いに
124
+ if len(df_aji) == 0:
125
+ insert_data(data, day, np.nan, np.nan, np.nan, 0)
126
+ continue
127
+
128
+ isnan = lambda x: isinstance(x, float) and np.isnan(x)
129
+ # 産地ごと(?)の鯵の販売実績を調べる
130
+ low_prices = []
131
+ center_prices = []
132
+ high_prices = []
133
+ quantities = []
134
+ for i, row in enumerate(df_aji.iloc):
135
+ lp, cp, hp, q = row[price_cols + ["卸売数量"]]
136
+ lp, cp, hp, q = (
137
+ to_numeric(lp),
138
+ to_numeric(cp),
139
+ to_numeric(hp),
140
+ to_numeric(q),
141
+ )
142
+
143
+ # 中値だけが記録されている -> 価格帯が1個だけなので高値、安値も中値と同じにしておく
144
+ if isnan(lp) and isnan(hp) and (not isnan(cp)):
145
+ low_prices.append(cp)
146
+ center_prices.append(cp)
147
+ high_prices.append(cp)
148
+
149
+ # 高値・安値があり中値がない -> 価格帯2個、とりあえず両者の平均を中値とする
150
+ elif (not isnan(lp)) and (not isnan(hp)) and isnan(cp):
151
+ low_prices.append(lp)
152
+ center_prices.append((lp + hp) / 2)
153
+ high_prices.append(hp)
154
+ else:
155
+ low_prices.append(lp)
156
+ center_prices.append(cp)
157
+ high_prices.append(hp)
158
+
159
+ if isnan(row["卸売数量"]):
160
+ quantities.append(0)
161
+ else:
162
+ quantities.append(q)
163
+
164
+ low_price = int(min(low_prices))
165
+ center_price = int(sum(center_prices) / len(center_prices))
166
+ high_price = int(max(high_prices))
167
+ quantity = int(float(sum(quantities)))
168
+
169
+ # 保存
170
+ insert_data(data, day, low_price, center_price, high_price, quantity)
171
+ # 短期間にアクセスが集中しないようにクールタイムを設定
172
+ time.sleep(max(0.5 + random.normalvariate(0, 0.3), 0.1))
173
+ # DataFrameを作成
174
+ df = pd.DataFrame(data)
175
+ return df
176
+
177
  # Webページを取得して解析する
178
  load_url = "https://www.football-lab.jp/kyot/match/"
179
  html = requests.get(load_url)