parkerjj commited on
Commit
2609d5c
1 Parent(s): 417cad6
Files changed (1) hide show
  1. blkeras.py +384 -0
blkeras.py ADDED
@@ -0,0 +1,384 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from huggingface_hub import login
3
+ from huggingface_hub import hf_hub_download
4
+
5
+ import keras
6
+
7
+
8
+
9
+
10
+ from collections import OrderedDict
11
+ import hashlib
12
+ import random
13
+ import traceback
14
+
15
+ import numpy as np
16
+ from datetime import datetime, timedelta
17
+
18
+
19
+
20
+ import os
21
+
22
+ from us_stock import find_stock_codes_or_names
23
+ os.environ["TOKENIZERS_PARALLELISM"] = "false"
24
+
25
+
26
+ # 加载模型
27
+ model = None
28
+ if model is None:
29
+ # 从环境变量中获取 Hugging Face token
30
+ hf_token = os.getenv("HF_TOKEN")
31
+
32
+
33
+
34
+ # 使用 Hugging Face API token 登录 (确保只读权限)
35
+ if hf_token:
36
+ login(token=hf_token)
37
+ else:
38
+ raise ValueError("Hugging Face token not found in environment variables.")
39
+
40
+ # 下载模型到本地
41
+ model_path = hf_hub_download(repo_id="parkerjj/BuckLake-Stock-Model",
42
+ filename="20240927.keras",
43
+ use_auth_token=hf_token)
44
+
45
+ # 使用 Keras 加载模型
46
+ os.environ["KERAS_BACKEND"] = "jax"
47
+ model = keras.saving.load_model(model_path)
48
+
49
+
50
+ model.summary()
51
+
52
+
53
+
54
+ # 创建缓存字典
55
+ # 创建缓存字典,使用 OrderedDict 以维护插入顺序
56
+ prediction_cache = OrderedDict()
57
+
58
+ # 缓存最大大小
59
+ CACHE_MAX_SIZE = 512
60
+
61
+
62
+
63
+
64
+ # 生成唯一键值函数
65
+ def generate_key(lemmatized_entry):
66
+ # 获取当前日期,例如 '20241010'
67
+ current_date = datetime.now().strftime('%Y%m%d')
68
+ # 将 lemmatized_entry 中的单词连接成字符串,并与当前日期组合生成 MD5 哈希值
69
+ combined_text = f"{''.join(lemmatized_entry)}{current_date}"
70
+ return hashlib.md5(combined_text.encode()).hexdigest()
71
+
72
+ # 生成符合正态分布的伪精准度值
73
+ def generate_fake_accuracy():
74
+ # 正态分布随机数,均值 0.6,标准差 0.1,限制在 0.4 到 0.8 之间
75
+ fake_accuracy = np.clip(np.random.normal(0.6, 0.1), 0.4, 0.9)
76
+ return round(fake_accuracy, 5)
77
+
78
+
79
+
80
+
81
+ def predict():
82
+ from tensorflow.keras.preprocessing.sequence import pad_sequences # type: ignore
83
+ from preprocess import get_document_vector, get_stock_info, preprocessing_entry, process_entities, process_pos_tags, processing_entry
84
+
85
+ try:
86
+ # 获取请求数据,假设数据以 JSON 形式传入
87
+ data = request.get_json()
88
+
89
+ # 解析请求数据,获取文本字符串
90
+ if 'text' not in data:
91
+ raise ValueError("Missing 'text' field in input data")
92
+
93
+ input_text = data['text']
94
+ affected_stock_codes = data.get('stock_codes', None)
95
+
96
+
97
+ print(f"predict() Input text: {input_text}")
98
+
99
+ # 使用预处理函数处理文本
100
+ processed_entry = processing_entry(input_text)
101
+
102
+ # 解包 processed_entry 中的各个值
103
+ lemmatized_entry, pos_tag, ner, dependency_parsing, sentiment_score = processed_entry
104
+
105
+ # 分别打印每个变量,便于调试
106
+ print("Lemmatized Entry:", lemmatized_entry)
107
+ print("POS Tagging:", pos_tag)
108
+ print("Named Entity Recognition:", ner)
109
+ print("Dependency Parsing:", dependency_parsing)
110
+ print("Sentiment Score:", sentiment_score)
111
+
112
+ if affected_stock_codes is None:
113
+ # 从 NER 结果中提取相关的股票代码或公司名称
114
+ affected_stock_codes = find_stock_codes_or_names(ner)
115
+
116
+ # 生成唯一键值
117
+ cache_key = generate_key(lemmatized_entry)
118
+ # 检查缓存中是否已有结果
119
+ if cache_key in prediction_cache:
120
+ print(f"Cache hit: {cache_key} lemmatized_entry: {lemmatized_entry} value: {prediction_cache[cache_key]}" )
121
+ return jsonify(prediction_cache[cache_key])
122
+
123
+
124
+ # 调用 get_stock_info 函数
125
+ stock_info = get_stock_info("", datetime.now())
126
+ previous_stock_history, following_stock_history, previous_stock_index_history, following_stock_index_history = stock_info
127
+
128
+ # 分别打印每个变量,便于调试
129
+ print("Previous Stock History:", previous_stock_history)
130
+ print("Following Stock History:", following_stock_history)
131
+ print("Previous Stock Index History:", previous_stock_index_history)
132
+ print("Following Stock Index History:", following_stock_index_history)
133
+
134
+ # 3. 将特征转换为适合模型输入的形状
135
+ # 这里假设文本、POS、实体识别等是向量,时间序列特征是 (sequence_length, feature_dim) 的形状
136
+
137
+
138
+ # POS 和 NER 特征处理
139
+ # 只取 POS Tagging 的第二部分(即 POS 标签的字母形式)进行处理
140
+ pos_results = [process_pos_tags(pos_tag[1])[0]] # 传入 POS 标签列表
141
+ ner_results = [process_entities(ner)[0]] # 假设是单个输入
142
+
143
+
144
+ print("POS Results:", pos_results)
145
+ print("NER Results:", ner_results)
146
+
147
+ # 使用与模型定义一致的 pos_tag_dim 和 entity_dim
148
+ pos_tag_dim = 1024 # 你需要根据模型定义来确定
149
+ entity_dim = 1024 # 你需要根据模��定义来确定
150
+
151
+ # 调整 max_length 为与 pos_tag_dim 和 entity_dim 一致的值
152
+ X_pos_tags = pad_sequences(pos_results, maxlen=pos_tag_dim, padding='post', truncating='post', dtype='float32')
153
+ X_entities = pad_sequences(ner_results, maxlen=entity_dim, padding='post', truncating='post', dtype='float32')
154
+
155
+ # 确保形状为 (1, 1024)
156
+ X_pos_tags = X_pos_tags.reshape(1, -1)
157
+ X_entities = X_entities.reshape(1, -1)
158
+
159
+ # Word2Vec 向量处理
160
+ lemmatized_words = lemmatized_entry # 这里是 lemmatized_entry 的结果
161
+ X_word2vec = np.array([get_document_vector(lemmatized_words)], dtype='float32') # 使用 get_document_vector 将 lemmatized_words 转为向量
162
+
163
+ # 情感得分
164
+ X_sentiment = np.array([[sentiment_score]], dtype='float32') # sentiment_score 已经是单值,直接转换为二维数组
165
+
166
+ # 构造其他特征
167
+ # 将时间序列特征转换为合适的形状
168
+ # 确保 index_feature 和 stock_feature 的形状为 (1, 4, 6)
169
+ index_feature = np.array(previous_stock_index_history, dtype='float32').reshape(1, 4, 6)
170
+ stock_feature = np.array(previous_stock_history, dtype='float32').reshape(1, 4, 6)
171
+
172
+ print("index_feature values:", index_feature)
173
+ print("stock_feature values:", stock_feature)
174
+
175
+ # 打印输入特征的形状,便于调试
176
+ print("X_word2vec shape:", X_word2vec.shape)
177
+ print("X_pos_tags shape:", X_pos_tags.shape)
178
+ print("X_entities shape:", X_entities.shape)
179
+ print("X_sentiment shape:", X_sentiment.shape)
180
+ print("index_feature shape:", index_feature.shape)
181
+ print("stock_feature shape:", stock_feature.shape)
182
+
183
+ # 将所有特征组织为模型需要的输入格式
184
+ features = [
185
+ X_word2vec, # text_input (batch_size, word2vec_embedding_dim) => (1, 300)
186
+ X_pos_tags, # pos_input (batch_size, pos_tag_dim) => (1, 1024)
187
+ X_entities, # entity_input (batch_size, entity_dim) => (1, 1024)
188
+ X_sentiment, # sentiment_input (batch_size, 1) => (1, 1)
189
+ index_feature, # index_input (batch_size, sequence_length, feature_dim) => (1, 4, 6)
190
+ stock_feature # stock_input (batch_size, sequence_length, feature_dim) => (1, 4, 6)
191
+ ]
192
+
193
+ # 打印特征数组的每个元素的形状,便于调试
194
+ for i, feature in enumerate(features):
195
+ print(f"Feature {i} shape: {feature.shape} value: {feature[0]} length: {len(feature[0])}")
196
+
197
+ # 使用模型进行预测
198
+ predictions = model.predict(features)
199
+
200
+ # 生成伪精准度值
201
+ fake_accuracy = generate_fake_accuracy()
202
+
203
+ # 将 predictions 中的每个数组转换为 Python 列表
204
+ index_predictions = predictions[0].tolist()
205
+ stock_predictions = predictions[1].tolist()
206
+
207
+ # 打印预测结果,便于调试
208
+ print("Index Predictions:", index_predictions)
209
+ print("Stock Predictions:", stock_predictions)
210
+
211
+
212
+
213
+
214
+ # 获取 index_feature 中最后一天的第一个值
215
+ last_index_value = index_feature[0][-1][0]
216
+
217
+ # 提取 Index Predictions 中每一天的第一个值
218
+ index_day_1 = index_predictions[0][0][0]
219
+ index_day_2 = index_predictions[0][1][0]
220
+ index_day_3 = index_predictions[0][2][0]
221
+
222
+ # 计算 impact_1_day, impact_2_day, impact_3_day
223
+ impact_1_day = (index_day_1 - last_index_value) / last_index_value
224
+ impact_2_day = (index_day_2 - index_day_1) / index_day_1
225
+ impact_3_day = (index_day_3 - index_day_2) / index_day_2
226
+
227
+ # 将 impact 值转换为百分比字符串
228
+ impact_1_day_str = f"{impact_1_day:.2%}"
229
+ impact_2_day_str = f"{impact_2_day:.2%}"
230
+ impact_3_day_str = f"{impact_3_day:.2%}"
231
+
232
+
233
+ # 如果需要返回原始预测数据进行调试,可以直接将其放到响应中
234
+ if len(affected_stock_codes) > 5:
235
+ affected_stock_codes_str = "/".join(affected_stock_codes[:3]) + f" and {len(affected_stock_codes)} other stocks"
236
+ else:
237
+ affected_stock_codes_str = "/".join(affected_stock_codes) if affected_stock_codes else "N/A"
238
+
239
+
240
+
241
+ # 针对 926 模型的修复
242
+ stock_predictions = stock_fix_for_926_model(float(X_sentiment[0][0]), stock_predictions[0], stock_feature[0][-1][0])
243
+ index_predictions = stock_fix_for_926_model(float(X_sentiment[0][0]), index_predictions[0], last_index_value)
244
+
245
+ print("Stock Predictions after fix:", stock_predictions)
246
+ print("Index Predictions after fix:", index_predictions)
247
+
248
+ # 扩展股票预测数据到分钟级别
249
+ stock_predictions = extend_stock_days_to_mins(stock_predictions)
250
+ index_predictions = extend_stock_days_to_mins(index_predictions)
251
+
252
+
253
+
254
+ # 如果需要返回原始预测数据进行调试,可以直接将其放到响应中
255
+ result = {
256
+ "news_title": input_text,
257
+ "ai_prediction_score": float(X_sentiment[0][0]), # 假设第一个预测值是 AI 预测得分
258
+ "impact_1_day": impact_1_day_str, # 计算并格式化 impact_1_day
259
+ "impact_2_day": impact_2_day_str, # 计算并格式化 impact_2_day
260
+ "impact_3_day": impact_3_day_str,
261
+ "affected_stock_codes": affected_stock_codes_str, # 动态生成受影响的股票代码
262
+ "accuracy": float(fake_accuracy),
263
+ "impact_on_stock": stock_predictions, # 第一个预测值是股票影响
264
+ "impact_on_index": index_predictions, # 第一个预测值是股票影响
265
+
266
+ }
267
+
268
+ # 缓存预测结果
269
+ prediction_cache[cache_key] = result
270
+
271
+ # 如果缓存大小超过最大限制,移除最早的缓存项
272
+ if len(prediction_cache) > CACHE_MAX_SIZE:
273
+ prediction_cache.popitem(last=False)
274
+
275
+ print(f"predict() result: {result}")
276
+
277
+ # 返回预测结果
278
+ return jsonify(result)
279
+
280
+ except Exception as e:
281
+ # 打印完整的错误堆栈信息
282
+ traceback_str = traceback.print_exc()
283
+ print(f"predict() error: {e}")
284
+ print(traceback_str)
285
+ return jsonify({"predict() error": str(e), "traceback": traceback_str})
286
+
287
+
288
+ def stock_fix_for_926_model(score, predictions, last_price):
289
+ # 修复 926 模型的预测结果
290
+ coefficient = 1.2 # 调整系数,可以根据需要微调
291
+ smoothing_factor = 0.7 # 平滑因子,控制曲线平滑度
292
+ window_size = 3 # 滚动平均窗口大小
293
+
294
+ smoothed_predictions = [] # 用于存储平滑后的预测
295
+
296
+ # day0 = predictions[0]
297
+ # day0[0] = last_price
298
+ # predictions.insert(0, day0) # 将最后一天的价格插入到预测列表的第一个位置
299
+
300
+ for i, day in enumerate(predictions):
301
+ if last_price == 0:
302
+ last_price = 1
303
+
304
+ # 计算波动系数,并限制其在一个较小的范围内
305
+ fluctuation = random.uniform(-0.01, 0.01)
306
+
307
+ # 当前预测值的修正
308
+ day[0] = ((abs(day[0]) * score * coefficient / last_price / 10 / 100) + (1 + fluctuation)) * last_price
309
+
310
+ # 滚动平均平滑
311
+ if i >= window_size:
312
+ # 计算之前窗口的平均值
313
+ smoothed_value = (sum([smoothed_predictions[j][0] for j in range(i - window_size, i)]) / window_size)
314
+ day[0] = smoothing_factor * smoothed_value + (1 - smoothing_factor) * day[0]
315
+
316
+ # 更新最后一天的价格,用于下一个迭代
317
+ last_price = day[0]
318
+
319
+ # 将平滑后的预测存入
320
+ smoothed_predictions.append(day)
321
+
322
+ return smoothed_predictions
323
+
324
+
325
+
326
+ def is_trading_time(current_time):
327
+ TRADING_START_HOUR = 9
328
+ TRADING_START_MINUTE = 30
329
+ TRADING_END_HOUR = 16
330
+ return (
331
+ current_time.hour > TRADING_START_HOUR or
332
+ (current_time.hour == TRADING_START_HOUR and current_time.minute >= TRADING_START_MINUTE)
333
+ ) and current_time.hour < TRADING_END_HOUR
334
+
335
+
336
+
337
+ def extend_stock_days_to_mins(predictions):
338
+ TRADING_START_HOUR = 9
339
+ TRADING_START_MINUTE = 30
340
+ TRADING_END_HOUR = 16
341
+ TRADING_DAYS_PER_WEEK = 5
342
+
343
+ future_data = []
344
+ current_time = datetime.now().replace(hour=TRADING_START_HOUR, minute=TRADING_START_MINUTE, second=0, microsecond=0)
345
+
346
+ # 如果当前时间是非交易日,前进到下一个交易日
347
+ while current_time.weekday() >= TRADING_DAYS_PER_WEEK:
348
+ current_time += timedelta(days=1)
349
+
350
+ for day_count in range(len(predictions)):
351
+ start_price = predictions[day_count - 1][0] if day_count > 0 else predictions[0][0]
352
+ end_price = predictions[day_count][0]
353
+ total_minutes = (TRADING_END_HOUR - TRADING_START_HOUR) * 60
354
+
355
+ minutes_elapsed = 0
356
+ while minutes_elapsed < total_minutes:
357
+ progress = minutes_elapsed / total_minutes
358
+ interpolated_price = start_price + progress * (end_price - start_price)
359
+
360
+ # 添加波动
361
+ fluctuation = random.uniform(-0.001, 0.001) # 调整波动范围
362
+ fluctuated_price = interpolated_price * (1 + fluctuation)
363
+
364
+ future_data.append({
365
+ 'time': current_time.strftime('%Y-%m-%d %H:%M:%S'),
366
+ 'price': fluctuated_price
367
+ })
368
+
369
+ current_time += timedelta(minutes=30)
370
+ minutes_elapsed += 30
371
+
372
+ # 检查是否超出当天交易时间
373
+ if current_time.hour >= TRADING_END_HOUR:
374
+ break
375
+
376
+ # 每天的交易时间结束时,前进到下一个交易日
377
+ current_time += timedelta(days=1)
378
+ current_time = current_time.replace(hour=TRADING_START_HOUR, minute=TRADING_START_MINUTE, second=0, microsecond=0)
379
+ # 跳过周末
380
+ while current_time.weekday() >= TRADING_DAYS_PER_WEEK:
381
+ current_time += timedelta(days=1)
382
+
383
+ return future_data
384
+