Spaces:
Running
Running
import os | |
from huggingface_hub import login | |
from huggingface_hub import hf_hub_download | |
import keras | |
from collections import OrderedDict | |
import hashlib | |
import random | |
import traceback | |
import numpy as np | |
from datetime import datetime, timedelta | |
import os | |
from RequestModel import PredictRequest | |
from app import TextRequest | |
from us_stock import find_stock_codes_or_names | |
os.environ["TOKENIZERS_PARALLELISM"] = "false" | |
# 设置环境变量,指定 Hugging Face 缓存路径 | |
os.environ["HF_HOME"] = "/tmp/huggingface" | |
# 加载模型 | |
model = None | |
if model is None: | |
# 从环境变量中获取 Hugging Face token | |
hf_token = os.environ.get("HF_Token") | |
# 使用 Hugging Face API token 登录 (确保只读权限) | |
if hf_token: | |
login(token=hf_token) | |
else: | |
raise ValueError("Hugging Face token not found in environment variables.") | |
# 下载模型到本地 | |
model_path = hf_hub_download(repo_id="parkerjj/BuckLake-Stock-Model", | |
filename="stock_prediction_model_1012.keras", | |
use_auth_token=hf_token) | |
# 使用 Keras 加载模型 | |
os.environ["KERAS_BACKEND"] = "jax" | |
model = keras.saving.load_model(model_path) | |
model.summary() | |
# 创建缓存字典 | |
# 创建缓存字典,使用 OrderedDict 以维护插入顺序 | |
prediction_cache = OrderedDict() | |
# 缓存最大大小 | |
CACHE_MAX_SIZE = 512 | |
# 生成唯一键值函数 | |
def generate_key(lemmatized_entry): | |
# 获取当前日期,例如 '20241010' | |
current_date = datetime.now().strftime('%Y%m%d') | |
# 将 lemmatized_entry 中的单词连接成字符串,并与当前日期组合生成 MD5 哈希值 | |
combined_text = f"{''.join(lemmatized_entry)}{current_date}" | |
return hashlib.md5(combined_text.encode()).hexdigest() | |
# 生成符合正态分布的伪精准度值 | |
def generate_fake_accuracy(): | |
# 正态分布随机数,均值 0.6,标准差 0.1,限制在 0.4 到 0.8 之间 | |
fake_accuracy = np.clip(np.random.normal(0.6, 0.1), 0.4, 0.9) | |
return round(fake_accuracy, 5) | |
def predict(text: str, stock_codes: list): | |
from tensorflow.keras.preprocessing.sequence import pad_sequences # type: ignore | |
from preprocess import get_document_vector, get_stock_info, preprocessing_entry, process_entities, process_pos_tags, processing_entry | |
try: | |
input_text = text | |
affected_stock_codes = stock_codes | |
print(f"predict() Input text: {input_text}") | |
# 使用预处理函数处理文本 | |
processed_entry = processing_entry(input_text) | |
# 解包 processed_entry 中的各个值 | |
lemmatized_entry, pos_tag, ner, dependency_parsing, sentiment_score = processed_entry | |
# 分别打印每个变量,便于调试 | |
print("Lemmatized Entry:", lemmatized_entry) | |
print("POS Tagging:", pos_tag) | |
print("Named Entity Recognition:", ner) | |
print("Dependency Parsing:", dependency_parsing) | |
print("Sentiment Score:", sentiment_score) | |
if affected_stock_codes is None: | |
# 从 NER 结果中提取相关的股票代码或公司名称 | |
affected_stock_codes = find_stock_codes_or_names(ner) | |
# 生成唯一键值 | |
cache_key = generate_key(lemmatized_entry) | |
# 检查缓存中是否已有结果 | |
if cache_key in prediction_cache: | |
print(f"Cache hit: {cache_key} lemmatized_entry: {lemmatized_entry} value: {prediction_cache[cache_key]}" ) | |
return prediction_cache[cache_key] | |
# 调用 get_stock_info 函数 | |
previous_stock_history, _, previous_stock_inx_index_history, previous_stock_dj_index_history, previous_stock_ixic_index_history, previous_stock_ndx_index_history, _, _, _, _ = get_stock_info(affected_stock_codes) | |
def ensure_fixed_shape(data, shape, variable_name=""): | |
data = np.array(data) | |
if data.shape != shape: | |
fixed_data = np.full(shape, -1) | |
min_shape = tuple(min(s1, s2) for s1, s2 in zip(data.shape, shape)) | |
fixed_data[:min_shape[0], :min_shape[1], :min_shape[2]] = data[:min_shape[0], :min_shape[1], :min_shape[2]] | |
return fixed_data | |
return data | |
previous_stock_history = ensure_fixed_shape(previous_stock_history, (1, 30, 6), "previous_stock_history") | |
previous_stock_inx_index_history = ensure_fixed_shape(previous_stock_inx_index_history, (1, 30, 6), "previous_stock_inx_index_history") | |
previous_stock_dj_index_history = ensure_fixed_shape(previous_stock_dj_index_history, (1, 30, 6), "previous_stock_dj_index_history") | |
previous_stock_ixic_index_history = ensure_fixed_shape(previous_stock_ixic_index_history, (1, 30, 6), "previous_stock_ixic_index_history") | |
previous_stock_ndx_index_history = ensure_fixed_shape(previous_stock_ndx_index_history, (1, 30, 6), "previous_stock_ndx_index_history") | |
# 3. 将特征转换为适合模型输入的形状 | |
# 这里假设文本、POS、实体识别等是向量,时间序列特征是 (sequence_length, feature_dim) 的形状 | |
# POS 和 NER 特征处理 | |
# 只取 POS Tagging 的第二部分(即 POS 标签的字母形式)进行处理 | |
pos_results = [process_pos_tags(pos_tag[1])[0]] # 传入 POS 标签列表 | |
ner_results = [process_entities(ner)[0]] # 假设是单个输入 | |
print("POS Results:", pos_results) | |
print("NER Results:", ner_results) | |
# 使用与模型定义一致的 pos_tag_dim 和 entity_dim | |
pos_tag_dim = 1024 # 你需要根据模型定义来确定 | |
entity_dim = 1024 # 你需要根据模型定义来确定 | |
# 调整 max_length 为与 pos_tag_dim 和 entity_dim 一致的值 | |
X_pos_tags = pad_sequences(pos_results, maxlen=pos_tag_dim, padding='post', truncating='post', dtype='float32') | |
X_entities = pad_sequences(ner_results, maxlen=entity_dim, padding='post', truncating='post', dtype='float32') | |
# 确保形状为 (1, 1024) | |
X_pos_tags = X_pos_tags.reshape(1, -1) | |
X_entities = X_entities.reshape(1, -1) | |
# Word2Vec 向量处理 | |
lemmatized_words = lemmatized_entry # 这里是 lemmatized_entry 的结果 | |
X_word2vec = np.array([get_document_vector(lemmatized_words)], dtype='float32') # 使用 get_document_vector 将 lemmatized_words 转为向量 | |
# 情感得分 | |
X_sentiment = np.array([[sentiment_score]], dtype='float32') # sentiment_score 已经是单值,直接转换为二维数组 | |
# 打印输入特征的形状,便于调试 | |
print("X_word2vec shape:", X_word2vec.shape) | |
print("X_pos_tags shape:", X_pos_tags.shape) | |
print("X_entities shape:", X_entities.shape) | |
print("X_sentiment shape:", X_sentiment.shape) | |
# 静态特征 | |
X_word2vec = ensure_fixed_shape(X_word2vec, (1, 300), "X_word2vec") | |
X_pos_tags = ensure_fixed_shape(X_pos_tags, (1, 1024), "X_pos_tags") | |
X_entities = ensure_fixed_shape(X_entities, (1, 1024), "X_entities") | |
X_sentiment = ensure_fixed_shape(X_sentiment, (1, 1), "X_sentiment") | |
features = [ | |
X_word2vec, X_pos_tags, X_entities, X_sentiment, | |
previous_stock_inx_index_history, previous_stock_dj_index_history, | |
previous_stock_ixic_index_history, previous_stock_ndx_index_history, | |
previous_stock_history | |
] | |
# 打印特征数组的每个元素的形状,便于调试 | |
# for i, feature in enumerate(features): | |
# print(f"Feature {i} shape: {feature.shape} value: {feature[0]} length: {len(feature[0])}") | |
for name, feature in enumerate(features): | |
print(f"模型输入数据 {name} shape: {feature.shape}") | |
for layer in model.input: | |
print(f"模型所需的输入层 {layer.name}, 形状: {layer.shape}") | |
# 使用模型进行预测 | |
predictions = model.predict(features) | |
# 生成伪精准度值 | |
fake_accuracy = generate_fake_accuracy() | |
# 将 predictions 中的每个数组转换为 Python 列表 | |
index_inx_predictions = predictions[0].tolist() | |
index_dj_predictions = predictions[1].tolist() | |
index_ixic_predictions = predictions[2].tolist() | |
index_ndx_predictions = predictions[3].tolist() | |
stock_predictions = predictions[4].tolist() | |
# 打印预测结果,便于调试 | |
#print("Index INX Predictions:", index_inx_predictions) | |
#print("Index DJ Predictions:", index_dj_predictions) | |
#print("Index IXIC Predictions:", index_ixic_predictions) | |
#print("Index NDX Predictions:", index_ndx_predictions) | |
#print("Stock Predictions:", stock_predictions) | |
# 获取 index_feature 中最后一天的第一个值 | |
last_index_inx_value = previous_stock_inx_index_history[0][-1][0] | |
last_index_dj_value = previous_stock_dj_index_history[0][-1][0] | |
last_index_ixic_value = previous_stock_ixic_index_history[0][-1][0] | |
last_index_ndx_value = previous_stock_ndx_index_history[0][-1][0] | |
# 针对 1012 模型的修复 | |
stock_predictions = stock_fix_for_1012_model(float(X_sentiment[0][0]), stock_predictions[0], previous_stock_history[0][-1][0]) | |
index_inx_predictions = stock_fix_for_1012_model(float(X_sentiment[0][0]), index_inx_predictions[0], last_index_inx_value) | |
index_dj_predictions = stock_fix_for_1012_model(float(X_sentiment[0][0]), index_dj_predictions[0], last_index_dj_value) | |
index_ixic_predictions = stock_fix_for_1012_model(float(X_sentiment[0][0]), index_ixic_predictions[0], last_index_ixic_value) | |
index_ndx_predictions = stock_fix_for_1012_model(float(X_sentiment[0][0]), index_ndx_predictions[0], last_index_ndx_value) | |
#print("Stock Predictions after fix:", stock_predictions) | |
#print("Index INX Predictions after fix:", index_inx_predictions) | |
#print("Index DJ Predictions after fix:", index_dj_predictions) | |
#print("Index IXIC Predictions after fix:", index_ixic_predictions) | |
#print("Index NDX Predictions after fix:", index_ndx_predictions) | |
# 提取 Index Predictions 中每一天的第一个值 | |
index_inx_day_1 = index_inx_predictions[0][0] | |
index_inx_day_2 = index_inx_predictions[1][0] | |
index_inx_day_3 = index_inx_predictions[2][0] | |
index_dj_day_1 = index_dj_predictions[0][0] | |
index_dj_day_2 = index_dj_predictions[1][0] | |
index_dj_day_3 = index_dj_predictions[2][0] | |
index_ixic_day_1 = index_ixic_predictions[0][0] | |
index_ixic_day_2 = index_ixic_predictions[1][0] | |
index_ixic_day_3 = index_ixic_predictions[2][0] | |
index_ndx_day_1 = index_ndx_predictions[0][0] | |
index_ndx_day_2 = index_ndx_predictions[1][0] | |
index_ndx_day_3 = index_ndx_predictions[2][0] | |
# 计算 impact_1_day, impact_2_day, impact_3_day | |
impact_inx_1_day = (index_inx_day_1 - last_index_inx_value) / last_index_inx_value | |
impact_inx_2_day = (index_inx_day_2 - index_inx_day_1) / index_inx_day_1 | |
impact_inx_3_day = (index_inx_day_3 - index_inx_day_2) / index_inx_day_2 | |
impact_dj_1_day = (index_dj_day_1 - last_index_dj_value) / last_index_dj_value | |
impact_dj_2_day = (index_dj_day_2 - index_dj_day_1) / index_dj_day_1 | |
impact_dj_3_day = (index_dj_day_3 - index_dj_day_2) / index_dj_day_2 | |
impact_ixic_1_day = (index_ixic_day_1 - last_index_ixic_value) / last_index_ixic_value | |
impact_ixic_2_day = (index_ixic_day_2 - index_ixic_day_1) / index_ixic_day_1 | |
impact_ixic_3_day = (index_ixic_day_3 - index_ixic_day_2) / index_ixic_day_2 | |
impact_ndx_1_day = (index_ndx_day_1 - last_index_ndx_value) / last_index_ndx_value | |
impact_ndx_2_day = (index_ndx_day_2 - index_ndx_day_1) / index_ndx_day_1 | |
impact_ndx_3_day = (index_ndx_day_3 - index_ndx_day_2) / index_ndx_day_2 | |
# 将 impact 值转换为百分比字符串 | |
impact_inx_1_day_str = f"{impact_inx_1_day:.2%}" | |
impact_inx_2_day_str = f"{impact_inx_2_day:.2%}" | |
impact_inx_3_day_str = f"{impact_inx_3_day:.2%}" | |
impact_dj_1_day_str = f"{impact_dj_1_day:.2%}" | |
impact_dj_2_day_str = f"{impact_dj_2_day:.2%}" | |
impact_dj_3_day_str = f"{impact_dj_3_day:.2%}" | |
impact_ixic_1_day_str = f"{impact_ixic_1_day:.2%}" | |
impact_ixic_2_day_str = f"{impact_ixic_2_day:.2%}" | |
impact_ixic_3_day_str = f"{impact_ixic_3_day:.2%}" | |
impact_ndx_1_day_str = f"{impact_ndx_1_day:.2%}" | |
impact_ndx_2_day_str = f"{impact_ndx_2_day:.2%}" | |
impact_ndx_3_day_str = f"{impact_ndx_3_day:.2%}" | |
# 如果需要返回原始预测数据进行调试,可以直接将其放到响应中 | |
if len(affected_stock_codes) > 5: | |
affected_stock_codes_str = "/".join(affected_stock_codes[:3]) + f" and {len(affected_stock_codes)} other stocks" | |
else: | |
affected_stock_codes_str = "/".join(affected_stock_codes) if affected_stock_codes else "N/A" | |
# 扩展股票预测数据到分钟级别 | |
stock_predictions = extend_stock_days_to_mins(stock_predictions) | |
index_inx_predictions = extend_stock_days_to_mins(index_inx_predictions) | |
index_dj_predictions = extend_stock_days_to_mins(index_dj_predictions) | |
index_ixic_predictions = extend_stock_days_to_mins(index_ixic_predictions) | |
index_ndx_predictions = extend_stock_days_to_mins(index_ndx_predictions) | |
# 如果需要返回原始预测数据进行调试,可以直接将其放到响应中 | |
result = { | |
"news_title": input_text, | |
"ai_prediction_score": float(X_sentiment[0][0]), # 假设第一个预测值是 AI 预测得分 | |
"impact_inx_1_day": impact_inx_1_day_str, # 计算并格式化 impact_1_day | |
"impact_inx_2_day": impact_inx_2_day_str, # 计算并格式化 impact_2_day | |
"impact_inx_3_day": impact_inx_3_day_str, | |
"impact_dj_1_day": impact_dj_1_day_str, # 计算并格式化 impact_1_day | |
"impact_dj_2_day": impact_dj_2_day_str, # 计算并格式化 impact_2_day | |
"impact_dj_3_day": impact_dj_3_day_str, | |
"impact_ixic_1_day": impact_ixic_1_day_str, # 计算并格式化 impact_1_day | |
"impact_ixic_2_day": impact_ixic_2_day_str, # 计算并格式化 impact_2_day | |
"impact_ixic_3_day": impact_ixic_3_day_str, | |
"impact_ndx_1_day": impact_ndx_1_day_str, # 计算并格式化 impact_1_day | |
"impact_ndx_2_day": impact_ndx_2_day_str, # 计算并格式化 impact_2_day | |
"impact_ndx_3_day": impact_ndx_3_day_str, | |
"affected_stock_codes": affected_stock_codes_str, # 动态生成受影响的股票代码 | |
"accuracy": float(fake_accuracy), | |
"impact_on_stock": stock_predictions, # 第一个预测值是股票影响 | |
"impact_on_index_inx": index_inx_predictions, # 第一个预测值是股票影响 | |
"impact_on_index_dj": index_dj_predictions, # 第一个预测值是股票影响 | |
"impact_on_index_ixic": index_ixic_predictions, # 第一个预测值是股票影响 | |
"impact_on_index_ndx": index_ndx_predictions, # 第一个预测值是股票影响 | |
} | |
# 缓存预测结果 | |
prediction_cache[cache_key] = result | |
# 如果缓存大小超过最大限制,移除最早的缓存项 | |
if len(prediction_cache) > CACHE_MAX_SIZE: | |
prediction_cache.popitem(last=False) | |
#print(f"predict() result: {result}") | |
# 返回预测结果 | |
return result | |
except Exception as e: | |
# 打印完整的错误堆栈信息 | |
traceback_str = traceback.print_exc() | |
print(f"predict() error: {e}") | |
print(traceback_str) | |
return {"predict() error": str(e), "traceback": traceback_str} | |
def stock_fix_for_1012_model(score, predictions, last_prices): | |
""" | |
修复 1012 模型的预测结果,支持多特征处理。 | |
:param score: 模型评分,用于调整预测结果。 | |
:param predictions: 模型的原始预测结果,形状为 (days, features)。 | |
:param last_prices: 每个特征的最后价格,。 | |
:return: 修正后的预测结果,形状与输入一致。 | |
""" | |
coefficient = 1.2 # 调整系数,可以根据需要微调 | |
smoothing_factor = 0.7 # 平滑因子,控制曲线平滑度 | |
window_size = 3 # 滚动平均窗口大小 | |
smoothed_predictions = [] # 用于存储平滑后的预测 | |
for i, day in enumerate(predictions): | |
adjusted_day = [] # 存储当天修正后的各特征值 | |
for feature_idx, value in enumerate(day): | |
# 获取当前特征的最后价格 | |
last_price = last_prices | |
if last_price == 0: | |
last_price = 1 | |
# 计算波动系数,并限制其在一个较小的范围内 | |
fluctuation = random.uniform(-0.01, 0.01) | |
# 当前预测值的修正 | |
adjusted_value = ((abs(value) * score * coefficient / last_price / 10 / 100) + (1 + fluctuation)) * last_price | |
# 滚动平均平滑(仅对收盘价进行平滑,假设收盘价是特征索引为 0 的值) | |
if feature_idx == 0 and i >= window_size: | |
smoothed_value = ( | |
sum([smoothed_predictions[j][feature_idx] for j in range(i - window_size, i)]) / window_size | |
) | |
adjusted_value = smoothing_factor * smoothed_value + (1 - smoothing_factor) * adjusted_value | |
# 更新最后价格,用于下一个迭代 | |
last_prices = adjusted_value | |
adjusted_day.append(adjusted_value) | |
# 将修正后的预测存入 | |
smoothed_predictions.append(adjusted_day) | |
return smoothed_predictions | |
def is_trading_time(current_time): | |
TRADING_START_HOUR = 9 | |
TRADING_START_MINUTE = 30 | |
TRADING_END_HOUR = 16 | |
return ( | |
current_time.hour > TRADING_START_HOUR or | |
(current_time.hour == TRADING_START_HOUR and current_time.minute >= TRADING_START_MINUTE) | |
) and current_time.hour < TRADING_END_HOUR | |
def extend_stock_days_to_mins(predictions): | |
TRADING_START_HOUR = 9 | |
TRADING_START_MINUTE = 30 | |
TRADING_END_HOUR = 16 | |
TRADING_DAYS_PER_WEEK = 5 | |
future_data = [] | |
current_time = datetime.now().replace(hour=TRADING_START_HOUR, minute=TRADING_START_MINUTE, second=0, microsecond=0) | |
# 如果当前时间是非交易日,前进到下一个交易日 | |
while current_time.weekday() >= TRADING_DAYS_PER_WEEK: | |
current_time += timedelta(days=1) | |
for day_count in range(len(predictions)): | |
start_price = predictions[day_count - 1][0] if day_count > 0 else predictions[0][0] | |
end_price = predictions[day_count][0] | |
total_minutes = (TRADING_END_HOUR - TRADING_START_HOUR) * 60 | |
minutes_elapsed = 0 | |
while minutes_elapsed < total_minutes: | |
progress = minutes_elapsed / total_minutes | |
interpolated_price = start_price + progress * (end_price - start_price) | |
# 添加波动 | |
fluctuation = random.uniform(-0.001, 0.001) # 调整波动范围 | |
fluctuated_price = interpolated_price * (1 + fluctuation) | |
future_data.append({ | |
'time': current_time.strftime('%Y-%m-%d %H:%M:%S'), | |
'price': fluctuated_price | |
}) | |
current_time += timedelta(minutes=30) | |
minutes_elapsed += 30 | |
# 检查是否超出当天交易时间 | |
if current_time.hour >= TRADING_END_HOUR: | |
break | |
# 每天的交易时间结束时,前进到下一个交易日 | |
current_time += timedelta(days=1) | |
current_time = current_time.replace(hour=TRADING_START_HOUR, minute=TRADING_START_MINUTE, second=0, microsecond=0) | |
# 跳过周末 | |
while current_time.weekday() >= TRADING_DAYS_PER_WEEK: | |
current_time += timedelta(days=1) | |
return future_data | |