Update app.py
Browse files
app.py
CHANGED
@@ -9,9 +9,8 @@ from transformers import AutoModelForSequenceClassification, AutoTokenizer
|
|
9 |
# Set up Streamlit app
|
10 |
st.title("An App to Score Firm-Generated Text on Eight Risk Factors")
|
11 |
st.write("Note: You can either upload a CSV file or a single TXT file for scoring.")
|
12 |
-
st.write("
|
13 |
-
st.write("
|
14 |
-
|
15 |
# Hugging Face model directories
|
16 |
model_directories = {
|
17 |
'finance': 'mgmtprofessor/finance_risk_factors',
|
@@ -62,6 +61,14 @@ def score_document(model, tokenizer, text_data):
|
|
62 |
|
63 |
return probability_class_1
|
64 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
65 |
# Dropdown to select file type
|
66 |
file_type = st.selectbox("Select the file type to upload:", ["CSV", "TXT"])
|
67 |
|
@@ -75,51 +82,57 @@ if file_type == "CSV":
|
|
75 |
if csv_file is not None:
|
76 |
# Read the CSV file
|
77 |
df = pd.read_csv(csv_file)
|
78 |
-
|
79 |
-
# Ensure the "Item 1A" column is treated as text data
|
80 |
-
text_data = df['Item 1A'].dropna().tolist() # Extract text from "Item 1A"
|
81 |
|
82 |
-
#
|
83 |
-
|
84 |
-
|
85 |
-
# Progress bar
|
86 |
-
progress_bar = st.progress(0)
|
87 |
-
total_categories = len(model_directories)
|
88 |
|
89 |
-
|
90 |
-
|
91 |
-
|
|
|
|
|
|
|
|
|
|
|
92 |
|
93 |
-
#
|
94 |
-
|
95 |
-
|
96 |
-
category_scores = []
|
97 |
-
for text in text_data:
|
98 |
-
probability = score_document(model, tokenizer, text)
|
99 |
-
category_scores.append(probability[0]) # Extract the first (and only) value
|
100 |
-
|
101 |
-
# Add the results to the DataFrame
|
102 |
-
result_df[category.capitalize()] = category_scores
|
103 |
|
104 |
-
|
105 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
106 |
|
107 |
-
#
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
mime="text/csv",
|
119 |
-
)
|
120 |
-
|
121 |
-
# Display completion message
|
122 |
-
st.success("Document scoring complete!")
|
123 |
|
124 |
elif file_type == "TXT":
|
125 |
doc_file = st.file_uploader("Upload a TXT file", type=["txt"])
|
|
|
9 |
# Set up Streamlit app
|
10 |
st.title("An App to Score Firm-Generated Text on Eight Risk Factors")
|
11 |
st.write("Note: You can either upload a CSV file or a single TXT file for scoring.")
|
12 |
+
st.write("If uploading a CSV file, ensure that it contains the following columns: cik, fyear, Item 1A. Item 1A should contain the respective risk factors section for each firm-year observation.")
|
13 |
+
st.write("If uploading a txt file, ensure it contains the respective risk factors section for each firm-year observation.")
|
|
|
14 |
# Hugging Face model directories
|
15 |
model_directories = {
|
16 |
'finance': 'mgmtprofessor/finance_risk_factors',
|
|
|
61 |
|
62 |
return probability_class_1
|
63 |
|
64 |
+
# Function to find the relevant text column
|
65 |
+
def get_text_column(df):
|
66 |
+
possible_columns = ['Item 1A', 'Item 1A.', 'Item 1A. Risk Factors']
|
67 |
+
for col in possible_columns:
|
68 |
+
if col in df.columns:
|
69 |
+
return col
|
70 |
+
return None # Return None if no matching column is found
|
71 |
+
|
72 |
# Dropdown to select file type
|
73 |
file_type = st.selectbox("Select the file type to upload:", ["CSV", "TXT"])
|
74 |
|
|
|
82 |
if csv_file is not None:
|
83 |
# Read the CSV file
|
84 |
df = pd.read_csv(csv_file)
|
|
|
|
|
|
|
85 |
|
86 |
+
# Find the relevant text column
|
87 |
+
text_column = get_text_column(df)
|
|
|
|
|
|
|
|
|
88 |
|
89 |
+
if text_column is None:
|
90 |
+
st.error("No valid text column found. Please ensure your CSV contains 'Item 1A', 'Item 1A.', or 'Item 1A. Risk Factors'.")
|
91 |
+
else:
|
92 |
+
# Extract text data from the identified column
|
93 |
+
text_data = df[text_column].dropna().tolist() # Extracts all non-empty rows
|
94 |
+
|
95 |
+
# Initialize an empty DataFrame for results
|
96 |
+
result_df = df.copy()
|
97 |
|
98 |
+
# Progress bar
|
99 |
+
progress_bar = st.progress(0)
|
100 |
+
total_categories = len(model_directories)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
101 |
|
102 |
+
for i, category in enumerate(tqdm(model_directories.keys(), desc="Scoring documents")):
|
103 |
+
# Load the pre-trained model for the current category
|
104 |
+
model, tokenizer = load_model(category)
|
105 |
+
|
106 |
+
# Skip the category if model loading fails
|
107 |
+
if model is not None:
|
108 |
+
# Score the document for each row in the text data
|
109 |
+
category_scores = []
|
110 |
+
for text in text_data:
|
111 |
+
probability = score_document(model, tokenizer, text)
|
112 |
+
category_scores.append(probability[0]) # Extract the first (and only) value
|
113 |
+
|
114 |
+
# Add the results to the DataFrame
|
115 |
+
result_df[category.capitalize()] = category_scores
|
116 |
+
|
117 |
+
# Update the progress bar
|
118 |
+
progress_bar.progress((i + 1) / total_categories)
|
119 |
+
|
120 |
+
# Estimate remaining time
|
121 |
+
elapsed_time = time.time() - start_time
|
122 |
+
estimated_total_time = (elapsed_time / (i + 1)) * total_categories
|
123 |
+
st.write(f"Elapsed time: {elapsed_time:.2f}s, Estimated time remaining: {estimated_total_time - elapsed_time:.2f}s")
|
124 |
|
125 |
+
# Save results to CSV
|
126 |
+
csv = result_df.to_csv(index=False).encode('utf-8')
|
127 |
+
st.download_button(
|
128 |
+
label="Download results as CSV",
|
129 |
+
data=csv,
|
130 |
+
file_name="document_scoring_results.csv",
|
131 |
+
mime="text/csv",
|
132 |
+
)
|
133 |
+
|
134 |
+
# Display completion message
|
135 |
+
st.success("Document scoring complete!")
|
|
|
|
|
|
|
|
|
|
|
136 |
|
137 |
elif file_type == "TXT":
|
138 |
doc_file = st.file_uploader("Upload a TXT file", type=["txt"])
|