mgmtprofessor commited on
Commit
d428815
·
verified ·
1 Parent(s): 0193789

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +56 -43
app.py CHANGED
@@ -9,9 +9,8 @@ from transformers import AutoModelForSequenceClassification, AutoTokenizer
9
  # Set up Streamlit app
10
  st.title("An App to Score Firm-Generated Text on Eight Risk Factors")
11
  st.write("Note: You can either upload a CSV file or a single TXT file for scoring.")
12
- st.write("Note: If uploading a CSV file, ensure that it contains the following columns: cik, fyear, Item 1A. Item 1A should contain the respective risk factors section for each firm-year observation.")
13
- st.write("Note: If uploading a txt file, ensure it contains the respective risk factors section for each firm-year observation.")
14
-
15
  # Hugging Face model directories
16
  model_directories = {
17
  'finance': 'mgmtprofessor/finance_risk_factors',
@@ -62,6 +61,14 @@ def score_document(model, tokenizer, text_data):
62
 
63
  return probability_class_1
64
 
 
 
 
 
 
 
 
 
65
  # Dropdown to select file type
66
  file_type = st.selectbox("Select the file type to upload:", ["CSV", "TXT"])
67
 
@@ -75,51 +82,57 @@ if file_type == "CSV":
75
  if csv_file is not None:
76
  # Read the CSV file
77
  df = pd.read_csv(csv_file)
78
-
79
- # Ensure the "Item 1A" column is treated as text data
80
- text_data = df['Item 1A'].dropna().tolist() # Extract text from "Item 1A"
81
 
82
- # Initialize an empty DataFrame for results
83
- result_df = df.copy()
84
-
85
- # Progress bar
86
- progress_bar = st.progress(0)
87
- total_categories = len(model_directories)
88
 
89
- for i, category in enumerate(tqdm(model_directories.keys(), desc="Scoring documents")):
90
- # Load the pre-trained model for the current category
91
- model, tokenizer = load_model(category)
 
 
 
 
 
92
 
93
- # Skip the category if model loading fails
94
- if model is not None:
95
- # Score the document for each row in the text data
96
- category_scores = []
97
- for text in text_data:
98
- probability = score_document(model, tokenizer, text)
99
- category_scores.append(probability[0]) # Extract the first (and only) value
100
-
101
- # Add the results to the DataFrame
102
- result_df[category.capitalize()] = category_scores
103
 
104
- # Update the progress bar
105
- progress_bar.progress((i + 1) / total_categories)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
106
 
107
- # Estimate remaining time
108
- elapsed_time = time.time() - start_time
109
- estimated_total_time = (elapsed_time / (i + 1)) * total_categories
110
- st.write(f"Elapsed time: {elapsed_time:.2f}s, Estimated time remaining: {estimated_total_time - elapsed_time:.2f}s")
111
-
112
- # Save results to CSV
113
- csv = result_df.to_csv(index=False).encode('utf-8')
114
- st.download_button(
115
- label="Download results as CSV",
116
- data=csv,
117
- file_name="document_scoring_results.csv",
118
- mime="text/csv",
119
- )
120
-
121
- # Display completion message
122
- st.success("Document scoring complete!")
123
 
124
  elif file_type == "TXT":
125
  doc_file = st.file_uploader("Upload a TXT file", type=["txt"])
 
9
  # Set up Streamlit app
10
  st.title("An App to Score Firm-Generated Text on Eight Risk Factors")
11
  st.write("Note: You can either upload a CSV file or a single TXT file for scoring.")
12
+ st.write("If uploading a CSV file, ensure that it contains the following columns: cik, fyear, Item 1A. Item 1A should contain the respective risk factors section for each firm-year observation.")
13
+ st.write("If uploading a txt file, ensure it contains the respective risk factors section for each firm-year observation.")
 
14
  # Hugging Face model directories
15
  model_directories = {
16
  'finance': 'mgmtprofessor/finance_risk_factors',
 
61
 
62
  return probability_class_1
63
 
64
+ # Function to find the relevant text column
65
+ def get_text_column(df):
66
+ possible_columns = ['Item 1A', 'Item 1A.', 'Item 1A. Risk Factors']
67
+ for col in possible_columns:
68
+ if col in df.columns:
69
+ return col
70
+ return None # Return None if no matching column is found
71
+
72
  # Dropdown to select file type
73
  file_type = st.selectbox("Select the file type to upload:", ["CSV", "TXT"])
74
 
 
82
  if csv_file is not None:
83
  # Read the CSV file
84
  df = pd.read_csv(csv_file)
 
 
 
85
 
86
+ # Find the relevant text column
87
+ text_column = get_text_column(df)
 
 
 
 
88
 
89
+ if text_column is None:
90
+ st.error("No valid text column found. Please ensure your CSV contains 'Item 1A', 'Item 1A.', or 'Item 1A. Risk Factors'.")
91
+ else:
92
+ # Extract text data from the identified column
93
+ text_data = df[text_column].dropna().tolist() # Extracts all non-empty rows
94
+
95
+ # Initialize an empty DataFrame for results
96
+ result_df = df.copy()
97
 
98
+ # Progress bar
99
+ progress_bar = st.progress(0)
100
+ total_categories = len(model_directories)
 
 
 
 
 
 
 
101
 
102
+ for i, category in enumerate(tqdm(model_directories.keys(), desc="Scoring documents")):
103
+ # Load the pre-trained model for the current category
104
+ model, tokenizer = load_model(category)
105
+
106
+ # Skip the category if model loading fails
107
+ if model is not None:
108
+ # Score the document for each row in the text data
109
+ category_scores = []
110
+ for text in text_data:
111
+ probability = score_document(model, tokenizer, text)
112
+ category_scores.append(probability[0]) # Extract the first (and only) value
113
+
114
+ # Add the results to the DataFrame
115
+ result_df[category.capitalize()] = category_scores
116
+
117
+ # Update the progress bar
118
+ progress_bar.progress((i + 1) / total_categories)
119
+
120
+ # Estimate remaining time
121
+ elapsed_time = time.time() - start_time
122
+ estimated_total_time = (elapsed_time / (i + 1)) * total_categories
123
+ st.write(f"Elapsed time: {elapsed_time:.2f}s, Estimated time remaining: {estimated_total_time - elapsed_time:.2f}s")
124
 
125
+ # Save results to CSV
126
+ csv = result_df.to_csv(index=False).encode('utf-8')
127
+ st.download_button(
128
+ label="Download results as CSV",
129
+ data=csv,
130
+ file_name="document_scoring_results.csv",
131
+ mime="text/csv",
132
+ )
133
+
134
+ # Display completion message
135
+ st.success("Document scoring complete!")
 
 
 
 
 
136
 
137
  elif file_type == "TXT":
138
  doc_file = st.file_uploader("Upload a TXT file", type=["txt"])