Spaces:

mgmtprofessor
/

risk_factors_scoring

Sleeping

App Files Files Community

mgmtprofessor commited on Oct 17, 2024

Commit

d428815

verified ·

1 Parent(s): 0193789

Update app.py

Browse files

Files changed (1) hide show

app.py +56 -43

app.py CHANGED Viewed

@@ -9,9 +9,8 @@ from transformers import AutoModelForSequenceClassification, AutoTokenizer
 # Set up Streamlit app
 st.title("An App to Score Firm-Generated Text on Eight Risk Factors")
 st.write("Note: You can either upload a CSV file or a single TXT file for scoring.")
-st.write("Note: If uploading a CSV file, ensure that it contains the following columns: cik, fyear, Item 1A. Item 1A should contain the respective risk factors section for each firm-year observation.")
-st.write("Note: If uploading a txt file, ensure it contains the respective risk factors section for each firm-year observation.")
 # Hugging Face model directories
 model_directories = {
     'finance': 'mgmtprofessor/finance_risk_factors',
@@ -62,6 +61,14 @@ def score_document(model, tokenizer, text_data):
     return probability_class_1
 # Dropdown to select file type
 file_type = st.selectbox("Select the file type to upload:", ["CSV", "TXT"])
@@ -75,51 +82,57 @@ if file_type == "CSV":
     if csv_file is not None:
         # Read the CSV file
         df = pd.read_csv(csv_file)
-        # Ensure the "Item 1A" column is treated as text data
-        text_data = df['Item 1A'].dropna().tolist()  # Extract text from "Item 1A"
-        # Initialize an empty DataFrame for results
-        result_df = df.copy()
-        # Progress bar
-        progress_bar = st.progress(0)
-        total_categories = len(model_directories)
-        for i, category in enumerate(tqdm(model_directories.keys(), desc="Scoring documents")):
-            # Load the pre-trained model for the current category
-            model, tokenizer = load_model(category)
-            # Skip the category if model loading fails
-            if model is not None:
-                # Score the document for each row in the text data
-                category_scores = []
-                for text in text_data:
-                    probability = score_document(model, tokenizer, text)
-                    category_scores.append(probability[0])  # Extract the first (and only) value
-                # Add the results to the DataFrame
-                result_df[category.capitalize()] = category_scores
-            # Update the progress bar
-            progress_bar.progress((i + 1) / total_categories)
-            # Estimate remaining time
-            elapsed_time = time.time() - start_time
-            estimated_total_time = (elapsed_time / (i + 1)) * total_categories
-            st.write(f"Elapsed time: {elapsed_time:.2f}s, Estimated time remaining: {estimated_total_time - elapsed_time:.2f}s")
-        # Save results to CSV
-        csv = result_df.to_csv(index=False).encode('utf-8')
-        st.download_button(
-            label="Download results as CSV",
-            data=csv,
-            file_name="document_scoring_results.csv",
-            mime="text/csv",
-        )
-        # Display completion message
-        st.success("Document scoring complete!")
 elif file_type == "TXT":
     doc_file = st.file_uploader("Upload a TXT file", type=["txt"])

 # Set up Streamlit app
 st.title("An App to Score Firm-Generated Text on Eight Risk Factors")
 st.write("Note: You can either upload a CSV file or a single TXT file for scoring.")
+st.write("If uploading a CSV file, ensure that it contains the following columns: cik, fyear, Item 1A. Item 1A should contain the respective risk factors section for each firm-year observation.")
+st.write("If uploading a txt file, ensure it contains the respective risk factors section for each firm-year observation.")
 # Hugging Face model directories
 model_directories = {
     'finance': 'mgmtprofessor/finance_risk_factors',
     return probability_class_1
+# Function to find the relevant text column
+def get_text_column(df):
+    possible_columns = ['Item 1A', 'Item 1A.', 'Item 1A. Risk Factors']
+    for col in possible_columns:
+        if col in df.columns:
+            return col
+    return None  # Return None if no matching column is found
 # Dropdown to select file type
 file_type = st.selectbox("Select the file type to upload:", ["CSV", "TXT"])
     if csv_file is not None:
         # Read the CSV file
         df = pd.read_csv(csv_file)
+        # Find the relevant text column
+        text_column = get_text_column(df)
+        if text_column is None:
+            st.error("No valid text column found. Please ensure your CSV contains 'Item 1A', 'Item 1A.', or 'Item 1A. Risk Factors'.")
+        else:
+            # Extract text data from the identified column
+            text_data = df[text_column].dropna().tolist()  # Extracts all non-empty rows
+            # Initialize an empty DataFrame for results
+            result_df = df.copy()
+            # Progress bar
+            progress_bar = st.progress(0)
+            total_categories = len(model_directories)
+            for i, category in enumerate(tqdm(model_directories.keys(), desc="Scoring documents")):
+                # Load the pre-trained model for the current category
+                model, tokenizer = load_model(category)
+                # Skip the category if model loading fails
+                if model is not None:
+                    # Score the document for each row in the text data
+                    category_scores = []
+                    for text in text_data:
+                        probability = score_document(model, tokenizer, text)
+                        category_scores.append(probability[0])  # Extract the first (and only) value
+                    # Add the results to the DataFrame
+                    result_df[category.capitalize()] = category_scores
+                # Update the progress bar
+                progress_bar.progress((i + 1) / total_categories)
+                # Estimate remaining time
+                elapsed_time = time.time() - start_time
+                estimated_total_time = (elapsed_time / (i + 1)) * total_categories
+                st.write(f"Elapsed time: {elapsed_time:.2f}s, Estimated time remaining: {estimated_total_time - elapsed_time:.2f}s")
+            # Save results to CSV
+            csv = result_df.to_csv(index=False).encode('utf-8')
+            st.download_button(
+                label="Download results as CSV",
+                data=csv,
+                file_name="document_scoring_results.csv",
+                mime="text/csv",
+            )
+            # Display completion message
+            st.success("Document scoring complete!")
 elif file_type == "TXT":
     doc_file = st.file_uploader("Upload a TXT file", type=["txt"])