File size: 7,855 Bytes
442a9e0
 
 
 
 
 
 
 
 
 
9baf7c3
fa4a747
d428815
442a9e0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9962ace
442a9e0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9962ace
442a9e0
299a1b8
442a9e0
d428815
 
fa4a747
d428815
 
 
 
 
9baf7c3
 
442a9e0
 
 
 
9baf7c3
 
 
442a9e0
9baf7c3
 
 
299a1b8
d428815
 
442a9e0
d428815
fa4a747
d428815
 
 
 
 
 
9baf7c3
d428815
 
 
9baf7c3
d428815
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9baf7c3
d428815
 
 
 
 
 
 
 
9baf7c3
d428815
 
9baf7c3
 
 
299a1b8
9baf7c3
 
 
 
 
9962ace
9baf7c3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9962ace
9baf7c3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
import os
import streamlit as st
import torch
import pandas as pd
import time
from tqdm import tqdm
from transformers import AutoModelForSequenceClassification, AutoTokenizer

# Set up Streamlit app
st.title("An App to Score Firm-Generated Text on Eight Risk Factors")
st.write("Note: You can either upload a CSV file or a single TXT file for scoring.")
st.write("If uploading a CSV file, ensure that it contains the following columns: cik, fyear, Item 1A (or Text). Item 1A should contain the respective risk factors section for each firm-year observation.")
st.write("If uploading a txt file, ensure it contains the respective risk factors section for each firm-year observation.")
# Hugging Face model directories
model_directories = {
    'finance': 'mgmtprofessor/finance_risk_factors',
    'accounting': 'mgmtprofessor/accounting_risk_factors',
    'technology': 'mgmtprofessor/technology_risk_factors',
    'international': 'mgmtprofessor/international_risk_factors',
    'operations': 'mgmtprofessor/operations_risk_factors',
    'marketing': 'mgmtprofessor/marketing_risk_factors',
    'management': 'mgmtprofessor/management_risk_factors',
    'legal': 'mgmtprofessor/legal_risk_factors'
}

# Check if CUDA is available
use_cuda = torch.cuda.is_available()

# Function to load a model from Hugging Face
def load_model(category):
    try:
        model_name = model_directories.get(category)
        if model_name:
            tokenizer = AutoTokenizer.from_pretrained(model_name)
            model = AutoModelForSequenceClassification.from_pretrained(model_name)
            return model, tokenizer
        else:
            st.error(f"No Hugging Face model found for {category}")
            return None, None
    except Exception as e:
        st.error(f"Failed to load model for {category}: {e}")
        return None, None

# Function to score a document and return the probability for class '1'
def score_document(model, tokenizer, text_data):
    if isinstance(text_data, str):
        text_data = [text_data]

    # Tokenize the input
    inputs = tokenizer(text_data, return_tensors="pt", padding=True, truncation=True)
    
    # Perform the prediction
    with torch.no_grad():
        outputs = model(**inputs)
    
    # Get probabilities (softmax)
    probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1)
    
    # Get the probability associated with class '1'
    probability_class_1 = probabilities[:, 1].tolist()  # Return as list

    return probability_class_1

# Function to find the relevant text column
def get_text_column(df):
    possible_columns = ['Item 1A', 'Item 1A.', 'Item 1A. Risk Factors', 'text', 'Text']
    for col in possible_columns:
        if col in df.columns:
            return col
    return None  # Return None if no matching column is found

# Dropdown to select file type
file_type = st.selectbox("Select the file type to upload:", ["CSV", "TXT"])

# Track the start time
start_time = time.time()

# Handle CSV or TXT upload
if file_type == "CSV":
    csv_file = st.file_uploader("Upload a CSV file with text data", type=["csv"])
    
    if csv_file is not None:
        # Read the CSV file
        df = pd.read_csv(csv_file)

        # Find the relevant text column
        text_column = get_text_column(df)
        
        if text_column is None:
            st.error("No valid text column found. Please ensure your CSV contains 'Item 1A', 'Item 1A.', 'Item 1A. Risk Factors', 'Text', or 'text'.")
        else:
            # Extract text data from the identified column
            text_data = df[text_column].dropna().tolist()  # Extracts all non-empty rows

            # Initialize an empty DataFrame for results
            result_df = df.copy()
            
            # Progress bar
            progress_bar = st.progress(0)
            total_categories = len(model_directories)
            
            for i, category in enumerate(tqdm(model_directories.keys(), desc="Scoring documents")):
                # Load the pre-trained model for the current category
                model, tokenizer = load_model(category)
                
                # Skip the category if model loading fails
                if model is not None:
                    # Score the document for each row in the text data
                    category_scores = []
                    for text in text_data:
                        probability = score_document(model, tokenizer, text)
                        category_scores.append(probability[0])  # Extract the first (and only) value
                    
                    # Add the results to the DataFrame
                    result_df[category.capitalize()] = category_scores
                
                # Update the progress bar
                progress_bar.progress((i + 1) / total_categories)
                
                # Estimate remaining time
                elapsed_time = time.time() - start_time
                estimated_total_time = (elapsed_time / (i + 1)) * total_categories
                st.write(f"Elapsed time: {elapsed_time:.2f}s, Estimated time remaining: {estimated_total_time - elapsed_time:.2f}s")
            
            # Save results to CSV
            csv = result_df.to_csv(index=False).encode('utf-8')
            st.download_button(
                label="Download results as CSV",
                data=csv,
                file_name="document_scoring_results.csv",
                mime="text/csv",
            )

            # Display completion message
            st.success("Document scoring complete!")

elif file_type == "TXT":
    doc_file = st.file_uploader("Upload a TXT file", type=["txt"])
    
    if doc_file is not None:
        # Read the content of the uploaded .txt file
        text_data = doc_file.read().decode("utf-8")
        
        # Initialize an empty DataFrame for results
        result_df = pd.DataFrame(columns=["Category", "Probability"])
        
        # Progress bar
        progress_bar = st.progress(0)
        total_categories = len(model_directories)
        
        for i, category in enumerate(tqdm(model_directories.keys(), desc="Scoring documents")):
            # Load the pre-trained model for the current category
            model, tokenizer = load_model(category)
            
            # Skip the category if model loading fails
            if model is not None:
                # Score the document
                probability = score_document(model, tokenizer, text_data)
                
                # Create a DataFrame for the current result
                new_row = pd.DataFrame({
                    "Category": [category],
                    "Probability": [probability[0]]  # Extract the first value
                })
                
                # Use pd.concat to append the new row to the DataFrame
                result_df = pd.concat([result_df, new_row], ignore_index=True)
            
            # Update the progress bar
            progress_bar.progress((i + 1) / total_categories)
            
            # Estimate remaining time
            elapsed_time = time.time() - start_time
            estimated_total_time = (elapsed_time / (i + 1)) * total_categories
            st.write(f"Elapsed time: {elapsed_time:.2f}s, Estimated time remaining: {estimated_total_time - elapsed_time:.2f}s")

        # Save results to CSV
        csv = result_df.to_csv(index=False).encode('utf-8')
        st.download_button(
            label="Download results as CSV",
            data=csv,
            file_name="document_scoring_results.csv",
            mime="text/csv",
        )

        # Display completion message
        st.success("Document scoring complete!")

st.write("Note: Ensure the uploaded document is formatted correctly. The models are limited to 512 tokens and will be upgraded in a future version.")