Spaces:

mgmtprofessor
/

risk_factors_scoring

Sleeping

App Files Files Community

risk_factors_scoring / app.py

mgmtprofessor

Update app.py

fa4a747 verified 4 months ago

raw

history blame

7.86 kB

	import os
	import streamlit as st
	import torch
	import pandas as pd
	import time
	from tqdm import tqdm
	from transformers import AutoModelForSequenceClassification, AutoTokenizer

	# Set up Streamlit app
	st.title("An App to Score Firm-Generated Text on Eight Risk Factors")
	st.write("Note: You can either upload a CSV file or a single TXT file for scoring.")
	st.write("If uploading a CSV file, ensure that it contains the following columns: cik, fyear, Item 1A (or Text). Item 1A should contain the respective risk factors section for each firm-year observation.")
	st.write("If uploading a txt file, ensure it contains the respective risk factors section for each firm-year observation.")
	# Hugging Face model directories
	model_directories = {
	'finance': 'mgmtprofessor/finance_risk_factors',
	'accounting': 'mgmtprofessor/accounting_risk_factors',
	'technology': 'mgmtprofessor/technology_risk_factors',
	'international': 'mgmtprofessor/international_risk_factors',
	'operations': 'mgmtprofessor/operations_risk_factors',
	'marketing': 'mgmtprofessor/marketing_risk_factors',
	'management': 'mgmtprofessor/management_risk_factors',
	'legal': 'mgmtprofessor/legal_risk_factors'
	}

	# Check if CUDA is available
	use_cuda = torch.cuda.is_available()

	# Function to load a model from Hugging Face
	def load_model(category):
	try:
	model_name = model_directories.get(category)
	if model_name:
	tokenizer = AutoTokenizer.from_pretrained(model_name)
	model = AutoModelForSequenceClassification.from_pretrained(model_name)
	return model, tokenizer
	else:
	st.error(f"No Hugging Face model found for {category}")
	return None, None
	except Exception as e:
	st.error(f"Failed to load model for {category}: {e}")
	return None, None

	# Function to score a document and return the probability for class '1'
	def score_document(model, tokenizer, text_data):
	if isinstance(text_data, str):
	text_data = [text_data]

	# Tokenize the input
	inputs = tokenizer(text_data, return_tensors="pt", padding=True, truncation=True)

	# Perform the prediction
	with torch.no_grad():
	outputs = model(**inputs)

	# Get probabilities (softmax)
	probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1)

	# Get the probability associated with class '1'
	probability_class_1 = probabilities[:, 1].tolist() # Return as list

	return probability_class_1

	# Function to find the relevant text column
	def get_text_column(df):
	possible_columns = ['Item 1A', 'Item 1A.', 'Item 1A. Risk Factors', 'text', 'Text']
	for col in possible_columns:
	if col in df.columns:
	return col
	return None # Return None if no matching column is found

	# Dropdown to select file type
	file_type = st.selectbox("Select the file type to upload:", ["CSV", "TXT"])

	# Track the start time
	start_time = time.time()

	# Handle CSV or TXT upload
	if file_type == "CSV":
	csv_file = st.file_uploader("Upload a CSV file with text data", type=["csv"])

	if csv_file is not None:
	# Read the CSV file
	df = pd.read_csv(csv_file)

	# Find the relevant text column
	text_column = get_text_column(df)

	if text_column is None:
	st.error("No valid text column found. Please ensure your CSV contains 'Item 1A', 'Item 1A.', 'Item 1A. Risk Factors', 'Text', or 'text'.")
	else:
	# Extract text data from the identified column
	text_data = df[text_column].dropna().tolist() # Extracts all non-empty rows

	# Initialize an empty DataFrame for results
	result_df = df.copy()

	# Progress bar
	progress_bar = st.progress(0)
	total_categories = len(model_directories)

	for i, category in enumerate(tqdm(model_directories.keys(), desc="Scoring documents")):
	# Load the pre-trained model for the current category
	model, tokenizer = load_model(category)

	# Skip the category if model loading fails
	if model is not None:
	# Score the document for each row in the text data
	category_scores = []
	for text in text_data:
	probability = score_document(model, tokenizer, text)
	category_scores.append(probability[0]) # Extract the first (and only) value

	# Add the results to the DataFrame
	result_df[category.capitalize()] = category_scores

	# Update the progress bar
	progress_bar.progress((i + 1) / total_categories)

	# Estimate remaining time
	elapsed_time = time.time() - start_time
	estimated_total_time = (elapsed_time / (i + 1)) * total_categories
	st.write(f"Elapsed time: {elapsed_time:.2f}s, Estimated time remaining: {estimated_total_time - elapsed_time:.2f}s")

	# Save results to CSV
	csv = result_df.to_csv(index=False).encode('utf-8')
	st.download_button(
	label="Download results as CSV",
	data=csv,
	file_name="document_scoring_results.csv",
	mime="text/csv",
	)

	# Display completion message
	st.success("Document scoring complete!")

	elif file_type == "TXT":
	doc_file = st.file_uploader("Upload a TXT file", type=["txt"])

	if doc_file is not None:
	# Read the content of the uploaded .txt file
	text_data = doc_file.read().decode("utf-8")

	# Initialize an empty DataFrame for results
	result_df = pd.DataFrame(columns=["Category", "Probability"])

	# Progress bar
	progress_bar = st.progress(0)
	total_categories = len(model_directories)

	for i, category in enumerate(tqdm(model_directories.keys(), desc="Scoring documents")):
	# Load the pre-trained model for the current category
	model, tokenizer = load_model(category)

	# Skip the category if model loading fails
	if model is not None:
	# Score the document
	probability = score_document(model, tokenizer, text_data)

	# Create a DataFrame for the current result
	new_row = pd.DataFrame({
	"Category": [category],
	"Probability": [probability[0]] # Extract the first value
	})

	# Use pd.concat to append the new row to the DataFrame
	result_df = pd.concat([result_df, new_row], ignore_index=True)

	# Update the progress bar
	progress_bar.progress((i + 1) / total_categories)

	# Estimate remaining time
	elapsed_time = time.time() - start_time
	estimated_total_time = (elapsed_time / (i + 1)) * total_categories
	st.write(f"Elapsed time: {elapsed_time:.2f}s, Estimated time remaining: {estimated_total_time - elapsed_time:.2f}s")

	# Save results to CSV
	csv = result_df.to_csv(index=False).encode('utf-8')
	st.download_button(
	label="Download results as CSV",
	data=csv,
	file_name="document_scoring_results.csv",
	mime="text/csv",
	)

	# Display completion message
	st.success("Document scoring complete!")

	st.write("Note: Ensure the uploaded document is formatted correctly. The models are limited to 512 tokens and will be upgraded in a future version.")