Spaces:

mgmtprofessor
/

risk_factors_scoring

Sleeping

App Files Files Community

mgmtprofessor commited on Oct 16, 2024

Commit

38d73ce

verified ·

1 Parent(s): 7f19cee

Upload 2 files

Browse files

Files changed (2) hide show

app.py +177 -0
requirements.txt +6 -0

app.py ADDED Viewed

	@@ -0,0 +1,177 @@

+import os
+import dropbox
+import streamlit as st
+import torch
+import pandas as pd
+import time
+from tqdm import tqdm
+from simpletransformers.classification import ClassificationModel
+# Set up Streamlit app
+st.title("Document Scoring App for Various Categories")
+# Model directories and corresponding Dropbox paths
+model_directories = {
+    'finance': 'models/finance_model',
+    'accounting': 'models/accounting_model',
+    'technology': 'models/technology_model',
+    'international': 'models/international_model',
+    'operations': 'models/operations_model',
+    'marketing': 'models/marketing_model',
+    'management': 'models/management_model',
+    'legal': 'models/legal_model'
+}
+# Dropbox paths to main model directories
+dropbox_model_paths = {
+    'international': '/3) Conferences and Publications/2) Current_Projects/VIVEK/COMPLETE_REWORK/international_model',
+    'finance': '/3) Conferences and Publications/2) Current_Projects/VIVEK/COMPLETE_REWORK/finance_model',
+    'accounting': '/3) Conferences and Publications/2) Current_Projects/VIVEK/COMPLETE_REWORK/accounting_model',
+    'technology': '/3) Conferences and Publications/2) Current_Projects/VIVEK/COMPLETE_REWORK/technology_model',
+    'operations': '/3) Conferences and Publications/2) Current_Projects/VIVEK/COMPLETE_REWORK/operations_model',
+    'marketing': '/3) Conferences and Publications/2) Current_Projects/VIVEK/COMPLETE_REWORK/marketing_model',
+    'management': '/3) Conferences and Publications/2) Current_Projects/VIVEK/COMPLETE_REWORK/management_model',
+    'legal': '/3) Conferences and Publications/2) Current_Projects/VIVEK/COMPLETE_REWORK/legal_model'
+}
+# Dropbox paths to model checkpoints (all 8 models)
+dropbox_checkpoint_paths = {
+    'international': '/3) Conferences and Publications/2) Current_Projects/VIVEK/COMPLETE_REWORK/international_model/checkpoint-174-epoch-3',
+    'finance': '/3) Conferences and Publications/2) Current_Projects/VIVEK/COMPLETE_REWORK/finance_model/checkpoint-174-epoch-3',
+    'accounting': '/3) Conferences and Publications/2) Current_Projects/VIVEK/COMPLETE_REWORK/accounting_model/checkpoint-174-epoch-3',
+    'technology': '/3) Conferences and Publications/2) Current_Projects/VIVEK/COMPLETE_REWORK/technology_model/checkpoint-174-epoch-3',
+    'operations': '/3) Conferences and Publications/2) Current_Projects/VIVEK/COMPLETE_REWORK/operations_model/checkpoint-174-epoch-3',
+    'marketing': '/3) Conferences and Publications/2) Current_Projects/VIVEK/COMPLETE_REWORK/marketing_model/checkpoint-174-epoch-3',
+    'management': '/3) Conferences and Publications/2) Current_Projects/VIVEK/COMPLETE_REWORK/management_model/checkpoint-174-epoch-3',
+    'legal': '/3) Conferences and Publications/2) Current_Projects/VIVEK/COMPLETE_REWORK/legal_model/checkpoint-174-epoch-3'
+}
+# Check if CUDA is available
+use_cuda = torch.cuda.is_available()
+# Function to download files from Dropbox recursively, including checkpoint directories
+def download_files_from_dropbox(dbx, dropbox_path, local_dir):
+    # List all files and subfolders in the Dropbox path
+    try:
+        for entry in dbx.files_list_folder(dropbox_path).entries:
+            local_path = os.path.join(local_dir, entry.name)
+            if isinstance(entry, dropbox.files.FileMetadata):
+                # It's a file, download it
+                with open(local_path, "wb") as f:
+                    metadata, res = dbx.files_download(path=entry.path_lower)
+                    f.write(res.content)
+            elif isinstance(entry, dropbox.files.FolderMetadata):
+                # It's a folder, create it locally and download its contents
+                os.makedirs(local_path, exist_ok=True)
+                download_files_from_dropbox(dbx, entry.path_lower, local_path)
+    except dropbox.exceptions.ApiError as err:
+        st.error(f"Dropbox API error: {err}")
+# Function to download models and checkpoints from Dropbox
+def download_model(category):
+    model_path = model_directories[category]
+    if not os.path.exists(model_path):
+        os.makedirs(model_path, exist_ok=True)
+        dbx = dropbox.Dropbox(st.secrets["dropbox_api_key"])
+        # Download the main model files
+        st.write(f"Downloading {category} model...")
+        download_files_from_dropbox(dbx, dropbox_model_paths[category], model_path)
+        # Download the checkpoint files if available
+        if category in dropbox_checkpoint_paths:
+            checkpoint_path = os.path.join(model_path, "checkpoint-174-epoch-3")
+            os.makedirs(checkpoint_path, exist_ok=True)
+            st.write(f"Downloading checkpoint for {category} model...")
+            download_files_from_dropbox(dbx, dropbox_checkpoint_paths[category], checkpoint_path)
+        st.success(f"{category} model and checkpoints downloaded successfully.")
+# Function to load a model, skipping if it can't be loaded
+def load_model(category):
+    model_path = model_directories[category]
+    # Ensure the model is downloaded
+    download_model(category)
+    try:
+        model = ClassificationModel(
+            "bert",
+            model_path,
+            use_cuda=use_cuda,
+            args={"silent": True}  # Suppress output
+        )
+        return model
+    except Exception as e:
+        st.error(f"Failed to load model for {category}: {e}")
+        return None
+# Function to score a document and return the prediction and probability for class '1'
+def score_document(model, text_data):
+    if isinstance(text_data, str):
+        text_data = [text_data]
+    predictions, raw_outputs = model.predict(text_data)
+    # Get the probability associated with class '1'
+    probability_class_1 = torch.nn.functional.softmax(torch.tensor(raw_outputs[0]), dim=0)[1].item()
+    return predictions[0], probability_class_1
+# Let the user upload a file
+doc_file = st.file_uploader("Upload a document (.txt)", type=["txt"])
+# Track the start time
+start_time = time.time()
+# Make predictions when a file is uploaded
+if doc_file is not None:
+    # Read the content of the uploaded .txt file
+    text_data = doc_file.read().decode("utf-8")
+    # Initialize an empty DataFrame for results
+    result_df = pd.DataFrame(columns=["Category", "Prediction", "Probability"])
+    # Progress bar
+    progress_bar = st.progress(0)
+    total_categories = len(model_directories)
+    for i, category in enumerate(tqdm(model_directories.keys(), desc="Scoring documents")):
+        # Load the pre-trained model for the current category
+        model = load_model(category)
+        # Skip the category if model loading fails
+        if model is not None:
+            # Score the document
+            prediction, probability = score_document(model, text_data)
+            # Create a DataFrame for the current result
+            new_row = pd.DataFrame({
+                "Category": [category],
+                "Prediction": [prediction],
+                "Probability": [probability]
+            })
+            # Use pd.concat to append the new row to the DataFrame
+            result_df = pd.concat([result_df, new_row], ignore_index=True)
+        # Update the progress bar
+        progress_bar.progress((i + 1) / total_categories)
+        # Estimate remaining time
+        elapsed_time = time.time() - start_time
+        estimated_total_time = (elapsed_time / (i + 1)) * total_categories
+        st.write(f"Elapsed time: {elapsed_time:.2f}s, Estimated time remaining: {estimated_total_time - elapsed_time:.2f}s")
+    # Save results to CSV
+    csv = result_df.to_csv(index=False).encode('utf-8')
+    st.download_button(
+        label="Download results as CSV",
+        data=csv,
+        file_name="document_scoring_results.csv",
+        mime="text/csv",
+    )
+    # Display completion message
+    st.success("Document scoring complete!")
+st.write("Note: Ensure the uploaded document is in .txt format containing text data.")

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+streamlit
+torch
+pandas
+tqdm
+simpletransformers
+dropbox==11.34.0