import os import gradio as gr from gradio_calendar import Calendar from transformers import pipeline import spacy import lib.read_pdf import lib.comparison import pandas as pd import re import matplotlib.pyplot as plt import matplotlib.patches as patches import io import shutil # Initialize spaCy model nlp = spacy.load('en_core_web_sm') nlp.add_pipe('sentencizer') # Gradio interface setup PDF_FOLDER = "data" def split_in_sentences(text): doc = nlp(text) return [str(sent).strip() for sent in doc.sents] def make_spans(text, results): results_list = [res['label'] for res in results] facts_spans = list(zip(split_in_sentences(text), results_list)) return facts_spans # Initialize pipelines summarizer = pipeline("summarization", model="human-centered-summarization/financial-summarization-pegasus") fin_model = pipeline("sentiment-analysis", model='yiyanghkust/finbert-tone', tokenizer='yiyanghkust/finbert-tone') fin_model_bis = pipeline("sentiment-analysis", model='ProsusAI/finbert', tokenizer='ProsusAI/finbert') table_to_text = pipeline('text2text-generation', model='google/flan-t5-xl') def summarize_text(text): resp = summarizer(text) return resp[0]['summary_text'] def text_to_sentiment(text, all_score=False, label = True): if label: return fin_model(text, return_all_scores=all_score)[0]["label"] else: return fin_model(text, return_all_scores=all_score) def fin_ext(text): results = fin_model(split_in_sentences(text)) return make_spans(text, results) def fin_ext_bis(text): results = fin_model_bis(split_in_sentences(text)) return make_spans(text, results) def upload_file_and_update_dropdown(files): for file in files: if file is not None: # Save the file to the upload directory file_path = os.path.join(PDF_FOLDER, os.path.basename(file)) shutil.copyfile(file.name, file_path) # Get the updated list of files files_list = os.listdir(PDF_FOLDER) return gr.update(choices=files_list), gr.update(choices=files_list) def extract_and_paragraph(pdf1, pdf2, paragraph): if not pdf1 or not pdf2: return [], [] pdf1_path = os.path.join(PDF_FOLDER, pdf1) pdf2_path = os.path.join(PDF_FOLDER, pdf2) # Extract and format paragraphs paragraphs_1 = lib.read_pdf.extract_and_format_paragraphs(pdf1_path) paragraphs_2 = lib.read_pdf.extract_and_format_paragraphs(pdf2_path) start_keyword = ["Main risks to", "Developments in Financial Markets"] end_keywords = ["4. Appendix", "Annex:", "4. Annex", "Detailed tables", "ACKNOWLEDGEMENTS", "STATISTICAL ANNEX", "PROSPECTS BY MEMBER STATES", "At the conclusion of the discussion"] start_index1, end_index1 = lib.read_pdf.find_text_range(paragraphs_1, start_keyword, end_keywords) start_index2, end_index2 = lib.read_pdf.find_text_range(paragraphs_2, start_keyword, end_keywords) paragraphs_1 = lib.read_pdf.extract_relevant_text(paragraphs_1, start_index1, end_index1) paragraphs_2 = lib.read_pdf.extract_relevant_text(paragraphs_2, start_index2, end_index2) if paragraph: paragraphs_1 = lib.read_pdf.split_text_into_paragraphs(paragraphs_1, 200) paragraphs_2 = lib.read_pdf.split_text_into_paragraphs(paragraphs_2, 200) return paragraphs_1, paragraphs_2 # Filter def filter_paragraphs(keyword): global stored_paragraphs_1, stored_paragraphs_2 global filter_paragraphs_1, filter_paragraphs_2 if not keyword: paragraph1 = [f"Paragraph {i+1}: {p[:100]}..." for i, p in enumerate(stored_paragraphs_1)] paragraph2 = [f"Paragraph {i+1}: {p[:100]}..." for i, p in enumerate(stored_paragraphs_2)] filter_paragraphs_1 = stored_paragraphs_1 filter_paragraphs_2 = stored_paragraphs_2 return gr.update(choices=paragraph1, value=None), gr.update(choices=paragraph2, value=None) # No keyword entered, return original list filter_paragraphs_1 = [p for p in stored_paragraphs_1 if keyword.lower() in p.lower()] filter_paragraphs_2 = [p for p in stored_paragraphs_2 if keyword.lower() in p.lower()] filtered1 = [f"Paragraph {i+1}: {p[:100]}..." for i, p in enumerate(filter_paragraphs_1)] filtered2 = [f"Paragraph {i+1}: {p[:100]}..." for i, p in enumerate(filter_paragraphs_2)] # Filter paragraphs that contain the keyword (case-insensitive) # Update dropdown with filtered results return gr.update(choices=filtered1, value=None), gr.update(choices=filtered2, value=None) def clear_paragraphs(): global stored_paragraphs_1, stored_paragraphs_2 paragraph1 = [f"Paragraph {i+1}: {p[:100]}..." for i, p in enumerate(stored_paragraphs_1)] paragraph2 = [f"Paragraph {i+1}: {p[:100]}..." for i, p in enumerate(stored_paragraphs_2)] return gr.update(choices=paragraph1, value=None), gr.update(choices=paragraph2, value=None) def filtered_close_paragraph(p, keyword, pdf): if not keyword: if pdf == "1": return lib.comparison.compare_selected_paragraph(p, stored_paragraphs_1) else: return lib.comparison.compare_selected_paragraph(p, stored_paragraphs_2) if pdf == "1": return lib.comparison.compare_selected_paragraph(p, filter_paragraphs_1) else: return lib.comparison.compare_selected_paragraph(p, filter_paragraphs_2) def process_paragraph_1_sum(paragraph): try: paragraph_index = int(paragraph.split(':')[0].replace('Paragraph ', '')) - 1 selected_paragraph = filter_paragraphs_1[paragraph_index] summary = summarize_text(selected_paragraph) return summary except (IndexError, ValueError): return "Error" def process_paragraph_1_sent(paragraph): try: paragraph_index = int(paragraph.split(':')[0].replace('Paragraph ', '')) - 1 selected_paragraph = filter_paragraphs_1[paragraph_index] results = text_to_sentiment(selected_paragraph, True, False) if isinstance(results, list) and isinstance(results[0], list): # We unpack the list of dictionaries to get all labels output = {result['label']: result['score'] for result in results[0]} print(output) else: output = {"Error": "Unexpected output format"} return output except (IndexError, ValueError): return {"Error": "Unexpected output format"} def process_paragraph_1_sent_tone(paragraph): try: paragraph_index = int(paragraph.split(':')[0].replace('Paragraph ', '')) - 1 selected_paragraph = filter_paragraphs_1[paragraph_index] fin_spans = fin_ext(selected_paragraph) return fin_spans except (IndexError, ValueError): return [] def process_paragraph_1_sent_tone_bis(paragraph): try: paragraph_index = int(paragraph.split(':')[0].replace('Paragraph ', '')) - 1 selected_paragraph = filter_paragraphs_1[paragraph_index] fin_spans = fin_ext_bis(selected_paragraph) return fin_spans except (IndexError, ValueError): return [] def process_paragraph_2_sum(paragraph): try: paragraph_index = int(paragraph.split(':')[0].replace('Paragraph ', '')) - 1 selected_paragraph = filter_paragraphs_2[paragraph_index] summary = summarize_text(selected_paragraph) return summary except (IndexError, ValueError): return "Error" def process_paragraph_2_sent(paragraph): try: paragraph_index = int(paragraph.split(':')[0].replace('Paragraph ', '')) - 1 selected_paragraph = filter_paragraphs_2[paragraph_index] results = text_to_sentiment(selected_paragraph, True, False) if isinstance(results, list) and isinstance(results[0], list): # We unpack the list of dictionaries to get all labels output = {result['label']: result['score'] for result in results[0]} else: output = {"Error": "Unexpected output format"} return output except (IndexError, ValueError): return {"Error": "Unexpected output format"} def process_paragraph_2_sent_tone(paragraph): try: paragraph_index = int(paragraph.split(':')[0].replace('Paragraph ', '')) - 1 selected_paragraph = filter_paragraphs_2[paragraph_index] fin_spans = fin_ext(selected_paragraph) return fin_spans except (IndexError, ValueError): return [] def process_paragraph_2_sent_tone_bis(paragraph): try: paragraph_index = int(paragraph.split(':')[0].replace('Paragraph ', '')) - 1 selected_paragraph = filter_paragraphs_2[paragraph_index] fin_spans = fin_ext_bis(selected_paragraph) return fin_spans except (IndexError, ValueError): return [] def get_pdf_files(folder): return [f for f in os.listdir(folder) if f.endswith('.pdf')] def show1(paragraph): try: paragraph_index = int(paragraph.split(':')[0].replace('Paragraph ', '')) - 1 selected_paragraph = filter_paragraphs_1[paragraph_index] return selected_paragraph except (IndexError, ValueError): return "Error" def show2(paragraph): try: paragraph_index = int(paragraph.split(':')[0].replace('Paragraph ', '')) - 1 selected_paragraph = filter_paragraphs_2[paragraph_index] return selected_paragraph except (IndexError, ValueError): return "Error" def get_excel_files(folder): return [f for f in os.listdir(folder) if f.endswith('.xlsx')] def get_sheet_names(file): xls = pd.ExcelFile(os.path.join(PDF_FOLDER, file)) return gr.update(choices=xls.sheet_names) def process_and_compare(file1, sheet1, file2, sheet2): def process_file(file_path, sheet_name): # Extract year from file name year = int(re.search(r'(\d{4})', file_path).group(1)) # Load the Excel file df = pd.read_excel(os.path.join(PDF_FOLDER, file_path), sheet_name=sheet_name, index_col=0) # Define expected columns based on extracted year historical_col = f'Historical {year - 1}' baseline_cols = [f'Baseline {year}', f'Baseline {year + 1}', f'Baseline {year + 2}'] adverse_cols = [f'Adverse {year}', f'Adverse {year + 1}', f'Adverse {year + 2}'] level_deviation_col = f'Level Deviation {year + 2}' # Drop rows and reset index df = df.iloc[4:].reset_index(drop=True) # Define the new column names new_columns = ['Country', 'Code', historical_col] + baseline_cols + adverse_cols + ['Adverse Cumulative', 'Adverse Minimum', level_deviation_col] # Ensure the number of columns matches if len(df.columns) == len(new_columns): df.columns = new_columns else: raise ValueError(f"Expected {len(new_columns)} columns, but found {len(df.columns)} columns in the data.") columns = ['Country', f'Adverse {year}', f'Adverse {year+1}', f'Adverse {year+2}', 'Adverse Cumulative'] return df, df[columns] # Process both files global stored_df1, stored_df2 df1, stored_df1 = process_file(file1, sheet1) df2, stored_df2 = process_file(file2, sheet2) year1 = int(re.search(r'(\d{4})', file1).group(1)) year2 = int(re.search(r'(\d{4})', file2).group(1)) # Merge dataframes on 'Country' merged_df = pd.merge(df2, df1, on='Country', suffixes=(f'_{year1}', f'_{year2}')) merged_df['Difference adverse cumulative growth'] = merged_df[f'Adverse Cumulative_{year2}'] - merged_df[f'Adverse Cumulative_{year1}'] # Ensure data types are correct merged_df['Country'] = merged_df['Country'].astype(str) merged_df['Difference adverse cumulative growth'] = pd.to_numeric(merged_df['Difference adverse cumulative growth'], errors='coerce') # Create histogram plot with color coding fig, ax = plt.subplots(figsize=(12, 8)) colors = plt.get_cmap('tab20').colors # Use a colormap with multiple colors num_countries = len(merged_df['Country']) bars = ax.bar(merged_df['Country'], merged_df['Difference adverse cumulative growth'], color=colors[:num_countries]) # Add a legend handles = [patches.Patch(color=color, label=country) for color, country in zip(colors[:num_countries], merged_df['Country'])] ax.legend(handles=handles, title='Countries', bbox_to_anchor=(1.05, 1), loc='upper left') ax.set_title(f'Histogram of Difference between Adverse cumulative growth of {year2} and {year1} for {sheet1}') ax.set_xlabel('Country') ax.set_ylabel('Difference') plt.xticks(rotation=90) # Save plot to a file file_path = 'output/plot.png' plt.savefig(file_path, format='png', bbox_inches='tight') plt.close() filtered_countries1 = [country for country in stored_df1.Country.values.tolist() if (len(str(country)) < 20 and str(country) != "nan")] filtered_countries2 = [country for country in stored_df2.Country.values.tolist() if (len(str(country)) < 20 and str(country) != "nan")] return file_path, gr.update(choices=filtered_countries1), gr.update(choices=filtered_countries2) def find_sentences_with_keywords(text, keywords): # Split text into sentences using regular expression to match sentence-ending punctuation sentences = re.split(r'(?<=[.!?])\s+', text) matched_sentences = set() # Use a set to store unique sentences # For each keyword, find sentences that contain the keyword as a whole word for keyword in keywords: keyword_pattern = re.compile(rf'\b{re.escape(keyword)}\b', re.IGNORECASE) # Using word boundaries for sentence in sentences: if keyword_pattern.search(sentence): matched_sentences.add(sentence) # Add to set to ensure uniqueness return list(matched_sentences) # Convert set back to list for consistent output # Main function to process both PDFs based on the Excel file names and the sheet name def process_pdfs_and_analyze_sentiment(file1, file2, sheet): # Extract text from both PDFs based on the file name pdf_file1 = file1.replace(".xlsx", ".pdf") pdf_file2 = file2.replace(".xlsx", ".pdf") text1, text2 =extract_and_paragraph(pdf_file1, pdf_file2, False) # Use sheet name as the keyword to find relevant sentences keywords = { 'GDP': ['GDP'], 'HICP': ['HICP'], 'RRE prices': ['RRE', 'residential'], 'CRE prices': ['CRE', 'commercial'], 'Unemployment': ['unemployment'] } selected_keywords = keywords.get(sheet, []) # Find sentences containing the keywords sentences1 = find_sentences_with_keywords(text1, selected_keywords) sentences2 = find_sentences_with_keywords(text2, selected_keywords) # Concatenate all sentences for each PDF text_pdf1 = "\n".join(sentences1) text_pdf2 = "\n".join(sentences2) # Perform sentiment analysis on the extracted sentences for each PDF result_pdf1 = fin_ext(text_pdf1) result_pdf2 = fin_ext(text_pdf2) return result_pdf1, result_pdf2 #def change_choices(df): # return gr.update(choices=df.Country.values.tolist()) def generate_text(df, country, theme): # Filter the dataframe based on the country #for column in df.columns: # if column != 'Country': # df[column] = df[column].apply(lambda x: f"{x:.6f}%") #row = df[df['Country'] == country].iloc[0] def format_row_for_prompt(row): # Create a formatted string with colons and percentages formatted_row = [] for col, value in row.items(): if col != 'Country': # Exclude 'Country' or format differently if needed if isinstance(value, (int, float)): # Add percentage sign for numeric values value_str = f"{value:.6f}%" else: value_str = str(value) formatted_row.append(f"{col}: {value_str}") else: formatted_row.append(f"{col}: {value}") return "\n".join(formatted_row) # Convert the row to a string format for prompt year = int(re.search(r'(\d{4})', df.columns[1]).group(1)) df.columns = ['Country', f'{year}', f'{year+1}', f'{year+2}', 'Total'] row = df[df['Country'] == country].iloc[0] row_str = format_row_for_prompt(row) #row_str = row.to_string(index=True) print(row_str) simple_prompt = f""" Here is the data for {theme} in {country}: {row_str} Summarize the adverse growth for {theme} in {country}. Highlight any increase or decrease compared to previous years and include the cumulative result. """ prompt = f""" Here is an example of how to describe adverse growth data for a given country: Country: Australia Adverse 1990: -0.43% Adverse 1991: -1.99% Adverse 1192: -1.20% Adverse Cumulative: -3.57% Topic: GDP Description: In the adverse scenario, the GDP growth in Australia was -0.43% in 1990. It decreased further to -1.99% in 1991, showing worsening conditions. There was a slight improvement to -1.20% in 1992. The total cumulative adverse growth is -3.57%. Now, using the following data for {theme} in {country}, describe the adverse growth: {row_str} Topic: {theme} Describe, using the similar pattern from the example, the changes for the provided country and years. Highlight how the values change year by year and whether they increased or decreased. Do not mention any other countries or years, and describe exactly what is in the table. Keep the description simple and direct. """ prompt = f""" Example: Country: Australia 1990: -0.43% 1991: -1.99% 1992: -1.20% Total: -3.57% Anwser: In the adverse scenario, the growth in Australia was -0.43% in 1990. It worsened to -1.99% in 1991 and slightly improved to -1.20% in 1992. The total cumulative adverse growth was -3.57% from 1990 to 1992. Now, using the following data in {country}, describe and provibe how the adverse growth changed each year, whether it increased or decreased, worsened or improved: {row_str} Answer: """ prompt1 = f""" Given the following adverse growth data for {theme} in {country}: {row_str} Topic: {theme} Describe the yearly changes in adverse growth, highlighting whether the values increased or decreased, and provide the cumulative growth. Follow this example: Example: Country: Australia 1990: -0.43% 1991: -1.99% 1992: -1.20% Cumulative: -3.57% Topic: GDP Description: In Australia, GDP growth was -0.43% in 1990. It worsened to -1.99% in 1991 and improved to -1.20% in 1992. The total cumulative adverse growth was -3.57%. Now, describe the data for {country} """ print(year) # Generate the descriptive text using the model #result = table_to_text(prompt, max_length=240, temperature = 0.7, top_p = 0.3, do_sample = False)[0]['generated_text'] result = table_to_text(prompt, max_length=240)[0]['generated_text'] return result # Global variable stored_paragraphs_1 = [] stored_paragraphs_2 = [] filter_paragraphs_1 = [] filter_paragraphs_2 = [] stored_df1 = [] stored_df2 = [] current_theme = {"dark": False} js_func = """ function refresh() { const url = new URL(window.location); if (url.searchParams.get('__theme') !== 'dark') { url.searchParams.set('__theme', 'dark'); window.location.href = url.href; } } """ # Define custom colors for the labels color_map = { "Positive": "green", # Green for positive "Neutral": "blue", # Blue for neutral "Negative": "red" # Red for negative } color_map1 = { "positive": "green", # Green for positive "neutral": "blue", # Blue for neutral "negative": "red" # Red for negative } with gr.Blocks(theme='gradio/soft',js=js_func) as demo: with gr.Tab("Methodology"): gr.Markdown(""" ## Macro-economy Adverse Scenario Comparison from EBA Reports This application allows the user to compare two reports from text contents or from tables. It's divided into two tabs. **First Tab: Text Comparisons** - It handdles EBA and Federal Open Market Committee files report. Don't modify federal file name. - Select two PDFs. Each PDF's text content will be extracted into paragraphs. - You can choose a keyword to filter paragraphs. - Select a paragraph from one PDF, and find the most similar paragraph from the other PDF using a specific method. - For a selected paragraph, compute summarization using the **FinPEGASUS model**. - For a selected paragraph, compute sentiment analysis of the paragraph, and for each sentence, classify into three classes (Positive, Negative, Neutral) using two different fine-tuned **FinBERT models**: - [ProsusAI/finbert](https://huggingface.co/ProsusAI/finbert) - [yiyanghkust/finbert-tone](https://huggingface.co/yiyanghkust/finbert-tone) **Second Tab: Table Comparisons** - Select two Excel files and a sheet name. - For the two selected tables, compute the difference of the cumulative adverse growth rate over their respective three years for the selected sheet name (topic). - For the selected topic (sheet name), find related sentences in the associated PDF text that mention the topic, and classify them by sentiment. - For a selected country and topic, describe the adverse growth rate trend over three years using the [**google/flan-t5-xl**](https://huggingface.co/google/flan-t5-xl). """) with gr.Tab("Financial Report Text Analysis"): gr.Markdown("## Financial Report Paragraph Selection and Analysis on Adverse Macro-Economy Scenario") with gr.Row(): # Upload PDFs with gr.Column(): gr.Markdown("### Step 1: Upload PDF Files") upload_button = gr.File(label="Upload files", file_types=[".pdf"], file_count="multiple") pdf1 = gr.Dropdown(choices=get_pdf_files(PDF_FOLDER), label="Select PDF 1") pdf2 = gr.Dropdown(choices=get_pdf_files(PDF_FOLDER), label="Select PDF 2") upload_button.upload(upload_file_and_update_dropdown, upload_button, [pdf1, pdf2]) with gr.Column(): gr.Markdown("### Step 2: Extract and Display Paragraphs") b1 = gr.Button("Extract and Display Paragraphs") paragraph_1_dropdown = gr.Dropdown(label="Select Paragraph from PDF 1") paragraph_2_dropdown = gr.Dropdown(label="Select Paragraph from PDF 2") keyword_input = gr.Textbox(label="Enter keyword to search") # Button to trigger the filtering with gr.Row(): search_button = gr.Button("Search") clear_button = gr.Button("Clear") search_button.click(filter_paragraphs, inputs=keyword_input, outputs=[paragraph_1_dropdown, paragraph_2_dropdown]) clear_button.click(clear_paragraphs, inputs=[], outputs=[paragraph_1_dropdown, paragraph_2_dropdown]) # Extract paragraphs from PDFs and update dropdowns def update_paragraphs(pdf1, pdf2): global stored_paragraphs_1, stored_paragraphs_2 global filter_paragraphs_1, filter_paragraphs_2 stored_paragraphs_1, stored_paragraphs_2 = extract_and_paragraph(pdf1, pdf2, True) filter_paragraphs_1, filter_paragraphs_2 = stored_paragraphs_1, stored_paragraphs_2 updated_dropdown_1 = [f"Paragraph {i+1}: {p[:100]}..." for i, p in enumerate(stored_paragraphs_1)] updated_dropdown_2 = [f"Paragraph {i+1}: {p[:100]}..." for i, p in enumerate(stored_paragraphs_2)] return gr.update(choices=updated_dropdown_1), gr.update(choices=updated_dropdown_2) b1.click(fn=update_paragraphs, inputs=[pdf1, pdf2], outputs=[paragraph_1_dropdown, paragraph_2_dropdown]) gr.Markdown("---") with gr.Row(): # PDF 1 Analysis section with custom background with gr.Column(): gr.Markdown("### PDF 1 Analysis") selected_paragraph_1 = gr.Textbox(label="Selected Paragraph 1 Content", lines=4) paragraph_1_dropdown.select(fn=show1, inputs = paragraph_1_dropdown, outputs=selected_paragraph_1) close_paragraph_1 = gr.Textbox(label="Closest Paragraph from PDF 2 to selected Paragraph PDF 1", lines=4) paragraph_1_dropdown.select( fn=lambda p, keyword: filtered_close_paragraph(p, keyword, "2"), # Use stored_paragraphs_2 inside the function inputs=[paragraph_1_dropdown, keyword_input], outputs=close_paragraph_1 ) with gr.Group(): summarize_btn1 = gr.Button("Summarize Text from PDF 1") summary_textbox_1 = gr.Textbox(label="Summary for PDF 1", lines=2) # Summarize the selected paragraph from PDF 1 summarize_btn1.click(fn=lambda p: process_paragraph_1_sum(p), inputs=paragraph_1_dropdown, outputs=summary_textbox_1) sentiment_btn1 = gr.Button("Classify Financial Tone for paragraph from PDF 1") sentiment_textbox_1 = gr.Label(label="Classification from PDF 1") # Classify the financial tone of the selected paragraph from PDF 1 sentiment_btn1.click(fn=lambda p: process_paragraph_1_sent(p), inputs=paragraph_1_dropdown, outputs=sentiment_textbox_1) with gr.Accordion("Analyze Financial Tone on each sentence"): analyze_btn1 = gr.Button("With FinBERT-tone") fin_spans_1 = gr.HighlightedText(label="Financial Tone Analysis for PDF 1",color_map=color_map, show_legend=True) # Analyze financial tone on each sentence using FinBERT-tone analyze_btn1.click(fn=lambda p: process_paragraph_1_sent_tone(p), inputs=paragraph_1_dropdown, outputs=fin_spans_1) analyze_btn1_ = gr.Button("With ProsusAI/finbert") fin_spans_1_ = gr.HighlightedText(label="Financial Tone Analysis for PDF 1 (Bis)",color_map=color_map1, show_legend=True) # Analyze financial tone using ProsusAI/finbert analyze_btn1_.click(fn=lambda p: process_paragraph_1_sent_tone_bis(p), inputs=paragraph_1_dropdown, outputs=fin_spans_1_) # Process the selected paragraph from PDF 2 with gr.Column(): gr.Markdown("### PDF 2 Analysis") selected_paragraph_2 = gr.Textbox(label="Selected Paragraph 2 Content", lines=4) paragraph_2_dropdown.select(fn=show2, inputs = paragraph_2_dropdown, outputs=selected_paragraph_2) close_paragraph_2 = gr.Textbox(label="Closest Paragraph from PDF 1 to selected Paragraph PDF 2", lines=4) paragraph_2_dropdown.select( fn=lambda p, keyword: filtered_close_paragraph(p, keyword, "1"), # Use stored_paragraphs_2 inside the function inputs=[paragraph_2_dropdown, keyword_input], outputs=close_paragraph_2 ) with gr.Group(): # Display selected paragraph from PDF 2 selected_paragraph_2.change(fn=show2, inputs=paragraph_2_dropdown, outputs=selected_paragraph_2) summarize_btn2 = gr.Button("Summarize Text from PDF 2") summary_textbox_2 = gr.Textbox(label="Summary for PDF 2", lines=2) # Summarize the selected paragraph from PDF 2 summarize_btn2.click(fn=lambda p: process_paragraph_2_sum(p), inputs=paragraph_2_dropdown, outputs=summary_textbox_2) sentiment_btn2 = gr.Button("Classify Financial Tone for paragraph from PDF 2") sentiment_textbox_2 = gr.Label(label="Classification from PDF 2") # Classify the financial tone of the selected paragraph from PDF 2 sentiment_btn2.click(fn=lambda p: process_paragraph_2_sent(p), inputs=paragraph_2_dropdown, outputs=sentiment_textbox_2) with gr.Accordion("Analyze Financial Tone on each sentence"): analyze_btn2 = gr.Button("With FinBERT-tone") fin_spans_2 = gr.HighlightedText(label="Financial Tone Analysis for PDF 2",color_map=color_map, show_legend=True) # Analyze financial tone on each sentence using FinBERT-tone for PDF 2 analyze_btn2.click(fn=lambda p: process_paragraph_2_sent_tone(p), inputs=paragraph_2_dropdown, outputs=fin_spans_2) analyze_btn2_ = gr.Button("With ProsusAI/finbert") fin_spans_2_ = gr.HighlightedText(label="Financial Tone Analysis for PDF 2 (Bis)",color_map=color_map1, show_legend=True) # Analyze financial tone using ProsusAI/finbert for PDF 2 analyze_btn2_.click(fn=lambda p: process_paragraph_2_sent_tone_bis(p), inputs=paragraph_2_dropdown, outputs=fin_spans_2_) with gr.Tab("Financial Report Table Analysis"): # New tab content goes here gr.Markdown("## Excel Data Comparison") with gr.Row(): with gr.Column(): file1 = gr.Dropdown(choices=get_excel_files(PDF_FOLDER), label="Select Excel File 1") file2 = gr.Dropdown(choices=get_excel_files(PDF_FOLDER), label="Select Excel File 2") sheet = gr.Dropdown(choices=["GDP", "HICP", "RRE prices", "Unemployment", "CRE prices"], label="Select Sheet for File 1 and 2") with gr.Column(): result = gr.Image(label="Comparison Plot") #result = gr.BarPlot() def update_sheets(file): return get_sheet_names(file) b1 = gr.Button("Compare Data") b2 = gr.Button("Extract text information from PDFs") with gr.Row(): with gr.Column(): sentiment_results_pdf1 = gr.HighlightedText(label="Sentiment Analysis - PDF 1",color_map=color_map, show_legend=True) with gr.Column(): sentiment_results_pdf2 = gr.HighlightedText(label="Sentiment Analysis - PDF 2",color_map=color_map, show_legend=True) with gr.Accordion("Adverse growth trends"): with gr.Row(): with gr.Column(): country_1_dropdown = gr.Dropdown(label="Select Country from Excel File 1") summarize_btn1_country = gr.Button("Summary for the selected country") text_result_df1 = gr.Textbox(label="Sentence for excel file 1", lines=2) summarize_btn1_country.click(fn=lambda country, theme: generate_text(stored_df1, country, theme), inputs=[country_1_dropdown, sheet], outputs=text_result_df1) with gr.Column(): country_2_dropdown = gr.Dropdown(label="Select Country from Excel File 2") summarize_btn2_country = gr.Button("Summary for the selected country") text_result_df2 = gr.Textbox(label="Sentence for excel file 2", lines=2) summarize_btn2_country.click(fn=lambda country, theme: generate_text(stored_df2, country, theme), inputs=[country_2_dropdown, sheet], outputs=text_result_df2) # Button to extract text from PDFs and perform sentiment analysis b1.click(fn=process_and_compare, inputs=[file1, sheet, file2, sheet], outputs=[result ,country_1_dropdown, country_2_dropdown]) b2.click(fn=process_pdfs_and_analyze_sentiment, inputs=[file1, file2, sheet], outputs=[sentiment_results_pdf1, sentiment_results_pdf2]) with gr.Tab("Fed data analysis"): gr.Markdown("## Sentiment Analysis Overview") # Display DataFrame df = pd.read_csv("data/2008_2024_minutes_corrected.csv", header = 0) df['Total_paragraphs']=df['Total_paragraphs']-df['Neutral'] df['Positive_ratio'] = df['Positive'] / df['Total_paragraphs']*100 df['Negative_ratio'] = df['Negative'] / df['Total_paragraphs']*100 df['Date'] = pd.to_datetime(df['Date']) start_date = df['Date'].min() end_date = df['Date'].max() #start = Calendar(value ="2008-01-01", type="string", label="Start") #end = Calendar(value="2025-01-01",type="string", label="End") #apply_btn = gr.Button("Apply", scale=0) #reset_btn = gr.Button("Reset", scale=0) # data_table = gr.DataFrame(value=df[['Date', 'Positive_ratio', 'Negative_ratio', 'Total_paragraphs']], label="Sentiment Data", height=500) # Pivot the DataFrame #melted_df = df.melt(id_vars='Date', value_vars=['Positive_ratio', 'Negative_ratio'], # var_name='Ratio_Type', value_name='Rate') # Line plot for the ratios line_plot = gr.LinePlot( df, x='Date', y='Positive_ratio', title="Positive Rate Over Time", y_lim=[0, 100], # Limit y-axis to 0-1 since it's a ratio #color = 'Ratio_Type' ) #apply_btn.click(lambda start,end: gr.LinePlot(x_lim=[start, end]), [start, end], line_plot) #reset_btn.click(lambda : gr.LinePlot(x_lim=[start_date, end_date]), [], line_plot) demo.launch()