import gradio as gr import PyPDF2 from PyPDF2 import PdfReader from io import BytesIO import pytesseract from PIL import Image import spacy import json from transformers import pipeline from PyPDF2 import PdfReader ner_model = pipeline('token-classification', model='dslim/bert-large-NER') summarization_pipeline = pipeline("summarization", model="facebook/bart-large-cnn") ner_models = { 'bert-large-NER': 'dslim/bert-large-NER', 'bioNER': 'd4data/biomedical-ner-all', 'SpaCy English NER': 'en_core_web_trf', } spacy_ner_model = spacy.load(ner_models['SpaCy English NER']) ner_model_bio = pipeline('token-classification', model='d4data/biomedical-ner-all') from transformers import AutoTokenizer tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER") from spacy import displacy def extract_text_from_pdf(pdf_bytes): """ Extracts text from a PDF file using PyPDF2. Parameters: - pdf_bytes (bytes): The content of the PDF file in bytes. Returns: - text (str): Extracted text from the PDF. """ text='' pdf_file=BytesIO(pdf_bytes) pdf_reader=PdfReader(pdf_file) for page_number in range(len(pdf_reader.pages)): page=pdf_reader.pages[page_number] text+=page.extract_text() return text def extract_text_from_image_or_pdf(file_bytes): """ Extracts text from either a PDF or an image file using PyPDF2 and pytesseract. Parameters: - file_bytes (bytes): The content of the file in bytes. Returns: - text (str): Extracted text from the file. """ try: if file_bytes.startswith(b'%PDF'): # PDF file text = extract_text_from_pdf(file_bytes) else: # Assume image file image = Image.open(BytesIO(file_bytes)) text = pytesseract.image_to_string(image) return text except Exception as e: return f"Error extracting text: {str(e)}" def perform_ner(text, model_name): """ Performs Named Entity Recognition (NER) on the given text using the specified NER model. Parameters: - text (str): The input text on which NER will be performed. - model_name (str): The name of the NER model to be used ('bert-large-NER', 'bioNER', or 'SpaCy English NER'). Returns: - extracted_entities (list): A list of dictionaries containing information about the recognized entities. Each dictionary has the keys: 'text', 'type', 'start_index', 'end_index'. - error_message (str): If an error occurs during the NER process, an error message is returned. """ try: if model_name == 'SpaCy English NER': doc = spacy_ner_model(text) extracted_entities = [{'text': ent.text, 'type': ent.label_, 'start_index': ent.start_char, 'end_index': ent.end_char} for ent in doc.ents] elif model_name == 'bert-large-NER': entities = ner_model(text) extracted_entities = [{'text': entity['word'], 'type': entity['entity'], 'start_index': entity['start'], 'end_index': entity['end']} for entity in entities] else: entities = ner_model_bio(text) extracted_entities = [{'text': entity['word'], 'type': entity['entity'], 'start_index': entity['start'], 'end_index': entity['end']} for entity in entities] return extracted_entities except Exception as e: return f"Error performing NER: {str(e)}" def highlight_entities_with_colors_and_labels_tokenized(text, entities, color_mapping, tokenizer): """ This function takes a raw text input, a list of entities with their start and end indices, a color mapping for entity labels, and a tokenizer. It tokenizes the input text, highlights the entities with specified colors and labels, and returns the formatted text with HTML-style markup. Parameters: - `text` (str): The raw input text. - `entities` (list): A list of dictionaries, each containing the start index (`start`), end index (`end`), and type (`type`) of an entity. - `color_mapping` (dict): A dictionary mapping entity labels to background colors for highlighting. - `tokenizer` (transformers.AutoTokenizer): The tokenizer for encoding the entity text. Returns: - `highlighted_text` (str): The formatted text with highlighted entities using HTML-style markup. """ highlighted_text = "" current_pos = 0 for ent in entities: start, end, label = ent.get('start_index', 0), ent.get('end_index', 0), ent.get('type', 'O') entity_text = text[start:end] # Tokenize the entity text encoded_entity = tokenizer.encode(entity_text, add_special_tokens=False) tokenized_entity_text = tokenizer.convert_ids_to_tokens(encoded_entity) tokenized_entity_length = len(tokenized_entity_text) # Add non-entity text highlighted_text += text[current_pos:start] # Add highlighted entity text with color and label on the same line color = color_mapping.get(label,'#4D94FF') highlighted_text += f"{entity_text} ({label})" # Update current position current_pos = end # Add any remaining non-entity text highlighted_text += text[current_pos:] return highlighted_text def highlight_entities(text, entities,model_name): """ Highlights named entities in the given text and returns HTML with colored annotations. Parameters: - text (str): The input text containing named entities. - entities (list): A list of dictionaries containing information about the recognized entities. Each dictionary has the keys: 'text', 'type', 'start_index', 'end_index'. - model_name (str): The name of the NER model used ('SpaCy English NER'). Returns: - colored_text (str): HTML with colored annotations highlighting the recognized entities. - error_message (str): If an error occurs during the highlighting process, an error message is returned. """ try: if model_name == 'SpaCy English NER': doc = spacy_ner_model(text) color_mapping = { "DATE": "#4D94FF", # Blue "PERSON": "#4CAF50", # Green "EVENT": "#FF6666", # Salmon "FAC": "#66B2FF", # Sky Blue "GPE": "#FFCC99", # Light Apricot "LANGUAGE": "#FF80BF", # Pink "LAW": "#66FF99", # Mint "LOC": "#809FFF", # Lavender Blue "MONEY": "#FFFF99", # Light Yellow "NORP": "#808000", # Olive Green "ORDINAL": "#FF9999", # Misty Rose "ORG": "#FFB366", # Light Peach "PERCENT": "#FF99FF", # Orchid "PRODUCT": "#FF6666", # Salmon "QUANTITY": "#CC99FF", # Pastel Purple "TIME": "#FFD54F", # Amber "WORK_OF_ART": "#FFC266" , # Light Orange "CARDINAL": "#008080" # Teal } options = {"ents": [entity['type'] for entity in entities], "colors": color_mapping} html = displacy.render(doc, style="ent", options=options, page=True) colored_text = html return colored_text else: color_mapping = { 'O': 'pink', 'B-MIS': 'red', 'I-MIS': 'brown', 'B-PER': 'green', 'I-PER': '#FFD54F', 'B-ORG': 'orange', 'I-ORG': '#FF6666', 'B-LOC': 'purple', 'I-LOC': '#FFCC99', } highlighted_example = highlight_entities_with_colors_and_labels_tokenized(text, entities, color_mapping, tokenizer) return highlighted_example except Exception as e: return f"Error highlighting entities: {str(e)}" def summarize_text(input_text): """ The `summarize_text` function is designed to provide a concise summary of a given input text using the Hugging Face Transformers library's summarization pipeline. The function takes an `input_text` parameter, representing the text that needs to be summarized. Parameters: - **input_text (str):** The input text that needs to be summarized. Returns: - **summarized_text (str):** The function utilizes the summarization pipeline with specific parameters, including `max_length`, `min_length`, `length_penalty`, `num_beams`, and `early_stopping`, to generate a summary of the input text. The summarized text is then extracted from the pipeline output and returned. """ summarized_text = summarization_pipeline(input_text, max_length=150, min_length=50, length_penalty=2.0, num_beams=4, early_stopping=True) summarized_text = summarized_text[0]['summary_text'] return summarized_text def image_ner_tool(file, model_name): """ Perform Named Entity Recognition (NER) on the text extracted from an image or PDF file. The extracted text is highlighted with colored annotations based on recognized entities. Parameters: - file (str or BytesIO): Either a file path or a BytesIO object containing the image or PDF file. - model_name (str): The name of the NER model to be used ('bert-large-NER', 'bioNER', or 'SpaCy English NER'). Returns: - text (str): Extracted text from the input file. - highlighted_text (str): HTML with colored annotations highlighting the recognized entities. - reformatted_ner_output (str): JSON-formatted string containing information about the recognized entities. """ reformatted_ner_output = "" try: if isinstance(file, str): # If the input is a file path with open(file, 'rb') as file_stream: file_bytes = file_stream.read() else: # If the input is a byte stream file_bytes = file.getvalue() text = extract_text_from_image_or_pdf(file_bytes) entities = perform_ner(text, model_name) highlighted_text = highlight_entities(text, entities,model_name) reformatted_ner_output = json.dumps(entities, indent=2) summary = summarize_text(text) return text, highlighted_text, reformatted_ner_output, summary except Exception as e: error_message = f"Error processing file: {str(e)}" return error_message, "", reformatted_ner_output import pandas as pd def store_data_to_csv(inputs, outputs): print(inputs) print(outputs) if isinstance(inputs, str): # If the input is a file path with open(inputs, 'rb') as file_stream: file_bytes = file_stream.read() else: # If the input is a byte stream file_bytes = inputs.getvalue() extracted_text = extract_text_from_image_or_pdf(file_bytes) named_entities=perform_ner(extracted_text, outputs) df = pd.DataFrame({"Extracted Text": [extracted_text], "Extracted Entities": [named_entities]}) df.to_csv("log.csv", mode='a', index=False, header=False) with gr.Blocks() as demo: gr.Markdown( """
Intelligent Document Processing
Upload a PDF or an image file to extract text and identify named entities
""" ) with gr.Row() as row: with gr.Column(): text1 =gr.File(label="Upload File") model=gr.Dropdown(list(ner_models.keys()), label="Select NER Model") btn = gr.Button("submit") with gr.Column(): with gr.Tab("Extracted Text"): output1=gr.Textbox(label="Extracted Text", container= True) with gr.Tab("Highlighted Entitied"): output2=gr.HTML(label="Highlighted Text") with gr.Tab("Summarized Text"): output3=gr.HTML(label="Summarized text") with gr.Tab("Named Entities Extracted"): output4=gr.HTML(label="Named Entities") store_button = gr.Button("Store Data to CSV") gr.Examples( [ [ # Text to display above the image "The year is 2043.pdf", # Path to the image file "SpaCy English NER" # Selected value for the dropdown menu ] ], [text1, model], ) btn.click( image_ner_tool, [text1, model], [output1, output2, output4, output3], ) store_button.click( store_data_to_csv, [text1, model], ) demo.launch()