Spaces:

dperales
/

ITACA_Insurance_Core_v4

Running

App Files Files Community

dperales commited on Apr 18, 2023

Commit

b2fbe3d

1 Parent(s): c0b980f

Upload 12 files

Browse files

Files changed (12) hide show

OCR_Detector.py +12 -0
Object_Detector.py +146 -0
Roboto-Light.ttf +0 -0
appv2.py +320 -0
emotion_detection.py +67 -0
itaca_logo.png +0 -0
keyword_extraction.py +145 -0
models.py +26 -0
named_entity_recognition.py +60 -0
part_of_speech_tagging.py +24 -0
requirements.txt +18 -0
sentiment_analysis.py +78 -0

OCR_Detector.py ADDED Viewed

	@@ -0,0 +1,12 @@

+import pandas as pd
+import numpy as np
+import streamlit as st
+import easyocr
+import PIL
+from PIL import Image, ImageDraw
+class OCRDetector:
+  def __init__(self):
+    # it will only detect the English and Spanish part of the image as text
+    self.reader = easyocr.Reader(['es','en'], gpu=False)

Object_Detector.py ADDED Viewed

	@@ -0,0 +1,146 @@

+import os
+import tensorflow as tf
+import tensorflow_hub as hub
+# Load compressed models from tensorflow_hub
+os.environ['TFHUB_MODEL_LOAD_FORMAT'] = 'COMPRESSED'
+import matplotlib.pyplot as plt
+import matplotlib as mpl
+# For drawing onto the image.
+import numpy as np
+from tensorflow.python.ops.numpy_ops import np_config
+np_config.enable_numpy_behavior()
+from PIL import Image
+from PIL import ImageColor
+from PIL import ImageDraw
+from PIL import ImageFont
+import time
+import streamlit as st
+# For measuring the inference time.
+import time
+class ObjectDetector:
+    def __init__(self):
+        # Load Tokenizer & Model
+        # hub_location = 'cardiffnlp/twitter-roberta-base-sentiment'
+        # self.tokenizer = AutoTokenizer.from_pretrained(hub_location)
+        # self.model = AutoModelForSequenceClassification.from_pretrained(hub_location)
+        # Change model labels in config
+        # self.model.config.id2label[0] = "Negative"
+        # self.model.config.id2label[1] = "Neutral"
+        # self.model.config.id2label[2] = "Positive"
+        # self.model.config.label2id["Negative"] = self.model.config.label2id.pop("LABEL_0")
+        # self.model.config.label2id["Neutral"] = self.model.config.label2id.pop("LABEL_1")
+        # self.model.config.label2id["Positive"] = self.model.config.label2id.pop("LABEL_2")
+        # Instantiate explainer
+        # self.explainer = SequenceClassificationExplainer(self.model, self.tokenizer)
+        # module_handle = "https://tfhub.dev/google/faster_rcnn/openimages_v4/inception_resnet_v2/1"
+        module_handle = "https://tfhub.dev/google/openimages_v4/ssd/mobilenet_v2/1"
+        self.detector = hub.load(module_handle).signatures['default']
+    def run_detector(self, path):
+      img = path
+      converted_img  = tf.image.convert_image_dtype(img, tf.float32)[tf.newaxis, ...]
+      start_time = time.time()
+      result = self.detector(converted_img)
+      end_time = time.time()
+      result = {key:value.numpy() for key,value in result.items()}
+      primer = format(result["detection_class_entities"][0]) + ' ' + format(round(result["detection_scores"][0]*100)) + '%'
+      image_with_boxes = self.draw_boxes(
+        img, result["detection_boxes"],
+        result["detection_class_entities"], result["detection_scores"])
+      # display_image(image_with_boxes)
+      return image_with_boxes, primer
+    def display_image(self, image):
+      fig = plt.figure(figsize=(20, 15))
+      plt.grid(False)
+      plt.imshow(image)
+    def draw_bounding_box_on_image(self, image,
+                                   ymin,
+                                   xmin,
+                                   ymax,
+                                   xmax,
+                                   color,
+                                   font,
+                                   thickness=4,
+                                   display_str_list=()):
+      """Adds a bounding box to an image."""
+      draw = ImageDraw.Draw(image)
+      im_width, im_height = image.size
+      (left, right, top, bottom) = (xmin * im_width, xmax * im_width,
+                                    ymin * im_height, ymax * im_height)
+      draw.line([(left, top), (left, bottom), (right, bottom), (right, top),
+                 (left, top)],
+                width=thickness,
+                fill=color)
+      # If the total height of the display strings added to the top of the bounding
+      # box exceeds the top of the image, stack the strings below the bounding box
+      # instead of above.
+      display_str_heights = [font.getsize(ds)[1] for ds in display_str_list]
+      # Each display_str has a top and bottom margin of 0.05x.
+      total_display_str_height = (1 + 2 * 0.05) * sum(display_str_heights)
+      if top > total_display_str_height:
+        text_bottom = top
+      else:
+        text_bottom = top + total_display_str_height
+      # Reverse list and print from bottom to top.
+      for display_str in display_str_list[::-1]:
+        text_width, text_height = font.getsize(display_str)
+        margin = np.ceil(0.05 * text_height)
+        draw.rectangle([(left, text_bottom - text_height - 2 * margin),
+                        (left + text_width, text_bottom)],
+                       fill=color)
+        draw.text((left + margin, text_bottom - text_height - margin),
+                  display_str,
+                  fill="black",
+                  font=font)
+        text_bottom -= text_height - 2 * margin
+    def draw_boxes(self, image, boxes, class_names, scores, max_boxes=10, min_score=0.4):
+      """Overlay labeled boxes on an image with formatted scores and label names."""
+      colors = list(ImageColor.colormap.values())
+      try:
+        font = ImageFont.truetype("./Roboto-Light.ttf", 24)
+      except IOError:
+        print("Font not found, using default font.")
+        font = ImageFont.load_default()
+      for i in range(min(boxes.shape[0], max_boxes)):
+        if scores[i] >= min_score:
+          ymin, xmin, ymax, xmax = tuple(boxes[i])
+          display_str = "{}: {}%".format(class_names[i].decode("ascii"),
+                                         int(100 * scores[i]))
+          color = colors[hash(class_names[i]) % len(colors)]
+          image_pil = Image.fromarray(np.uint8(image)).convert("RGB")
+          self.draw_bounding_box_on_image(
+              image_pil,
+              ymin,
+              xmin,
+              ymax,
+              xmax,
+              color,
+              font,
+              display_str_list=[display_str])
+          np.copyto(image, np.array(image_pil))
+      return image

Roboto-Light.ttf ADDED Viewed

Binary file (170 kB). View file

appv2.py ADDED Viewed

	@@ -0,0 +1,320 @@

+import os
+import pandas as pd
+import numpy as np
+import easyocr
+import streamlit as st
+from annotated_text import annotated_text
+from streamlit_option_menu import option_menu
+from sentiment_analysis import SentimentAnalysis
+from keyword_extraction import KeywordExtractor
+from part_of_speech_tagging import POSTagging
+from emotion_detection import EmotionDetection
+from named_entity_recognition import NamedEntityRecognition
+from Object_Detector import ObjectDetector
+from OCR_Detector import OCRDetector
+import PIL
+from PIL import Image
+from PIL import ImageColor
+from PIL import ImageDraw
+from PIL import ImageFont
+import time
+# Imports de Object Detection
+import tensorflow as tf
+import tensorflow_hub as hub
+# Load compressed models from tensorflow_hub
+os.environ['TFHUB_MODEL_LOAD_FORMAT'] = 'COMPRESSED'
+import matplotlib.pyplot as plt
+import matplotlib as mpl
+# For drawing onto the image.
+import numpy as np
+from tensorflow.python.ops.numpy_ops import np_config
+np_config.enable_numpy_behavior()
+import torch
+import librosa
+from models import infere_speech_emotion, infere_text_emotion, infere_voice2text
+st.set_page_config(layout="wide")
+hide_streamlit_style = """
+            <style>
+            #MainMenu {visibility: hidden;}
+            footer {visibility: hidden;}
+            </style>
+            """
+st.markdown(hide_streamlit_style, unsafe_allow_html=True)
+@st.cache_resource
+def load_sentiment_model():
+    return SentimentAnalysis()
+@st.cache_resource
+def load_keyword_model():
+    return KeywordExtractor()
+@st.cache_resource
+def load_pos_model():
+    return POSTagging()
+@st.cache_resource
+def load_emotion_model():
+    return EmotionDetection()
+@st.cache_resource
+def load_ner_model():
+    return NamedEntityRecognition()
+@st.cache_resource
+def load_objectdetector_model():
+    return ObjectDetector()
+@st.cache_resource
+def load_ocrdetector_model():
+    return OCRDetector()
+sentiment_analyzer = load_sentiment_model()
+keyword_extractor = load_keyword_model()
+pos_tagger = load_pos_model()
+emotion_detector = load_emotion_model()
+ner = load_ner_model()
+objectdetector1 = load_objectdetector_model()
+ocrdetector1 = load_ocrdetector_model()
+def rectangle(image, result):
+    draw = ImageDraw.Draw(image)
+    for res in result:
+        top_left = tuple(res[0][0]) # top left coordinates as tuple
+        bottom_right = tuple(res[0][2]) # bottom right coordinates as tuple
+        draw.rectangle((top_left, bottom_right), outline="blue", width=2)
+    st.image(image)
+example_text = "My name is Daniel: The attention to detail, swift resolution, and accuracy demonstrated by ITACA Insurance Company in Spain  in handling my claim were truly impressive. This undoubtedly reflects their commitment to being a customer-centric insurance provider."
+with st.sidebar:
+    image = Image.open('./itaca_logo.png')
+    st.image(image,width=150) #use_column_width=True)
+    page = option_menu(menu_title='Menu',
+                       menu_icon="robot",
+                       options=["Sentiment Analysis",
+                                "Keyword Extraction",
+                                "Part of Speech Tagging",
+                                "Emotion Detection",
+                                "Named Entity Recognition",
+                                "Speech & Text Emotion",
+                                "Object Detector",
+                                "OCR Detector"],
+                       icons=["chat-dots",
+                              "key",
+                              "tag",
+                              "emoji-heart-eyes",
+                              "building",
+                              "book",
+                              "camera",
+                              "list-task"],
+                       default_index=0
+                       )
+st.title('ITACA Insurance Core AI Module')
+# Replace '20px' with your desired font size
+font_size = '20px'
+if page == "Sentiment Analysis":
+    st.header('Sentiment Analysis')
+    # st.markdown("![Alt Text](https://media.giphy.com/media/XIqCQx02E1U9W/giphy.gif)")
+    st.write(
+        """
+        """
+    )
+    text = st.text_area("Paste text here", value=example_text)
+    if st.button('🔥 Run!'):
+        with st.spinner("Loading..."):
+            preds, html = sentiment_analyzer.run(text)
+            st.success('All done!')
+            st.write("")
+            st.subheader("Sentiment Predictions")
+            st.bar_chart(data=preds, width=0, height=0, use_container_width=True)
+            st.write("")
+            st.subheader("Sentiment Justification")
+            raw_html = html._repr_html_()
+            st.components.v1.html(raw_html, height=500)
+elif page == "Keyword Extraction":
+    st.header('Keyword Extraction')
+    # st.markdown("![Alt Text](https://media.giphy.com/media/xT9C25UNTwfZuk85WP/giphy-downsized-large.gif)")
+    st.write(
+        """
+        """
+    )
+    text = st.text_area("Paste text here", value=example_text)
+    max_keywords = st.slider('# of Keywords Max Limit', min_value=1, max_value=10, value=5, step=1)
+    if st.button('🔥 Run!'):
+        with st.spinner("Loading..."):
+            annotation, keywords = keyword_extractor.generate(text, max_keywords)
+            st.success('All done!')
+        if annotation:
+            st.subheader("Keyword Annotation")
+            st.write("")
+            annotated_text(*annotation)
+            st.text("")
+        st.subheader("Extracted Keywords")
+        st.write("")
+        df = pd.DataFrame(keywords, columns=['Extracted Keywords'])
+        csv = df.to_csv(index=False).encode('utf-8')
+        st.download_button('Download Keywords to CSV', csv, file_name='news_intelligence_keywords.csv')
+        data_table = st.table(df)
+elif page == "Part of Speech Tagging":
+    st.header('Part of Speech Tagging')
+    # st.markdown("![Alt Text](https://media.giphy.com/media/WoWm8YzFQJg5i/giphy.gif)")
+    st.write(
+        """
+        """
+    )
+    text = st.text_area("Paste text here", value=example_text)
+    if st.button('🔥 Run!'):
+        with st.spinner("Loading..."):
+            preds = pos_tagger.classify(text)
+            st.success('All done!')
+            st.write("")
+            st.subheader("Part of Speech tags")
+            annotated_text(*preds)
+            st.write("")
+            st.components.v1.iframe('https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html', height=1000)
+elif page == "Emotion Detection":
+    st.header('Emotion Detection')
+    # st.markdown("![Alt Text](https://media.giphy.com/media/fU8X6ozSszyEw/giphy.gif)")
+    st.write(
+        """
+        """
+    )
+    text = st.text_area("Paste text here", value=example_text)
+    if st.button('🔥 Run!'):
+        with st.spinner("Loading..."):
+            preds, html = emotion_detector.run(text)
+            st.success('All done!')
+            st.write("")
+            st.subheader("Emotion Predictions")
+            st.bar_chart(data=preds, width=0, height=0, use_container_width=True)
+            raw_html = html._repr_html_()
+            st.write("")
+            st.subheader("Emotion Justification")
+            st.components.v1.html(raw_html, height=500)
+elif page == "Named Entity Recognition":
+    st.header('Named Entity Recognition')
+    # st.markdown("![Alt Text](https://media.giphy.com/media/lxO8wdWdu4tig/giphy.gif)")
+    st.write(
+        """
+        """
+    )
+    text = st.text_area("Paste text here", value=example_text)
+    if st.button('🔥 Run!'):
+        with st.spinner("Loading..."):
+            preds, ner_annotation = ner.classify(text)
+            st.success('All done!')
+            st.write("")
+            st.subheader("NER Predictions")
+            annotated_text(*ner_annotation)
+            st.write("")
+            st.subheader("NER Prediction Metadata")
+            st.write(preds)
+elif page == "Object Detector":
+    st.header('Object Detector')
+    st.write(
+    """
+    """
+    )
+    img_file_buffer = st.file_uploader("Load an image", type=["png", "jpg", "jpeg"])
+    if img_file_buffer is not None:
+        image = np.array(Image.open(img_file_buffer))
+    if st.button('🔥 Run!'):
+        with st.spinner("Loading..."):
+            img, primero = objectdetector1.run_detector(image)
+            st.success('The first image detected is: ' + primero)
+            st.image(img, caption="Imagen", use_column_width=True)
+elif page == "OCR Detector":
+    st.header('OCR Detector')
+    st.write(
+    """
+    """
+    )
+    file = st.file_uploader("Load an image", type=["png", "jpg", "jpeg"])
+    #read the csv file and display the dataframe
+    if file is not None:
+        image = Image.open(file) # read image with PIL library
+    if st.button('🔥 Run!'):
+        with st.spinner("Loading..."):
+            result = ocrdetector1.reader.readtext(np.array(image))  # turn image to numpy array
+            # collect the results in dictionary:
+            textdic_easyocr = {}
+            for idx in range(len(result)):
+                pred_coor = result[idx][0]
+                pred_text = result[idx][1]
+                pred_confidence = result[idx][2]
+                textdic_easyocr[pred_text] = {}
+                textdic_easyocr[pred_text]['pred_confidence'] = pred_confidence
+            # get boxes on the image
+            rectangle(image, result)
+            # create a dataframe which shows the predicted text and prediction confidence
+            df = pd.DataFrame.from_dict(textdic_easyocr).T
+            st.table(df)
+elif page == "Speech & Text Emotion":
+    st.header('Speech & Text Emotion')
+    st.write(
+    """
+    """
+    )
+    uploaded_file = st.file_uploader("Choose an audio file", type=["mp3", "wav", "ogg"])
+    if uploaded_file is not None:
+        st.audio(uploaded_file, format='audio/' + uploaded_file.type.split('/')[1])
+        st.write("Audio file uploaded and playing.")
+    else:
+        st.write("Please upload an audio file.")
+    if st.button("Analysis"):
+        with st.spinner("Loading..."):
+            st.header('Results of the Audio & Text analysis:')
+            samples, sample_rate = librosa.load(uploaded_file, sr=16000)
+            p_voice2text = infere_voice2text (samples)
+            p_speechemotion = infere_speech_emotion(samples)
+            p_textemotion = infere_text_emotion(p_voice2text)
+            st.subheader("Text from the Audio:")
+            st.write(p_voice2text)
+            st.write("---")
+            st.subheader("Speech emotion:")
+            st.write(p_speechemotion)
+            st.write("---")
+            st.subheader("Text emotion:")
+            st.write(p_textemotion)
+            st.write("---")

emotion_detection.py ADDED Viewed

	@@ -0,0 +1,67 @@

+from transformers import AutoTokenizer, AutoModelForSequenceClassification
+from transformers_interpret import SequenceClassificationExplainer
+import torch
+import pandas as pd
+class EmotionDetection:
+    """
+    Emotion Detection on text data.
+    Attributes:
+        tokenizer: An instance of Hugging Face Tokenizer
+        model: An instance of Hugging Face Model
+        explainer: An instance of SequenceClassificationExplainer from Transformers interpret
+    """
+    def __init__(self):
+        hub_location = 'cardiffnlp/twitter-roberta-base-emotion'
+        self.tokenizer = AutoTokenizer.from_pretrained(hub_location)
+        self.model = AutoModelForSequenceClassification.from_pretrained(hub_location)
+        self.explainer = SequenceClassificationExplainer(self.model, self.tokenizer)
+    def justify(self, text):
+        """
+        Get html annotation for displaying emotion justification over text.
+        Parameters:
+            text (str): The user input string to emotion justification
+        Returns:
+            html (hmtl): html object for plotting emotion prediction justification
+        """
+        word_attributions = self.explainer(text)
+        html = self.explainer.visualize("example.html")
+        return html
+    def classify(self, text):
+        """
+        Recognize Emotion in text.
+        Parameters:
+            text (str): The user input string to perform emotion classification on
+        Returns:
+            predictions (str): The predicted probabilities for emotion classes
+        """
+        tokens = self.tokenizer.encode_plus(text, add_special_tokens=False, return_tensors='pt')
+        outputs = self.model(**tokens)
+        probs = torch.nn.functional.softmax(outputs[0], dim=-1)
+        probs = probs.mean(dim=0).detach().numpy()
+        labels = list(self.model.config.id2label.values())
+        preds = pd.Series(probs, index=labels, name='Predicted Probability')
+        return preds
+    def run(self, text):
+        """
+        Classify and Justify Emotion in text.
+        Parameters:
+            text (str): The user input string to perform emotion classification on
+        Returns:
+            predictions (str): The predicted probabilities for emotion classes
+            html (hmtl): html object for plotting emotion prediction justification
+        """
+        preds = self.classify(text)
+        html = self.justify(text)
+        return preds, html

itaca_logo.png ADDED Viewed

keyword_extraction.py ADDED Viewed

	@@ -0,0 +1,145 @@

+import nltk
+import pytextrank
+import re
+from operator import itemgetter
+import en_core_web_sm
+class KeywordExtractor:
+    """
+    Keyword Extraction on text data
+    Attributes:
+        nlp: An instance English pipeline optimized for CPU for spacy
+    """
+    def __init__(self):
+        self.nlp = en_core_web_sm.load()
+        self.nlp.add_pipe("textrank")
+    def get_keywords(self, text, max_keywords):
+        """
+        Extract keywords from text.
+        Parameters:
+            text (str): The user input string to extract keywords from
+        Returns:
+            kws (list): list of extracted keywords
+        """
+        doc = self.nlp(text)
+        kws = [i.text for i in doc._.phrases[:max_keywords]]
+        return kws
+    def get_keyword_indices(self, kws, text):
+        """
+        Extract keywords from text.
+        Parameters:
+            kws (list): list of extracted keywords
+            text (str): The user input string to extract keywords from
+        Returns:
+            keyword_indices (list): list of indices for keyword boundaries in text
+        """
+        keyword_indices = []
+        for s in kws:
+            indices = [[m.start(), m.end()] for m in re.finditer(re.escape(s), text)]
+            keyword_indices.extend(indices)
+        return keyword_indices
+    def merge_overlapping_indices(self, keyword_indices):
+        """
+        Merge overlapping keyword indices.
+        Parameters:
+            keyword_indices (list): list of indices for keyword boundaries in text
+        Returns:
+            keyword_indices (list): list of indices for keyword boundaries in with overlapping combined
+        """
+        # Sort the array on the basis of start values of intervals.
+        keyword_indices.sort()
+        stack = []
+        # insert first interval into stack
+        stack.append(keyword_indices[0])
+        for i in keyword_indices[1:]:
+            # Check for overlapping interval,
+            # if interval overlap
+            if (stack[-1][0] <= i[0] <= stack[-1][-1]) or (stack[-1][-1] == i[0]-1):
+                stack[-1][-1] = max(stack[-1][-1], i[-1])
+            else:
+                stack.append(i)
+        return stack
+    def merge_until_finished(self, keyword_indices):
+        """
+        Loop until no overlapping keyword indices left.
+        Parameters:
+            keyword_indices (list): list of indices for keyword boundaries in text
+        Returns:
+            keyword_indices (list): list of indices for keyword boundaries in with overlapping combined
+        """
+        len_indices = 0
+        while True:
+            # Merge overlapping indices
+            merged = self.merge_overlapping_indices(keyword_indices)
+            # Check to see if merging reduced number of annotation indices
+            # If merging did not reduce list return final indicies
+            if len_indices == len(merged):
+                out_indices = sorted(merged, key=itemgetter(0))
+                return out_indices
+            else:
+                len_indices = len(merged)
+    def get_annotation(self, text, keyword_indices):
+        """
+        Create text annotation for extracted keywords.
+        Parameters:
+            keyword_indices (list): list of indices for keyword boundaries in text
+        Returns:
+            annotation (list): list of tuples for generating html
+        """
+        # Turn list to numpy array
+        arr = list(text)
+        # Loop through indices in list and insert delimeters
+        for idx in sorted(keyword_indices, reverse=True):
+            arr.insert(idx[0], "<kw>")
+            arr.insert(idx[1]+1, "<!kw> <kw>")
+        # join array
+        joined_annotation = ''.join(arr)
+        # split array on delimeter
+        split = joined_annotation.split('<kw>')
+        # Create annotation for keywords in text
+        annotation = [(x.replace('<!kw> ', ''), "KEY", "#26aaef") if "<!kw>" in x else x for x in split]
+        return annotation
+    def generate(self, text, max_keywords):
+        """
+        Create text annotation for extracted keywords.
+        Parameters:
+            text (str): The user input string to extract keywords from
+            max_keywords (int): Limit on number of keywords to generate
+        Returns:
+            annotation (list): list of tuples for generating html
+            kws (list): list of extracted keywords
+        """
+        kws = self.get_keywords(text, max_keywords)
+        indices = list(self.get_keyword_indices(kws, text))
+        if indices:
+            indices_merged = self.merge_until_finished(indices)
+            annotation = self.get_annotation(text, indices_merged)
+        else:
+            annotation = None
+        return annotation, kws

models.py ADDED Viewed

	@@ -0,0 +1,26 @@

+# Import the necessary libraries
+from transformers import pipeline
+# Initialize the text classification model with a pre-trained model
+model_text_emotion = pipeline("text-classification", model="j-hartmann/emotion-english-distilroberta-base")
+# Initialize the audio classification model with a pre-trained SER model
+model_speech_emotion = pipeline("audio-classification", model="aherzberg/ser_model_fixed_label")
+# Initialize the automatic speech recognition model with a pre-trained model that is capable of converting speech to text
+model_voice2text = pipeline("automatic-speech-recognition", model="openai/whisper-tiny.en")
+# A function that uses the initialized text classification model to predict the emotion of a given text input
+def infere_text_emotion(text):
+    return model_text_emotion(text)[0]["label"].capitalize()
+# A function that uses the initialized audio classification model to predict the emotion of a given speech input
+def infere_speech_emotion(text):
+    # Dict that maps the speech model emotions with the text's ones
+    emotions_dict = {"angry": "Anger", "disgust": "Disgust", "fear": "Fear", "happy": "Joy", "neutral": "Neutral", "sad": "Sadness"}
+    inference = model_speech_emotion(text)[0]["label"]
+    return emotions_dict[inference]
+# A function that uses the initialized automatic speech recognition model to convert speech (as an audio file) to text
+def infere_voice2text(audio_file):
+    return model_voice2text(audio_file)["text"]

named_entity_recognition.py ADDED Viewed

	@@ -0,0 +1,60 @@

+from transformers import AutoTokenizer, AutoModelForTokenClassification
+from transformers import pipeline
+class NamedEntityRecognition:
+    """
+    Named Entity Recognition on text data.
+    Attributes:
+        tokenizer: An instance of Hugging Face Tokenizer
+        model: An instance of Hugging Face Model
+        nlp: An instance of Hugging Face Named Entity Recognition pipeline
+    """
+    def __init__(self):
+        tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-large-finetuned-conll03-english")
+        model = AutoModelForTokenClassification.from_pretrained("xlm-roberta-large-finetuned-conll03-english")
+        self.nlp = pipeline("ner", model=model, tokenizer=tokenizer, grouped_entities=True)
+    def get_annotation(self, preds, text):
+        """
+        Get html annotation for displaying entities over text.
+        Parameters:
+            preds (dict): List of entities and their associated metadata
+            text (str): The user input string to generate entity tags for
+        Returns:
+            final_annotation (list): List of tuples to pass to text annotation html creator
+        """
+        splits = [0]
+        entities = {}
+        for i in preds:
+            splits.append(i['start'])
+            splits.append(i['end'])
+            entities[i['word']] = i['entity_group']
+        # Exclude bad preds
+        exclude = ['', '.', '. ', ' ']
+        for x in exclude:
+            if x in entities.keys():
+                entities.pop(x)
+        parts = [text[i:j] for i, j in zip(splits, splits[1:] + [None])]
+        final_annotation = [(x, entities[x], "") if x in entities.keys() else x for x in parts]
+        return final_annotation
+    def classify(self, text):
+        """
+        Recognize Named Entities in text.
+        Parameters:
+            text (str): The user input string to generate entity tags for
+        Returns:
+            predictions (str): The user input string to generate entity tags for
+            ner_annotation (str): The user input string to generate entity tags for
+        """
+        preds = self.nlp(text)
+        ner_annotation = self.get_annotation(preds, text)
+        return preds, ner_annotation

part_of_speech_tagging.py ADDED Viewed

	@@ -0,0 +1,24 @@

+import nltk
+from nltk.tokenize import word_tokenize
+nltk.download('punkt')
+nltk.download('averaged_perceptron_tagger')
+class POSTagging:
+    """Part of Speech Tagging on text data"""
+    def __init__(self):
+       pass
+    def classify(self, text):
+        """
+        Generate Part of Speech tags.
+        Parameters:
+            text (str): The user input string to generate tags for
+        Returns:
+            predictions (list): list of tuples containing words and their respective tags
+        """
+        text = word_tokenize(text)
+        predictions = nltk.pos_tag(text)
+        return predictions

requirements.txt ADDED Viewed

	@@ -0,0 +1,18 @@

+Pillow
+streamlit==1.21.0
+pandas
+numpy
+matplotlib
+tensorflow
+tensorflow-hub
+scikit-learn
+easyocr
+nltk~=3.5
+typing-extensions
+streamlit-option-menu~=0.3.2
+st-annotated-text~=3.0.0
+transformers-interpret~=0.7.2
+htbuilder==0.6.0
+pytextrank~=3.2.3
+spacy~=3.0.5
+en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.0.0/en_core_web_sm-3.0.0-py3-none-any.whl

sentiment_analysis.py ADDED Viewed

	@@ -0,0 +1,78 @@

+from transformers import AutoTokenizer, AutoModelForSequenceClassification
+from transformers_interpret import SequenceClassificationExplainer
+import torch
+import pandas as pd
+class SentimentAnalysis:
+    """
+    Sentiment on text data.
+    Attributes:
+        tokenizer: An instance of Hugging Face Tokenizer
+        model: An instance of Hugging Face Model
+        explainer: An instance of SequenceClassificationExplainer from Transformers interpret
+    """
+    def __init__(self):
+        # Load Tokenizer & Model
+        hub_location = 'cardiffnlp/twitter-roberta-base-sentiment'
+        #hub_location = 'dccuchile/bert-base-spanish-wwm-uncased'
+        self.tokenizer = AutoTokenizer.from_pretrained(hub_location)
+        self.model = AutoModelForSequenceClassification.from_pretrained(hub_location)
+        # Change model labels in config
+        self.model.config.id2label[0] = "Negative"
+        self.model.config.id2label[1] = "Neutral"
+        self.model.config.id2label[2] = "Positive"
+        self.model.config.label2id["Negative"] = self.model.config.label2id.pop("LABEL_0")
+        self.model.config.label2id["Neutral"] = self.model.config.label2id.pop("LABEL_1")
+        self.model.config.label2id["Positive"] = self.model.config.label2id.pop("LABEL_2")
+        # Instantiate explainer
+        self.explainer = SequenceClassificationExplainer(self.model, self.tokenizer)
+    def justify(self, text):
+        """
+        Get html annotation for displaying sentiment justification over text.
+        Parameters:
+            text (str): The user input string to sentiment justification
+        Returns:
+            html (hmtl): html object for plotting sentiment prediction justification
+        """
+        word_attributions = self.explainer(text)
+        html = self.explainer.visualize("example.html")
+        return html
+    def classify(self, text):
+        """
+        Recognize Sentiment in text.
+        Parameters:
+            text (str): The user input string to perform sentiment classification on
+        Returns:
+            predictions (str): The predicted probabilities for sentiment classes
+        """
+        tokens = self.tokenizer.encode_plus(text, add_special_tokens=False, return_tensors='pt')
+        outputs = self.model(**tokens)
+        probs = torch.nn.functional.softmax(outputs[0], dim=-1)
+        probs = probs.mean(dim=0).detach().numpy()
+        predictions = pd.Series(probs, index=["Negative", "Neutral", "Positive"], name='Predicted Probability')
+        return predictions
+    def run(self, text):
+        """
+        Classify and Justify Sentiment in text.
+        Parameters:
+            text (str): The user input string to perform sentiment classification on
+        Returns:
+            predictions (str): The predicted probabilities for sentiment classes
+            html (hmtl): html object for plotting sentiment prediction justification
+        """
+        predictions = self.classify(text)
+        html = self.justify(text)
+        return predictions, html