File size: 5,475 Bytes
c6158bd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6e74485
c6158bd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForSequenceClassification, TextClassificationPipeline
import torch
import gradio as gr
from openpyxl import load_workbook
from numpy import mean

# Load tokenizers and models
tokenizer = AutoTokenizer.from_pretrained("suriya7/bart-finetuned-text-summarization")
model = AutoModelForSeq2SeqLM.from_pretrained("suriya7/bart-finetuned-text-summarization")

tokenizer_keywords = AutoTokenizer.from_pretrained("transformer3/H2-keywordextractor")
model_keywords = AutoModelForSeq2SeqLM.from_pretrained("transformer3/H2-keywordextractor")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
new_model = AutoModelForSequenceClassification.from_pretrained('roberta-rating')
new_tokenizer = AutoTokenizer.from_pretrained('roberta-rating')

classifier = TextClassificationPipeline(model=new_model, tokenizer=new_tokenizer, device=device)

label_mapping = {1: '1/5', 2: '2/5', 3: '3/5', 4: '4/5', 5: '5/5'}

# Function to parse Excel file
def parse_xl(file_path):
    cells = []

    workbook = load_workbook(filename=file_path)
    for sheet in workbook.worksheets:
        for row in sheet.iter_rows():
            for cell in row:
                if cell.value != None:
                    cells.append(cell.value)

    return cells

# Function to evaluate reviews from Excel file
def evaluate(file):
    reviews = parse_xl(file)
    ratings = []
    text = ""
    sentiments = []

    for review in reviews:
        rating = int(classifier(review)[0]['label'].split('_')[1])
        ratings.append(rating)
        text += review
        text += " "
        
        sentiment = classifier(review)[0]['label']
        sentiment_label = "Positive" if sentiment == "LABEL_4" or sentiment == "LABEL_5" else "Negative" if sentiment == "LABEL_1" or sentiment == "LABEL_2" else "Neutral"
        sentiments.append(sentiment_label)
    
    overall_sentiment = "Positive" if sentiments.count("Positive") > sentiments.count("Negative") else "Negative" if sentiments.count("Negative") > sentiments.count("Positive") else "Neutral"
            
    inputs = tokenizer([text], max_length=1024, truncation=True, return_tensors="pt")
    summary_ids = model.generate(inputs["input_ids"], num_beams=2, min_length=10, max_length=50)
    summary = tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]

    # Modify the summary to third person
    summary = summary.replace("I", "He/She").replace("my", "his/her").replace("me", "him/her")

    inputs_keywords = tokenizer_keywords([text], max_length=1024, truncation=True, return_tensors="pt")
    summary_ids_keywords = model_keywords.generate(inputs_keywords["input_ids"], num_beams=2, min_length=0, max_length=100)
    keywords = tokenizer_keywords.batch_decode(summary_ids_keywords, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]   

    return round(mean(ratings), 2), summary, keywords, overall_sentiment

# Function to test a single text input
def test_area(text):
    inputs = tokenizer([text], max_length=1024, truncation=True, return_tensors="pt")
    summary_ids = model.generate(inputs["input_ids"], num_beams=2, min_length=10, max_length=50)
    summary = tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]

    # Modify the summary to third person
    summary = summary.replace("I", "He/She").replace("my", "his/her").replace("me", "him/her")

    inputs_keywords = tokenizer_keywords([text], max_length=1024, truncation=True, return_tensors="pt")
    summary_ids_keywords = model_keywords.generate(inputs_keywords["input_ids"], num_beams=2, min_length=0, max_length=100)
    keywords = tokenizer_keywords.batch_decode(summary_ids_keywords, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]   

    sentiment = classifier(text)[0]['label']
    sentiment_label = "Positive" if sentiment == "LABEL_4" or sentiment == "LABEL_5" else "Negative" if sentiment == "LABEL_1" or sentiment == "LABEL_2" else "Neutral"
    
    rating = int(classifier(text)[0]['label'].split('_')[1])

    return rating, summary, keywords, sentiment_label

# Main interface
main_interface = gr.Interface(
    fn=evaluate,
    inputs=gr.File(label="Reviews"),
    outputs=[gr.Textbox(label="Overall Rating"), gr.Textbox(label="Summary"), gr.Textbox(label="Keywords"), gr.Textbox(label="Overall Sentiment")],
    title='Summarize Reviews',
    description="Evaluate and summarize collection of reviews. Reviews are submitted as an Excel file, where each review is in its own cell."
)

# Testing area interface
testing_interface = gr.Interface(
    fn=test_area,
    inputs=gr.Textbox(label="Input Text"),
    outputs=[gr.Textbox(label="Rating"), gr.Textbox(label="Summary"), gr.Textbox(label="Keywords"), gr.Textbox(label="Sentiment")],
    title='Testing Area',
    description="Test the summarization, keyword extraction, sentiment analysis, and rating on custom text input."
)

# Combine interfaces into a tabbed interface with a sidebar
with gr.Blocks() as demo:
    with gr.Row():
        with gr.Column(scale=1):
            gr.Markdown("## Sidebar")
            gr.Button("Button 1")
            gr.Button("Button 2")
        with gr.Column(scale=4):
            iface = gr.TabbedInterface(
                [main_interface, testing_interface],
                ["Summarize Reviews", "Testing Area"]
            )

demo.launch(share=True)