trttung1610 commited on
Commit
625ce4f
1 Parent(s): cadb6e2

Upload 2 files

Browse files
Files changed (2) hide show
  1. main.py +230 -0
  2. requirements.txt +8 -0
main.py ADDED
@@ -0,0 +1,230 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import RobertaForSequenceClassification, AutoTokenizer, AutoModelForTokenClassification, pipeline
2
+ import torch
3
+ import nltk
4
+ import docx2txt
5
+ import pandas as pd
6
+ import os
7
+ import matplotlib.pyplot as plt
8
+ import openpyxl
9
+ from openpyxl.styles import Font, Color, PatternFill
10
+ from openpyxl.styles.colors import WHITE
11
+ import gradio as gr
12
+
13
+ nltk.download('punkt')
14
+
15
+ # Load the model and tokenizer
16
+ senti_model = RobertaForSequenceClassification.from_pretrained("wonrax/phobert-base-vietnamese-sentiment")
17
+ senti_tokenizer = AutoTokenizer.from_pretrained("wonrax/phobert-base-vietnamese-sentiment", use_fast=False)
18
+
19
+ # Load the model and tokenizer segmentation
20
+ seg_tokenizer = AutoTokenizer.from_pretrained("NlpHUST/vi-word-segmentation")
21
+ seg_model = AutoModelForTokenClassification.from_pretrained("NlpHUST/vi-word-segmentation")
22
+ nlp = pipeline("token-classification", model=seg_model, tokenizer=seg_tokenizer)
23
+
24
+
25
+ # Word segmented
26
+ def segmentation(sentences):
27
+ segmented_sentences = []
28
+ for sentence in sentences:
29
+ ner_results = nlp(sentence)
30
+ sentence_tok = ""
31
+ for e in ner_results:
32
+ if "##" in e["word"]:
33
+ sentence_tok = sentence_tok + e["word"].replace("##", "")
34
+ elif e["entity"] == "I":
35
+ sentence_tok = sentence_tok + "_" + e["word"]
36
+ else:
37
+ sentence_tok = sentence_tok + " " + e["word"]
38
+ segmented_sentences.append(sentence_tok.strip())
39
+ return segmented_sentences
40
+
41
+
42
+ # File read
43
+ def read_file(docx):
44
+ try:
45
+ text = docx2txt.process(docx)
46
+ lines = text.split('\n')
47
+ lines = [line.strip() for line in lines]
48
+ lines = [line for line in lines if line]
49
+ return lines # add this line
50
+ except Exception as e:
51
+ print(f"Error reading file: {e}")
52
+
53
+
54
+ # Define a function to analyze the sentiment of a text
55
+ def analyze(sentence):
56
+ input_ids = torch.tensor([senti_tokenizer.encode(sentence)])
57
+ with torch.no_grad():
58
+ out = senti_model(input_ids)
59
+ results = out.logits.softmax(dim=-1).tolist()
60
+ return results[0]
61
+
62
+
63
+ def file_analysis(docx):
64
+ # Read the file and segment the sentences
65
+ sentences = read_file(docx)
66
+ segmented_sentences = segmentation(sentences)
67
+
68
+ # Analyze the sentiment of each sentence
69
+ results = []
70
+ for sentence in segmented_sentences:
71
+ results.append(analyze(sentence))
72
+
73
+ return results
74
+
75
+
76
+ def generate_pie_chart(df):
77
+ # Calculate the average scores
78
+ neg_avg = df['Negative'].mean()
79
+ pos_avg = df['Positive'].mean()
80
+ neu_avg = df['Neutral'].mean()
81
+
82
+ # Create a new DataFrame with the average scores
83
+ avg_df = pd.DataFrame({'Sentiment': ['Negative', 'Positive', 'Neutral'],
84
+ 'Score': [neg_avg, pos_avg, neu_avg]})
85
+
86
+ # Set custom colors for the pie chart
87
+ colors = ['#BDBDBD', '#9ACD32', '#87CEFA']
88
+
89
+ # Create a pie chart showing the average scores
90
+ plt.pie(avg_df['Score'], labels=avg_df['Sentiment'], colors=colors, autopct='%1.1f%%')
91
+ plt.title('Average Scores by Sentiment')
92
+
93
+ # Save the pie chart as an image file in the static folder
94
+ pie_chart_name = 'pie_chart.png'
95
+ plt.savefig(pie_chart_name)
96
+ plt.close()
97
+
98
+ return pie_chart_name
99
+
100
+
101
+ def generate_excel_file(df):
102
+ # Create a new workbook and worksheet
103
+ wb = openpyxl.Workbook()
104
+ ws = wb.active
105
+
106
+ # Add column headers to the worksheet
107
+ headers = ['Negative', 'Positive', 'Neutral', 'Text']
108
+ for col_num, header in enumerate(headers, 1):
109
+ cell = ws.cell(row=1, column=col_num)
110
+ cell.value = header
111
+ cell.font = Font(bold=True)
112
+
113
+ # Set up cell formatting for each sentiment
114
+ fill_dict = {
115
+ 'Negative': PatternFill(start_color='BDBDBD', end_color='BDBDBD', fill_type='solid'),
116
+ 'Positive': PatternFill(start_color='9ACD32', end_color='9ACD32', fill_type='solid'),
117
+ 'Neutral': PatternFill(start_color='87CEFA', end_color='87CEFA', fill_type='solid')
118
+ }
119
+
120
+ # Loop through each row of the input DataFrame and write data to the worksheet
121
+ for row_num, row_data in df.iterrows():
122
+ # Calculate the highest score and corresponding sentiment for this row
123
+ sentiment_cols = ['Negative', 'Positive', 'Neutral']
124
+ scores = [row_data[col] for col in sentiment_cols]
125
+ max_score = max(scores)
126
+ max_index = scores.index(max_score)
127
+ sentiment = sentiment_cols[max_index]
128
+
129
+ # Write the data to the worksheet
130
+ for col_num, col_data in enumerate(row_data, 1):
131
+ cell = ws.cell(row=row_num + 2, column=col_num)
132
+ cell.value = col_data
133
+ if col_num in [1, 2, 3]:
134
+ if col_data == max_score:
135
+ cell.fill = fill_dict[sentiment]
136
+ if col_num == 4:
137
+ fill = fill_dict[sentiment]
138
+ font_color = WHITE if fill.start_color.rgb == 'BDBDBD' else Color('000000')
139
+ cell.fill = fill
140
+ cell.font = Font(color=font_color)
141
+ if col_data == max_score:
142
+ cell.fill = fill_dict[sentiment]
143
+
144
+ # Save the workbook
145
+ excel_file_path = 'result.xlsx'
146
+ wb.save(excel_file_path)
147
+
148
+ return excel_file_path
149
+
150
+
151
+ def process_file(docx):
152
+ # Perform analysis on the file
153
+ results = file_analysis(docx)
154
+
155
+ # Create a DataFrame from the results
156
+ df = pd.DataFrame(results, columns=['Negative', 'Positive', 'Neutral'])
157
+ df['Text'] = read_file(docx)
158
+
159
+ # Generate the pie chart and excel file
160
+ pie_chart_name = generate_pie_chart(df)
161
+ excel_file_path = generate_excel_file(df)
162
+
163
+ return pie_chart_name, excel_file_path
164
+
165
+ def analyze_file(file, sentence):
166
+ if file and sentence:
167
+ # Both file and sentence inputs are provided
168
+ # Process the uploaded file and generate the output files
169
+ pie_chart_name, excel_file_path = process_file(file.name)
170
+
171
+ # Analyze the sentiment of the input sentence
172
+ segmented_sentence = segmentation([sentence])
173
+ results = analyze(segmented_sentence[0])
174
+
175
+ # Get the label names
176
+ label_names = ['Negative', 'Positive', 'Neutral']
177
+
178
+ # Create the output text with labels and scores
179
+ output_text = ""
180
+ for label, score in zip(label_names, results):
181
+ score_formatted = "{:.2f}".format(score)
182
+ output_text += f"{label}: {score_formatted}\n"
183
+
184
+ return excel_file_path, pie_chart_name, output_text
185
+
186
+ elif sentence:
187
+ # Only sentence input is provided
188
+ # Analyze the sentiment of the input sentence
189
+ segmented_sentence = segmentation([sentence])
190
+ results = analyze(segmented_sentence[0])
191
+
192
+ # Get the label names
193
+ label_names = ['Negative', 'Positive', 'Neutral']
194
+
195
+ # Create the output text with labels and scores
196
+ output_text = ""
197
+ for label, score in zip(label_names, results):
198
+ score_formatted = "{:.2f}".format(score)
199
+ output_text += f"{label}: {score_formatted}\n"
200
+
201
+ return None, None, output_text
202
+ elif file:
203
+ # Only file input is provided
204
+ # Process the uploaded file and generate the output files
205
+ pie_chart_name, excel_file_path = process_file(file.name)
206
+
207
+ # Return the file paths for the pie chart and excel file
208
+ return excel_file_path, pie_chart_name, None
209
+
210
+ inputs = [
211
+ gr.inputs.File(label="Chọn Tệp Bạn Muốn Phân Tích"),
212
+ gr.inputs.Textbox(label="Nhập Văn Bản")
213
+ ]
214
+ outputs = [
215
+ gr.outputs.File(label="Kết Quả Phân Tích Excel"),
216
+ gr.outputs.Image(type="filepath", label="Thông Số Phân Tích"),
217
+ gr.outputs.Textbox(label="Kết Quả Phân Tích")
218
+ ]
219
+
220
+ interface = gr.Interface(
221
+ fn=analyze_file,
222
+ inputs=inputs,
223
+ outputs=outputs,
224
+ title="Sentiment Analysis",
225
+ allow_flagging="never" # Disable flag button
226
+ )
227
+
228
+
229
+ if __name__ == "__main__":
230
+ interface.launch(share=True)
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ transformers
2
+ torch
3
+ nltk
4
+ python-docx
5
+ pandas
6
+ matplotlib
7
+ openpyxl
8
+ gradio