DevBM AneriThakkar commited on
Commit
03f344d
1 Parent(s): 90f21b4

Update app into modularized components (#4)

Browse files

- Update app into modularized components (ab9b628e0d6d7f771ba98507ffbec2e5b7625899)


Co-authored-by: Thakkar Aneri Pareshkumar <AneriThakkar@users.noreply.huggingface.co>

Files changed (1) hide show
  1. app.py +27 -623
app.py CHANGED
@@ -1,50 +1,4 @@
1
  import streamlit as st
2
- from transformers import T5ForConditionalGeneration, T5Tokenizer
3
- import spacy
4
- import nltk
5
- from sklearn.feature_extraction.text import TfidfVectorizer
6
- from rake_nltk import Rake
7
- import pandas as pd
8
- from fpdf import FPDF
9
- import wikipediaapi
10
- from functools import lru_cache
11
- nltk.download('punkt')
12
- nltk.download('stopwords')
13
- nltk.download('brown')
14
- from nltk.tokenize import sent_tokenize
15
- nltk.download('wordnet')
16
- from nltk.corpus import wordnet
17
- import random
18
- import sense2vec
19
- from wordcloud import WordCloud
20
- import matplotlib.pyplot as plt
21
- import json
22
- import os
23
- from sentence_transformers import SentenceTransformer, util
24
- import textstat
25
- from spellchecker import SpellChecker
26
- from transformers import pipeline
27
- import re
28
- import pymupdf
29
- import uuid
30
- import time
31
- import asyncio
32
- import aiohttp
33
- from datetime import datetime
34
- import base64
35
- from io import BytesIO
36
- # '-----------------'
37
- import smtplib
38
- from email.mime.multipart import MIMEMultipart
39
- from email.mime.text import MIMEText
40
- from email.mime.base import MIMEBase
41
- from email.mime.application import MIMEApplication
42
- from email import encoders
43
- # '------------------'
44
- from gliner import GLiNER
45
- # -------------------
46
-
47
- print("***************************************************************")
48
 
49
  st.set_page_config(
50
  page_icon='cyclone',
@@ -55,62 +9,19 @@ st.set_page_config(
55
  }
56
  )
57
 
58
- st.set_option('deprecation.showPyplotGlobalUse',False)
59
-
60
- class QuestionGenerationError(Exception):
61
- """Custom exception for question generation errors."""
62
- pass
63
-
64
-
65
- # Initialize Wikipedia API with a user agent
66
- user_agent = 'QGen/1.2'
67
- wiki_wiki = wikipediaapi.Wikipedia(user_agent= user_agent,language='en')
68
-
69
- def get_session_id():
70
- if 'session_id' not in st.session_state:
71
- st.session_state.session_id = str(uuid.uuid4())
72
- return st.session_state.session_id
73
-
74
- def initialize_state(session_id):
75
- if 'session_states' not in st.session_state:
76
- st.session_state.session_states = {}
77
-
78
- if session_id not in st.session_state.session_states:
79
- st.session_state.session_states[session_id] = {
80
- 'generated_questions': [],
81
- # add other state variables as needed
82
- }
83
- return st.session_state.session_states[session_id]
84
-
85
- def get_state(session_id):
86
- return st.session_state.session_states[session_id]
87
-
88
- def set_state(session_id, key, value):
89
- st.session_state.session_states[session_id][key] = value
90
-
91
-
92
- @st.cache_resource
93
- def load_model(modelname):
94
- model_name = modelname
95
- model = T5ForConditionalGeneration.from_pretrained(model_name)
96
- tokenizer = T5Tokenizer.from_pretrained(model_name)
97
- return model, tokenizer
98
-
99
- # Load Spacy Model
100
- @st.cache_resource
101
- def load_nlp_models():
102
- nlp = spacy.load("en_core_web_md")
103
- s2v = sense2vec.Sense2Vec().from_disk('s2v_old')
104
- return nlp, s2v
105
 
106
- # Load Quality Assurance Models
107
- @st.cache_resource
108
- def load_qa_models():
109
- # Initialize BERT model for sentence similarity
110
- similarity_model = SentenceTransformer('all-MiniLM-L6-v2')
 
 
 
 
 
111
 
112
- spell = SpellChecker()
113
- return similarity_model, spell
114
 
115
  with st.sidebar:
116
  select_model = st.selectbox("Select Model", ("T5-large","T5-small"))
@@ -118,518 +29,8 @@ if select_model == "T5-large":
118
  modelname = "DevBM/t5-large-squad"
119
  elif select_model == "T5-small":
120
  modelname = "AneriThakkar/flan-t5-small-finetuned"
121
- nlp, s2v = load_nlp_models()
122
- similarity_model, spell = load_qa_models()
123
- context_model = similarity_model
124
- model, tokenizer = load_model(modelname)
125
-
126
-
127
- # Info Section
128
- def display_info():
129
- st.sidebar.title("Information")
130
- st.sidebar.markdown("""
131
- ### Question Generator System
132
- This system is designed to generate questions based on the provided context. It uses various NLP techniques and models to:
133
- - Extract keywords from the text
134
- - Map keywords to sentences
135
- - Generate questions
136
- - Provide multiple choice options
137
- - Assess the quality of generated questions
138
-
139
- #### Key Features:
140
- - **Keyword Extraction:** Combines RAKE, TF-IDF, and spaCy for comprehensive keyword extraction.
141
- - **Question Generation:** Utilizes a pre-trained T5 model for generating questions.
142
- - **Options Generation:** Creates contextually relevant multiple-choice options.
143
- - **Question Assessment:** Scores questions based on relevance, complexity, and spelling correctness.
144
- - **Feedback Collection:** Allows users to rate the generated questions and provides statistics on feedback.
145
-
146
- #### Customization Options:
147
- - Number of beams for question generation
148
- - Context window size for mapping keywords to sentences
149
- - Number of questions to generate
150
- - Additional display elements (context, answer, options, entity link, QA scores)
151
-
152
- #### Outputs:
153
- - Generated questions with multiple-choice options
154
- - Download options for CSV and PDF formats
155
- - Visualization of overall scores
156
-
157
- """)
158
-
159
- def get_pdf_text(pdf_file):
160
- doc = pymupdf.open(stream=pdf_file.read(), filetype="pdf")
161
- text = ""
162
- for page_num in range(doc.page_count):
163
- page = doc.load_page(page_num)
164
- text += page.get_text()
165
- return text
166
-
167
- def save_feedback_og(question, answer, rating, options, context):
168
- feedback_file = 'question_feedback.json'
169
- if os.path.exists(feedback_file):
170
- with open(feedback_file, 'r') as f:
171
- feedback_data = json.load(f)
172
- else:
173
- feedback_data = []
174
- tpl = {
175
- 'question' : question,
176
- 'answer' : answer,
177
- 'context' : context,
178
- 'options' : options,
179
- 'rating' : rating,
180
- }
181
- # feedback_data[question] = rating
182
- feedback_data.append(tpl)
183
- print(feedback_data)
184
- with open(feedback_file, 'w') as f:
185
- json.dump(feedback_data, f)
186
-
187
- return feedback_file
188
-
189
- # -----------------------------------------------------------------------------------------
190
- def send_email_with_attachment(email_subject, email_body, recipient_emails, sender_email, sender_password, attachment):
191
- smtp_server = "smtp.gmail.com" # Replace with your SMTP server
192
- smtp_port = 587 # Replace with your SMTP port
193
-
194
- # Create the email message
195
- message = MIMEMultipart()
196
- message['From'] = sender_email
197
- message['To'] = ", ".join(recipient_emails)
198
- message['Subject'] = email_subject
199
- message.attach(MIMEText(email_body, 'plain'))
200
-
201
- # Attach the feedback data if available
202
- if attachment:
203
- attachment_part = MIMEApplication(attachment.getvalue(), Name="feedback_data.json")
204
- attachment_part['Content-Disposition'] = f'attachment; filename="feedback_data.json"'
205
- message.attach(attachment_part)
206
-
207
- # Send the email
208
- try:
209
- with smtplib.SMTP(smtp_server, smtp_port) as server:
210
- server.starttls()
211
- print(sender_email)
212
- print(sender_password)
213
- server.login(sender_email, sender_password)
214
- text = message.as_string()
215
- server.sendmail(sender_email, recipient_emails, text)
216
- return True
217
- except Exception as e:
218
- st.error(f"Failed to send email: {str(e)}")
219
- return False
220
- # ----------------------------------------------------------------------------------
221
-
222
- def collect_feedback(i,question, answer, context, options):
223
- st.write("Please provide feedback for this question:")
224
- edited_question = st.text_input("Enter improved question",value=question,key=f'fdx1{i}')
225
- clarity = st.slider("Clarity", 1, 5, 3, help="1 = Very unclear, 5 = Very clear",key=f'fdx2{i}')
226
- difficulty = st.slider("Difficulty", 1, 5, 3, help="1 = Very easy, 5 = Very difficult",key=f'fdx3{i}')
227
- relevance = st.slider("Relevance", 1, 5, 3, help="1 = Not relevant, 5 = Highly relevant",key=f'fdx4{i}')
228
- option_quality = st.slider("Quality of Options", 1, 5, 3, help="1 = Poor options, 5 = Excellent options",key=f'fdx5{i}')
229
- overall_rating = st.slider("Overall Rating", 1, 5, 3, help="1 = Poor, 5 = Excellent",key=f'fdx6{i}')
230
- comments = st.text_input("Additional Comments", "",key=f'fdx7{i}')
231
-
232
- if st.button("Submit Feedback",key=f'fdx8{i}'):
233
- feedback = {
234
- "question": question,
235
- 'edited_question':edited_question,
236
- "answer": answer,
237
- "options": options,
238
- "clarity": clarity,
239
- "difficulty": difficulty,
240
- "relevance": relevance,
241
- "option_quality": option_quality,
242
- "overall_rating": overall_rating,
243
- "comments": comments
244
- }
245
- save_feedback(feedback)
246
- st.success("Thank you for your feedback!")
247
-
248
- def save_feedback(feedback):
249
- st.session_state.feedback_data.append(feedback)
250
-
251
- def analyze_feedback():
252
- if not st.session_state.feedback_data:
253
- st.warning("No feedback data available yet.")
254
- return
255
-
256
- df = pd.DataFrame(st.session_state.feedback_data)
257
-
258
- st.write("Feedback Analysis")
259
- st.write(f"Total feedback collected: {len(df)}")
260
-
261
- metrics = ['clarity', 'difficulty', 'relevance', 'option_quality', 'overall_rating']
262
-
263
- for metric in metrics:
264
- fig, ax = plt.subplots()
265
- df[metric].value_counts().sort_index().plot(kind='bar', ax=ax)
266
- plt.title(f"Distribution of {metric.capitalize()} Ratings")
267
- plt.xlabel("Rating")
268
- plt.ylabel("Count")
269
- st.pyplot(fig)
270
-
271
- st.write("Average Ratings:")
272
- st.write(df[metrics].mean())
273
-
274
- # Word cloud of comments
275
- comments = " ".join(df['comments'])
276
- if len(comments) > 1:
277
- wordcloud = WordCloud(width=800, height=400, background_color='white').generate(comments)
278
- fig, ax = plt.subplots()
279
- plt.imshow(wordcloud, interpolation='bilinear')
280
- plt.axis("off")
281
- st.pyplot(fig)
282
-
283
-
284
- def export_feedback_data():
285
- if not st.session_state.feedback_data:
286
- st.warning("No feedback data available.")
287
- return None
288
-
289
- # Convert feedback data to JSON
290
- json_data = json.dumps(st.session_state.feedback_data, indent=2)
291
-
292
- # Create a BytesIO object
293
- buffer = BytesIO()
294
- buffer.write(json_data.encode())
295
- buffer.seek(0)
296
-
297
- return buffer
298
-
299
- # Function to clean text
300
- def clean_text(text):
301
- text = re.sub(r"[^\x00-\x7F]", " ", text)
302
- text = re.sub(f"[\n]"," ", text)
303
- return text
304
-
305
- # Function to create text chunks
306
- def segment_text(text, max_segment_length=700, batch_size=7):
307
- sentences = sent_tokenize(text)
308
- segments = []
309
- current_segment = ""
310
-
311
- for sentence in sentences:
312
- if len(current_segment) + len(sentence) <= max_segment_length:
313
- current_segment += sentence + " "
314
- else:
315
- segments.append(current_segment.strip())
316
- current_segment = sentence + " "
317
-
318
- if current_segment:
319
- segments.append(current_segment.strip())
320
-
321
- # Create batches
322
- batches = [segments[i:i + batch_size] for i in range(0, len(segments), batch_size)]
323
- return batches
324
-
325
-
326
- # Function to extract keywords using combined techniques
327
- def extract_keywords(text, extract_all):
328
- try:
329
- gliner_model = GLiNER.from_pretrained("knowledgator/gliner-multitask-large-v0.5")
330
- labels = ["person", "organization", "email", "Award", "Date", "Competitions", "Teams", "location", "percentage", "money"]
331
- entities = gliner_model.predict_entities(text, labels, threshold=0.7)
332
-
333
- gliner_keywords = list(set([ent["text"] for ent in entities]))
334
- print(f"Gliner keywords:{gliner_keywords}")
335
- # Use Only Gliner Entities
336
- if extract_all is False:
337
- return list(gliner_keywords)
338
-
339
- doc = nlp(text)
340
- spacy_keywords = set([ent.text for ent in doc.ents])
341
- spacy_entities = spacy_keywords
342
- print(f"\n\nSpacy Entities: {spacy_entities} \n\n")
343
-
344
- #
345
- # if extract_all is False:
346
- # return list(spacy_entities)
347
-
348
- # Use RAKE
349
- rake = Rake()
350
- rake.extract_keywords_from_text(text)
351
- rake_keywords = set(rake.get_ranked_phrases())
352
- print(f"\n\nRake Keywords: {rake_keywords} \n\n")
353
- # Use spaCy for NER and POS tagging
354
- spacy_keywords.update([token.text for token in doc if token.pos_ in ["NOUN", "PROPN", "VERB", "ADJ"]])
355
- print(f"\n\nSpacy Keywords: {spacy_keywords} \n\n")
356
- # Use TF-IDF
357
- vectorizer = TfidfVectorizer(stop_words='english')
358
- X = vectorizer.fit_transform([text])
359
- tfidf_keywords = set(vectorizer.get_feature_names_out())
360
- print(f"\n\nTFIDF Entities: {tfidf_keywords} \n\n")
361
-
362
- # Combine all keywords
363
- combined_keywords = rake_keywords.union(spacy_keywords).union(tfidf_keywords).union(gliner_keywords)
364
-
365
- return list(combined_keywords)
366
- except Exception as e:
367
- raise QuestionGenerationError(f"Error in keyword extraction: {str(e)}")
368
-
369
- def get_similar_words_sense2vec(word, n=3):
370
- # Try to find the word with its most likely part-of-speech
371
- word_with_pos = word + "|NOUN"
372
- if word_with_pos in s2v:
373
- similar_words = s2v.most_similar(word_with_pos, n=n)
374
- return [word.split("|")[0] for word, _ in similar_words]
375
-
376
- # If not found, try without POS
377
- if word in s2v:
378
- similar_words = s2v.most_similar(word, n=n)
379
- return [word.split("|")[0] for word, _ in similar_words]
380
-
381
- return []
382
-
383
- def get_synonyms(word, n=3):
384
- synonyms = []
385
- for syn in wordnet.synsets(word):
386
- for lemma in syn.lemmas():
387
- if lemma.name() != word and lemma.name() not in synonyms:
388
- synonyms.append(lemma.name())
389
- if len(synonyms) == n:
390
- return synonyms
391
- return synonyms
392
-
393
- def generate_options(answer, context, n=3):
394
- options = [answer]
395
-
396
- # Add contextually relevant words using a pre-trained model
397
- context_embedding = context_model.encode(context)
398
- answer_embedding = context_model.encode(answer)
399
- context_words = [token.text for token in nlp(context) if token.is_alpha and token.text.lower() != answer.lower()]
400
-
401
- # Compute similarity scores and sort context words
402
- similarity_scores = [util.pytorch_cos_sim(context_model.encode(word), answer_embedding).item() for word in context_words]
403
- sorted_context_words = [word for _, word in sorted(zip(similarity_scores, context_words), reverse=True)]
404
- options.extend(sorted_context_words[:n])
405
-
406
- # Try to get similar words based on sense2vec
407
- similar_words = get_similar_words_sense2vec(answer, n)
408
- options.extend(similar_words)
409
-
410
- # If we don't have enough options, try synonyms
411
- if len(options) < n + 1:
412
- synonyms = get_synonyms(answer, n - len(options) + 1)
413
- options.extend(synonyms)
414
-
415
- # If we still don't have enough options, extract other entities from the context
416
- if len(options) < n + 1:
417
- doc = nlp(context)
418
- entities = [ent.text for ent in doc.ents if ent.text.lower() != answer.lower()]
419
- options.extend(entities[:n - len(options) + 1])
420
-
421
- # If we still need more options, add some random words from the context
422
- if len(options) < n + 1:
423
- context_words = [token.text for token in nlp(context) if token.is_alpha and token.text.lower() != answer.lower()]
424
- options.extend(random.sample(context_words, min(n - len(options) + 1, len(context_words))))
425
- print(f"\n\nAll Possible Options: {options}\n\n")
426
- # Ensure we have the correct number of unique options
427
- options = list(dict.fromkeys(options))[:n+1]
428
-
429
- # Shuffle the options
430
- random.shuffle(options)
431
-
432
- return options
433
-
434
- # Function to map keywords to sentences with customizable context window size
435
- def map_keywords_to_sentences(text, keywords, context_window_size):
436
- sentences = sent_tokenize(text)
437
- keyword_sentence_mapping = {}
438
- print(f"\n\nSentences: {sentences}\n\n")
439
- for keyword in keywords:
440
- for i, sentence in enumerate(sentences):
441
- if keyword in sentence:
442
- # Combine current sentence with surrounding sentences for context
443
- # start = max(0, i - context_window_size)
444
- # end = min(len(sentences), i + context_window_size + 1)
445
- start = max(0,i - context_window_size)
446
- context_sentenses = sentences[start:i+1]
447
- context = ' '.join(context_sentenses)
448
- # context = ' '.join(sentences[start:end])
449
- if keyword not in keyword_sentence_mapping:
450
- keyword_sentence_mapping[keyword] = context
451
- else:
452
- keyword_sentence_mapping[keyword] += ' ' + context
453
- return keyword_sentence_mapping
454
-
455
-
456
- # Function to perform entity linking using Wikipedia API
457
- @lru_cache(maxsize=128)
458
- def entity_linking(keyword):
459
- page = wiki_wiki.page(keyword)
460
- if page.exists():
461
- return page.fullurl
462
- return None
463
-
464
- async def generate_question_async(context, answer, num_beams):
465
- try:
466
- input_text = f"<context> {context} <answer> {answer}"
467
- print(f"\n{input_text}\n")
468
- input_ids = tokenizer.encode(input_text, return_tensors='pt')
469
- outputs = await asyncio.to_thread(model.generate, input_ids, num_beams=num_beams, early_stopping=True, max_length=250)
470
- question = tokenizer.decode(outputs[0], skip_special_tokens=True)
471
- print(f"\n{question}\n")
472
- return question
473
- except Exception as e:
474
- raise QuestionGenerationError(f"Error in question generation: {str(e)}")
475
-
476
- async def generate_options_async(answer, context, n=3):
477
- try:
478
- options = [answer]
479
-
480
- # Add contextually relevant words using a pre-trained model
481
- context_embedding = await asyncio.to_thread(context_model.encode, context)
482
- answer_embedding = await asyncio.to_thread(context_model.encode, answer)
483
- context_words = [token.text for token in nlp(context) if token.is_alpha and token.text.lower() != answer.lower()]
484
-
485
- # Compute similarity scores and sort context words
486
- similarity_scores = [util.pytorch_cos_sim(await asyncio.to_thread(context_model.encode, word), answer_embedding).item() for word in context_words]
487
- sorted_context_words = [word for _, word in sorted(zip(similarity_scores, context_words), reverse=True)]
488
- options.extend(sorted_context_words[:n])
489
-
490
- # Try to get similar words based on sense2vec
491
- similar_words = await asyncio.to_thread(get_similar_words_sense2vec, answer, n)
492
- options.extend(similar_words)
493
-
494
- # If we don't have enough options, try synonyms
495
- if len(options) < n + 1:
496
- synonyms = await asyncio.to_thread(get_synonyms, answer, n - len(options) + 1)
497
- options.extend(synonyms)
498
-
499
- # Ensure we have the correct number of unique options
500
- options = list(dict.fromkeys(options))[:n+1]
501
-
502
- # Shuffle the options
503
- random.shuffle(options)
504
-
505
- return options
506
- except Exception as e:
507
- raise QuestionGenerationError(f"Error in generating options: {str(e)}")
508
-
509
-
510
- # Function to generate questions using beam search
511
- async def generate_questions_async(text, num_questions, context_window_size, num_beams, extract_all_keywords):
512
- try:
513
- batches = segment_text(text)
514
- keywords = extract_keywords(text, extract_all_keywords)
515
- all_questions = []
516
-
517
- progress_bar = st.progress(0)
518
- status_text = st.empty()
519
-
520
- for i, batch in enumerate(batches):
521
- status_text.text(f"Processing batch {i+1} of {len(batches)}...")
522
- batch_questions = await process_batch(batch, keywords, context_window_size, num_beams)
523
- all_questions.extend(batch_questions)
524
- progress_bar.progress((i + 1) / len(batches))
525
-
526
- if len(all_questions) >= num_questions:
527
- break
528
-
529
- progress_bar.empty()
530
- status_text.empty()
531
-
532
- return all_questions[:num_questions]
533
- except QuestionGenerationError as e:
534
- st.error(f"An error occurred during question generation: {str(e)}")
535
- return []
536
- except Exception as e:
537
- st.error(f"An unexpected error occurred: {str(e)}")
538
- return []
539
-
540
- async def generate_fill_in_the_blank_questions(context,answer):
541
- answerSize = len(answer)
542
- replacedBlanks = ""
543
- for i in range(answerSize):
544
- replacedBlanks += "_"
545
- blank_q = context.replace(answer,replacedBlanks)
546
- return blank_q
547
-
548
- async def process_batch(batch, keywords, context_window_size, num_beams):
549
- questions = []
550
- for text in batch:
551
- keyword_sentence_mapping = map_keywords_to_sentences(text, keywords, context_window_size)
552
- for keyword, context in keyword_sentence_mapping.items():
553
- question = await generate_question_async(context, keyword, num_beams)
554
- options = await generate_options_async(keyword, context)
555
- blank_question = await generate_fill_in_the_blank_questions(context,keyword)
556
- overall_score, relevance_score, complexity_score, spelling_correctness = assess_question_quality(context, question, keyword)
557
- if overall_score >= 0.5:
558
- questions.append({
559
- "question": question,
560
- "context": context,
561
- "answer": keyword,
562
- "options": options,
563
- "overall_score": overall_score,
564
- "relevance_score": relevance_score,
565
- "complexity_score": complexity_score,
566
- "spelling_correctness": spelling_correctness,
567
- "blank_question": blank_question,
568
- })
569
- return questions
570
-
571
- # Function to export questions to CSV
572
- def export_to_csv(data):
573
- # df = pd.DataFrame(data, columns=["Context", "Answer", "Question", "Options"])
574
- df = pd.DataFrame(data)
575
- # csv = df.to_csv(index=False,encoding='utf-8')
576
- csv = df.to_csv(index=False)
577
- return csv
578
-
579
- # Function to export questions to PDF
580
- def export_to_pdf(data):
581
- pdf = FPDF()
582
- pdf.add_page()
583
- pdf.set_font("Arial", size=12)
584
-
585
- for item in data:
586
- pdf.multi_cell(0, 10, f"Context: {item['context']}")
587
- pdf.multi_cell(0, 10, f"Question: {item['question']}")
588
- pdf.multi_cell(0, 10, f"Answer: {item['answer']}")
589
- pdf.multi_cell(0, 10, f"Options: {', '.join(item['options'])}")
590
- pdf.multi_cell(0, 10, f"Overall Score: {item['overall_score']:.2f}")
591
- pdf.ln(10)
592
-
593
- return pdf.output(dest='S').encode('latin-1')
594
-
595
- def display_word_cloud(generated_questions):
596
- word_frequency = {}
597
- for question in generated_questions:
598
- words = question.split()
599
- for word in words:
600
- word_frequency[word] = word_frequency.get(word, 0) + 1
601
-
602
- wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(word_frequency)
603
- plt.figure(figsize=(10, 5))
604
- plt.imshow(wordcloud, interpolation='bilinear')
605
- plt.axis('off')
606
- st.pyplot()
607
-
608
-
609
- def assess_question_quality(context, question, answer):
610
- # Assess relevance using cosine similarity
611
- context_doc = nlp(context)
612
- question_doc = nlp(question)
613
- relevance_score = context_doc.similarity(question_doc)
614
-
615
- # Assess complexity using token length (as a simple metric)
616
- complexity_score = min(len(question_doc) / 20, 1) # Normalize to 0-1
617
-
618
- # Assess Spelling correctness
619
- misspelled = spell.unknown(question.split())
620
- spelling_correctness = 1 - (len(misspelled) / len(question.split())) # Normalize to 0-1
621
-
622
- # Calculate overall score (you can adjust weights as needed)
623
- overall_score = (
624
- 0.4 * relevance_score +
625
- 0.4 * complexity_score +
626
- 0.2 * spelling_correctness
627
- )
628
-
629
- return overall_score, relevance_score, complexity_score, spelling_correctness
630
 
631
  def main():
632
- # Streamlit interface
633
  st.title(":blue[Question Generator System]")
634
  session_id = get_session_id()
635
  state = initialize_state(session_id)
@@ -637,18 +38,18 @@ def main():
637
  st.session_state.feedback_data = []
638
 
639
  with st.sidebar:
640
- show_info = st.toggle('Show Info',True)
641
  if show_info:
642
  display_info()
643
  st.subheader("Customization Options")
644
  # Customization options
645
  input_type = st.radio("Select Input Preference", ("Text Input","Upload PDF"))
646
  with st.expander("Choose the Additional Elements to show"):
647
- show_context = st.checkbox("Context",True)
648
  show_answer = st.checkbox("Answer",True)
649
- show_options = st.checkbox("Options",False)
650
  show_entity_link = st.checkbox("Entity Link For Wikipedia",True)
651
- show_qa_scores = st.checkbox("QA Score",False)
652
  show_blank_question = st.checkbox("Fill in the Blank Questions",True)
653
  num_beams = st.slider("Select number of beams for question generation", min_value=2, max_value=10, value=2)
654
  context_window_size = st.slider("Select context window size (number of sentences before and after)", min_value=1, max_value=5, value=1)
@@ -674,15 +75,15 @@ def main():
674
  text = clean_text(text)
675
  with st.expander("Show text"):
676
  st.write(text)
 
677
  generate_questions_button = st.button("Generate Questions",help="This is the generate questions button")
678
  # st.markdown('<span aria-label="Generate questions button">Above is the generate questions button</span>', unsafe_allow_html=True)
679
 
680
- # if generate_questions_button:
681
  if generate_questions_button and text:
682
  start_time = time.time()
683
  with st.spinner("Generating questions..."):
684
  try:
685
- state['generated_questions'] = asyncio.run(generate_questions_async(text, num_questions, context_window_size, num_beams, extract_all_keywords))
686
  if not state['generated_questions']:
687
  st.warning("No questions were generated. The text might be too short or lack suitable content.")
688
  else:
@@ -743,12 +144,16 @@ def main():
743
  # Export buttons
744
  # if st.session_state.generated_questions:
745
  if state['generated_questions']:
746
- with st.sidebar:
747
- csv_data = export_to_csv(state['generated_questions'])
748
- st.download_button(label="Download CSV", data=csv_data, file_name='questions.csv', mime='text/csv')
749
-
750
- pdf_data = export_to_pdf(state['generated_questions'])
751
- st.download_button(label="Download PDF", data=pdf_data, file_name='questions.pdf', mime='application/pdf')
 
 
 
 
752
 
753
  with st.expander("View Visualizations"):
754
  questions = [tpl['question'] for tpl in state['generated_questions']]
@@ -759,7 +164,6 @@ def main():
759
  overall_scores = pd.DataFrame(overall_scores,columns=['Overall Scores'])
760
  st.line_chart(overall_scores)
761
 
762
-
763
  # View Feedback Statistics
764
  with st.expander("View Feedback Statistics"):
765
  analyze_feedback()
 
1
  import streamlit as st
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
 
3
  st.set_page_config(
4
  page_icon='cyclone',
 
9
  }
10
  )
11
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
 
13
+ from text_processing import clean_text, get_pdf_text
14
+ from question_generation import generate_questions_async
15
+ from visualization import display_word_cloud
16
+ from data_export import export_to_csv, export_to_pdf
17
+ from feedback import collect_feedback, analyze_feedback, export_feedback_data
18
+ from utils import get_session_id, initialize_state, get_state, set_state, display_info, QuestionGenerationError, entity_linking
19
+ import asyncio
20
+ import time
21
+ import pandas as pd
22
+ from data_export import send_email_with_attachment
23
 
24
+ st.set_option('deprecation.showPyplotGlobalUse',False)
 
25
 
26
  with st.sidebar:
27
  select_model = st.selectbox("Select Model", ("T5-large","T5-small"))
 
29
  modelname = "DevBM/t5-large-squad"
30
  elif select_model == "T5-small":
31
  modelname = "AneriThakkar/flan-t5-small-finetuned"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
 
33
  def main():
 
34
  st.title(":blue[Question Generator System]")
35
  session_id = get_session_id()
36
  state = initialize_state(session_id)
 
38
  st.session_state.feedback_data = []
39
 
40
  with st.sidebar:
41
+ show_info = st.toggle('Show Info',False)
42
  if show_info:
43
  display_info()
44
  st.subheader("Customization Options")
45
  # Customization options
46
  input_type = st.radio("Select Input Preference", ("Text Input","Upload PDF"))
47
  with st.expander("Choose the Additional Elements to show"):
48
+ show_context = st.checkbox("Context",False)
49
  show_answer = st.checkbox("Answer",True)
50
+ show_options = st.checkbox("Options",True)
51
  show_entity_link = st.checkbox("Entity Link For Wikipedia",True)
52
+ show_qa_scores = st.checkbox("QA Score",True)
53
  show_blank_question = st.checkbox("Fill in the Blank Questions",True)
54
  num_beams = st.slider("Select number of beams for question generation", min_value=2, max_value=10, value=2)
55
  context_window_size = st.slider("Select context window size (number of sentences before and after)", min_value=1, max_value=5, value=1)
 
75
  text = clean_text(text)
76
  with st.expander("Show text"):
77
  st.write(text)
78
+ # st.text(text)
79
  generate_questions_button = st.button("Generate Questions",help="This is the generate questions button")
80
  # st.markdown('<span aria-label="Generate questions button">Above is the generate questions button</span>', unsafe_allow_html=True)
81
 
 
82
  if generate_questions_button and text:
83
  start_time = time.time()
84
  with st.spinner("Generating questions..."):
85
  try:
86
+ state['generated_questions'] = asyncio.run(generate_questions_async(text, num_questions, context_window_size, num_beams, extract_all_keywords,modelname))
87
  if not state['generated_questions']:
88
  st.warning("No questions were generated. The text might be too short or lack suitable content.")
89
  else:
 
144
  # Export buttons
145
  # if st.session_state.generated_questions:
146
  if state['generated_questions']:
147
+ with st.sidebar:
148
+ # Adding error handling while exporting the files
149
+ # ---------------------------------------------------------------------
150
+ try:
151
+ csv_data = export_to_csv(state['generated_questions'])
152
+ st.download_button(label="Download CSV", data=csv_data, file_name='questions.csv', mime='text/csv')
153
+ pdf_data = export_to_pdf(state['generated_questions'])
154
+ st.download_button(label="Download PDF", data=pdf_data, file_name='questions.pdf', mime='application/pdf')
155
+ except Exception as e:
156
+ st.error(f"Error exporting CSV: {e}")
157
 
158
  with st.expander("View Visualizations"):
159
  questions = [tpl['question'] for tpl in state['generated_questions']]
 
164
  overall_scores = pd.DataFrame(overall_scores,columns=['Overall Scores'])
165
  st.line_chart(overall_scores)
166
 
 
167
  # View Feedback Statistics
168
  with st.expander("View Feedback Statistics"):
169
  analyze_feedback()