DevBM AneriThakkar commited on
Commit
f7842f6
1 Parent(s): 03f344d

Upload files for modules/functions (#5)

Browse files

- Upload files for modules/functions (7fafac42e4b5d1e40338e2b5ace7ef9fba805bff)


Co-authored-by: Thakkar Aneri Pareshkumar <AneriThakkar@users.noreply.huggingface.co>

data_export.py ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from fpdf import FPDF
3
+ import streamlit as st
4
+ import smtplib
5
+ from email.mime.multipart import MIMEMultipart
6
+ from email.mime.text import MIMEText
7
+ # from email.mime.base import MIMEBase
8
+ from email.mime.application import MIMEApplication
9
+ # from email import encoders
10
+
11
+
12
+ def export_to_csv(data):
13
+ df = pd.DataFrame(data)
14
+ csv = df.to_csv(index=False)
15
+ return csv
16
+
17
+ def export_to_pdf(data):
18
+ pdf = FPDF()
19
+ pdf.add_page()
20
+ pdf.set_font("Arial", size=12)
21
+
22
+ for item in data:
23
+ pdf.multi_cell(0, 10, f"Context: {item['context']}")
24
+ pdf.multi_cell(0, 10, f"Question: {item['question']}")
25
+ pdf.multi_cell(0, 10, f"Answer: {item['answer']}")
26
+ pdf.multi_cell(0, 10, f"Options: {', '.join(item['options'])}")
27
+ pdf.multi_cell(0, 10, f"Overall Score: {item['overall_score']:.2f}")
28
+ pdf.ln(10)
29
+
30
+ return pdf.output(dest='S').encode('latin-1')
31
+
32
+ def send_email_with_attachment(email_subject, email_body, recipient_emails, sender_email, sender_password, attachment):
33
+ smtp_server = "smtp.gmail.com" # Replace with your SMTP server
34
+ smtp_port = 587 # Replace with your SMTP port
35
+
36
+ # Create the email message
37
+ message = MIMEMultipart()
38
+ message['From'] = sender_email
39
+ message['To'] = ", ".join(recipient_emails)
40
+ message['Subject'] = email_subject
41
+ message.attach(MIMEText(email_body, 'plain'))
42
+
43
+ # Attach the feedback data if available
44
+ if attachment:
45
+ attachment_part = MIMEApplication(attachment.getvalue(), Name="feedback_data.json")
46
+ attachment_part['Content-Disposition'] = f'attachment; filename="feedback_data.json"'
47
+ message.attach(attachment_part)
48
+
49
+ # Send the email
50
+ try:
51
+ with smtplib.SMTP(smtp_server, smtp_port) as server:
52
+ server.starttls()
53
+ # print(sender_email)
54
+ # print(sender_password)
55
+ server.login(sender_email, sender_password)
56
+ text = message.as_string()
57
+ server.sendmail(sender_email, recipient_emails, text)
58
+ return True
59
+ except Exception as e:
60
+ st.error(f"Failed to send email: {str(e)}")
61
+ return False
feedback.py ADDED
@@ -0,0 +1,108 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import json
3
+ from io import BytesIO
4
+ import pandas as pd
5
+ import matplotlib.pyplot as plt
6
+ from wordcloud import WordCloud
7
+ import os
8
+
9
+ def save_feedback_og(feedback):
10
+
11
+ feedback_file = 'feedback_data.json'
12
+ if os.path.exists(feedback_file):
13
+ with open(feedback_file, 'r') as f:
14
+ feedback_data = json.load(f)
15
+ else:
16
+ feedback_data = []
17
+ # tpl = {
18
+ # 'question' : question,
19
+ # 'answer' : answer,
20
+ # 'context' : context,
21
+ # 'options' : options,
22
+ # 'rating' : rating,
23
+ # }
24
+ # feedback_data[question] = rating
25
+ feedback_data.append(feedback)
26
+
27
+ print(feedback_data)
28
+ with open(feedback_file, 'w') as f:
29
+ json.dump(feedback_data, f)
30
+ st.session_state.feedback_data.append(feedback)
31
+ return feedback_file
32
+
33
+ def collect_feedback(i,question, answer, context, options):
34
+ st.write("Please provide feedback for this question:")
35
+ edited_question = st.text_input("Enter improved question",value=question,key=f'fdx1{i}')
36
+ clarity = st.slider("Clarity", 1, 5, 3, help="1 = Very unclear, 5 = Very clear",key=f'fdx2{i}')
37
+ difficulty = st.slider("Difficulty", 1, 5, 3, help="1 = Very easy, 5 = Very difficult",key=f'fdx3{i}')
38
+ relevance = st.slider("Relevance", 1, 5, 3, help="1 = Not relevant, 5 = Highly relevant",key=f'fdx4{i}')
39
+ option_quality = st.slider("Quality of Options", 1, 5, 3, help="1 = Poor options, 5 = Excellent options",key=f'fdx5{i}')
40
+ overall_rating = st.slider("Overall Rating", 1, 5, 3, help="1 = Poor, 5 = Excellent",key=f'fdx6{i}')
41
+ comments = st.text_input("Additional Comments", "",key=f'fdx7{i}')
42
+
43
+ if st.button("Submit Feedback",key=f'fdx8{i}'):
44
+ feedback = {
45
+ "context": context,
46
+ "question": question,
47
+ 'edited_question':edited_question,
48
+ "answer": answer,
49
+ "options": options,
50
+ "clarity": clarity,
51
+ "difficulty": difficulty,
52
+ "relevance": relevance,
53
+ "option_quality": option_quality,
54
+ "overall_rating": overall_rating,
55
+ "comments": comments
56
+ }
57
+ # save_feedback(feedback)
58
+ save_feedback_og(feedback)
59
+
60
+ st.success("Thank you for your feedback!")
61
+
62
+ def analyze_feedback():
63
+ if not st.session_state.feedback_data:
64
+ st.warning("No feedback data available yet.")
65
+ return
66
+
67
+ df = pd.DataFrame(st.session_state.feedback_data)
68
+
69
+ st.write("Feedback Analysis")
70
+ st.write(f"Total feedback collected: {len(df)}")
71
+
72
+ metrics = ['clarity', 'difficulty', 'relevance', 'option_quality', 'overall_rating']
73
+
74
+ for metric in metrics:
75
+ fig, ax = plt.subplots()
76
+ df[metric].value_counts().sort_index().plot(kind='bar', ax=ax)
77
+ plt.title(f"Distribution of {metric.capitalize()} Ratings")
78
+ plt.xlabel("Rating")
79
+ plt.ylabel("Count")
80
+ st.pyplot(fig)
81
+
82
+ st.write("Average Ratings:")
83
+ st.write(df[metrics].mean())
84
+
85
+ # Word cloud of comments
86
+ comments = " ".join(df['comments'])
87
+ if len(comments) > 1:
88
+ wordcloud = WordCloud(width=800, height=400, background_color='white').generate(comments)
89
+ fig, ax = plt.subplots()
90
+ plt.imshow(wordcloud, interpolation='bilinear')
91
+ plt.axis("off")
92
+ st.pyplot(fig)
93
+
94
+
95
+ def export_feedback_data():
96
+ if not st.session_state.feedback_data:
97
+ st.warning("No feedback data available.")
98
+ return None
99
+
100
+ # Convert feedback data to JSON
101
+ json_data = json.dumps(st.session_state.feedback_data, indent=2)
102
+
103
+ # Create a BytesIO object
104
+ buffer = BytesIO()
105
+ buffer.write(json_data.encode())
106
+ buffer.seek(0)
107
+
108
+ return buffer
fill_in_the_blanks_generation.py ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+
2
+ async def generate_fill_in_the_blank_questions(context,answer):
3
+ answerSize = len(answer)
4
+ replacedBlanks = ""
5
+ for i in range(answerSize):
6
+ replacedBlanks += "_"
7
+ blank_q = context.replace(answer,replacedBlanks)
8
+ return blank_q
keyword_extraction.py ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from nltk.corpus import stopwords
2
+ from rake_nltk import Rake
3
+ from sklearn.feature_extraction.text import TfidfVectorizer
4
+ import spacy
5
+ from transformers import pipeline
6
+ from gliner import GLiNER
7
+ from load_models import load_nlp_models
8
+
9
+ nlp, s2v = load_nlp_models()
10
+
11
+ def filter_keywords(extracted_keywords):
12
+ unwanted_keywords =[
13
+ # Common punctuation marks
14
+ '.', ',', '!', '?', ':', ';', '-', '_', '(', ')', '[', ']', '{', '}',
15
+ '/', '\\', '|', '@', '#', '$', '%', '^', '&', '*', '+', '=', '<', '>',
16
+ '`', '~', '"', "'",
17
+
18
+ # Common contractions (if not already removed as stopwords)
19
+ "n't", "'s", "'m", "'re", "'ll", "'ve", "'d",
20
+
21
+ # Common abbreviations
22
+ 'etc', 'eg', 'ie', 'ex', 'vs', 'viz',
23
+
24
+ 'tbd', 'tba', # To be determined/announced
25
+ 'na', 'n/a', # Not applicable
26
+
27
+ # Single characters
28
+ 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
29
+
30
+ # HTML-related tags (if the text contains any HTML content)
31
+ '<html>', '</html>', '<body>', '</body>', '<head>', '</head>', '<div>', '</div>', '<p>', '</p>', '<br>', '<hr>', '<h1>', '</h1>', '<h2>', '</h2>', '<h3>', '</h3>',
32
+
33
+ # Random technical or common abbreviations that aren't meaningful keywords
34
+ 'etc', 'e.g', 'i.e', 'vs', 'ex', 'vol', 'sec', 'pg', 'id', 'ref', 'eq',
35
+
36
+ # Miscellaneous tokens
37
+ 'www', 'com', 'http', 'https', 'ftp', 'pdf', 'doc', 'img', 'gif', 'jpeg', 'jpg', 'png', 'mp4', 'mp3', 'org', 'net', 'edu',
38
+ 'untitled', 'noname', 'unknown', 'undefined',
39
+
40
+ # Single letters commonly used in bullet points or references
41
+ 'i', 'ii', 'iii', 'iv', 'v', 'vi', 'vii', 'viii', 'ix', 'x', 'xi', 'xii',
42
+
43
+ # Common file extensions (if filenames are included in the text)
44
+ '.jpg', '.png', '.pdf', '.doc', '.docx', '.ppt', '.pptx', '.xls', '.xlsx', '.csv', '.txt', '.zip', '.tar', '.gz', '.exe', '.bat', '.sh', '.py', '.cpp', '.java',
45
+
46
+ # Other tokens related to formatting or structure
47
+ 'chapter', 'section', 'figure', 'table', 'appendix',
48
+
49
+ # Miscellaneous general noise terms
50
+ 'note', 'item', 'items', 'number', 'numbers', 'figure', 'case', 'cases', 'example', 'examples', 'type', 'types', 'section', 'part', 'parts'
51
+ ]
52
+ # Convert both lists to sets for efficient lookup
53
+ extracted_set = set(extracted_keywords)
54
+ unwanted_set = set(unwanted_keywords)
55
+
56
+ # Remove unwanted keywords
57
+ filtered_keywords = extracted_set - unwanted_set
58
+
59
+ # Convert back to a list and sort (optional)
60
+ return sorted(list(filtered_keywords))
61
+
62
+
63
+ def remove_stopwords(keywords):
64
+ stop_words = set(stopwords.words('english'))
65
+ modified_keywords = [''.join(keyword.split()) for keyword in keywords]
66
+ filtered_keywords = [keyword for keyword in modified_keywords if keyword.lower() not in stop_words]
67
+ original_keywords = []
68
+ for keyword in filtered_keywords:
69
+ for original_keyword in keywords:
70
+ if ''.join(original_keyword.split()).lower() == keyword.lower():
71
+ original_keywords.append(original_keyword)
72
+ break
73
+ return original_keywords
74
+
75
+ def enhanced_ner(text):
76
+ nlp = spacy.load("en_core_web_trf")
77
+ ner_pipeline = pipeline("ner", model="dbmdz/bert-large-cased-finetuned-conll03-english")
78
+ doc = nlp(text)
79
+ spacy_entities = set((ent.text, ent.label_) for ent in doc.ents)
80
+ hf_entities = set((ent['word'], ent['entity']) for ent in ner_pipeline(text))
81
+ combined_entities = spacy_entities.union(hf_entities)
82
+ keywords = [entity[0] for entity in combined_entities]
83
+ return list(keywords)
84
+
85
+ def extract_keywords(text, extract_all):
86
+ try:
87
+ text = text.lower()
88
+ enhanced_ner_entities = enhanced_ner(text)
89
+ print("Enhanced ner entities: ",enhanced_ner_entities)
90
+ enhanced_ner_entities = remove_stopwords(enhanced_ner_entities)
91
+ enhanced_ner_entities = filter_keywords(enhanced_ner_entities)
92
+ print("Enhanced ner entities after applying filter and stopwords removal: ",enhanced_ner_entities)
93
+
94
+ gliner_model = GLiNER.from_pretrained("knowledgator/gliner-multitask-large-v0.5")
95
+ labels = ["person", "organization", "phone number", "address", "email", "date of birth",
96
+ "mobile phone number", "medication", "ip address", "email address",
97
+ "landline phone number", "blood type", "digital signature", "postal code",
98
+ "date"]
99
+ entities = gliner_model.predict_entities(text, labels, threshold=0.5)
100
+
101
+ gliner_keywords = set(remove_stopwords([ent["text"] for ent in entities]))
102
+ print(f"Gliner keywords:{gliner_keywords}")
103
+
104
+ # if extract_all is False:
105
+ # return list(gliner_keywords)
106
+
107
+ doc = nlp(text)
108
+ spacy_keywords = set(remove_stopwords([ent.text for ent in doc.ents]))
109
+ print(f"\n\nSpacy Entities: {spacy_keywords} \n\n")
110
+
111
+ if extract_all is False:
112
+ combined_keywords_without_all = list(spacy_keywords.union(gliner_keywords).union(enhanced_ner_entities))
113
+ filtered_results = filter_keywords(combined_keywords_without_all)
114
+ print("Keywords returned: ",filtered_results)
115
+ return list(filtered_results)
116
+
117
+ rake = Rake()
118
+ rake.extract_keywords_from_text(text)
119
+ rake_keywords = set(remove_stopwords(rake.get_ranked_phrases()))
120
+ print(f"\n\nRake Keywords: {rake_keywords} \n\n")
121
+
122
+ vectorizer = TfidfVectorizer(stop_words='english')
123
+ X = vectorizer.fit_transform([text])
124
+ tfidf_keywords = set(remove_stopwords(vectorizer.get_feature_names_out()))
125
+ print(f"\n\nTFIDF Entities: {tfidf_keywords} \n\n")
126
+
127
+ combined_keywords = list(rake_keywords.union(spacy_keywords).union(tfidf_keywords).union(gliner_keywords))
128
+ filtered_results = filter_keywords(combined_keywords)
129
+ print("Keywords returned: ",filtered_results)
130
+ return list(filtered_results)
131
+
132
+ except Exception as e:
133
+ raise Exception(f"Error in keyword extraction: {str(e)}")
load_models.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from transformers import T5ForConditionalGeneration, T5Tokenizer
3
+ import spacy
4
+ import sense2vec
5
+ from sentence_transformers import SentenceTransformer
6
+ from spellchecker import SpellChecker
7
+ import wikipediaapi
8
+ from langchain_community.llms import Ollama
9
+ # import time
10
+
11
+ def load_llama():
12
+ llm = Ollama(model='llama3:latest')
13
+ return llm
14
+
15
+ @st.cache_resource
16
+ def load_model(modelname):
17
+ model_name = modelname
18
+ model = T5ForConditionalGeneration.from_pretrained(model_name)
19
+ tokenizer = T5Tokenizer.from_pretrained(model_name)
20
+ return model, tokenizer
21
+
22
+ # Load Spacy Model
23
+ @st.cache_resource
24
+ def load_nlp_models():
25
+ nlp = spacy.load("en_core_web_md")
26
+ s2v = sense2vec.Sense2Vec().from_disk('s2v_old')
27
+ return nlp, s2v
28
+
29
+ # Load Quality Assurance Models
30
+ @st.cache_resource
31
+ def load_qa_models():
32
+ # Initialize BERT model for sentence similarity
33
+ similarity_model = SentenceTransformer('all-MiniLM-L6-v2')
34
+
35
+ spell = SpellChecker()
36
+ return similarity_model, spell
37
+
38
+ def initialize_wikiapi():
39
+ # Initialize Wikipedia API with a user agent
40
+ user_agent = 'QGen/1.2'
41
+ wiki_wiki = wikipediaapi.Wikipedia(user_agent= user_agent,language='en')
42
+ return user_agent, wiki_wiki
43
+
44
+
45
+
mapping_keywords.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from nltk.tokenize import sent_tokenize
2
+
3
+ # Function to map keywords to sentences with customizable context window size
4
+ def map_keywords_to_sentences(text, keywords, context_window_size):
5
+ sentences = sent_tokenize(text)
6
+ keyword_sentence_mapping = {}
7
+ print(f"\n\nSentences: {sentences}\n\n")
8
+ for keyword in keywords:
9
+ for i, sentence in enumerate(sentences):
10
+ if keyword in sentence:
11
+ # Combine current sentence with surrounding sentences for context
12
+ # start = max(0, i - context_window_size)
13
+ # end = min(len(sentences), i + context_window_size + 1)
14
+ start = max(0,i - context_window_size)
15
+ context_sentenses = sentences[start:i+1]
16
+ context = ' '.join(context_sentenses)
17
+ # context = ' '.join(sentences[start:end])
18
+ if keyword not in keyword_sentence_mapping:
19
+ keyword_sentence_mapping[keyword] = context
20
+ else:
21
+ keyword_sentence_mapping[keyword] += ' ' + context
22
+ return keyword_sentence_mapping
option_generation.py ADDED
@@ -0,0 +1,135 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import nltk
2
+ import random
3
+ import asyncio
4
+ nltk.download('wordnet')
5
+ from nltk.corpus import wordnet
6
+ from sentence_transformers import util
7
+ from load_models import load_nlp_models, load_llama, load_qa_models
8
+ from utils import QuestionGenerationError
9
+
10
+ nlp, s2v = load_nlp_models()
11
+ llm = load_llama()
12
+ similarity_model, spell = load_qa_models()
13
+ context_model = similarity_model
14
+
15
+ def get_similar_words_sense2vec(word, n=3):
16
+ # Try to find the word with its most likely part-of-speech
17
+ word_with_pos = word + "|NOUN"
18
+ if word_with_pos in s2v:
19
+ similar_words = s2v.most_similar(word_with_pos, n=n)
20
+ return [word.split("|")[0] for word, _ in similar_words]
21
+
22
+ # If not found, try without POS
23
+ if word in s2v:
24
+ similar_words = s2v.most_similar(word, n=n)
25
+ return [word.split("|")[0] for word, _ in similar_words]
26
+
27
+ return []
28
+
29
+ def get_synonyms(word, n=3):
30
+ synonyms = []
31
+ for syn in wordnet.synsets(word):
32
+ for lemma in syn.lemmas():
33
+ if lemma.name() != word and lemma.name() not in synonyms:
34
+ synonyms.append(lemma.name())
35
+ if len(synonyms) == n:
36
+ return synonyms
37
+ return synonyms
38
+
39
+ def gen_options(answer,context,question):
40
+ prompt=f'''Given the following context, question, and correct answer,
41
+ generate {4} incorrect but plausible answer options. The options should be:
42
+ 1. Contextually related to the given context
43
+ 2. Grammatically consistent with the question
44
+ 3. Different from the correct answer
45
+ 4. Not explicitly mentioned in the given context
46
+
47
+ Context: {context}
48
+ Question: {question}
49
+ Correct Answer: {answer}
50
+
51
+ Provide the options in a semi colon-separated list. Output must contain only the options and nothing else.
52
+ '''
53
+ options= [answer]
54
+ response = llm.invoke(prompt, stop=['<|eot_id|>'])
55
+ incorrect_options = [option.strip() for option in response.split(';')]
56
+ options.extend(incorrect_options)
57
+ random.shuffle(options)
58
+ print(options)
59
+ return options
60
+ # print(response)
61
+
62
+ def generate_options(answer, context, n=3):
63
+ options = [answer]
64
+
65
+ # Add contextually relevant words using a pre-trained model
66
+ context_embedding = context_model.encode(context)
67
+ answer_embedding = context_model.encode(answer)
68
+ context_words = [token.text for token in nlp(context) if token.is_alpha and token.text.lower() != answer.lower()]
69
+
70
+ # Compute similarity scores and sort context words
71
+ similarity_scores = [util.pytorch_cos_sim(context_model.encode(word), answer_embedding).item() for word in context_words]
72
+ sorted_context_words = [word for _, word in sorted(zip(similarity_scores, context_words), reverse=True)]
73
+ options.extend(sorted_context_words[:n])
74
+
75
+ # Try to get similar words based on sense2vec
76
+ similar_words = get_similar_words_sense2vec(answer, n)
77
+ options.extend(similar_words)
78
+
79
+ # If we don't have enough options, try synonyms
80
+ if len(options) < n + 1:
81
+ synonyms = get_synonyms(answer, n - len(options) + 1)
82
+ options.extend(synonyms)
83
+
84
+ # If we still don't have enough options, extract other entities from the context
85
+ if len(options) < n + 1:
86
+ doc = nlp(context)
87
+ entities = [ent.text for ent in doc.ents if ent.text.lower() != answer.lower()]
88
+ options.extend(entities[:n - len(options) + 1])
89
+
90
+ # If we still need more options, add some random words from the context
91
+ if len(options) < n + 1:
92
+ context_words = [token.text for token in nlp(context) if token.is_alpha and token.text.lower() != answer.lower()]
93
+ options.extend(random.sample(context_words, min(n - len(options) + 1, len(context_words))))
94
+ print(f"\n\nAll Possible Options: {options}\n\n")
95
+ # Ensure we have the correct number of unique options
96
+ options = list(dict.fromkeys(options))[:n+1]
97
+
98
+ # Shuffle the options
99
+ random.shuffle(options)
100
+
101
+ return options
102
+
103
+ async def generate_options_async(answer, context, n=3):
104
+ try:
105
+ options = [answer]
106
+
107
+ # Add contextually relevant words using a pre-trained model
108
+ context_embedding = await asyncio.to_thread(context_model.encode, context)
109
+ answer_embedding = await asyncio.to_thread(context_model.encode, answer)
110
+ context_words = [token.text for token in nlp(context) if token.is_alpha and token.text.lower() != answer.lower()]
111
+
112
+ # Compute similarity scores and sort context words
113
+ similarity_scores = [util.pytorch_cos_sim(await asyncio.to_thread(context_model.encode, word), answer_embedding).item() for word in context_words]
114
+ sorted_context_words = [word for _, word in sorted(zip(similarity_scores, context_words), reverse=True)]
115
+ options.extend(sorted_context_words[:n])
116
+
117
+ # Try to get similar words based on sense2vec
118
+ similar_words = await asyncio.to_thread(get_similar_words_sense2vec, answer, n)
119
+ options.extend(similar_words)
120
+
121
+ # If we don't have enough options, try synonyms
122
+ if len(options) < n + 1:
123
+ synonyms = await asyncio.to_thread(get_synonyms, answer, n - len(options) + 1)
124
+ options.extend(synonyms)
125
+
126
+ # Ensure we have the correct number of unique options
127
+ options = list(dict.fromkeys(options))[:n+1]
128
+
129
+ # Shuffle the options
130
+ random.shuffle(options)
131
+
132
+ return options
133
+ except Exception as e:
134
+ raise QuestionGenerationError(f"Error in generating options: {str(e)}")
135
+
question_generation.py ADDED
@@ -0,0 +1,122 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ import streamlit as st
3
+ from text_processing import segment_text
4
+ from keyword_extraction import extract_keywords
5
+ from utils import QuestionGenerationError
6
+ from mapping_keywords import map_keywords_to_sentences
7
+ from option_generation import gen_options, generate_options_async
8
+ from fill_in_the_blanks_generation import generate_fill_in_the_blank_questions
9
+ from load_models import load_nlp_models, load_qa_models, load_model
10
+
11
+ nlp, s2v = load_nlp_models()
12
+ similarity_model, spell = load_qa_models()
13
+
14
+
15
+ def assess_question_quality(context, question, answer):
16
+ # Assess relevance using cosine similarity
17
+ context_doc = nlp(context)
18
+ question_doc = nlp(question)
19
+ relevance_score = context_doc.similarity(question_doc)
20
+
21
+ # Assess complexity using token length (as a simple metric)
22
+ complexity_score = min(len(question_doc) / 20, 1) # Normalize to 0-1
23
+
24
+ # Assess Spelling correctness
25
+ misspelled = spell.unknown(question.split())
26
+ spelling_correctness = 1 - (len(misspelled) / len(question.split())) # Normalize to 0-1
27
+
28
+ # Calculate overall score (you can adjust weights as needed)
29
+ overall_score = (
30
+ 0.4 * relevance_score +
31
+ 0.4 * complexity_score +
32
+ 0.2 * spelling_correctness
33
+ )
34
+
35
+ return overall_score, relevance_score, complexity_score, spelling_correctness
36
+
37
+
38
+ async def process_batch(batch, keywords, context_window_size, num_beams, num_questions, modelname):
39
+ questions = []
40
+ print("inside process batch function")
41
+ flag = False
42
+ for text in batch:
43
+ if flag:
44
+ break
45
+ keyword_sentence_mapping = map_keywords_to_sentences(text, keywords, context_window_size)
46
+ print(keyword_sentence_mapping)
47
+ for keyword, context in keyword_sentence_mapping.items():
48
+ print("Length of questions list from process batch function: ",len(questions))
49
+ if len(questions)>=num_questions:
50
+ flag = True
51
+ break
52
+ question = await generate_question_async(context, keyword, num_beams,modelname)
53
+ options = await generate_options_async(keyword, context)
54
+ # options = gen_options(keyword, context, question)
55
+ blank_question = await generate_fill_in_the_blank_questions(context,keyword)
56
+ overall_score, relevance_score, complexity_score, spelling_correctness = assess_question_quality(context, question, keyword)
57
+ if overall_score >= 0.5:
58
+ questions.append({
59
+ "question": question,
60
+ "context": context,
61
+ "answer": keyword,
62
+ "options": options,
63
+ "overall_score": overall_score,
64
+ "relevance_score": relevance_score,
65
+ "complexity_score": complexity_score,
66
+ "spelling_correctness": spelling_correctness,
67
+ "blank_question": blank_question,
68
+ })
69
+ return questions
70
+
71
+
72
+ async def generate_question_async(context, answer, num_beams,modelname):
73
+ model, tokenizer = load_model(modelname)
74
+ try:
75
+ input_text = f"<context> {context} <answer> {answer}"
76
+ print(f"\n{input_text}\n")
77
+ input_ids = tokenizer.encode(input_text, return_tensors='pt')
78
+ outputs = await asyncio.to_thread(model.generate, input_ids, num_beams=num_beams, early_stopping=True, max_length=250)
79
+ question = tokenizer.decode(outputs[0], skip_special_tokens=True)
80
+ print(f"\n{question}\n")
81
+ # print(type(question))
82
+ return question
83
+ except Exception as e:
84
+ raise QuestionGenerationError(f"Error in question generation: {str(e)}")
85
+
86
+ # Function to generate questions using beam search
87
+ async def generate_questions_async(text, num_questions, context_window_size, num_beams, extract_all_keywords,modelname):
88
+ try:
89
+ batches = segment_text(text.lower())
90
+ keywords = extract_keywords(text, extract_all_keywords)
91
+ all_questions = []
92
+
93
+ progress_bar = st.progress(0)
94
+ status_text = st.empty()
95
+ print("Final keywords:",keywords)
96
+ print("Number of questions that needs to be generated: ",num_questions)
97
+ print("totoal no of batches:", batches)
98
+ for i, batch in enumerate(batches):
99
+ print("batch no: ", len(batches))
100
+ status_text.text(f"Processing batch {i+1} of {len(batches)}...")
101
+ batch_questions = await process_batch(batch, keywords, context_window_size, num_beams,num_questions,modelname)
102
+ all_questions.extend(batch_questions)
103
+ progress_bar.progress((i + 1) / len(batches))
104
+
105
+ print("Length of the all questions list: ",len(all_questions))
106
+
107
+ if len(all_questions) >= num_questions:
108
+ break
109
+
110
+ progress_bar.empty()
111
+ status_text.empty()
112
+
113
+ return all_questions[:num_questions]
114
+ except QuestionGenerationError as e:
115
+ st.error(f"An error occurred during question generation: {str(e)}")
116
+ return []
117
+ except Exception as e:
118
+ st.error(f"An unexpected error occurred: {str(e)}")
119
+ return []
120
+
121
+
122
+
text_processing.py ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import pymupdf
3
+ from nltk.tokenize import sent_tokenize
4
+
5
+ def get_pdf_text(pdf_file):
6
+ doc = pymupdf.open(stream=pdf_file.read(), filetype="pdf")
7
+ text = ""
8
+ for page_num in range(doc.page_count):
9
+ page = doc.load_page(page_num)
10
+ text += page.get_text()
11
+ return text
12
+
13
+ def clean_text(text):
14
+ text = re.sub(r"[^\x00-\x7F]", " ", text)
15
+ text = re.sub(r"[\n]", " ", text)
16
+ text = re.sub(r'\s+', ' ', text).strip()
17
+ text = re.sub(r'[“”]', '"', text)
18
+ text = re.sub(r"[‘’]", "'", text)
19
+ text = text.replace('\xad', '')
20
+ text = re.sub(r'[‒–—―]', '-', text)
21
+ return text
22
+
23
+ # Function to create text chunks
24
+ def segment_text(text, max_segment_length=700, batch_size=7):
25
+ sentences = sent_tokenize(text)
26
+ segments = []
27
+ current_segment = ""
28
+
29
+ for sentence in sentences:
30
+ if len(current_segment) + len(sentence) <= max_segment_length:
31
+ current_segment += sentence + " "
32
+ else:
33
+ segments.append(current_segment.strip())
34
+ current_segment = sentence + " "
35
+
36
+ if current_segment:
37
+ segments.append(current_segment.strip())
38
+
39
+ # Create batches
40
+ batches = [segments[i:i + batch_size] for i in range(0, len(segments), batch_size)]
41
+ return batches
utils.py ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import uuid
3
+ from load_models import initialize_wikiapi
4
+ from functools import lru_cache
5
+
6
+ class QuestionGenerationError(Exception):
7
+ """Custom exception for question generation errors."""
8
+ pass
9
+
10
+ def get_session_id():
11
+ if 'session_id' not in st.session_state:
12
+ st.session_state.session_id = str(uuid.uuid4())
13
+ return st.session_state.session_id
14
+
15
+ def initialize_state(session_id):
16
+ if 'session_states' not in st.session_state:
17
+ st.session_state.session_states = {}
18
+
19
+ if session_id not in st.session_state.session_states:
20
+ st.session_state.session_states[session_id] = {
21
+ 'generated_questions': [],
22
+ # add other state variables as needed
23
+ }
24
+ return st.session_state.session_states[session_id]
25
+
26
+ def get_state(session_id):
27
+ return st.session_state.session_states[session_id]
28
+
29
+ def set_state(session_id, key, value):
30
+ st.session_state.session_states[session_id][key] = value
31
+
32
+
33
+
34
+ # Info Section
35
+ def display_info():
36
+ st.sidebar.title("Information")
37
+ st.sidebar.markdown("""
38
+ ### Question Generator System
39
+ This system is designed to generate questions based on the provided context. It uses various NLP techniques and models to:
40
+ - Extract keywords from the text
41
+ - Map keywords to sentences
42
+ - Generate questions
43
+ - Provide multiple choice options
44
+ - Assess the quality of generated questions
45
+
46
+ #### Key Features:
47
+ - **Keyword Extraction:** Combines RAKE, TF-IDF, and spaCy for comprehensive keyword extraction.
48
+ - **Question Generation:** Utilizes a pre-trained T5 model for generating questions.
49
+ - **Options Generation:** Creates contextually relevant multiple-choice options.
50
+ - **Question Assessment:** Scores questions based on relevance, complexity, and spelling correctness.
51
+ - **Feedback Collection:** Allows users to rate the generated questions and provides statistics on feedback.
52
+
53
+ #### Customization Options:
54
+ - Number of beams for question generation
55
+ - Context window size for mapping keywords to sentences
56
+ - Number of questions to generate
57
+ - Additional display elements (context, answer, options, entity link, QA scores)
58
+
59
+ #### Outputs:
60
+ - Generated questions with multiple-choice options
61
+ - Download options for CSV and PDF formats
62
+ - Visualization of overall scores
63
+
64
+ """)
65
+
66
+
67
+ # Function to perform entity linking using Wikipedia API
68
+ @lru_cache(maxsize=128)
69
+ def entity_linking(keyword):
70
+ user_agent, wiki_wiki = initialize_wikiapi()
71
+ page = wiki_wiki.page(keyword)
72
+ if page.exists():
73
+ return page.fullurl
74
+ return None
75
+
visualization.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from wordcloud import WordCloud
2
+ import matplotlib.pyplot as plt
3
+ import streamlit as st
4
+
5
+ def display_word_cloud(generated_questions):
6
+ word_frequency = {}
7
+ for question in generated_questions:
8
+ words = question.split()
9
+ for word in words:
10
+ word_frequency[word] = word_frequency.get(word, 0) + 1
11
+
12
+ wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(word_frequency)
13
+ plt.figure(figsize=(10, 5))
14
+ plt.imshow(wordcloud, interpolation='bilinear')
15
+ plt.axis('off')
16
+ st.pyplot()