Spaces:

DevBM
/

QGen

Sleeping

App Files Files Community

DevBM

AneriThakkar commited on Oct 16, 2024

Commit

f7842f6

verified ·

1 Parent(s): 03f344d

Upload files for modules/functions (#5)

Browse files

- Upload files for modules/functions (7fafac42e4b5d1e40338e2b5ace7ef9fba805bff)

Co-authored-by: Thakkar Aneri Pareshkumar <AneriThakkar@users.noreply.huggingface.co>

Files changed (11) hide show

data_export.py +61 -0
feedback.py +108 -0
fill_in_the_blanks_generation.py +8 -0
keyword_extraction.py +133 -0
load_models.py +45 -0
mapping_keywords.py +22 -0
option_generation.py +135 -0
question_generation.py +122 -0
text_processing.py +41 -0
utils.py +75 -0
visualization.py +16 -0

data_export.py ADDED Viewed

	@@ -0,0 +1,61 @@

+import pandas as pd
+from fpdf import FPDF
+import streamlit as st
+import smtplib
+from email.mime.multipart import MIMEMultipart
+from email.mime.text import MIMEText
+# from email.mime.base import MIMEBase
+from email.mime.application import MIMEApplication
+# from email import encoders
+def export_to_csv(data):
+    df = pd.DataFrame(data)
+    csv = df.to_csv(index=False)
+    return csv
+def export_to_pdf(data):
+    pdf = FPDF()
+    pdf.add_page()
+    pdf.set_font("Arial", size=12)
+    for item in data:
+        pdf.multi_cell(0, 10, f"Context: {item['context']}")
+        pdf.multi_cell(0, 10, f"Question: {item['question']}")
+        pdf.multi_cell(0, 10, f"Answer: {item['answer']}")
+        pdf.multi_cell(0, 10, f"Options: {', '.join(item['options'])}")
+        pdf.multi_cell(0, 10, f"Overall Score: {item['overall_score']:.2f}")
+        pdf.ln(10)
+    return pdf.output(dest='S').encode('latin-1')
+def send_email_with_attachment(email_subject, email_body, recipient_emails, sender_email, sender_password, attachment):
+    smtp_server = "smtp.gmail.com"  # Replace with your SMTP server
+    smtp_port = 587  # Replace with your SMTP port
+    # Create the email message
+    message = MIMEMultipart()
+    message['From'] = sender_email
+    message['To'] = ", ".join(recipient_emails)
+    message['Subject'] = email_subject
+    message.attach(MIMEText(email_body, 'plain'))
+    # Attach the feedback data if available
+    if attachment:
+        attachment_part = MIMEApplication(attachment.getvalue(), Name="feedback_data.json")
+        attachment_part['Content-Disposition'] = f'attachment; filename="feedback_data.json"'
+        message.attach(attachment_part)
+    # Send the email
+    try:
+        with smtplib.SMTP(smtp_server, smtp_port) as server:
+            server.starttls()
+            # print(sender_email)
+            # print(sender_password)
+            server.login(sender_email, sender_password)
+            text = message.as_string()
+            server.sendmail(sender_email, recipient_emails, text)
+        return True
+    except Exception as e:
+        st.error(f"Failed to send email: {str(e)}")
+        return False

feedback.py ADDED Viewed

	@@ -0,0 +1,108 @@

+import streamlit as st
+import json
+from io import BytesIO
+import pandas as pd
+import matplotlib.pyplot as plt
+from wordcloud import WordCloud
+import os
+def save_feedback_og(feedback):
+    feedback_file = 'feedback_data.json'
+    if os.path.exists(feedback_file):
+        with open(feedback_file, 'r') as f:
+            feedback_data = json.load(f)
+    else:
+        feedback_data = []
+    # tpl = {
+    #     'question' : question,
+    #     'answer' : answer,
+    #     'context' : context,
+    #     'options' : options,
+    #     'rating' : rating,
+    # }
+    # feedback_data[question] = rating
+    feedback_data.append(feedback)
+    print(feedback_data)
+    with open(feedback_file, 'w') as f:
+        json.dump(feedback_data, f)
+    st.session_state.feedback_data.append(feedback)
+    return feedback_file
+def collect_feedback(i,question, answer, context, options):
+    st.write("Please provide feedback for this question:")
+    edited_question = st.text_input("Enter improved question",value=question,key=f'fdx1{i}')
+    clarity = st.slider("Clarity", 1, 5, 3, help="1 = Very unclear, 5 = Very clear",key=f'fdx2{i}')
+    difficulty = st.slider("Difficulty", 1, 5, 3, help="1 = Very easy, 5 = Very difficult",key=f'fdx3{i}')
+    relevance = st.slider("Relevance", 1, 5, 3, help="1 = Not relevant, 5 = Highly relevant",key=f'fdx4{i}')
+    option_quality = st.slider("Quality of Options", 1, 5, 3, help="1 = Poor options, 5 = Excellent options",key=f'fdx5{i}')
+    overall_rating = st.slider("Overall Rating", 1, 5, 3, help="1 = Poor, 5 = Excellent",key=f'fdx6{i}')
+    comments = st.text_input("Additional Comments", "",key=f'fdx7{i}')
+    if st.button("Submit Feedback",key=f'fdx8{i}'):
+        feedback = {
+            "context": context,
+            "question": question,
+            'edited_question':edited_question,
+            "answer": answer,
+            "options": options,
+            "clarity": clarity,
+            "difficulty": difficulty,
+            "relevance": relevance,
+            "option_quality": option_quality,
+            "overall_rating": overall_rating,
+            "comments": comments
+        }
+        # save_feedback(feedback)
+        save_feedback_og(feedback)
+        st.success("Thank you for your feedback!")
+def analyze_feedback():
+    if not st.session_state.feedback_data:
+        st.warning("No feedback data available yet.")
+        return
+    df = pd.DataFrame(st.session_state.feedback_data)
+    st.write("Feedback Analysis")
+    st.write(f"Total feedback collected: {len(df)}")
+    metrics = ['clarity', 'difficulty', 'relevance', 'option_quality', 'overall_rating']
+    for metric in metrics:
+        fig, ax = plt.subplots()
+        df[metric].value_counts().sort_index().plot(kind='bar', ax=ax)
+        plt.title(f"Distribution of {metric.capitalize()} Ratings")
+        plt.xlabel("Rating")
+        plt.ylabel("Count")
+        st.pyplot(fig)
+    st.write("Average Ratings:")
+    st.write(df[metrics].mean())
+    # Word cloud of comments
+    comments = " ".join(df['comments'])
+    if len(comments) > 1:
+        wordcloud = WordCloud(width=800, height=400, background_color='white').generate(comments)
+        fig, ax = plt.subplots()
+        plt.imshow(wordcloud, interpolation='bilinear')
+        plt.axis("off")
+        st.pyplot(fig)
+def export_feedback_data():
+    if not st.session_state.feedback_data:
+        st.warning("No feedback data available.")
+        return None
+    # Convert feedback data to JSON
+    json_data = json.dumps(st.session_state.feedback_data, indent=2)
+    # Create a BytesIO object
+    buffer = BytesIO()
+    buffer.write(json_data.encode())
+    buffer.seek(0)
+    return buffer

fill_in_the_blanks_generation.py ADDED Viewed

	@@ -0,0 +1,8 @@

+async def generate_fill_in_the_blank_questions(context,answer):
+    answerSize = len(answer)
+    replacedBlanks = ""
+    for i in range(answerSize):
+        replacedBlanks += "_"
+    blank_q = context.replace(answer,replacedBlanks)
+    return blank_q

keyword_extraction.py ADDED Viewed

	@@ -0,0 +1,133 @@

+from nltk.corpus import stopwords
+from rake_nltk import Rake
+from sklearn.feature_extraction.text import TfidfVectorizer
+import spacy
+from transformers import pipeline
+from gliner import GLiNER
+from load_models import load_nlp_models
+nlp, s2v = load_nlp_models()
+def filter_keywords(extracted_keywords):
+    unwanted_keywords =[
+    # Common punctuation marks
+    '.', ',', '!', '?', ':', ';', '-', '_', '(', ')', '[', ']', '{', '}',
+    '/', '\\', '|', '@', '#', '$', '%', '^', '&', '*', '+', '=', '<', '>',
+    '`', '~', '"', "'",
+    # Common contractions (if not already removed as stopwords)
+    "n't", "'s", "'m", "'re", "'ll", "'ve", "'d",
+    # Common abbreviations
+    'etc', 'eg', 'ie', 'ex', 'vs', 'viz',
+    'tbd', 'tba',  # To be determined/announced
+    'na', 'n/a',  # Not applicable
+    # Single characters
+    'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
+    # HTML-related tags (if the text contains any HTML content)
+    '<html>', '</html>', '<body>', '</body>', '<head>', '</head>', '<div>', '</div>', '<p>', '</p>', '<br>', '<hr>', '<h1>', '</h1>', '<h2>', '</h2>', '<h3>', '</h3>',
+    # Random technical or common abbreviations that aren't meaningful keywords
+    'etc', 'e.g', 'i.e', 'vs', 'ex', 'vol', 'sec', 'pg', 'id', 'ref', 'eq',
+    # Miscellaneous tokens
+    'www', 'com', 'http', 'https', 'ftp', 'pdf', 'doc', 'img', 'gif', 'jpeg', 'jpg', 'png', 'mp4', 'mp3', 'org', 'net', 'edu',
+    'untitled', 'noname', 'unknown', 'undefined',
+    # Single letters commonly used in bullet points or references
+    'i', 'ii', 'iii', 'iv', 'v', 'vi', 'vii', 'viii', 'ix', 'x', 'xi', 'xii',
+    # Common file extensions (if filenames are included in the text)
+    '.jpg', '.png', '.pdf', '.doc', '.docx', '.ppt', '.pptx', '.xls', '.xlsx', '.csv', '.txt', '.zip', '.tar', '.gz', '.exe', '.bat', '.sh', '.py', '.cpp', '.java',
+    # Other tokens related to formatting or structure
+    'chapter', 'section', 'figure', 'table', 'appendix',
+    # Miscellaneous general noise terms
+    'note', 'item', 'items', 'number', 'numbers', 'figure', 'case', 'cases', 'example', 'examples', 'type', 'types', 'section', 'part', 'parts'
+    ]
+    # Convert both lists to sets for efficient lookup
+    extracted_set = set(extracted_keywords)
+    unwanted_set = set(unwanted_keywords)
+    # Remove unwanted keywords
+    filtered_keywords = extracted_set - unwanted_set
+    # Convert back to a list and sort (optional)
+    return sorted(list(filtered_keywords))
+def remove_stopwords(keywords):
+    stop_words = set(stopwords.words('english'))
+    modified_keywords = [''.join(keyword.split()) for keyword in keywords]
+    filtered_keywords = [keyword for keyword in modified_keywords if keyword.lower() not in stop_words]
+    original_keywords = []
+    for keyword in filtered_keywords:
+        for original_keyword in keywords:
+            if ''.join(original_keyword.split()).lower() == keyword.lower():
+                original_keywords.append(original_keyword)
+                break
+    return original_keywords
+def enhanced_ner(text):
+    nlp = spacy.load("en_core_web_trf")
+    ner_pipeline = pipeline("ner", model="dbmdz/bert-large-cased-finetuned-conll03-english")
+    doc = nlp(text)
+    spacy_entities = set((ent.text, ent.label_) for ent in doc.ents)
+    hf_entities = set((ent['word'], ent['entity']) for ent in ner_pipeline(text))
+    combined_entities = spacy_entities.union(hf_entities)
+    keywords = [entity[0] for entity in combined_entities]
+    return list(keywords)
+def extract_keywords(text, extract_all):
+    try:
+        text = text.lower()
+        enhanced_ner_entities = enhanced_ner(text)
+        print("Enhanced ner entities: ",enhanced_ner_entities)
+        enhanced_ner_entities = remove_stopwords(enhanced_ner_entities)
+        enhanced_ner_entities = filter_keywords(enhanced_ner_entities)
+        print("Enhanced ner entities after applying filter and stopwords removal: ",enhanced_ner_entities)
+        gliner_model = GLiNER.from_pretrained("knowledgator/gliner-multitask-large-v0.5")
+        labels = ["person", "organization", "phone number", "address", "email", "date of birth",
+                  "mobile phone number", "medication", "ip address", "email address",
+                  "landline phone number", "blood type", "digital signature", "postal code",
+                  "date"]
+        entities = gliner_model.predict_entities(text, labels, threshold=0.5)
+        gliner_keywords = set(remove_stopwords([ent["text"] for ent in entities]))
+        print(f"Gliner keywords:{gliner_keywords}")
+        # if extract_all is False:
+        #     return list(gliner_keywords)
+        doc = nlp(text)
+        spacy_keywords = set(remove_stopwords([ent.text for ent in doc.ents]))
+        print(f"\n\nSpacy Entities: {spacy_keywords} \n\n")
+        if extract_all is False:
+            combined_keywords_without_all = list(spacy_keywords.union(gliner_keywords).union(enhanced_ner_entities))
+            filtered_results = filter_keywords(combined_keywords_without_all)
+            print("Keywords returned: ",filtered_results)
+            return list(filtered_results)
+        rake = Rake()
+        rake.extract_keywords_from_text(text)
+        rake_keywords = set(remove_stopwords(rake.get_ranked_phrases()))
+        print(f"\n\nRake Keywords: {rake_keywords} \n\n")
+        vectorizer = TfidfVectorizer(stop_words='english')
+        X = vectorizer.fit_transform([text])
+        tfidf_keywords = set(remove_stopwords(vectorizer.get_feature_names_out()))
+        print(f"\n\nTFIDF Entities: {tfidf_keywords} \n\n")
+        combined_keywords = list(rake_keywords.union(spacy_keywords).union(tfidf_keywords).union(gliner_keywords))
+        filtered_results = filter_keywords(combined_keywords)
+        print("Keywords returned: ",filtered_results)
+        return list(filtered_results)
+    except Exception as e:
+        raise Exception(f"Error in keyword extraction: {str(e)}")

load_models.py ADDED Viewed

	@@ -0,0 +1,45 @@

+import streamlit as st
+from transformers import T5ForConditionalGeneration, T5Tokenizer
+import spacy
+import sense2vec
+from sentence_transformers import SentenceTransformer
+from spellchecker import SpellChecker
+import wikipediaapi
+from langchain_community.llms import Ollama
+# import time
+def load_llama():
+    llm = Ollama(model='llama3:latest')
+    return llm
+@st.cache_resource
+def load_model(modelname):
+    model_name = modelname
+    model = T5ForConditionalGeneration.from_pretrained(model_name)
+    tokenizer = T5Tokenizer.from_pretrained(model_name)
+    return model, tokenizer
+# Load Spacy Model
+@st.cache_resource
+def load_nlp_models():
+    nlp = spacy.load("en_core_web_md")
+    s2v = sense2vec.Sense2Vec().from_disk('s2v_old')
+    return nlp, s2v
+# Load Quality Assurance Models
+@st.cache_resource
+def load_qa_models():
+    # Initialize BERT model for sentence similarity
+    similarity_model = SentenceTransformer('all-MiniLM-L6-v2')
+    spell = SpellChecker()
+    return similarity_model, spell
+def initialize_wikiapi():
+    # Initialize Wikipedia API with a user agent
+    user_agent = 'QGen/1.2'
+    wiki_wiki = wikipediaapi.Wikipedia(user_agent= user_agent,language='en')
+    return user_agent, wiki_wiki

mapping_keywords.py ADDED Viewed

	@@ -0,0 +1,22 @@

+from nltk.tokenize import sent_tokenize
+# Function to map keywords to sentences with customizable context window size
+def map_keywords_to_sentences(text, keywords, context_window_size):
+    sentences = sent_tokenize(text)
+    keyword_sentence_mapping = {}
+    print(f"\n\nSentences: {sentences}\n\n")
+    for keyword in keywords:
+        for i, sentence in enumerate(sentences):
+            if keyword in sentence:
+                # Combine current sentence with surrounding sentences for context
+                # start = max(0, i - context_window_size)
+                # end = min(len(sentences), i + context_window_size + 1)
+                start = max(0,i - context_window_size)
+                context_sentenses = sentences[start:i+1]
+                context = ' '.join(context_sentenses)
+                # context = ' '.join(sentences[start:end])
+                if keyword not in keyword_sentence_mapping:
+                    keyword_sentence_mapping[keyword] = context
+                else:
+                    keyword_sentence_mapping[keyword] += ' ' + context
+    return keyword_sentence_mapping

option_generation.py ADDED Viewed

	@@ -0,0 +1,135 @@

+import nltk
+import random
+import asyncio
+nltk.download('wordnet')
+from nltk.corpus import wordnet
+from sentence_transformers import util
+from load_models import load_nlp_models, load_llama, load_qa_models
+from utils import QuestionGenerationError
+nlp, s2v = load_nlp_models()
+llm = load_llama()
+similarity_model, spell = load_qa_models()
+context_model = similarity_model
+def get_similar_words_sense2vec(word, n=3):
+    # Try to find the word with its most likely part-of-speech
+    word_with_pos = word + "|NOUN"
+    if word_with_pos in s2v:
+        similar_words = s2v.most_similar(word_with_pos, n=n)
+        return [word.split("|")[0] for word, _ in similar_words]
+    # If not found, try without POS
+    if word in s2v:
+        similar_words = s2v.most_similar(word, n=n)
+        return [word.split("|")[0] for word, _ in similar_words]
+    return []
+def get_synonyms(word, n=3):
+    synonyms = []
+    for syn in wordnet.synsets(word):
+        for lemma in syn.lemmas():
+            if lemma.name() != word and lemma.name() not in synonyms:
+                synonyms.append(lemma.name())
+                if len(synonyms) == n:
+                    return synonyms
+    return synonyms
+def gen_options(answer,context,question):
+    prompt=f'''Given the following context, question, and correct answer,
+    generate {4} incorrect but plausible answer options. The options should be:
+    1. Contextually related to the given context
+    2. Grammatically consistent with the question
+    3. Different from the correct answer
+    4. Not explicitly mentioned in the given context
+    Context: {context}
+    Question: {question}
+    Correct Answer: {answer}
+    Provide the options in a semi colon-separated list. Output must contain only the options and nothing else.
+    '''
+    options= [answer]
+    response = llm.invoke(prompt, stop=['<|eot_id|>'])
+    incorrect_options = [option.strip() for option in response.split(';')]
+    options.extend(incorrect_options)
+    random.shuffle(options)
+    print(options)
+    return options
+    # print(response)
+def generate_options(answer, context, n=3):
+    options = [answer]
+    # Add contextually relevant words using a pre-trained model
+    context_embedding = context_model.encode(context)
+    answer_embedding = context_model.encode(answer)
+    context_words = [token.text for token in nlp(context) if token.is_alpha and token.text.lower() != answer.lower()]
+    # Compute similarity scores and sort context words
+    similarity_scores = [util.pytorch_cos_sim(context_model.encode(word), answer_embedding).item() for word in context_words]
+    sorted_context_words = [word for _, word in sorted(zip(similarity_scores, context_words), reverse=True)]
+    options.extend(sorted_context_words[:n])
+    # Try to get similar words based on sense2vec
+    similar_words = get_similar_words_sense2vec(answer, n)
+    options.extend(similar_words)
+    # If we don't have enough options, try synonyms
+    if len(options) < n + 1:
+        synonyms = get_synonyms(answer, n - len(options) + 1)
+        options.extend(synonyms)
+    # If we still don't have enough options, extract other entities from the context
+    if len(options) < n + 1:
+        doc = nlp(context)
+        entities = [ent.text for ent in doc.ents if ent.text.lower() != answer.lower()]
+        options.extend(entities[:n - len(options) + 1])
+    # If we still need more options, add some random words from the context
+    if len(options) < n + 1:
+        context_words = [token.text for token in nlp(context) if token.is_alpha and token.text.lower() != answer.lower()]
+        options.extend(random.sample(context_words, min(n - len(options) + 1, len(context_words))))
+    print(f"\n\nAll Possible Options: {options}\n\n")
+    # Ensure we have the correct number of unique options
+    options = list(dict.fromkeys(options))[:n+1]
+    # Shuffle the options
+    random.shuffle(options)
+    return options
+async def generate_options_async(answer, context, n=3):
+    try:
+        options = [answer]
+        # Add contextually relevant words using a pre-trained model
+        context_embedding = await asyncio.to_thread(context_model.encode, context)
+        answer_embedding = await asyncio.to_thread(context_model.encode, answer)
+        context_words = [token.text for token in nlp(context) if token.is_alpha and token.text.lower() != answer.lower()]
+        # Compute similarity scores and sort context words
+        similarity_scores = [util.pytorch_cos_sim(await asyncio.to_thread(context_model.encode, word), answer_embedding).item() for word in context_words]
+        sorted_context_words = [word for _, word in sorted(zip(similarity_scores, context_words), reverse=True)]
+        options.extend(sorted_context_words[:n])
+        # Try to get similar words based on sense2vec
+        similar_words = await asyncio.to_thread(get_similar_words_sense2vec, answer, n)
+        options.extend(similar_words)
+        # If we don't have enough options, try synonyms
+        if len(options) < n + 1:
+            synonyms = await asyncio.to_thread(get_synonyms, answer, n - len(options) + 1)
+            options.extend(synonyms)
+        # Ensure we have the correct number of unique options
+        options = list(dict.fromkeys(options))[:n+1]
+        # Shuffle the options
+        random.shuffle(options)
+        return options
+    except Exception as e:
+        raise QuestionGenerationError(f"Error in generating options: {str(e)}")

question_generation.py ADDED Viewed

	@@ -0,0 +1,122 @@

+import asyncio
+import streamlit as st
+from text_processing import segment_text
+from keyword_extraction import extract_keywords
+from utils import QuestionGenerationError
+from mapping_keywords import map_keywords_to_sentences
+from option_generation import gen_options, generate_options_async
+from fill_in_the_blanks_generation import generate_fill_in_the_blank_questions
+from load_models import load_nlp_models, load_qa_models, load_model
+nlp, s2v = load_nlp_models()
+similarity_model, spell = load_qa_models()
+def assess_question_quality(context, question, answer):
+    # Assess relevance using cosine similarity
+    context_doc = nlp(context)
+    question_doc = nlp(question)
+    relevance_score = context_doc.similarity(question_doc)
+    # Assess complexity using token length (as a simple metric)
+    complexity_score = min(len(question_doc) / 20, 1)  # Normalize to 0-1
+    # Assess Spelling correctness
+    misspelled = spell.unknown(question.split())
+    spelling_correctness = 1 - (len(misspelled) / len(question.split()))  # Normalize to 0-1
+    # Calculate overall score (you can adjust weights as needed)
+    overall_score = (
+        0.4 * relevance_score +
+        0.4 * complexity_score +
+        0.2 * spelling_correctness
+    )
+    return overall_score, relevance_score, complexity_score, spelling_correctness
+async def process_batch(batch, keywords, context_window_size, num_beams, num_questions, modelname):
+    questions = []
+    print("inside process batch function")
+    flag = False
+    for text in batch:
+        if flag:
+            break
+        keyword_sentence_mapping = map_keywords_to_sentences(text, keywords, context_window_size)
+        print(keyword_sentence_mapping)
+        for keyword, context in keyword_sentence_mapping.items():
+            print("Length of questions list from process batch function: ",len(questions))
+            if len(questions)>=num_questions:
+                flag = True
+                break
+            question = await generate_question_async(context, keyword, num_beams,modelname)
+            options = await generate_options_async(keyword, context)
+            # options = gen_options(keyword, context, question)
+            blank_question = await generate_fill_in_the_blank_questions(context,keyword)
+            overall_score, relevance_score, complexity_score, spelling_correctness = assess_question_quality(context, question, keyword)
+            if overall_score >= 0.5:
+                questions.append({
+                    "question": question,
+                    "context": context,
+                    "answer": keyword,
+                    "options": options,
+                    "overall_score": overall_score,
+                    "relevance_score": relevance_score,
+                    "complexity_score": complexity_score,
+                    "spelling_correctness": spelling_correctness,
+                    "blank_question": blank_question,
+                })
+    return questions
+async def generate_question_async(context, answer, num_beams,modelname):
+    model, tokenizer = load_model(modelname)
+    try:
+        input_text = f"<context> {context} <answer> {answer}"
+        print(f"\n{input_text}\n")
+        input_ids = tokenizer.encode(input_text, return_tensors='pt')
+        outputs = await asyncio.to_thread(model.generate, input_ids, num_beams=num_beams, early_stopping=True, max_length=250)
+        question = tokenizer.decode(outputs[0], skip_special_tokens=True)
+        print(f"\n{question}\n")
+        # print(type(question))
+        return question
+    except Exception as e:
+        raise QuestionGenerationError(f"Error in question generation: {str(e)}")
+# Function to generate questions using beam search
+async def generate_questions_async(text, num_questions, context_window_size, num_beams, extract_all_keywords,modelname):
+    try:
+        batches = segment_text(text.lower())
+        keywords = extract_keywords(text, extract_all_keywords)
+        all_questions = []
+        progress_bar = st.progress(0)
+        status_text = st.empty()
+        print("Final keywords:",keywords)
+        print("Number of questions that needs to be generated: ",num_questions)
+        print("totoal no of batches:", batches)
+        for i, batch in enumerate(batches):
+            print("batch no: ", len(batches))
+            status_text.text(f"Processing batch {i+1} of {len(batches)}...")
+            batch_questions = await process_batch(batch, keywords, context_window_size, num_beams,num_questions,modelname)
+            all_questions.extend(batch_questions)
+            progress_bar.progress((i + 1) / len(batches))
+            print("Length of the all questions list: ",len(all_questions))
+            if len(all_questions) >= num_questions:
+                break
+        progress_bar.empty()
+        status_text.empty()
+        return all_questions[:num_questions]
+    except QuestionGenerationError as e:
+        st.error(f"An error occurred during question generation: {str(e)}")
+        return []
+    except Exception as e:
+        st.error(f"An unexpected error occurred: {str(e)}")
+        return []

text_processing.py ADDED Viewed

	@@ -0,0 +1,41 @@

+import re
+import pymupdf
+from nltk.tokenize import sent_tokenize
+def get_pdf_text(pdf_file):
+    doc = pymupdf.open(stream=pdf_file.read(), filetype="pdf")
+    text = ""
+    for page_num in range(doc.page_count):
+        page = doc.load_page(page_num)
+        text += page.get_text()
+    return text
+def clean_text(text):
+    text = re.sub(r"[^\x00-\x7F]", " ", text)
+    text = re.sub(r"[\n]", " ", text)
+    text = re.sub(r'\s+', ' ', text).strip()
+    text = re.sub(r'[“”]', '"', text)
+    text = re.sub(r"[‘’]", "'", text)
+    text = text.replace('\xad', '')
+    text = re.sub(r'[‒–—―]', '-', text)
+    return text
+# Function to create text chunks
+def segment_text(text, max_segment_length=700, batch_size=7):
+    sentences = sent_tokenize(text)
+    segments = []
+    current_segment = ""
+    for sentence in sentences:
+        if len(current_segment) + len(sentence) <= max_segment_length:
+            current_segment += sentence + " "
+        else:
+            segments.append(current_segment.strip())
+            current_segment = sentence + " "
+    if current_segment:
+        segments.append(current_segment.strip())
+    # Create batches
+    batches = [segments[i:i + batch_size] for i in range(0, len(segments), batch_size)]
+    return batches

utils.py ADDED Viewed

	@@ -0,0 +1,75 @@

+import streamlit as st
+import uuid
+from load_models import initialize_wikiapi
+from functools import lru_cache
+class QuestionGenerationError(Exception):
+    """Custom exception for question generation errors."""
+    pass
+def get_session_id():
+    if 'session_id' not in st.session_state:
+        st.session_state.session_id = str(uuid.uuid4())
+    return st.session_state.session_id
+def initialize_state(session_id):
+    if 'session_states' not in st.session_state:
+        st.session_state.session_states = {}
+    if session_id not in st.session_state.session_states:
+        st.session_state.session_states[session_id] = {
+            'generated_questions': [],
+            # add other state variables as needed
+        }
+    return st.session_state.session_states[session_id]
+def get_state(session_id):
+    return st.session_state.session_states[session_id]
+def set_state(session_id, key, value):
+    st.session_state.session_states[session_id][key] = value
+# Info Section
+def display_info():
+    st.sidebar.title("Information")
+    st.sidebar.markdown("""
+        ### Question Generator System
+        This system is designed to generate questions based on the provided context. It uses various NLP techniques and models to:
+        - Extract keywords from the text
+        - Map keywords to sentences
+        - Generate questions
+        - Provide multiple choice options
+        - Assess the quality of generated questions
+        #### Key Features:
+        - **Keyword Extraction:** Combines RAKE, TF-IDF, and spaCy for comprehensive keyword extraction.
+        - **Question Generation:** Utilizes a pre-trained T5 model for generating questions.
+        - **Options Generation:** Creates contextually relevant multiple-choice options.
+        - **Question Assessment:** Scores questions based on relevance, complexity, and spelling correctness.
+        - **Feedback Collection:** Allows users to rate the generated questions and provides statistics on feedback.
+        #### Customization Options:
+        - Number of beams for question generation
+        - Context window size for mapping keywords to sentences
+        - Number of questions to generate
+        - Additional display elements (context, answer, options, entity link, QA scores)
+        #### Outputs:
+        - Generated questions with multiple-choice options
+        - Download options for CSV and PDF formats
+        - Visualization of overall scores
+    """)
+# Function to perform entity linking using Wikipedia API
+@lru_cache(maxsize=128)
+def entity_linking(keyword):
+    user_agent, wiki_wiki = initialize_wikiapi()
+    page = wiki_wiki.page(keyword)
+    if page.exists():
+        return page.fullurl
+    return None

visualization.py ADDED Viewed

	@@ -0,0 +1,16 @@

+from wordcloud import WordCloud
+import matplotlib.pyplot as plt
+import streamlit as st
+def display_word_cloud(generated_questions):
+    word_frequency = {}
+    for question in generated_questions:
+        words = question.split()
+        for word in words:
+            word_frequency[word] = word_frequency.get(word, 0) + 1
+    wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(word_frequency)
+    plt.figure(figsize=(10, 5))
+    plt.imshow(wordcloud, interpolation='bilinear')
+    plt.axis('off')
+    st.pyplot()