Spaces:

nontGcob
/

T2E_Vocabulary_Exam_Generator

Sleeping

File size: 5,960 Bytes

# Importing libraries
from nltk.corpus import wordnet
import nltk
import transformers
import pandas as pd
import json
import random
import torch

device='cpu'

# Declare the (trained) model that will be used
classifier = transformers.pipeline("zero-shot-classification", model="simple_trained_wsd_pipeline", device=device)

import spacy
# Part Of Speech tagging (POS tagging)
nlp = spacy.load("en_core_web_sm")

# Importing as module.
# import en_core_web_sm
# nlp = en_core_web_sm.load()

print('successfully download model')


def model(passage, level):
  # pip install spacy
  # pip install transformers
  # pip install torch
  # pip install en_core_web_sm
  # python -m spacy download en_core_web_sm
  # pip install spacy-download
  # pip install nltk

  nltk.download('wordnet')
  nltk.download('omw-1.4')

  # Passing file directories into variables
  # text_input = "./text_input.txt"
  cefr_vocab = "cefr-vocab.csv"

  # Create and open the text file
  # with open(text_input, "a") as file:
  #   file.write(".") # Add a full stop at the end to make sure there is a full stop at the end of the text for the model to understand where to stop the sentence


  # Ask the user for the CEFR level
  # while True:
    # cefr_level = input("Which CEFR level you want to test?: ").upper()
    # if "A1" in cefr_level or "A2" in cefr_level or "B1" in cefr_level or "B2" in cefr_level or "C1" in cefr_level or "C2" in cefr_level:
    #   break
    # else:
    #   continue
  cefr_level = level

  # Read from the input file
  # with open(text_input, "r") as file:
  #   txt = str(file.readlines()).replace("[", "").replace("'", "").replace("]", "")
  txt = passage + "."

  if "." in txt:
    txt = (txt.split("."))
  else:
    txt = txt

  text_dict = {}
  for n in txt:
    n = n.strip()
    ex1 = nlp(n)

    for word in ex1:
      sentence_question_tag = n.replace(word.text, f"[{word.text}]")
      text_dict[f"{word.lemma_} = {sentence_question_tag}"] = word.pos_

  # Collect the tagging results (filter in just NOUN, PROPN, VERB, ADJ, or ADV only)
  collector = {}
  for key, value in text_dict.items():
    if "NOUN" in value or "VERB" in value or "ADJ" in value or "ADV" in value:
      collector[key] = value

  # Collect the CEFR level of the words collected before
  reference = pd.read_csv(cefr_vocab)

  matching = {}
  for row_idx in range(reference.shape[0]):
    row = reference.iloc[row_idx]
    key = f"{row.headword}, {row.pos}"
    matching[key] = row.CEFR

  # Convert pos of the word into all lowercase to match another data set with CEFR level
  for key1, value1 in collector.items():
    if value1 == "NOUN":
      collector[key1] = "noun"
    if value1 == "VERB":
      collector[key1] = "verb"
    if value1 == "ADJ":
      collector[key1] = "adjective"
    if value1 == "ADV":
      collector[key1] = "adverb"

  # Matching 2 datasets together by the word and the pos
  ready2filter = {}
  for key, value in matching.items():
    first_key, second_key = key.split(", ")
    for key2, value2 in collector.items():
      key2 = key2.split(" = ")
      if first_key == key2[0].lower():
        if second_key == value2:
          ready2filter[f"{key} = {key2[1]}"] = value

  # Filter in just the vocab that has the selected CEFR level that the user provided at the beginning
  filtered0 = {}
  for key, value in ready2filter.items():
      if cefr_level == "ALL":
          filtered0[key] = value
      else:
          if value == cefr_level:
              filtered0[key] = value

  # Rearrange the Python dictionary structure
  filtered = {}
  for key, value in filtered0.items():
      key_parts = key.split(', ')
      new_key = key_parts[0]
      new_value = key_parts[1]
      filtered[new_key] = new_value

  # Grab the definition of each vocab from the NLTK wordnet English dictionary
  def_filtered = {}
  for key3, value3 in filtered.items():
    syns = wordnet.synsets(key3)
    partofspeech, context = value3.split(" = ")
    def_filtered[f"{key3} = {context}"] = []

    # pos conversion
    if partofspeech == "noun":
      partofspeech = "n"
    if partofspeech == "verb":
      partofspeech = "v"
    if partofspeech == "adjective":
      partofspeech = "s"
    if partofspeech == "adverb":
      partofspeech = "r"

    # print("def_filtered 0:", def_filtered)

    # Adding the definitions into the Python dictionary, def_filtered (syns variable does the job of finding the relevant word aka synonyms)
    for s in syns:
        # print('s:', s)
        # print("syns:", syns)
        def_filtered[f"{key3} = {context}"].append(s.definition())
        # print("def_filtered 1:", def_filtered)

  # Use Nvidia CUDA core if available
  # if torch.cuda.is_available():
  #     device=0
  # else:


  # Process Python dictionary, def_filtereddic
  correct_def = {}
  for key4, value4 in def_filtered.items():
    vocab, context = key4.split(" = ")
    sequence_to_classify = context
    candidate_labels = value4
    # correct_def[key4] = []
    correct_def_list = []
    temp_def = []
    hypothesis_template = 'The meaning of [' + vocab + '] is {}.'

    output = classifier(sequence_to_classify, candidate_labels, hypothesis_template=hypothesis_template)

    # Process the score of each definition and add it to the Python dictionary, correct_def
    for label, score in zip(output['labels'], output['scores']):
        temp_def.append(label)
        # print(temp_def)
        for first in range(len(temp_def)):
          if first == 0:
            val = f">> {temp_def[first]}"
          else:
            val = f"{temp_def[first]}"

        correct_def_list.append(val)

    print(type(key4), type(correct_def_list))
    correct_def[key4] = correct_def_list
        
        # correct_def[key4].append(f"{label}")

  return correct_def

  # with open(T2E_exam, "r") as file:
  #    exam = file.readlines()
  # print(exam)
  # return(exam)


# passage = "Computer is good"
# level = "A1"
# print(model(passage, level))