|
|
|
from nltk.corpus import wordnet |
|
import nltk |
|
import transformers |
|
import pandas as pd |
|
import json |
|
import random |
|
import torch |
|
|
|
device='cpu' |
|
|
|
|
|
classifier = transformers.pipeline("zero-shot-classification", model="simple_trained_wsd_pipeline", device=device) |
|
|
|
import spacy |
|
|
|
nlp = spacy.load("en_core_web_sm") |
|
|
|
print('successfully download model') |
|
|
|
|
|
def model(passage, level): |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
nltk.download('wordnet') |
|
nltk.download('omw-1.4') |
|
|
|
|
|
|
|
cefr_vocab = "cefr-vocab.csv" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
cefr_level = level |
|
|
|
|
|
|
|
|
|
txt = passage + "." |
|
|
|
if "." in txt: |
|
txt = (txt.split(".")) |
|
else: |
|
txt = txt |
|
|
|
text_dict = {} |
|
for n in txt: |
|
n = n.strip() |
|
ex1 = nlp(n) |
|
|
|
for word in ex1: |
|
sentence_question_tag = n.replace(word.text, f"[{word.text}]") |
|
text_dict[f"{word.lemma_} = {sentence_question_tag}"] = word.pos_ |
|
|
|
|
|
collector = {} |
|
for key, value in text_dict.items(): |
|
if "NOUN" in value or "VERB" in value or "ADJ" in value or "ADV" in value: |
|
collector[key] = value |
|
|
|
|
|
reference = pd.read_csv(cefr_vocab) |
|
|
|
matching = {} |
|
for row_idx in range(reference.shape[0]): |
|
row = reference.iloc[row_idx] |
|
key = f"{row.headword}, {row.pos}" |
|
matching[key] = row.CEFR |
|
|
|
|
|
for key1, value1 in collector.items(): |
|
if value1 == "NOUN": |
|
collector[key1] = "noun" |
|
if value1 == "VERB": |
|
collector[key1] = "verb" |
|
if value1 == "ADJ": |
|
collector[key1] = "adjective" |
|
if value1 == "ADV": |
|
collector[key1] = "adverb" |
|
|
|
|
|
ready2filter = {} |
|
for key, value in matching.items(): |
|
first_key, second_key = key.split(", ") |
|
for key2, value2 in collector.items(): |
|
key2 = key2.split(" = ") |
|
if first_key == key2[0].lower(): |
|
if second_key == value2: |
|
ready2filter[f"{key} = {key2[1]}"] = value |
|
|
|
|
|
filtered0 = {} |
|
for key, value in ready2filter.items(): |
|
if cefr_level == "ALL": |
|
filtered0[key] = value |
|
else: |
|
if value == cefr_level: |
|
filtered0[key] = value |
|
|
|
|
|
filtered = {} |
|
for key, value in filtered0.items(): |
|
key_parts = key.split(', ') |
|
new_key = key_parts[0] |
|
new_value = key_parts[1] |
|
filtered[new_key] = new_value |
|
|
|
|
|
def_filtered = {} |
|
for key3, value3 in filtered.items(): |
|
syns = wordnet.synsets(key3) |
|
partofspeech, context = value3.split(" = ") |
|
def_filtered[f"{key3} = {context}"] = [] |
|
|
|
|
|
if partofspeech == "noun": |
|
partofspeech = "n" |
|
if partofspeech == "verb": |
|
partofspeech = "v" |
|
if partofspeech == "adjective": |
|
partofspeech = "s" |
|
if partofspeech == "adverb": |
|
partofspeech = "r" |
|
|
|
|
|
|
|
|
|
for s in syns: |
|
|
|
|
|
def_filtered[f"{key3} = {context}"].append(s.definition()) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
correct_def = {} |
|
for key4, value4 in def_filtered.items(): |
|
vocab, context = key4.split(" = ") |
|
sequence_to_classify = context |
|
candidate_labels = value4 |
|
|
|
correct_def_list = [] |
|
temp_def = [] |
|
hypothesis_template = 'The meaning of [' + vocab + '] is {}.' |
|
|
|
output = classifier(sequence_to_classify, candidate_labels, hypothesis_template=hypothesis_template) |
|
|
|
|
|
for label, score in zip(output['labels'], output['scores']): |
|
temp_def.append(label) |
|
|
|
for first in range(len(temp_def)): |
|
if first == 0: |
|
val = f">> {temp_def[first]}" |
|
else: |
|
val = f"{temp_def[first]}" |
|
|
|
correct_def_list.append(val) |
|
|
|
print(type(key4), type(correct_def_list)) |
|
correct_def[key4] = correct_def_list |
|
|
|
|
|
|
|
return correct_def |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|