File size: 6,461 Bytes
1728ec4
 
 
 
 
 
 
 
 
 
 
 
 
 
c7e97a6
1728ec4
c7e97a6
821e0e5
 
f473190
 
1728ec4
c7e97a6
 
 
 
 
1728ec4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
01d266e
 
 
 
1728ec4
179a07d
167c0f2
 
 
 
 
1728ec4
 
 
 
 
 
01d266e
 
 
 
 
1728ec4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
# Importing libraries
from nltk.corpus import wordnet
import nltk
import transformers
import pandas as pd
import json
import random
import torch

device='cpu'

# Declare the (trained) model that will be used
classifier = transformers.pipeline("zero-shot-classification", model="simple_trained_wsd_pipeline", device=device)

# import spacy
# Part Of Speech tagging (POS tagging)
# nlp = spacy.load("en_core_web_sm")

# Importing as module.
# import en_core_web_sm
# nlp = en_core_web_sm.load()

import stanza

# Initialize the English pipeline
nlp = stanza.Pipeline('en')

print('successfully download model')


def model(passage, level):
  # pip install spacy
  # pip install transformers
  # pip install torch
  # pip install en_core_web_sm
  # python -m spacy download en_core_web_sm
  # pip install spacy-download
  # pip install nltk

  nltk.download('wordnet')
  nltk.download('omw-1.4')

  # Passing file directories into variables
  # text_input = "./text_input.txt"
  cefr_vocab = "cefr-vocab.csv"

  # Create and open the text file
  # with open(text_input, "a") as file:
  #   file.write(".") # Add a full stop at the end to make sure there is a full stop at the end of the text for the model to understand where to stop the sentence


  # Ask the user for the CEFR level
  # while True:
    # cefr_level = input("Which CEFR level you want to test?: ").upper()
    # if "A1" in cefr_level or "A2" in cefr_level or "B1" in cefr_level or "B2" in cefr_level or "C1" in cefr_level or "C2" in cefr_level:
    #   break
    # else:
    #   continue
  cefr_level = level

  # Read from the input file
  # with open(text_input, "r") as file:
  #   txt = str(file.readlines()).replace("[", "").replace("'", "").replace("]", "")
  if not passage.endswith((".", "!", "?")):
    txt = passage + "."
  else:
    txt = passage

  # sentence_cutters = [".", "!", "?"]
  # if ("." in txt) or ("!" in txt) or ("?" in txt):
  txt = txt.replace("!", ".").replace("?", ".")
  txt = txt.split(".")
  # else:
  #   txt = txt

  text_dict = {}
  for n in txt:
    n = n.strip()
    ex1 = nlp(n)

    for sentence in ex1.sentences:
      for word in sentence.words:
        sentence_question_tag = n.replace(word.text, f"[{word.text}]") # spacy and stanza use the same entity tag: "word.text"
        # text_dict[f"{word.lemma_} = {sentence_question_tag}"] = word.pos_ # this is for spacy
        text_dict[f"{word.lemma} = {sentence_question_tag}"] = word.upos # this is for stanza

  # Collect the tagging results (filter in just NOUN, PROPN, VERB, ADJ, or ADV only)
  collector = {}
  for key, value in text_dict.items():
    if "NOUN" in value or "VERB" in value or "ADJ" in value or "ADV" in value:
      collector[key] = value

  # Collect the CEFR level of the words collected before
  reference = pd.read_csv(cefr_vocab)

  matching = {}
  for row_idx in range(reference.shape[0]):
    row = reference.iloc[row_idx]
    key = f"{row.headword}, {row.pos}"
    matching[key] = row.CEFR

  # Convert pos of the word into all lowercase to match another data set with CEFR level
  for key1, value1 in collector.items():
    if value1 == "NOUN":
      collector[key1] = "noun"
    if value1 == "VERB":
      collector[key1] = "verb"
    if value1 == "ADJ":
      collector[key1] = "adjective"
    if value1 == "ADV":
      collector[key1] = "adverb"

  # Matching 2 datasets together by the word and the pos
  ready2filter = {}
  for key, value in matching.items():
    first_key, second_key = key.split(", ")
    for key2, value2 in collector.items():
      key2 = key2.split(" = ")
      if first_key == key2[0].lower():
        if second_key == value2:
          ready2filter[f"{key} = {key2[1]}"] = value

  # Filter in just the vocab that has the selected CEFR level that the user provided at the beginning
  filtered0 = {}
  for key, value in ready2filter.items():
      if cefr_level == "ALL":
          filtered0[key] = value
      else:
          if value == cefr_level:
              filtered0[key] = value

  # Rearrange the Python dictionary structure
  filtered = {}
  for key, value in filtered0.items():
      key_parts = key.split(', ')
      new_key = key_parts[0]
      new_value = key_parts[1]
      filtered[new_key] = new_value

  # Grab the definition of each vocab from the NLTK wordnet English dictionary
  def_filtered = {}
  for key3, value3 in filtered.items():
    syns = wordnet.synsets(key3)
    partofspeech, context = value3.split(" = ")
    def_filtered[f"{key3} = {context}"] = []

    # pos conversion
    if partofspeech == "noun":
      partofspeech = "n"
    if partofspeech == "verb":
      partofspeech = "v"
    if partofspeech == "adjective":
      partofspeech = "s"
    if partofspeech == "adverb":
      partofspeech = "r"

    # print("def_filtered 0:", def_filtered)

    # Adding the definitions into the Python dictionary, def_filtered (syns variable does the job of finding the relevant word aka synonyms)
    for s in syns:
        # print('s:', s)
        # print("syns:", syns)
        def_filtered[f"{key3} = {context}"].append(s.definition())
        # print("def_filtered 1:", def_filtered)

  # Use Nvidia CUDA core if available
  # if torch.cuda.is_available():
  #     device=0
  # else:


  # Process Python dictionary, def_filtereddic
  correct_def = {}
  for key4, value4 in def_filtered.items():
    vocab, context = key4.split(" = ")
    sequence_to_classify = context
    candidate_labels = value4
    # correct_def[key4] = []
    correct_def_list = []
    temp_def = []
    hypothesis_template = 'The meaning of [' + vocab + '] is {}.'

    output = classifier(sequence_to_classify, candidate_labels, hypothesis_template=hypothesis_template)

    # Process the score of each definition and add it to the Python dictionary, correct_def
    for label, score in zip(output['labels'], output['scores']):
        temp_def.append(label)
        # print(temp_def)
        for first in range(len(temp_def)):
          if first == 0:
            val = f">> {temp_def[first]}"
          else:
            val = f"{temp_def[first]}"

        correct_def_list.append(val)

    print(type(key4), type(correct_def_list))
    correct_def[key4] = correct_def_list
        
        # correct_def[key4].append(f"{label}")

  return correct_def

  # with open(T2E_exam, "r") as file:
  #    exam = file.readlines()
  # print(exam)
  # return(exam)


# passage = "Computer is good"
# level = "A1"
# print(model(passage, level))