File size: 2,948 Bytes
9703df0
 
 
 
 
 
 
 
 
 
 
 
 
c3b1412
9703df0
 
 
 
 
 
 
 
c3b1412
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9703df0
c3b1412
 
 
 
 
 
 
 
 
 
 
 
9703df0
c3b1412
 
 
 
 
 
 
 
 
 
 
 
9703df0
c3b1412
 
 
 
 
9703df0
 
c3b1412
9703df0
 
c3b1412
9703df0
 
 
c3b1412
 
 
9703df0
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
import gradio as gr

import nltk
import simplemma
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.probability import FreqDist
from simplemma import text_lemmatizer
nltk.download('punkt')

file = "text.txt"

import spacy
nlp_IT = spacy.load("it_core_news_sm")

def get_lists(file):
  with open(file, 'r', encoding='utf-8') as f:
    text = f.read()

  sent_tokenized_text = sent_tokenize(text, language='italian')
  sent_tokenized_text_lower = [sent.lower() for sent in sent_tokenized_text]

  return sent_tokenized_text, sent_tokenized_text_lower

sentences, sentences_lower = get_lists(file)

def search_engine_collocations(target = 'scarto' , colloc = 'azioni' , nlp = nlp_it, sentences_lower = sentences_lower, sentences = sentences):
  
  verbs = []
  adjectives = []
  nouns = []
  result = 0
  
  for i,sent in enumerate(sentences_lower):
    if target.lower() in sent:
      result += 1
      doc = nlp(sent)
      for token in doc:
        if 'VERB' in token.pos_: 
          verbs.append(token.lemma_)
        elif 'ADJ' in token.pos_: 
          adjectives.append(token.lemma_)
        elif 'NOUN' in token.pos_: 
          nouns.append(token.lemma_)


  if result == 0:
    return "Non ho trovato la parola '{target}'.\n"
  
  else:
    if colloc = 'azioni' and verbs != []:
        verbs_fdist = FreqDist(verbs)
      
        stringed_results = ''
        for n,r in enumerate(verbs_fdist.most_common()):
            stringed_results += str(n+1) + ': ' + str(r) + '\n\n'

        return f"Ho trovato {len(verbs)} azioni legate a '{target}'\n{stringed_results}"
      
     elif verbs = []:
         return f"Non ho trovato azioni legate a '{target}'"
         
     
     if colloc = 'caratteristiche' and adjectives != []:
        adj_fdist = FreqDist(adjectives)
      
        stringed_results = ''
        for n,r in enumerate(adj_fdist.most_common()):
            stringed_results += str(n+1) + ': ' + str(r) + '\n\n'

        return f"Ho trovato {len(adjectives)} caratteristiche legate a '{target}'\n{stringed_results}"
      
     elif adjectives = []:
         return f"Non ho trovato caratteristiche legate a '{target}'"
      
      
      if colloc = 'concetti' and nouns != []:
        nouns_fdist = FreqDist(verbs)
      
        stringed_results = ''
        for n,r in enumerate(nouns_fdist.most_common()):
            stringed_results += str(n+1) + ': ' + str(r) + '\n\n'

        return f"Ho trovato {len(nouns)} concetti legati a '{target}'\n{stringed_results}"
      
     elif nouns = []:
         return f"Non ho trovato concetti legate a '{target}'"
         

demo = gr.Interface(
    search_engine_collocations,
    [
        gr.Textbox(),
        gr.Radio(["azioni", "caratteristiche", "concetti"]),
    ],
    "text",
    examples=[
        ["scarto", "azioni"],
        ["rifiuto", "caratteristiche"],
        ["sostenibilità", "concetti"],
    ],
)

demo.launch()