File size: 2,948 Bytes
9703df0
 
 
 
 
 
 
 
 
 
 
 
 
c3b1412
9703df0
 
 
 
 
 
 
 
c3b1412
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fd2d594
c3b1412
 
 
 
 
9703df0
c3b1412
 
836051d
c3b1412
 
 
836051d
c3b1412
 
 
 
 
9703df0
c3b1412
 
836051d
c3b1412
 
 
836051d
c3b1412
 
 
 
 
9703df0
c3b1412
 
836051d
c3b1412
 
9703df0
 
c3b1412
9703df0
 
c3b1412
9703df0
 
 
c3b1412
 
 
9703df0
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
import gradio as gr

import nltk
import simplemma
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.probability import FreqDist
from simplemma import text_lemmatizer
nltk.download('punkt')

file = "text.txt"

import spacy
nlp_IT = spacy.load("it_core_news_sm")

def get_lists(file):
  with open(file, 'r', encoding='utf-8') as f:
    text = f.read()

  sent_tokenized_text = sent_tokenize(text, language='italian')
  sent_tokenized_text_lower = [sent.lower() for sent in sent_tokenized_text]

  return sent_tokenized_text, sent_tokenized_text_lower

sentences, sentences_lower = get_lists(file)

def search_engine_collocations(target = 'scarto' , colloc = 'azioni' , nlp = nlp_it, sentences_lower = sentences_lower, sentences = sentences):
  
  verbs = []
  adjectives = []
  nouns = []
  result = 0
  
  for i,sent in enumerate(sentences_lower):
    if target.lower() in sent:
      result += 1
      doc = nlp(sent)
      for token in doc:
        if 'VERB' in token.pos_: 
          verbs.append(token.lemma_)
        elif 'ADJ' in token.pos_: 
          adjectives.append(token.lemma_)
        elif 'NOUN' in token.pos_: 
          nouns.append(token.lemma_)


  if result == 0:
    return "Non ho trovato la parola '{target}'.\n"
  
  else:
    if colloc == 'azioni' and verbs != []:
        verbs_fdist = FreqDist(verbs)
      
        stringed_results = ''
        for n,r in enumerate(verbs_fdist.most_common()):
            stringed_results += str(n+1) + ': ' + str(r) + '\n\n'

        return f"Ho trovato {len(verbs)} azioni legate a '{target}'\n{stringed_results}"
      
    elif verbs == []:
         return f"Non ho trovato azioni legate a '{target}'"
         
     
    if colloc == 'caratteristiche' and adjectives != []:
        adj_fdist = FreqDist(adjectives)
      
        stringed_results = ''
        for n,r in enumerate(adj_fdist.most_common()):
            stringed_results += str(n+1) + ': ' + str(r) + '\n\n'

        return f"Ho trovato {len(adjectives)} caratteristiche legate a '{target}'\n{stringed_results}"
      
    elif adjectives == []:
         return f"Non ho trovato caratteristiche legate a '{target}'"
      
      
    if colloc == 'concetti' and nouns != []:
        nouns_fdist = FreqDist(verbs)
      
        stringed_results = ''
        for n,r in enumerate(nouns_fdist.most_common()):
            stringed_results += str(n+1) + ': ' + str(r) + '\n\n'

        return f"Ho trovato {len(nouns)} concetti legati a '{target}'\n{stringed_results}"
      
    elif nouns == []:
         return f"Non ho trovato concetti legate a '{target}'"
         

demo = gr.Interface(
    search_engine_collocations,
    [
        gr.Textbox(),
        gr.Radio(["azioni", "caratteristiche", "concetti"]),
    ],
    "text",
    examples=[
        ["scarto", "azioni"],
        ["rifiuto", "caratteristiche"],
        ["sostenibilità", "concetti"],
    ],
)

demo.launch()