azioni_ICC / app.py
azaninello's picture
Update app.py
e51f461
raw
history blame
2.95 kB
import gradio as gr
import nltk
import simplemma
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.probability import FreqDist
from simplemma import text_lemmatizer
nltk.download('punkt')
file = "text.txt"
import spacy
nlp_IT = spacy.load("it_core_news_sm")
def get_lists(file):
with open(file, 'r', encoding='utf-8') as f:
text = f.read()
sent_tokenized_text = sent_tokenize(text, language='italian')
sent_tokenized_text_lower = [sent.lower() for sent in sent_tokenized_text]
return sent_tokenized_text, sent_tokenized_text_lower
sentences, sentences_lower = get_lists(file)
def search_engine_collocations(target = 'scarto' , colloc = 'azioni' , nlp = nlp_IT, sentences_lower = sentences_lower, sentences = sentences):
verbs = []
adjectives = []
nouns = []
result = 0
for i,sent in enumerate(sentences_lower):
if target.lower() in sent:
result += 1
doc = nlp(sent)
for token in doc:
if 'VERB' in token.pos_:
verbs.append(token.lemma_)
elif 'ADJ' in token.pos_:
adjectives.append(token.lemma_)
elif 'NOUN' in token.pos_:
nouns.append(token.lemma_)
if result == 0:
return "Non ho trovato la parola '{target}'.\n\n"
else:
if colloc == 'azioni' and verbs != []:
verbs_fdist = FreqDist(verbs)
stringed_results = ''
for n,r in enumerate(verbs_fdist.most_common()):
stringed_results += str(n+1) + ': ' + str(r) + '\n\n'
return f"Ho trovato {len(verbs)} azioni legate a '{target}'\n{stringed_results}"
elif verbs == []:
return f"Non ho trovato azioni legate a '{target}'"
if colloc == 'caratteristiche' and adjectives != []:
adj_fdist = FreqDist(adjectives)
stringed_results = ''
for n,r in enumerate(adj_fdist.most_common()):
stringed_results += str(n+1) + ': ' + str(r) + '\n\n'
return f"Ho trovato {len(adjectives)} caratteristiche legate a '{target}'\n{stringed_results}"
elif adjectives == []:
return f"Non ho trovato caratteristiche legate a '{target}'"
if colloc == 'concetti' and nouns != []:
nouns_fdist = FreqDist(nouns)
stringed_results = ''
for n,r in enumerate(nouns_fdist.most_common()):
stringed_results += str(n+1) + ': ' + str(r) + '\n\n'
return f"Ho trovato {len(nouns)} concetti legati a '{target}'\n{stringed_results}"
elif nouns == []:
return f"Non ho trovato concetti legati a '{target}'"
demo = gr.Interface(
search_engine_collocations,
[
gr.Textbox(),
gr.Radio(["azioni", "caratteristiche", "concetti"]),
],
"text",
examples=[
["scarto", "azioni"],
["rifiuto", "caratteristiche"],
["sostenibilità", "concetti"],
],
)
demo.launch()