AIdeaText commited on
Commit
726d44a
1 Parent(s): 13d91b4

Create semantic_analysis.py

Browse files
modules/text_analysis/semantic_analysis.py ADDED
@@ -0,0 +1,262 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #semantic_analysis.py
2
+ import streamlit as st
3
+ import spacy
4
+ import networkx as nx
5
+ import matplotlib.pyplot as plt
6
+ from collections import Counter
7
+ from collections import defaultdict
8
+
9
+ # Define colors for grammatical categories
10
+ POS_COLORS = {
11
+ 'ADJ': '#FFA07A', # Light Salmon
12
+ 'ADP': '#98FB98', # Pale Green
13
+ 'ADV': '#87CEFA', # Light Sky Blue
14
+ 'AUX': '#DDA0DD', # Plum
15
+ 'CCONJ': '#F0E68C', # Khaki
16
+ 'DET': '#FFB6C1', # Light Pink
17
+ 'INTJ': '#FF6347', # Tomato
18
+ 'NOUN': '#90EE90', # Light Green
19
+ 'NUM': '#FAFAD2', # Light Goldenrod Yellow
20
+ 'PART': '#D3D3D3', # Light Gray
21
+ 'PRON': '#FFA500', # Orange
22
+ 'PROPN': '#20B2AA', # Light Sea Green
23
+ 'SCONJ': '#DEB887', # Burlywood
24
+ 'SYM': '#7B68EE', # Medium Slate Blue
25
+ 'VERB': '#FF69B4', # Hot Pink
26
+ 'X': '#A9A9A9', # Dark Gray
27
+ }
28
+
29
+ POS_TRANSLATIONS = {
30
+ 'es': {
31
+ 'ADJ': 'Adjetivo',
32
+ 'ADP': 'Adposición',
33
+ 'ADV': 'Adverbio',
34
+ 'AUX': 'Auxiliar',
35
+ 'CCONJ': 'Conjunción Coordinante',
36
+ 'DET': 'Determinante',
37
+ 'INTJ': 'Interjección',
38
+ 'NOUN': 'Sustantivo',
39
+ 'NUM': 'Número',
40
+ 'PART': 'Partícula',
41
+ 'PRON': 'Pronombre',
42
+ 'PROPN': 'Nombre Propio',
43
+ 'SCONJ': 'Conjunción Subordinante',
44
+ 'SYM': 'Símbolo',
45
+ 'VERB': 'Verbo',
46
+ 'X': 'Otro',
47
+ },
48
+ 'en': {
49
+ 'ADJ': 'Adjective',
50
+ 'ADP': 'Adposition',
51
+ 'ADV': 'Adverb',
52
+ 'AUX': 'Auxiliary',
53
+ 'CCONJ': 'Coordinating Conjunction',
54
+ 'DET': 'Determiner',
55
+ 'INTJ': 'Interjection',
56
+ 'NOUN': 'Noun',
57
+ 'NUM': 'Number',
58
+ 'PART': 'Particle',
59
+ 'PRON': 'Pronoun',
60
+ 'PROPN': 'Proper Noun',
61
+ 'SCONJ': 'Subordinating Conjunction',
62
+ 'SYM': 'Symbol',
63
+ 'VERB': 'Verb',
64
+ 'X': 'Other',
65
+ },
66
+ 'fr': {
67
+ 'ADJ': 'Adjectif',
68
+ 'ADP': 'Adposition',
69
+ 'ADV': 'Adverbe',
70
+ 'AUX': 'Auxiliaire',
71
+ 'CCONJ': 'Conjonction de Coordination',
72
+ 'DET': 'Déterminant',
73
+ 'INTJ': 'Interjection',
74
+ 'NOUN': 'Nom',
75
+ 'NUM': 'Nombre',
76
+ 'PART': 'Particule',
77
+ 'PRON': 'Pronom',
78
+ 'PROPN': 'Nom Propre',
79
+ 'SCONJ': 'Conjonction de Subordination',
80
+ 'SYM': 'Symbole',
81
+ 'VERB': 'Verbe',
82
+ 'X': 'Autre',
83
+ }
84
+ }
85
+ ########################################################################################################################################
86
+
87
+ # Definimos las etiquetas y colores para cada idioma
88
+ ENTITY_LABELS = {
89
+ 'es': {
90
+ "Personas": "lightblue",
91
+ "Conceptos": "lightgreen",
92
+ "Lugares": "lightcoral",
93
+ "Fechas": "lightyellow"
94
+ },
95
+ 'en': {
96
+ "People": "lightblue",
97
+ "Concepts": "lightgreen",
98
+ "Places": "lightcoral",
99
+ "Dates": "lightyellow"
100
+ },
101
+ 'fr': {
102
+ "Personnes": "lightblue",
103
+ "Concepts": "lightgreen",
104
+ "Lieux": "lightcoral",
105
+ "Dates": "lightyellow"
106
+ }
107
+ }
108
+
109
+ #########################################################################################################
110
+ def count_pos(doc):
111
+ return Counter(token.pos_ for token in doc if token.pos_ != 'PUNCT')
112
+
113
+ #####################################################################################################################
114
+
115
+ def create_semantic_graph(doc, lang):
116
+ G = nx.Graph()
117
+ word_freq = defaultdict(int)
118
+ lemma_to_word = {}
119
+ lemma_to_pos = {}
120
+
121
+ # Count frequencies of lemmas and map lemmas to their most common word form and POS
122
+ for token in doc:
123
+ if token.pos_ in ['NOUN', 'VERB']:
124
+ lemma = token.lemma_.lower()
125
+ word_freq[lemma] += 1
126
+ if lemma not in lemma_to_word or token.text.lower() == lemma:
127
+ lemma_to_word[lemma] = token.text
128
+ lemma_to_pos[lemma] = token.pos_
129
+
130
+ # Get top 20 most frequent lemmas
131
+ top_lemmas = [lemma for lemma, _ in sorted(word_freq.items(), key=lambda x: x[1], reverse=True)[:20]]
132
+
133
+ # Add nodes
134
+ for lemma in top_lemmas:
135
+ word = lemma_to_word[lemma]
136
+ G.add_node(word, pos=lemma_to_pos[lemma])
137
+
138
+ # Add edges
139
+ for token in doc:
140
+ if token.lemma_.lower() in top_lemmas:
141
+ if token.head.lemma_.lower() in top_lemmas:
142
+ source = lemma_to_word[token.lemma_.lower()]
143
+ target = lemma_to_word[token.head.lemma_.lower()]
144
+ if source != target: # Avoid self-loops
145
+ G.add_edge(source, target, label=token.dep_)
146
+
147
+ return G, word_freq
148
+
149
+ ############################################################################################################################################
150
+
151
+ def visualize_semantic_relations(doc, lang):
152
+ G = nx.Graph()
153
+ word_freq = defaultdict(int)
154
+ lemma_to_word = {}
155
+ lemma_to_pos = {}
156
+
157
+ # Count frequencies of lemmas and map lemmas to their most common word form and POS
158
+ for token in doc:
159
+ if token.pos_ in ['NOUN', 'VERB']:
160
+ lemma = token.lemma_.lower()
161
+ word_freq[lemma] += 1
162
+ if lemma not in lemma_to_word or token.text.lower() == lemma:
163
+ lemma_to_word[lemma] = token.text
164
+ lemma_to_pos[lemma] = token.pos_
165
+
166
+ # Get top 20 most frequent lemmas
167
+ top_lemmas = [lemma for lemma, _ in sorted(word_freq.items(), key=lambda x: x[1], reverse=True)[:20]]
168
+
169
+ # Add nodes
170
+ for lemma in top_lemmas:
171
+ word = lemma_to_word[lemma]
172
+ G.add_node(word, pos=lemma_to_pos[lemma])
173
+
174
+ # Add edges
175
+ for token in doc:
176
+ if token.lemma_.lower() in top_lemmas:
177
+ if token.head.lemma_.lower() in top_lemmas:
178
+ source = lemma_to_word[token.lemma_.lower()]
179
+ target = lemma_to_word[token.head.lemma_.lower()]
180
+ if source != target: # Avoid self-loops
181
+ G.add_edge(source, target, label=token.dep_)
182
+
183
+ fig, ax = plt.subplots(figsize=(36, 27))
184
+ pos = nx.spring_layout(G, k=0.7, iterations=50)
185
+
186
+ node_colors = [POS_COLORS.get(G.nodes[node]['pos'], '#CCCCCC') for node in G.nodes()]
187
+
188
+ nx.draw(G, pos, node_color=node_colors, with_labels=True,
189
+ node_size=10000,
190
+ font_size=16,
191
+ font_weight='bold',
192
+ arrows=True,
193
+ arrowsize=30,
194
+ width=3,
195
+ edge_color='gray',
196
+ ax=ax)
197
+
198
+ edge_labels = nx.get_edge_attributes(G, 'label')
199
+ nx.draw_networkx_edge_labels(G, pos, edge_labels=edge_labels, font_size=14, ax=ax)
200
+
201
+ title = {
202
+ 'es': "Relaciones Semánticas Relevantes",
203
+ 'en': "Relevant Semantic Relations",
204
+ 'fr': "Relations Sémantiques Pertinentes"
205
+ }
206
+ ax.set_title(title[lang], fontsize=24, fontweight='bold')
207
+ ax.axis('off')
208
+
209
+ legend_elements = [plt.Rectangle((0,0),1,1,fc=POS_COLORS.get(pos, '#CCCCCC'), edgecolor='none',
210
+ label=f"{POS_TRANSLATIONS[lang].get(pos, pos)}")
211
+ for pos in ['NOUN', 'VERB']]
212
+ ax.legend(handles=legend_elements, loc='center left', bbox_to_anchor=(1, 0.5), fontsize=16)
213
+
214
+ return fig
215
+
216
+ ############################################################################################################################################
217
+ def identify_and_contextualize_entities(doc, lang):
218
+ entities = []
219
+ for ent in doc.ents:
220
+ # Obtener el contexto (3 palabras antes y después de la entidad)
221
+ start = max(0, ent.start - 3)
222
+ end = min(len(doc), ent.end + 3)
223
+ context = doc[start:end].text
224
+
225
+ entities.append({
226
+ 'text': ent.text,
227
+ 'label': ent.label_,
228
+ 'start': ent.start,
229
+ 'end': ent.end,
230
+ 'context': context
231
+ })
232
+
233
+ # Identificar conceptos clave (usando sustantivos y verbos más frecuentes)
234
+ word_freq = Counter([token.lemma_.lower() for token in doc if token.pos_ in ['NOUN', 'VERB'] and not token.is_stop])
235
+ key_concepts = word_freq.most_common(10) # Top 10 conceptos clave
236
+
237
+ return entities, key_concepts
238
+
239
+
240
+ ############################################################################################################################################
241
+ def perform_semantic_analysis(text, nlp, lang):
242
+ doc = nlp(text)
243
+
244
+ # Identificar entidades y conceptos clave
245
+ entities, key_concepts = identify_and_contextualize_entities(doc, lang)
246
+
247
+ # Visualizar relaciones semánticas
248
+ relations_graph = visualize_semantic_relations(doc, lang)
249
+
250
+ # Imprimir entidades para depuración
251
+ print(f"Entidades encontradas ({lang}):")
252
+ for ent in doc.ents:
253
+ print(f"{ent.text} - {ent.label_}")
254
+
255
+ relations_graph = visualize_semantic_relations(doc, lang)
256
+ return {
257
+ 'entities': entities,
258
+ 'key_concepts': key_concepts,
259
+ 'relations_graph': relations_graph
260
+ }
261
+
262
+ __all__ = ['visualize_semantic_relations', 'create_semantic_graph', 'POS_COLORS', 'POS_TRANSLATIONS', 'identify_and_contextualize_entities']