AIdeaText commited on
Commit
aee1800
1 Parent(s): e90add8

Update modules/semantic_analysis.py

Browse files
Files changed (1) hide show
  1. modules/semantic_analysis.py +27 -56
modules/semantic_analysis.py CHANGED
@@ -112,70 +112,41 @@ ENTITY_LABELS = {
112
  def count_pos(doc):
113
  return Counter(token.pos_ for token in doc if token.pos_ != 'PUNCT')
114
 
115
- import spacy
116
- import networkx as nx
117
- import matplotlib.pyplot as plt
118
- from collections import Counter
119
-
120
- # Mantén las definiciones de POS_COLORS y POS_TRANSLATIONS que ya tienes
121
 
122
- #############################################################################################################################
123
- def extract_entities(doc, lang):
124
- entities = {label: [] for label in ENTITY_LABELS[lang].keys()}
125
-
126
- for ent in doc.ents:
127
- if ent.label_ == "PERSON":
128
- entities[list(ENTITY_LABELS[lang].keys())[0]].append(ent.text)
129
- elif ent.label_ in ["LOC", "GPE"]:
130
- entities[list(ENTITY_LABELS[lang].keys())[2]].append(ent.text)
131
- elif ent.label_ == "DATE":
132
- entities[list(ENTITY_LABELS[lang].keys())[3]].append(ent.text)
133
- else:
134
- entities[list(ENTITY_LABELS[lang].keys())[1]].append(ent.text)
135
-
136
- return entities
137
 
138
- #####################################################################################################################
 
 
 
 
 
 
 
139
 
140
- #def visualize_context_graph(doc, lang):
141
- # G = nx.Graph()
142
- # entities = extract_entities(doc, lang)
143
- # color_map = ENTITY_LABELS[lang]
144
 
145
  # Add nodes
146
- # for category, items in entities.items():
147
- # for item in items:
148
- # G.add_node(item, category=category)
149
 
150
  # Add edges
151
- # for sent in doc.sents:
152
- # sent_entities = [ent for ent in sent.ents if ent.text in G.nodes()]
153
- # for i in range(len(sent_entities)):
154
- # for j in range(i+1, len(sent_entities)):
155
- # G.add_edge(sent_entities[i].text, sent_entities[j].text)
156
-
157
- # Visualize
158
- # plt.figure(figsize=(30, 22)) # Increased figure size
159
- # pos = nx.spring_layout(G, k=0.7, iterations=50) # Adjusted layout
160
-
161
- # node_colors = [color_map[G.nodes[node]['category']] for node in G.nodes()]
162
-
163
- # nx.draw(G, pos, node_color=node_colors, with_labels=True,
164
- # node_size=10000, # Increased node size
165
- # font_size=18, # Increased font size
166
- # font_weight='bold',
167
- # width=2, # Increased edge width
168
- # arrowsize=30) # Increased arrow size
169
-
170
- # Add a legend
171
- # legend_elements = [plt.Rectangle((0,0),1,1,fc=color, edgecolor='none', label=category)
172
- # for category, color in color_map.items()]
173
- # plt.legend(handles=legend_elements, loc='upper left', bbox_to_anchor=(1, 1), fontsize=16) # Increased legend font size
174
-
175
- # plt.title("Análisis del Contexto" if lang == 'es' else "Context Analysis" if lang == 'en' else "Analyse du Contexte", fontsize=24) # Increased title font size
176
- # plt.axis('off')
177
 
178
- # return plt
179
 
180
  ############################################################################################################################################
181
 
 
112
  def count_pos(doc):
113
  return Counter(token.pos_ for token in doc if token.pos_ != 'PUNCT')
114
 
115
+ #####################################################################################################################
 
 
 
 
 
116
 
117
+ def create_semantic_graph(doc, lang):
118
+ G = nx.Graph()
119
+ word_freq = defaultdict(int)
120
+ lemma_to_word = {}
121
+ lemma_to_pos = {}
 
 
 
 
 
 
 
 
 
 
122
 
123
+ # Count frequencies of lemmas and map lemmas to their most common word form and POS
124
+ for token in doc:
125
+ if token.pos_ in ['NOUN', 'VERB']:
126
+ lemma = token.lemma_.lower()
127
+ word_freq[lemma] += 1
128
+ if lemma not in lemma_to_word or token.text.lower() == lemma:
129
+ lemma_to_word[lemma] = token.text
130
+ lemma_to_pos[lemma] = token.pos_
131
 
132
+ # Get top 20 most frequent lemmas
133
+ top_lemmas = [lemma for lemma, _ in sorted(word_freq.items(), key=lambda x: x[1], reverse=True)[:20]]
 
 
134
 
135
  # Add nodes
136
+ for lemma in top_lemmas:
137
+ word = lemma_to_word[lemma]
138
+ G.add_node(word, pos=lemma_to_pos[lemma])
139
 
140
  # Add edges
141
+ for token in doc:
142
+ if token.lemma_.lower() in top_lemmas:
143
+ if token.head.lemma_.lower() in top_lemmas:
144
+ source = lemma_to_word[token.lemma_.lower()]
145
+ target = lemma_to_word[token.head.lemma_.lower()]
146
+ if source != target: # Avoid self-loops
147
+ G.add_edge(source, target, label=token.dep_)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
148
 
149
+ return G, word_freq
150
 
151
  ############################################################################################################################################
152