Spaces:
Runtime error
Runtime error
Narendra291
commited on
Commit
•
78f76dd
1
Parent(s):
89ffd2f
Update app.py
Browse filesadded duplicate names removal functionality
app.py
CHANGED
@@ -309,6 +309,10 @@ TIME"""
|
|
309 |
# dedupe the entities but only on exact match of values as occasional it will assign an ORG entity to PER
|
310 |
deduplicated_entities = {frozenset(item.values()):
|
311 |
item for item in article_entity_list}.values()
|
|
|
|
|
|
|
|
|
312 |
# create a dataframe from the entities
|
313 |
for record in deduplicated_entities:
|
314 |
record_df = pd.DataFrame(record.items(), columns=["entity", "description"])
|
@@ -318,6 +322,20 @@ TIME"""
|
|
318 |
print('______________________')
|
319 |
return self.entity_df
|
320 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
321 |
def entity_json(self):
|
322 |
"""Returns a JSON representation of an entity defined by the `entity_df` dataframe. The `entity_json` function
|
323 |
will return a JSON object with the following fields:
|
@@ -444,6 +462,10 @@ if st.session_state['parsed']:
|
|
444 |
'GPE':'Political Location',
|
445 |
'NORP':'Political or Religious Groups',
|
446 |
'LOC':'Non Political Location'})
|
|
|
|
|
|
|
|
|
447 |
gb = GridOptionsBuilder.from_dataframe(df_to_st)
|
448 |
gb.configure_pagination(paginationAutoPageSize=True) #Add pagination
|
449 |
gb.configure_side_bar() #Add a sidebar
|
|
|
309 |
# dedupe the entities but only on exact match of values as occasional it will assign an ORG entity to PER
|
310 |
deduplicated_entities = {frozenset(item.values()):
|
311 |
item for item in article_entity_list}.values()
|
312 |
+
|
313 |
+
#to remove duplicate names
|
314 |
+
fuzzy_remove_duplicate_ent(deduplicated_entities, threshold = 85, limit = 1)
|
315 |
+
|
316 |
# create a dataframe from the entities
|
317 |
for record in deduplicated_entities:
|
318 |
record_df = pd.DataFrame(record.items(), columns=["entity", "description"])
|
|
|
322 |
print('______________________')
|
323 |
return self.entity_df
|
324 |
|
325 |
+
|
326 |
+
def fuzzy_remove_duplicate_ent(deduped_ents, threshold=85, limit=1):
|
327 |
+
search_space = list(deduped_ents)
|
328 |
+
|
329 |
+
for ent in deduped_ents:
|
330 |
+
duplicates_found = process.extract(ent, search_space.remove(ent), limit =1) # process.extract return the ent match amongst search_space with it's score
|
331 |
+
duplicates_found = [entity[0] for entity in duplicates_found if entity[1]> threshold]
|
332 |
+
|
333 |
+
if (len(duplicates_found) >0 ):
|
334 |
+
deduped_ents =[entity for entity in deduped_ents if entity not in duplicates_found]
|
335 |
+
|
336 |
+
return deduped_ents
|
337 |
+
|
338 |
+
|
339 |
def entity_json(self):
|
340 |
"""Returns a JSON representation of an entity defined by the `entity_df` dataframe. The `entity_json` function
|
341 |
will return a JSON object with the following fields:
|
|
|
462 |
'GPE':'Political Location',
|
463 |
'NORP':'Political or Religious Groups',
|
464 |
'LOC':'Non Political Location'})
|
465 |
+
|
466 |
+
# Replace name with zeroth element of related to
|
467 |
+
df_to_st['Name'] = df_to_st['Related to'].apply(lambda x : x[0])
|
468 |
+
|
469 |
gb = GridOptionsBuilder.from_dataframe(df_to_st)
|
470 |
gb.configure_pagination(paginationAutoPageSize=True) #Add pagination
|
471 |
gb.configure_side_bar() #Add a sidebar
|