Narendra291 commited on
Commit
78f76dd
1 Parent(s): 89ffd2f

Update app.py

Browse files

added duplicate names removal functionality

Files changed (1) hide show
  1. app.py +22 -0
app.py CHANGED
@@ -309,6 +309,10 @@ TIME"""
309
  # dedupe the entities but only on exact match of values as occasional it will assign an ORG entity to PER
310
  deduplicated_entities = {frozenset(item.values()):
311
  item for item in article_entity_list}.values()
 
 
 
 
312
  # create a dataframe from the entities
313
  for record in deduplicated_entities:
314
  record_df = pd.DataFrame(record.items(), columns=["entity", "description"])
@@ -318,6 +322,20 @@ TIME"""
318
  print('______________________')
319
  return self.entity_df
320
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
321
  def entity_json(self):
322
  """Returns a JSON representation of an entity defined by the `entity_df` dataframe. The `entity_json` function
323
  will return a JSON object with the following fields:
@@ -444,6 +462,10 @@ if st.session_state['parsed']:
444
  'GPE':'Political Location',
445
  'NORP':'Political or Religious Groups',
446
  'LOC':'Non Political Location'})
 
 
 
 
447
  gb = GridOptionsBuilder.from_dataframe(df_to_st)
448
  gb.configure_pagination(paginationAutoPageSize=True) #Add pagination
449
  gb.configure_side_bar() #Add a sidebar
 
309
  # dedupe the entities but only on exact match of values as occasional it will assign an ORG entity to PER
310
  deduplicated_entities = {frozenset(item.values()):
311
  item for item in article_entity_list}.values()
312
+
313
+ #to remove duplicate names
314
+ fuzzy_remove_duplicate_ent(deduplicated_entities, threshold = 85, limit = 1)
315
+
316
  # create a dataframe from the entities
317
  for record in deduplicated_entities:
318
  record_df = pd.DataFrame(record.items(), columns=["entity", "description"])
 
322
  print('______________________')
323
  return self.entity_df
324
 
325
+
326
+ def fuzzy_remove_duplicate_ent(deduped_ents, threshold=85, limit=1):
327
+ search_space = list(deduped_ents)
328
+
329
+ for ent in deduped_ents:
330
+ duplicates_found = process.extract(ent, search_space.remove(ent), limit =1) # process.extract return the ent match amongst search_space with it's score
331
+ duplicates_found = [entity[0] for entity in duplicates_found if entity[1]> threshold]
332
+
333
+ if (len(duplicates_found) >0 ):
334
+ deduped_ents =[entity for entity in deduped_ents if entity not in duplicates_found]
335
+
336
+ return deduped_ents
337
+
338
+
339
  def entity_json(self):
340
  """Returns a JSON representation of an entity defined by the `entity_df` dataframe. The `entity_json` function
341
  will return a JSON object with the following fields:
 
462
  'GPE':'Political Location',
463
  'NORP':'Political or Religious Groups',
464
  'LOC':'Non Political Location'})
465
+
466
+ # Replace name with zeroth element of related to
467
+ df_to_st['Name'] = df_to_st['Related to'].apply(lambda x : x[0])
468
+
469
  gb = GridOptionsBuilder.from_dataframe(df_to_st)
470
  gb.configure_pagination(paginationAutoPageSize=True) #Add pagination
471
  gb.configure_side_bar() #Add a sidebar