pritamdeka commited on
Commit
895bc99
β€’
1 Parent(s): ffda8a6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +42 -11
app.py CHANGED
@@ -1,6 +1,9 @@
1
  import nltk
2
  import re
3
  import nltkmodule
 
 
 
4
 
5
  from nltk.tokenize import word_tokenize
6
  from sentence_transformers import SentenceTransformer
@@ -38,13 +41,35 @@ def remove_stopwords(sen):
38
  sen_new = " ".join([i for i in sen if i not in stop_words])
39
  return sen_new
40
 
41
- def keyphrase_generator(article, model_1, model_2, max_num_keywords):
42
  element=[]
 
43
  document=[]
44
- text=[]
 
 
45
  model_1 = SentenceTransformer(model_1)
46
  model_2 = SentenceTransformer(model_2)
 
 
 
47
  corpus=sent_tokenize(article)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
  clean_sentences_new = pd.Series(corpus).str.replace("[^a-zA-Z]", " ").tolist()
49
  corpus_embeddings = model_1.encode(clean_sentences_new)
50
  sim_mat = np.zeros([len(clean_sentences_new), len(clean_sentences_new)])
@@ -54,21 +79,28 @@ def keyphrase_generator(article, model_1, model_2, max_num_keywords):
54
  sim_mat[i][j] = cosine_similarity(corpus_embeddings[i].reshape(1,768), corpus_embeddings[j].reshape(1,768))[0,0]
55
  nx_graph = nx.from_numpy_array(sim_mat)
56
  scores = nx.pagerank(nx_graph)
57
- ranked_sentences = sorted(((scores[i],s) for i,s in enumerate(corpus)), reverse=True)
58
- for elem in ranked_sentences:
59
- element.append(elem[1])
60
- a=int((10*len(element))/100.0)
 
 
 
 
 
 
 
61
  if(a<5):
62
  total=5
63
  else:
64
  total=int(a)
65
  for i in range(total):
66
- document.append(element[i])
67
  doc=" ".join(document)
68
  for i in document:
69
  doc_1=nlp(i)
70
- text.append([X.text for X in doc_1.ents])
71
- entity_list = [item for sublist in text for item in sublist]
72
  entity_list = [word for word in entity_list if not word in all_stopwords]
73
  entity_list=list(dict.fromkeys(entity_list))
74
  doc_embedding = model_2.encode([doc])
@@ -80,9 +112,8 @@ def keyphrase_generator(article, model_1, model_2, max_num_keywords):
80
  keywords = '\n'.join(keyword_list)
81
  return keywords
82
 
83
-
84
  igen=gr.Interface(keyphrase_generator,
85
- inputs=[gr.inputs.Textbox(lines=10, placeholder="Provide article text here",default="", label="article text"),gr.inputs.Textbox(lines=1, placeholder="SBERT model",default="all-mpnet-base-v2", label="Model for TextRank (e.g. all-mpnet-base-v2)"),gr.inputs.Textbox(lines=1, placeholder="SBERT model",default="all-distilroberta-v1",label="Model for keyphrases (e.g. all-distilroberta-v1)"),gr.inputs.Slider(minimum=5, maximum=30, step=1, default=10, label="Max Keywords")],
86
  outputs="text", theme="huggingface",
87
  title="Scientific Article Keyphrase Generator",
88
  description="Generates the keyphrases from an article which best describes the article.",
 
1
  import nltk
2
  import re
3
  import nltkmodule
4
+ from newspaper import Article
5
+ from newspaper import fulltext
6
+ import requests
7
 
8
  from nltk.tokenize import word_tokenize
9
  from sentence_transformers import SentenceTransformer
 
41
  sen_new = " ".join([i for i in sen if i not in stop_words])
42
  return sen_new
43
 
44
+ def keyphrase_generator(article_link, model_1, model_2, max_num_keywords):
45
  element=[]
46
+ final_textrank_list=[]
47
  document=[]
48
+ text_doc=[]
49
+ score_list=[]
50
+ sum_list=[]
51
  model_1 = SentenceTransformer(model_1)
52
  model_2 = SentenceTransformer(model_2)
53
+ url = article_link
54
+ html = requests.get(url).text
55
+ article = fulltext(html)
56
  corpus=sent_tokenize(article)
57
+ indicator_list=['concluded','concludes','in a study', 'concluding','conclude','in sum','in a recent study','therefore','thus','so','hence',
58
+ 'as a result','accordingly','consequently','in short','proves that','shows that','suggests that','demonstrates that','found that','observed that',
59
+ 'indicated that','suggested that','demonstrated that']
60
+ count_dict={}
61
+ for l in corpus:
62
+ c=0
63
+ for l2 in indicator_list:
64
+ if l.find(l2)!=-1:#then it is a substring
65
+ c=1
66
+ break
67
+ if c:#
68
+ count_dict[l]=1
69
+ else:
70
+ count_dict[l]=0
71
+ for sent, score in count_dict.items():
72
+ score_list.append(score)
73
  clean_sentences_new = pd.Series(corpus).str.replace("[^a-zA-Z]", " ").tolist()
74
  corpus_embeddings = model_1.encode(clean_sentences_new)
75
  sim_mat = np.zeros([len(clean_sentences_new), len(clean_sentences_new)])
 
79
  sim_mat[i][j] = cosine_similarity(corpus_embeddings[i].reshape(1,768), corpus_embeddings[j].reshape(1,768))[0,0]
80
  nx_graph = nx.from_numpy_array(sim_mat)
81
  scores = nx.pagerank(nx_graph)
82
+ sentences=((scores[i],s) for i,s in enumerate(corpus))
83
+
84
+ for elem in sentences:
85
+ element.append(elem[0])
86
+ for sc, lst in zip(score_list, element): ########## taking the scores from both the lists
87
+ sum1=sc+lst
88
+ sum_list.append(sum1)
89
+ x=sorted(((sum_list[i],s) for i,s in enumerate(corpus)), reverse=True)
90
+ for elem in x:
91
+ final_textrank_list.append(elem[1])
92
+ a=int((10*len(final_textrank_list))/100.0)
93
  if(a<5):
94
  total=5
95
  else:
96
  total=int(a)
97
  for i in range(total):
98
+ document.append(final_textrank_list[i])
99
  doc=" ".join(document)
100
  for i in document:
101
  doc_1=nlp(i)
102
+ text_doc.append([X.text for X in doc_1.ents])
103
+ entity_list = [item for sublist in text_doc for item in sublist]
104
  entity_list = [word for word in entity_list if not word in all_stopwords]
105
  entity_list=list(dict.fromkeys(entity_list))
106
  doc_embedding = model_2.encode([doc])
 
112
  keywords = '\n'.join(keyword_list)
113
  return keywords
114
 
 
115
  igen=gr.Interface(keyphrase_generator,
116
+ inputs=[gr.inputs.Textbox(lines=3, placeholder="Provide article link here",default="", label="article link"),gr.inputs.Textbox(lines=1, placeholder="SBERT model",default="all-mpnet-base-v2", label="Model for TextRank (e.g. all-mpnet-base-v2)"),gr.inputs.Textbox(lines=1, placeholder="SBERT model",default="all-distilroberta-v1",label="Model for keyphrases (e.g. all-distilroberta-v1)"),gr.inputs.Slider(minimum=5, maximum=30, step=1, default=10, label="Max Keywords")],
117
  outputs="text", theme="huggingface",
118
  title="Scientific Article Keyphrase Generator",
119
  description="Generates the keyphrases from an article which best describes the article.",