Arhashmi commited on
Commit
7b23c0c
β€’
1 Parent(s): e2f752b

Upload 5 files

Browse files
Files changed (6) hide show
  1. .gitattributes +1 -0
  2. README.md +6 -6
  3. app.py +306 -0
  4. authors.csv +0 -0
  5. covid_abstracts.csv +3 -0
  6. requirements.txt +10 -0
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ covid_abstracts.csv filter=lfs diff=lfs merge=lfs -text
README.md CHANGED
@@ -1,10 +1,10 @@
1
  ---
2
- title: KeyExtraction
3
- emoji: πŸš€
4
- colorFrom: purple
5
- colorTo: indigo
6
- sdk: gradio
7
- sdk_version: 4.13.0
8
  app_file: app.py
9
  pinned: false
10
  license: mit
 
1
  ---
2
+ title: Keyextractionction
3
+ emoji: πŸ’»
4
+ colorFrom: indigo
5
+ colorTo: pink
6
+ sdk: streamlit
7
+ sdk_version: 1.29.0
8
  app_file: app.py
9
  pinned: false
10
  license: mit
app.py ADDED
@@ -0,0 +1,306 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas
2
+ import nltk
3
+ nltk.download('wordnet')
4
+
5
+ # load the dataset
6
+ dataset = pandas.read_csv('covid_abstracts.csv')
7
+ dataset.head()
8
+
9
+ #Fetch wordcount for each abstract
10
+ dataset['word_count'] = dataset['title'].apply(lambda x: len(str(x).split(" ")))
11
+ dataset[['title','word_count']].head()
12
+
13
+ ##Descriptive statistics of word counts
14
+ dataset.word_count.describe()
15
+
16
+ #Identify common words
17
+ freq = pandas.Series(' '.join(dataset['title'].astype(str)).split()).value_counts()[:20]
18
+
19
+ #freq = pandas.Series(' '.join(dataset['title']).split()).value_counts()[:20]
20
+ freq
21
+
22
+ #Identify uncommon words
23
+ freq1 = pandas.Series(' '.join(dataset['title'].astype(str)).split()).value_counts()[-20:]
24
+
25
+ #freq1 = pandas.Series(' '.join(dataset
26
+ # ['title']).split()).value_counts()[-20:]
27
+ freq1
28
+
29
+ from nltk.stem.porter import PorterStemmer
30
+ from nltk.stem.wordnet import WordNetLemmatizer
31
+ lem = WordNetLemmatizer()
32
+ stem = PorterStemmer()
33
+ word = "cryptogenic"
34
+ print("stemming:",stem.stem(word))
35
+ print("lemmatization:", lem.lemmatize(word, "v"))
36
+
37
+ import nltk
38
+ nltk.download('wordnet')
39
+
40
+ # Libraries for text preprocessing
41
+ import re
42
+ import nltk
43
+ nltk.download('stopwords')
44
+ from nltk.corpus import stopwords
45
+ from nltk.stem.porter import PorterStemmer
46
+ from nltk.tokenize import RegexpTokenizer
47
+ #nltk.download('wordnet')
48
+ from nltk.stem.wordnet import WordNetLemmatizer
49
+
50
+ ##Creating a list of stop words and adding custom stopwords
51
+ stop_words = set(stopwords.words("english"))
52
+ ##Creating a list of custom stopwords
53
+ new_words = ["using", "show", "result", "large", "also", "iv", "one", "two", "new", "previously", "shown"]
54
+ stop_words = stop_words.union(new_words)
55
+
56
+ print(stop_words)
57
+
58
+ print(new_words)
59
+
60
+ corpus = []
61
+ for i in range(0, 3847):
62
+ #Remove punctuations
63
+ text = re.sub('[^a-zA-Z]', ' ', dataset['title'][i])
64
+
65
+ #Convert to lowercase
66
+ text = text.lower()
67
+
68
+ #remove tags
69
+ text=re.sub("</?.*?>"," <> ",text)
70
+
71
+ # remove special characters and digits
72
+ text=re.sub("(\\d|\\W)+"," ",text)
73
+
74
+ ##Convert to list from string
75
+ text = text.split()
76
+
77
+ ##Stemming
78
+ ps=PorterStemmer()
79
+ #Lemmatisation
80
+ lem = WordNetLemmatizer()
81
+ text = [lem.lemmatize(word) for word in text if not word in
82
+ stop_words]
83
+ text = " ".join(text)
84
+ corpus.append(text)
85
+
86
+ #View corpus item
87
+ corpus[222]
88
+
89
+ #View corpus item
90
+ corpus[300]
91
+
92
+ #Word cloud
93
+ from os import path
94
+ from PIL import Image
95
+ from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
96
+ import matplotlib.pyplot as plt
97
+
98
+ wordcloud = WordCloud(
99
+ background_color='white',
100
+ stopwords=stop_words,
101
+ max_words=100,
102
+ max_font_size=50,
103
+ random_state=42
104
+ ).generate(str(corpus))
105
+ print(wordcloud)
106
+ fig = plt.figure(1)
107
+ plt.imshow(wordcloud)
108
+ plt.axis('off')
109
+ plt.show()
110
+ fig.savefig("word1.png", dpi=900)
111
+ from sklearn.feature_extraction.text import CountVectorizer
112
+ import re
113
+
114
+ # Assuming you have the 'corpus' defined
115
+ # and 'stop_words' defined as in your previous code
116
+
117
+ # Create a CountVectorizer with predefined English stop words
118
+ cv = CountVectorizer(max_df=0.8, stop_words='english', max_features=10000, ngram_range=(1, 3))
119
+ X = cv.fit_transform(corpus)
120
+
121
+ # Alternatively, use your custom stop words
122
+ custom_stop_words = ['same', 'hers', 'they', 'with', 'if', 'y', 'iv', 'new', ...] # Add your custom stop words
123
+ cv = CountVectorizer(max_df=0.8, stop_words=custom_stop_words, max_features=10000, ngram_range=(1, 3))
124
+ X = cv.fit_transform(corpus)
125
+
126
+ #from sklearn.feature_extraction.text import CountVectorizer
127
+ #import re
128
+ #cv=CountVectorizer(max_df=0.8,stop_words=stop_words, max_features=10000, ngram_range=(1,3))
129
+ #X=cv.fit_transform(corpus)
130
+
131
+ from sklearn.feature_extraction.text import CountVectorizer
132
+
133
+ cv = CountVectorizer(max_df=0.8, stop_words='english', max_features=10000, ngram_range=(1,3))
134
+ X = cv.fit_transform(corpus)
135
+
136
+ custom_stop_words = ['from', 'to', 'against', 'each', 'own', ...] # Add your custom stop words
137
+ cv = CountVectorizer(max_df=0.8, stop_words=custom_stop_words, max_features=10000, ngram_range=(1,3))
138
+ X = cv.fit_transform(corpus)
139
+
140
+ list(cv.vocabulary_.keys())[:10]
141
+
142
+ #Most frequently occuring words
143
+ def get_top_n_words(corpus, n=None):
144
+ vec = CountVectorizer().fit(corpus)
145
+ bag_of_words = vec.transform(corpus)
146
+ sum_words = bag_of_words.sum(axis=0)
147
+ words_freq = [(word, sum_words[0, idx]) for word, idx in
148
+ vec.vocabulary_.items()]
149
+ words_freq =sorted(words_freq, key = lambda x: x[1],
150
+ reverse=True)
151
+ return words_freq[:n]
152
+ #Convert most freq words to dataframe for plotting bar plot
153
+ top_words = get_top_n_words(corpus, n=20)
154
+ top_df = pandas.DataFrame(top_words)
155
+ top_df.columns=["Word", "Freq"]
156
+ #Barplot of most freq words
157
+ import seaborn as sns
158
+ sns.set(rc={'figure.figsize':(13,8)})
159
+ g = sns.barplot(x="Word", y="Freq", data=top_df)
160
+ g.set_xticklabels(g.get_xticklabels(), rotation=30)
161
+
162
+ #Most frequently occuring Bi-grams
163
+ def get_top_n2_words(corpus, n=None):
164
+ vec1 = CountVectorizer(ngram_range=(2,2),
165
+ max_features=2000).fit(corpus)
166
+ bag_of_words = vec1.transform(corpus)
167
+ sum_words = bag_of_words.sum(axis=0)
168
+ words_freq = [(word, sum_words[0, idx]) for word, idx in
169
+ vec1.vocabulary_.items()]
170
+ words_freq =sorted(words_freq, key = lambda x: x[1],
171
+ reverse=True)
172
+ return words_freq[:n]
173
+ top2_words = get_top_n2_words(corpus, n=20)
174
+ top2_df = pandas.DataFrame(top2_words)
175
+ top2_df.columns=["Bi-gram", "Freq"]
176
+ print(top2_df)
177
+ #Barplot of most freq Bi-grams
178
+ import seaborn as sns
179
+ sns.set(rc={'figure.figsize':(13,8)})
180
+ h=sns.barplot(x="Bi-gram", y="Freq", data=top2_df)
181
+ h.set_xticklabels(h.get_xticklabels(), rotation=45)
182
+
183
+ #Most frequently occuring Tri-grams
184
+ def get_top_n3_words(corpus, n=None):
185
+ vec1 = CountVectorizer(ngram_range=(3,3),
186
+ max_features=2000).fit(corpus)
187
+ bag_of_words = vec1.transform(corpus)
188
+ sum_words = bag_of_words.sum(axis=0)
189
+ words_freq = [(word, sum_words[0, idx]) for word, idx in
190
+ vec1.vocabulary_.items()]
191
+ words_freq =sorted(words_freq, key = lambda x: x[1],
192
+ reverse=True)
193
+ return words_freq[:n]
194
+ top3_words = get_top_n3_words(corpus, n=20)
195
+ top3_df = pandas.DataFrame(top3_words)
196
+ top3_df.columns=["Tri-gram", "Freq"]
197
+ print(top3_df)
198
+ #Barplot of most freq Tri-grams
199
+ import seaborn as sns
200
+ sns.set(rc={'figure.figsize':(13,8)})
201
+ j=sns.barplot(x="Tri-gram", y="Freq", data=top3_df)
202
+ j.set_xticklabels(j.get_xticklabels(), rotation=45)
203
+
204
+ from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
205
+
206
+ # Assuming you already have the 'corpus' defined
207
+
208
+ # Create a CountVectorizer
209
+ cv = CountVectorizer(max_df=0.8, stop_words='english', max_features=10000, ngram_range=(1, 3))
210
+
211
+ # Fit and transform the corpus
212
+ X = cv.fit_transform(corpus)
213
+
214
+ # Create a TfidfTransformer and fit it to the CountVectorizer output
215
+ tfidf_transformer = TfidfTransformer(smooth_idf=True, use_idf=True)
216
+ tfidf_transformer.fit(X)
217
+
218
+ # Get feature names from CountVectorizer
219
+ feature_names = cv.get_feature_names_out()
220
+
221
+ # Fetch document for which keywords need to be extracted
222
+ doc = corpus[82]
223
+
224
+ # Generate tf-idf for the given document
225
+ tf_idf_vector = tfidf_transformer.transform(cv.transform([doc]))
226
+
227
+ # Now you can proceed with your further code
228
+
229
+ #Function for sorting tf_idf in descending order
230
+ from scipy.sparse import coo_matrix
231
+ def sort_coo(coo_matrix):
232
+ tuples = zip(coo_matrix.col, coo_matrix.data)
233
+ return sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)
234
+
235
+ def extract_topn_from_vector(feature_names, sorted_items, topn=10):
236
+ """get the feature names and tf-idf score of top n items"""
237
+
238
+ #use only top n items from vector
239
+ sorted_items = sorted_items[:topn]
240
+
241
+ score_vals = []
242
+ feature_vals = []
243
+
244
+ # word index and corresponding tf-idf score
245
+ for idx, score in sorted_items:
246
+
247
+ #keep track of feature name and its corresponding score
248
+ score_vals.append(round(score, 3))
249
+ feature_vals.append(feature_names[idx])
250
+
251
+ #create a tuples of feature,score
252
+ #results = zip(feature_vals,score_vals)
253
+ results= {}
254
+ for idx in range(len(feature_vals)):
255
+ results[feature_vals[idx]]=score_vals[idx]
256
+
257
+ return results
258
+ #sort the tf-idf vectors by descending order of scores
259
+ sorted_items=sort_coo(tf_idf_vector.tocoo())
260
+ #extract only the top n; n here is 10
261
+ keywords=extract_topn_from_vector(feature_names,sorted_items,10)
262
+
263
+ # now print the results
264
+ print("\nAbstract:")
265
+ print(doc)
266
+ print("\nKeywords:")
267
+ for k in keywords:
268
+ print(k,keywords[k])
269
+
270
+ from gensim.models import word2vec
271
+ tokenized_sentences = [sentence.split() for sentence in corpus]
272
+ model = word2vec.Word2Vec(tokenized_sentences, min_count=1)
273
+
274
+ model.wv.most_similar(positive=["incidence"])
275
+
276
+ import nltk
277
+ #nltk.download('omw-1.4')
278
+ from nltk.corpus import wordnet as wn
279
+
280
+ wn.synsets('car')
281
+
282
+ wn.synset('car.n.01').definition()
283
+ import gradio as gr
284
+ from nltk.corpus import wordnet as wn
285
+
286
+ # Function to get the definition of the first synset for a given word
287
+ def get_synset_definition(word):
288
+ synsets = wn.synsets(word)
289
+ if synsets:
290
+ first_synset = synsets[0]
291
+ return first_synset.definition()
292
+ else:
293
+ return "No synsets found for the given word."
294
+
295
+ # Gradio Interface
296
+ iface = gr.Interface(
297
+ fn=get_synset_definition,
298
+ inputs=gr.Textbox(),
299
+ outputs=gr.Textbox(),
300
+ live=True,
301
+ title="Key Extraction By Daniyal Tabish",
302
+ description="Enter a word to get the definition of its first WordNet synset.",
303
+ )
304
+
305
+ # Launch the Gradio interface
306
+ iface.launch()
authors.csv ADDED
The diff for this file is too large to render. See raw diff
 
covid_abstracts.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4ab9415d8ef00b8f9512169a8ac4f2b720001beabdf3ff128bd25bb9317ace5c
3
+ size 16916623
requirements.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ nltk
2
+ pandas
3
+ matplotlib
4
+ wordcloud
5
+
6
+ scikit-learn
7
+ gensim
8
+ gradio[oauth]==4.8.0
9
+ Pillow
10
+ seaborn