Spaces:
Sleeping
Sleeping
File size: 9,586 Bytes
7b23c0c 8737388 7b23c0c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 |
import pandas
import nltk
nltk.download('wordnet')
# load the dataset
dataset = pandas.read_csv('covid_abstracts.csv')
dataset.head()
#Fetch wordcount for each abstract
dataset['word_count'] = dataset['title'].apply(lambda x: len(str(x).split(" ")))
dataset[['title','word_count']].head()
##Descriptive statistics of word counts
dataset.word_count.describe()
#Identify common words
freq = pandas.Series(' '.join(dataset['title'].astype(str)).split()).value_counts()[:20]
#freq = pandas.Series(' '.join(dataset['title']).split()).value_counts()[:20]
freq
#Identify uncommon words
freq1 = pandas.Series(' '.join(dataset['title'].astype(str)).split()).value_counts()[-20:]
#freq1 = pandas.Series(' '.join(dataset
# ['title']).split()).value_counts()[-20:]
freq1
from nltk.stem.porter import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
lem = WordNetLemmatizer()
stem = PorterStemmer()
word = "cryptogenic"
print("stemming:",stem.stem(word))
print("lemmatization:", lem.lemmatize(word, "v"))
import nltk
nltk.download('wordnet')
# Libraries for text preprocessing
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import RegexpTokenizer
#nltk.download('wordnet')
from nltk.stem.wordnet import WordNetLemmatizer
##Creating a list of stop words and adding custom stopwords
stop_words = set(stopwords.words("english"))
##Creating a list of custom stopwords
new_words = ["using", "show", "result", "large", "also", "iv", "one", "two", "new", "previously", "shown"]
stop_words = stop_words.union(new_words)
print(stop_words)
print(new_words)
corpus = []
for i in range(0, 3847):
#Remove punctuations
text = re.sub('[^a-zA-Z]', ' ', dataset['title'][i])
#Convert to lowercase
text = text.lower()
#remove tags
text=re.sub("</?.*?>"," <> ",text)
# remove special characters and digits
text=re.sub("(\\d|\\W)+"," ",text)
##Convert to list from string
text = text.split()
##Stemming
ps=PorterStemmer()
#Lemmatisation
lem = WordNetLemmatizer()
text = [lem.lemmatize(word) for word in text if not word in
stop_words]
text = " ".join(text)
corpus.append(text)
#View corpus item
corpus[222]
#View corpus item
corpus[300]
#Word cloud
from os import path
from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import matplotlib.pyplot as plt
wordcloud = WordCloud(
background_color='white',
stopwords=stop_words,
max_words=100,
max_font_size=50,
random_state=42
).generate(str(corpus))
print(wordcloud)
fig = plt.figure(1)
plt.imshow(wordcloud)
plt.axis('off')
plt.show()
fig.savefig("word1.png", dpi=900)
from sklearn.feature_extraction.text import CountVectorizer
import re
# Assuming you have the 'corpus' defined
# and 'stop_words' defined as in your previous code
# Create a CountVectorizer with predefined English stop words
cv = CountVectorizer(max_df=0.8, stop_words='english', max_features=10000, ngram_range=(1, 3))
X = cv.fit_transform(corpus)
# Alternatively, use your custom stop words
custom_stop_words = ['same', 'hers', 'they', 'with', 'if', 'y', 'iv', 'new', ...] # Add your custom stop words
cv = CountVectorizer(max_df=0.8, stop_words=custom_stop_words, max_features=10000, ngram_range=(1, 3))
X = cv.fit_transform(corpus)
#from sklearn.feature_extraction.text import CountVectorizer
#import re
#cv=CountVectorizer(max_df=0.8,stop_words=stop_words, max_features=10000, ngram_range=(1,3))
#X=cv.fit_transform(corpus)
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_df=0.8, stop_words='english', max_features=10000, ngram_range=(1,3))
X = cv.fit_transform(corpus)
custom_stop_words = ['from', 'to', 'against', 'each', 'own', ...] # Add your custom stop words
cv = CountVectorizer(max_df=0.8, stop_words=custom_stop_words, max_features=10000, ngram_range=(1,3))
X = cv.fit_transform(corpus)
list(cv.vocabulary_.keys())[:10]
#Most frequently occuring words
def get_top_n_words(corpus, n=None):
vec = CountVectorizer().fit(corpus)
bag_of_words = vec.transform(corpus)
sum_words = bag_of_words.sum(axis=0)
words_freq = [(word, sum_words[0, idx]) for word, idx in
vec.vocabulary_.items()]
words_freq =sorted(words_freq, key = lambda x: x[1],
reverse=True)
return words_freq[:n]
#Convert most freq words to dataframe for plotting bar plot
top_words = get_top_n_words(corpus, n=20)
top_df = pandas.DataFrame(top_words)
top_df.columns=["Word", "Freq"]
#Barplot of most freq words
import seaborn as sns
sns.set(rc={'figure.figsize':(13,8)})
g = sns.barplot(x="Word", y="Freq", data=top_df)
g.set_xticklabels(g.get_xticklabels(), rotation=30)
#Most frequently occuring Bi-grams
def get_top_n2_words(corpus, n=None):
vec1 = CountVectorizer(ngram_range=(2,2),
max_features=2000).fit(corpus)
bag_of_words = vec1.transform(corpus)
sum_words = bag_of_words.sum(axis=0)
words_freq = [(word, sum_words[0, idx]) for word, idx in
vec1.vocabulary_.items()]
words_freq =sorted(words_freq, key = lambda x: x[1],
reverse=True)
return words_freq[:n]
top2_words = get_top_n2_words(corpus, n=20)
top2_df = pandas.DataFrame(top2_words)
top2_df.columns=["Bi-gram", "Freq"]
print(top2_df)
#Barplot of most freq Bi-grams
import seaborn as sns
sns.set(rc={'figure.figsize':(13,8)})
h=sns.barplot(x="Bi-gram", y="Freq", data=top2_df)
h.set_xticklabels(h.get_xticklabels(), rotation=45)
#Most frequently occuring Tri-grams
def get_top_n3_words(corpus, n=None):
vec1 = CountVectorizer(ngram_range=(3,3),
max_features=2000).fit(corpus)
bag_of_words = vec1.transform(corpus)
sum_words = bag_of_words.sum(axis=0)
words_freq = [(word, sum_words[0, idx]) for word, idx in
vec1.vocabulary_.items()]
words_freq =sorted(words_freq, key = lambda x: x[1],
reverse=True)
return words_freq[:n]
top3_words = get_top_n3_words(corpus, n=20)
top3_df = pandas.DataFrame(top3_words)
top3_df.columns=["Tri-gram", "Freq"]
print(top3_df)
#Barplot of most freq Tri-grams
import seaborn as sns
sns.set(rc={'figure.figsize':(13,8)})
j=sns.barplot(x="Tri-gram", y="Freq", data=top3_df)
j.set_xticklabels(j.get_xticklabels(), rotation=45)
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
# Assuming you already have the 'corpus' defined
# Create a CountVectorizer
cv = CountVectorizer(max_df=0.8, stop_words='english', max_features=10000, ngram_range=(1, 3))
# Fit and transform the corpus
X = cv.fit_transform(corpus)
# Create a TfidfTransformer and fit it to the CountVectorizer output
tfidf_transformer = TfidfTransformer(smooth_idf=True, use_idf=True)
tfidf_transformer.fit(X)
# Get feature names from CountVectorizer
feature_names = cv.get_feature_names_out()
# Fetch document for which keywords need to be extracted
doc = corpus[82]
# Generate tf-idf for the given document
tf_idf_vector = tfidf_transformer.transform(cv.transform([doc]))
# Now you can proceed with your further code
#Function for sorting tf_idf in descending order
from scipy.sparse import coo_matrix
def sort_coo(coo_matrix):
tuples = zip(coo_matrix.col, coo_matrix.data)
return sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)
def extract_topn_from_vector(feature_names, sorted_items, topn=10):
"""get the feature names and tf-idf score of top n items"""
#use only top n items from vector
sorted_items = sorted_items[:topn]
score_vals = []
feature_vals = []
# word index and corresponding tf-idf score
for idx, score in sorted_items:
#keep track of feature name and its corresponding score
score_vals.append(round(score, 3))
feature_vals.append(feature_names[idx])
#create a tuples of feature,score
#results = zip(feature_vals,score_vals)
results= {}
for idx in range(len(feature_vals)):
results[feature_vals[idx]]=score_vals[idx]
return results
#sort the tf-idf vectors by descending order of scores
sorted_items=sort_coo(tf_idf_vector.tocoo())
#extract only the top n; n here is 10
keywords=extract_topn_from_vector(feature_names,sorted_items,10)
# now print the results
print("\nAbstract:")
print(doc)
print("\nKeywords:")
for k in keywords:
print(k,keywords[k])
from gensim.models import word2vec
tokenized_sentences = [sentence.split() for sentence in corpus]
model = word2vec.Word2Vec(tokenized_sentences, min_count=1)
model.wv.most_similar(positive=["incidence"])
import nltk
#nltk.download('omw-1.4')
from nltk.corpus import wordnet as wn
wn.synsets('car')
wn.synset('car.n.01').definition()
import gradio as gr
from nltk.corpus import wordnet as wn
# Function to get the definition of the first synset for a given word
def get_synset_definition(word):
synsets = wn.synsets(word)
if synsets:
first_synset = synsets[0]
return first_synset.definition()
else:
return "No synsets found for the given word."
# Gradio Interface
iface = gr.Interface(
fn=get_synset_definition,
inputs=gr.Textbox(),
outputs=gr.Textbox(),
live=True,
title="Key Extraction",
description="Enter a word to get the definition of its first WordNet synset.",
)
# Launch the Gradio interface
iface.launch() |