Spaces:

Arhashmi
/

KeyExtraction

Sleeping

File size: 9,586 Bytes

import pandas
import nltk
nltk.download('wordnet')

# load the dataset
dataset = pandas.read_csv('covid_abstracts.csv')
dataset.head()

#Fetch wordcount for each abstract
dataset['word_count'] = dataset['title'].apply(lambda x: len(str(x).split(" ")))
dataset[['title','word_count']].head()

##Descriptive statistics of word counts
dataset.word_count.describe()

#Identify common words
freq = pandas.Series(' '.join(dataset['title'].astype(str)).split()).value_counts()[:20]

#freq = pandas.Series(' '.join(dataset['title']).split()).value_counts()[:20]
freq

#Identify uncommon words
freq1 = pandas.Series(' '.join(dataset['title'].astype(str)).split()).value_counts()[-20:]

#freq1 =  pandas.Series(' '.join(dataset
#         ['title']).split()).value_counts()[-20:]
freq1

from nltk.stem.porter import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
lem = WordNetLemmatizer()
stem = PorterStemmer()
word = "cryptogenic"
print("stemming:",stem.stem(word))
print("lemmatization:", lem.lemmatize(word, "v"))

import nltk
nltk.download('wordnet')

# Libraries for text preprocessing
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import RegexpTokenizer
#nltk.download('wordnet')
from nltk.stem.wordnet import WordNetLemmatizer

##Creating a list of stop words and adding custom stopwords
stop_words = set(stopwords.words("english"))
##Creating a list of custom stopwords
new_words = ["using", "show", "result", "large", "also", "iv", "one", "two", "new", "previously", "shown"]
stop_words = stop_words.union(new_words)

print(stop_words)

print(new_words)

corpus = []
for i in range(0, 3847):
    #Remove punctuations
    text = re.sub('[^a-zA-Z]', ' ', dataset['title'][i])

    #Convert to lowercase
    text = text.lower()

    #remove tags
    text=re.sub("&lt;/?.*?&gt;"," &lt;&gt; ",text)

    # remove special characters and digits
    text=re.sub("(\\d|\\W)+"," ",text)

    ##Convert to list from string
    text = text.split()

    ##Stemming
    ps=PorterStemmer()
    #Lemmatisation
    lem = WordNetLemmatizer()
    text = [lem.lemmatize(word) for word in text if not word in
            stop_words]
    text = " ".join(text)
    corpus.append(text)

#View corpus item
corpus[222]

#View corpus item
corpus[300]

#Word cloud
from os import path
from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import matplotlib.pyplot as plt

wordcloud = WordCloud(
                          background_color='white',
                          stopwords=stop_words,
                          max_words=100,
                          max_font_size=50,
                          random_state=42
                         ).generate(str(corpus))
print(wordcloud)
fig = plt.figure(1)
plt.imshow(wordcloud)
plt.axis('off')
plt.show()
fig.savefig("word1.png", dpi=900)
from sklearn.feature_extraction.text import CountVectorizer
import re

# Assuming you have the 'corpus' defined
# and 'stop_words' defined as in your previous code

# Create a CountVectorizer with predefined English stop words
cv = CountVectorizer(max_df=0.8, stop_words='english', max_features=10000, ngram_range=(1, 3))
X = cv.fit_transform(corpus)

# Alternatively, use your custom stop words
custom_stop_words = ['same', 'hers', 'they', 'with', 'if', 'y', 'iv', 'new', ...]  # Add your custom stop words
cv = CountVectorizer(max_df=0.8, stop_words=custom_stop_words, max_features=10000, ngram_range=(1, 3))
X = cv.fit_transform(corpus)

#from sklearn.feature_extraction.text import CountVectorizer
#import re
#cv=CountVectorizer(max_df=0.8,stop_words=stop_words, max_features=10000, ngram_range=(1,3))
#X=cv.fit_transform(corpus)

from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(max_df=0.8, stop_words='english', max_features=10000, ngram_range=(1,3))
X = cv.fit_transform(corpus)

custom_stop_words = ['from', 'to', 'against', 'each', 'own', ...]  # Add your custom stop words
cv = CountVectorizer(max_df=0.8, stop_words=custom_stop_words, max_features=10000, ngram_range=(1,3))
X = cv.fit_transform(corpus)

list(cv.vocabulary_.keys())[:10]

#Most frequently occuring words
def get_top_n_words(corpus, n=None):
    vec = CountVectorizer().fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0)
    words_freq = [(word, sum_words[0, idx]) for word, idx in
                   vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1],
                       reverse=True)
    return words_freq[:n]
#Convert most freq words to dataframe for plotting bar plot
top_words = get_top_n_words(corpus, n=20)
top_df = pandas.DataFrame(top_words)
top_df.columns=["Word", "Freq"]
#Barplot of most freq words
import seaborn as sns
sns.set(rc={'figure.figsize':(13,8)})
g = sns.barplot(x="Word", y="Freq", data=top_df)
g.set_xticklabels(g.get_xticklabels(), rotation=30)

#Most frequently occuring Bi-grams
def get_top_n2_words(corpus, n=None):
    vec1 = CountVectorizer(ngram_range=(2,2),
            max_features=2000).fit(corpus)
    bag_of_words = vec1.transform(corpus)
    sum_words = bag_of_words.sum(axis=0)
    words_freq = [(word, sum_words[0, idx]) for word, idx in
                  vec1.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1],
                reverse=True)
    return words_freq[:n]
top2_words = get_top_n2_words(corpus, n=20)
top2_df = pandas.DataFrame(top2_words)
top2_df.columns=["Bi-gram", "Freq"]
print(top2_df)
#Barplot of most freq Bi-grams
import seaborn as sns
sns.set(rc={'figure.figsize':(13,8)})
h=sns.barplot(x="Bi-gram", y="Freq", data=top2_df)
h.set_xticklabels(h.get_xticklabels(), rotation=45)

#Most frequently occuring Tri-grams
def get_top_n3_words(corpus, n=None):
    vec1 = CountVectorizer(ngram_range=(3,3),
           max_features=2000).fit(corpus)
    bag_of_words = vec1.transform(corpus)
    sum_words = bag_of_words.sum(axis=0)
    words_freq = [(word, sum_words[0, idx]) for word, idx in
                  vec1.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1],
                reverse=True)
    return words_freq[:n]
top3_words = get_top_n3_words(corpus, n=20)
top3_df = pandas.DataFrame(top3_words)
top3_df.columns=["Tri-gram", "Freq"]
print(top3_df)
#Barplot of most freq Tri-grams
import seaborn as sns
sns.set(rc={'figure.figsize':(13,8)})
j=sns.barplot(x="Tri-gram", y="Freq", data=top3_df)
j.set_xticklabels(j.get_xticklabels(), rotation=45)

from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer

# Assuming you already have the 'corpus' defined

# Create a CountVectorizer
cv = CountVectorizer(max_df=0.8, stop_words='english', max_features=10000, ngram_range=(1, 3))

# Fit and transform the corpus
X = cv.fit_transform(corpus)

# Create a TfidfTransformer and fit it to the CountVectorizer output
tfidf_transformer = TfidfTransformer(smooth_idf=True, use_idf=True)
tfidf_transformer.fit(X)

# Get feature names from CountVectorizer
feature_names = cv.get_feature_names_out()

# Fetch document for which keywords need to be extracted
doc = corpus[82]

# Generate tf-idf for the given document
tf_idf_vector = tfidf_transformer.transform(cv.transform([doc]))

# Now you can proceed with your further code

#Function for sorting tf_idf in descending order
from scipy.sparse import coo_matrix
def sort_coo(coo_matrix):
    tuples = zip(coo_matrix.col, coo_matrix.data)
    return sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)

def extract_topn_from_vector(feature_names, sorted_items, topn=10):
    """get the feature names and tf-idf score of top n items"""

    #use only top n items from vector
    sorted_items = sorted_items[:topn]

    score_vals = []
    feature_vals = []

    # word index and corresponding tf-idf score
    for idx, score in sorted_items:

        #keep track of feature name and its corresponding score
        score_vals.append(round(score, 3))
        feature_vals.append(feature_names[idx])

    #create a tuples of feature,score
    #results = zip(feature_vals,score_vals)
    results= {}
    for idx in range(len(feature_vals)):
        results[feature_vals[idx]]=score_vals[idx]

    return results
#sort the tf-idf vectors by descending order of scores
sorted_items=sort_coo(tf_idf_vector.tocoo())
#extract only the top n; n here is 10
keywords=extract_topn_from_vector(feature_names,sorted_items,10)

# now print the results
print("\nAbstract:")
print(doc)
print("\nKeywords:")
for k in keywords:
    print(k,keywords[k])

from gensim.models import word2vec
tokenized_sentences = [sentence.split() for sentence in corpus]
model = word2vec.Word2Vec(tokenized_sentences, min_count=1)

model.wv.most_similar(positive=["incidence"])

import nltk
#nltk.download('omw-1.4')
from nltk.corpus import wordnet as wn

wn.synsets('car')

wn.synset('car.n.01').definition()
import gradio as gr
from nltk.corpus import wordnet as wn

# Function to get the definition of the first synset for a given word
def get_synset_definition(word):
    synsets = wn.synsets(word)
    if synsets:
        first_synset = synsets[0]
        return first_synset.definition()
    else:
        return "No synsets found for the given word."

# Gradio Interface
iface = gr.Interface(
    fn=get_synset_definition,
    inputs=gr.Textbox(),
    outputs=gr.Textbox(),
    live=True,
    title="Key Extraction",
    description="Enter a word to get the definition of its first WordNet synset.",
)

# Launch the Gradio interface
iface.launch()