suwesh commited on
Commit
f1e5b3d
1 Parent(s): f356ea9

Upload 2 files

Browse files
Files changed (2) hide show
  1. engine.py +390 -0
  2. requirements.txt +4 -0
engine.py ADDED
@@ -0,0 +1,390 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import multiprocessing
2
+ import os
3
+ import pandas as pd
4
+ import requests
5
+ from bs4 import BeautifulSoup
6
+ import re
7
+ import string
8
+ import nltk
9
+ import time
10
+ nltk.download('punkt')
11
+ nltk.download('stopwords')
12
+ nltk.download('wordnet')
13
+ nltk.download('cmudict')
14
+ from nltk.corpus import stopwords
15
+ from nltk.tokenize import sent_tokenize, word_tokenize
16
+ from nltk.corpus import cmudict
17
+
18
+ folderpath = r'C:\Users/suwes/SentimentEngine/'
19
+ textfile_path = f"{folderpath}inputtext/"
20
+ stopword_path = f"{folderpath}StopWords/"
21
+ masterdict_path = f"{folderpath}MasterDictionary/"
22
+
23
+ def createdf():
24
+ inputxlsx = os.path.join(folderpath, "Input.xlsx")
25
+ dfxlsx = pd.read_excel(inputxlsx)
26
+ print(dfxlsx)
27
+ df_urls = dfxlsx['URL']
28
+ #print(df_urls)
29
+ return dfxlsx
30
+
31
+ df = createdf()
32
+
33
+ def extract(df):
34
+ #extracting article text from urls
35
+ def extract_urltext(url):
36
+ response = requests.get(url)#send GET req to url
37
+ soup = BeautifulSoup(response.content, 'html.parser')
38
+ article_title = soup.find('title').get_text()#find and extract tile of article
39
+ article_content = soup.find('div', class_= 'td-pb-span8 td-main-content')#find and extract article text
40
+ article_text = ''
41
+ if article_content:
42
+ for para in article_content.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6']):
43
+ article_text += para.get_text()
44
+ #print(article_title)
45
+ #print(article_text)
46
+ return article_title, article_text
47
+
48
+ #url = 'https://insights.blackcoffer.com/rising-it-cities-and-its-impact-on-the-economy-environment-infrastructure-and-city-life-by-the-year-2040/'
49
+ #extract_urltext(url)
50
+ #article_title, article_text = extract_urltext(url)
51
+
52
+ for index, row in df.iterrows():
53
+ url = row['URL']
54
+ url_id = row['URL_ID']
55
+ article_title, article_text = extract_urltext(url)
56
+ #save text to file
57
+ filename = f"{folderpath}inputtext/{url_id}.txt"
58
+ with open(filename, 'w', encoding = 'utf-8') as file:
59
+ file.write(article_title+ '\n\n' +article_text)
60
+ print(f"text saved to file {filename}")
61
+
62
+ #extract data
63
+ extract(df)
64
+
65
+ def transform(df):
66
+ #cleaning stop words
67
+ #reading stop words from stopword files
68
+ def read_stopwords(stopword_folder):
69
+ stopwords = set()
70
+ filenames = os.listdir(stopword_folder)
71
+ # process each file
72
+ for filename in filenames:
73
+ filepath = os.path.join(stopword_folder, filename)
74
+ #read stop words from each file
75
+ with open(filepath, 'r', encoding= 'utf-8', errors='ignore') as file:
76
+ stopwords.update(map(str.strip, file.readlines()))
77
+ return stopwords
78
+ #stop words
79
+ stopwords = read_stopwords(stopword_path)
80
+
81
+ #cleaning stop words from text
82
+ def clean_stopwords(text, stopwords):
83
+ #tokenize text
84
+ words = word_tokenize(text)
85
+ #remove stop words from text
86
+ cleaned_words = [word for word in words if word.lower() not in stopwords]
87
+ #reconstructing cleaned text
88
+ cleaned_text = ' '.join(cleaned_words)
89
+ return cleaned_text
90
+
91
+ #cleaning stop words from a directory/multiple files
92
+ def clean_stopwords_directory(directory, stopwords):
93
+ #list all files in directory
94
+ filenames = os.listdir(directory)
95
+ #cleaning each file
96
+ for filename in filenames:
97
+ filepath = os.path.join(directory, filename)
98
+ #read text from each file
99
+ with open(filepath, 'r', encoding='utf-8', errors='ignore') as file:
100
+ text = file.read()
101
+ #clean stop words from file text
102
+ cleaned_text = clean_stopwords(text, stopwords)
103
+ #write back cleaned text
104
+ with open(filepath, 'w', encoding= 'utf-8', errors='ignore') as file:
105
+ file.write(cleaned_text)
106
+ print(f"cleaned text from {filename}")
107
+
108
+ clean_stopwords_directory(textfile_path, stopwords)
109
+ #creating dictionary of positive and negative words
110
+ def create_posneg_dict(masterdict_path, stopwords):
111
+ poswords = set()
112
+ negwords = set()
113
+ #read positivewords file
114
+ with open(os.path.join(masterdict_path, 'positive-words.txt'), 'r', encoding='utf-8', errors='ignore') as file:
115
+ for line in file:
116
+ words = line.strip().split()
117
+ for word in words:
118
+ if word.lower() not in stopwords:
119
+ poswords.add(word.lower())
120
+ #read negativewords file
121
+ with open(os.path.join(masterdict_path, 'negative-words.txt'), 'r', encoding='utf-8', errors='ignore') as file:
122
+ for line in file:
123
+ words = line.strip().split()
124
+ for word in words:
125
+ if word.lower() not in stopwords:
126
+ negwords.add(word.lower())
127
+ return poswords, negwords
128
+
129
+ positivewords, negativewords = create_posneg_dict(masterdict_path, stopwords)
130
+ #print(positivewords)
131
+ #print(negativewords)
132
+ return stopwords, positivewords, negativewords
133
+
134
+ #cleaning/transforming data
135
+ stopwords, positivewords, negativewords = transform(df)
136
+
137
+ #load data
138
+ result_df = pd.DataFrame()
139
+ def loadoutput(folderpath, result_df):
140
+ exceloutfilepath = f"{folderpath}Output.xlsx"
141
+ result_df.to_excel(exceloutfilepath, index=False)
142
+ print(f"output file saved to {exceloutfilepath}")
143
+ print(f"analysis time: {int((time.time() - starttime)//3600)} hours {int(((time.time() - starttime)%3600)//60)} minutes {int((time.time() - starttime)%60)} seconds")
144
+
145
+ #process text files
146
+ def runengine(df, stopwords, files_subset, dflist):
147
+ #sentimental analysis
148
+ #calculating variables
149
+ def calculate_positivescore(words, positivewords):
150
+ positivescore = sum(1 for word in words if word.lower() in positivewords)
151
+ return positivescore
152
+
153
+ def calculate_negativescore(words, negativewords):
154
+ negativescore = (sum(-1 for word in words if word.lower() in negativewords))*(-1)
155
+ return negativescore
156
+
157
+ #analysis of readability
158
+ def calc_readibility(words, sentences):
159
+ #calculate average length of sentences
160
+ avg_sentencelen = len(words)/len(sentences) if sentences else 0
161
+ #calculate % of complex words
162
+ complexwords = [word for word in words if syllable_count(word)>2]
163
+ percent_complexwords = len(complexwords)/len(words)*100 if words else 0
164
+ #calculate fog index
165
+ fog_index = 0.4*(avg_sentencelen + percent_complexwords)
166
+ return avg_sentencelen, percent_complexwords, fog_index
167
+
168
+ #average words per text
169
+ def avg_wordspersentence(words, sentences):
170
+ if len(sentences) > 0:
171
+ averagewords = len(words)/len(sentences)
172
+ return averagewords
173
+ else: return 0
174
+
175
+ #complex word count
176
+ def syllable_count(word):
177
+ d = cmudict.dict()
178
+ if word.lower() in d:
179
+ return [len(list(y for y in x if y[-1].isdigit())) for x in d[word.lower()]][0]
180
+ else:
181
+ return 0
182
+ def complexwords_count(words):
183
+ complexwords = [word for word in words if syllable_count(word)>2]
184
+ return len(complexwords)
185
+
186
+ #clean words count
187
+ def cleanwords_count(words, stopwords):
188
+ punctuations = set(string.punctuation)
189
+ cleaned_words = [word.lower() for word in words if word.lower() not in stopwords and word.lower() not in punctuations]
190
+ return len(cleaned_words)
191
+
192
+ #syllable count per word
193
+ #vowel syllable count per word
194
+ def vowel_syllable(word):
195
+ vowels = 'aeiouy'
196
+ count = 0
197
+ endings = 'es', 'ed', 'e'
198
+ #exceptions for word with endings
199
+ word = word.lower().strip()
200
+ if word.endswith(endings):
201
+ word = word[:-2]#subtract 2 characters from ending of word
202
+ elif word.emdswith('le'):
203
+ word = word[:-2]
204
+ endings = ''
205
+ elif word.endswith('ing'):
206
+ word = word[:-3]#subtract 3 characters from ending of word
207
+ endings = ''
208
+ #counting vowels in word
209
+ if len(word)<=3:
210
+ return 1
211
+ for index, letter in enumerate(word):
212
+ if letter in vowels and (index ==0 or word[index -1] not in vowels):
213
+ count +=1
214
+ #handling y as vowel at end of word
215
+ if word.endswith('y') and word[-2] not in vowels:
216
+ count +=1
217
+ return count
218
+ #per text
219
+ def vowel_syllable_perword(words):
220
+ total_syllables = sum(syllable_count(word) for word in words)
221
+ return total_syllables
222
+
223
+ #personal pronouns
224
+ def count_pronouns(text):
225
+ pattern = r'\b(?:I|we|my|ours|us)\b'#define regex pattern for matching pronouns
226
+ #find all matches
227
+ matches = re.findall(pattern, text, flags=re.IGNORECASE)
228
+ #excluse 'US' when reffering to USA
229
+ matches_fin = [matches for match in matches if match.lower() != 'us']
230
+ countpron = len(matches_fin)#count of pronouns
231
+ return countpron
232
+
233
+ #average word length
234
+ def calc_avg_wordlength(words):
235
+ total_chars = sum(len(word) for word in words)#calculate total charactes in text
236
+ total_words = len(words)
237
+ if total_words != 0:
238
+ avg_wordlength = total_chars/total_words
239
+ else: avg_wordlength = 0
240
+ return avg_wordlength
241
+
242
+ def appendtodf(url_idkey, calc_values, process_df):
243
+ rowindex = df[df['URL_ID'] == url_idkey].index #get index of row where url_id = url_idkey
244
+ if not rowindex.empty:
245
+ idx_toupdate = rowindex[0]
246
+ # Create a new row with the columns from the original DataFrame df
247
+ new_row = pd.DataFrame(columns=process_df.columns)
248
+ # Assign the existing values from df to the new row at the corresponding index
249
+ new_row.loc[0, process_df.columns[:2]] = df.loc[idx_toupdate, ['URL_ID', 'URL']]
250
+ # Update the new row with the calculated values
251
+ for col, value in calc_values.items():
252
+ new_row[col] = value
253
+ # Add the new row to the process_df
254
+ process_df = process_df._append(new_row, ignore_index=True)
255
+ print(f"Result updated for {url_idkey}")
256
+ else:
257
+ print(f"!not found {url_idkey}")
258
+ return process_df
259
+
260
+ #process data/ processing each file
261
+ process_df = pd.DataFrame(columns=df.columns)
262
+ for filename in files_subset:
263
+ filepath = os.path.join(textfile_path, filename)
264
+ #to update values for each URL_ID
265
+ url_idkey = re.search(r'blackassign\d{4}', filepath).group()
266
+ if os.path.isfile(filepath):
267
+ with open(filepath, 'r', encoding='utf-8', errors='ignore') as file:
268
+ text = file.read()
269
+ #tokenize text
270
+ words = word_tokenize(text)
271
+ sentences = sent_tokenize(text)
272
+ totalwords = len(words)
273
+
274
+ #calculate positive score
275
+ positive_score = calculate_positivescore(words, positivewords)
276
+ print(f"{filename} positive socre: {positive_score}")
277
+
278
+ #calculate negative score
279
+ negative_score = calculate_negativescore(words, negativewords)
280
+ print(f"{filename} negative socre: {negative_score}")
281
+
282
+ #calculate polarity score
283
+ polarity_score = (positive_score - negative_score)/((positive_score+negative_score)+0.000001)
284
+ print(f"{filename} polarity socre: {polarity_score}")
285
+
286
+ #calculate subjective score
287
+ subjectivity_score = (positive_score+negative_score)/((totalwords)+0.000001)
288
+ print(f"{filename} subjectivity socre: {subjectivity_score}")
289
+
290
+ #readibility analysis
291
+ avg_sentencelen, percent_complexwords, fog_index = calc_readibility(words, sentences)
292
+ print(f"{filename} avg sentencelength: {avg_sentencelen}")
293
+ #load(df, "AVG SENTENCE LENGTH",avg_sentencelen, url_idkey)
294
+ print(f"{filename} percentage of complex words: {percent_complexwords}")
295
+ #load(df, "PERCENTAGE OF COMPLEX WORDS",percent_complexwords, url_idkey)
296
+ print(f"{filename} Fog Index: {fog_index}")
297
+
298
+ #average number of words per sentence
299
+ avg_wordper_sentence = avg_wordspersentence(words, sentences)
300
+ print(f"{filename} avg words per sentence: {avg_wordper_sentence}")
301
+
302
+ #complex word count
303
+ complexword_count = complexwords_count(words)
304
+ print(f"{filename} complex words count: {complexword_count}")
305
+
306
+ #word count
307
+ cleanword_count = cleanwords_count(words, stopwords)
308
+ print(f"{filename} clean words count: {cleanword_count}")
309
+
310
+ #syllable count per word
311
+ syllablecount_perword = vowel_syllable_perword(words)
312
+ print(f"{filename} syllable count per word: {syllablecount_perword}")
313
+
314
+ #personal pronouns
315
+ pronouns_count = count_pronouns(text)
316
+ print(f"{filename} personal pronouns count: {pronouns_count}")
317
+
318
+ #avg word length
319
+ avg_wordlength = calc_avg_wordlength(words)
320
+ print(f"{filename} avg word length: {avg_wordlength}")
321
+ else: print(f"df not updated for {filename}!")
322
+
323
+ calc_values = {
324
+ "POSITIVE SCORE": positive_score,
325
+ "NEGATIVE SCORE": negative_score,
326
+ "POLARITY SCORE": polarity_score,
327
+ "SUBJECTIVITY SCORE": subjectivity_score,
328
+ "AVG SENTENCE LENGTH": avg_sentencelen,
329
+ "PERCENTAGE OF COMPLEX WORDS": percent_complexwords,
330
+ "FOG INDEX": fog_index,
331
+ "AVG NUMBER OF WORDS PER SENTENCE": avg_wordper_sentence,
332
+ "COMPLEX WORD COUNT": complexword_count,
333
+ "WORD COUNT": cleanword_count,
334
+ "SYLLABLE PER WORD": syllablecount_perword,
335
+ "PERSONAL PRONOUNS": pronouns_count,
336
+ "AVG WORD LENGTH": avg_wordlength
337
+ }
338
+ try:
339
+ process_df = appendtodf(url_idkey,calc_values, process_df)
340
+ except Exception as e:
341
+ print(e)
342
+ print(process_df)
343
+ dflist.append(process_df)
344
+
345
+
346
+
347
+ #runengine(df, stopwords, files_subset, dflist)
348
+ if __name__ == '__main__':
349
+ starttime = time.time()
350
+ files_toprocess = os.listdir(textfile_path)
351
+ #files_toprocess = [r'blackassign0049.txt', r'blackassign0099.txt', r'blackassign0100.txt']
352
+ num_processes = multiprocessing.cpu_count()
353
+ print(str(num_processes)+ " CPUs")
354
+ files_perprocess = len(files_toprocess) // num_processes
355
+ print(files_perprocess)
356
+
357
+ processes = []
358
+ # Create a Manager object to share a list among processes
359
+ manager = multiprocessing.Manager()
360
+ dflist = manager.list()
361
+
362
+ for i in range(num_processes):
363
+ try:
364
+ start = i*files_perprocess
365
+ end = (i+1)*files_perprocess if i != num_processes-1 else len(files_toprocess)
366
+ files_subset = files_toprocess[start:end]
367
+
368
+ p = multiprocessing.Process(target=runengine, args =(df, stopwords, files_subset, dflist))
369
+ processes.append(p)
370
+ p.start()
371
+ except Exception as e:
372
+ print(e)
373
+
374
+ print("waiting for all processes to end...")
375
+ for i in processes:
376
+ print(i)
377
+ for process in processes:
378
+ try:
379
+ process.join()
380
+ except Exception as e:
381
+ print(e)
382
+ for i in processes:
383
+ print(i)
384
+
385
+ print(str(len(dflist))+" result dataframes obtained.")
386
+ result_df = pd.concat(dflist, ignore_index=True)
387
+ result_df = result_df.sort_values(by='URL_ID')
388
+ print(result_df)
389
+
390
+ loadoutput(folderpath, result_df)
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ pandas==2.1.4
2
+ requests==2.31.0
3
+ beautifulsoup4==4.12.2
4
+ nltk==3.8.1