Upload 2 files
Browse files- engine.py +390 -0
- requirements.txt +4 -0
@@ -0,0 +1,390 @@
1 |
import multiprocessing
2 |
import os
3 |
import pandas as pd
4 |
import requests
5 |
from bs4 import BeautifulSoup
6 |
import re
7 |
import string
8 |
import nltk
9 |
import time
10 |
11 |
12 |
13 |
14 |
from nltk.corpus import stopwords
15 |
from nltk.tokenize import sent_tokenize, word_tokenize
16 |
from nltk.corpus import cmudict
17 |
18 |
folderpath = r'C:\Users/suwes/SentimentEngine/'
19 |
textfile_path = f"{folderpath}inputtext/"
20 |
stopword_path = f"{folderpath}StopWords/"
21 |
masterdict_path = f"{folderpath}MasterDictionary/"
22 |
23 |
def createdf():
24 |
inputxlsx = os.path.join(folderpath, "Input.xlsx")
25 |
dfxlsx = pd.read_excel(inputxlsx)
26 |
27 |
df_urls = dfxlsx['URL']
28 |
29 |
return dfxlsx
30 |
31 |
df = createdf()
32 |
33 |
def extract(df):
34 |
#extracting article text from urls
35 |
def extract_urltext(url):
36 |
response = requests.get(url)#send GET req to url
37 |
soup = BeautifulSoup(response.content, 'html.parser')
38 |
article_title = soup.find('title').get_text()#find and extract tile of article
39 |
article_content = soup.find('div', class_= 'td-pb-span8 td-main-content')#find and extract article text
40 |
article_text = ''
41 |
if article_content:
42 |
for para in article_content.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6']):
43 |
article_text += para.get_text()
44 |
45 |
46 |
return article_title, article_text
47 |
48 |
#url = 'https://insights.blackcoffer.com/rising-it-cities-and-its-impact-on-the-economy-environment-infrastructure-and-city-life-by-the-year-2040/'
49 |
50 |
#article_title, article_text = extract_urltext(url)
51 |
52 |
for index, row in df.iterrows():
53 |
url = row['URL']
54 |
url_id = row['URL_ID']
55 |
article_title, article_text = extract_urltext(url)
56 |
#save text to file
57 |
filename = f"{folderpath}inputtext/{url_id}.txt"
58 |
with open(filename, 'w', encoding = 'utf-8') as file:
59 |
file.write(article_title+ '\n\n' +article_text)
60 |
print(f"text saved to file {filename}")
61 |
62 |
#extract data
63 |
64 |
65 |
def transform(df):
66 |
#cleaning stop words
67 |
#reading stop words from stopword files
68 |
def read_stopwords(stopword_folder):
69 |
stopwords = set()
70 |
filenames = os.listdir(stopword_folder)
71 |
# process each file
72 |
for filename in filenames:
73 |
filepath = os.path.join(stopword_folder, filename)
74 |
#read stop words from each file
75 |
with open(filepath, 'r', encoding= 'utf-8', errors='ignore') as file:
76 |
stopwords.update(map(str.strip, file.readlines()))
77 |
return stopwords
78 |
#stop words
79 |
stopwords = read_stopwords(stopword_path)
80 |
81 |
#cleaning stop words from text
82 |
def clean_stopwords(text, stopwords):
83 |
#tokenize text
84 |
words = word_tokenize(text)
85 |
#remove stop words from text
86 |
cleaned_words = [word for word in words if word.lower() not in stopwords]
87 |
#reconstructing cleaned text
88 |
cleaned_text = ' '.join(cleaned_words)
89 |
return cleaned_text
90 |
91 |
#cleaning stop words from a directory/multiple files
92 |
def clean_stopwords_directory(directory, stopwords):
93 |
#list all files in directory
94 |
filenames = os.listdir(directory)
95 |
#cleaning each file
96 |
for filename in filenames:
97 |
filepath = os.path.join(directory, filename)
98 |
#read text from each file
99 |
with open(filepath, 'r', encoding='utf-8', errors='ignore') as file:
100 |
text = file.read()
101 |
#clean stop words from file text
102 |
cleaned_text = clean_stopwords(text, stopwords)
103 |
#write back cleaned text
104 |
with open(filepath, 'w', encoding= 'utf-8', errors='ignore') as file:
105 |
106 |
print(f"cleaned text from {filename}")
107 |
108 |
clean_stopwords_directory(textfile_path, stopwords)
109 |
#creating dictionary of positive and negative words
110 |
def create_posneg_dict(masterdict_path, stopwords):
111 |
poswords = set()
112 |
negwords = set()
113 |
#read positivewords file
114 |
with open(os.path.join(masterdict_path, 'positive-words.txt'), 'r', encoding='utf-8', errors='ignore') as file:
115 |
for line in file:
116 |
words = line.strip().split()
117 |
for word in words:
118 |
if word.lower() not in stopwords:
119 |
120 |
#read negativewords file
121 |
with open(os.path.join(masterdict_path, 'negative-words.txt'), 'r', encoding='utf-8', errors='ignore') as file:
122 |
for line in file:
123 |
words = line.strip().split()
124 |
for word in words:
125 |
if word.lower() not in stopwords:
126 |
127 |
return poswords, negwords
128 |
129 |
positivewords, negativewords = create_posneg_dict(masterdict_path, stopwords)
130 |
131 |
132 |
return stopwords, positivewords, negativewords
133 |
134 |
#cleaning/transforming data
135 |
stopwords, positivewords, negativewords = transform(df)
136 |
137 |
#load data
138 |
result_df = pd.DataFrame()
139 |
def loadoutput(folderpath, result_df):
140 |
exceloutfilepath = f"{folderpath}Output.xlsx"
141 |
result_df.to_excel(exceloutfilepath, index=False)
142 |
print(f"output file saved to {exceloutfilepath}")
143 |
print(f"analysis time: {int((time.time() - starttime)//3600)} hours {int(((time.time() - starttime)%3600)//60)} minutes {int((time.time() - starttime)%60)} seconds")
144 |
145 |
#process text files
146 |
def runengine(df, stopwords, files_subset, dflist):
147 |
#sentimental analysis
148 |
#calculating variables
149 |
def calculate_positivescore(words, positivewords):
150 |
positivescore = sum(1 for word in words if word.lower() in positivewords)
151 |
return positivescore
152 |
153 |
def calculate_negativescore(words, negativewords):
154 |
negativescore = (sum(-1 for word in words if word.lower() in negativewords))*(-1)
155 |
return negativescore
156 |
157 |
#analysis of readability
158 |
def calc_readibility(words, sentences):
159 |
#calculate average length of sentences
160 |
avg_sentencelen = len(words)/len(sentences) if sentences else 0
161 |
#calculate % of complex words
162 |
complexwords = [word for word in words if syllable_count(word)>2]
163 |
percent_complexwords = len(complexwords)/len(words)*100 if words else 0
164 |
#calculate fog index
165 |
fog_index = 0.4*(avg_sentencelen + percent_complexwords)
166 |
return avg_sentencelen, percent_complexwords, fog_index
167 |
168 |
#average words per text
169 |
def avg_wordspersentence(words, sentences):
170 |
if len(sentences) > 0:
171 |
averagewords = len(words)/len(sentences)
172 |
return averagewords
173 |
else: return 0
174 |
175 |
#complex word count
176 |
def syllable_count(word):
177 |
d = cmudict.dict()
178 |
if word.lower() in d:
179 |
return [len(list(y for y in x if y[-1].isdigit())) for x in d[word.lower()]][0]
180 |
181 |
return 0
182 |
def complexwords_count(words):
183 |
complexwords = [word for word in words if syllable_count(word)>2]
184 |
return len(complexwords)
185 |
186 |
#clean words count
187 |
def cleanwords_count(words, stopwords):
188 |
punctuations = set(string.punctuation)
189 |
cleaned_words = [word.lower() for word in words if word.lower() not in stopwords and word.lower() not in punctuations]
190 |
return len(cleaned_words)
191 |
192 |
#syllable count per word
193 |
#vowel syllable count per word
194 |
def vowel_syllable(word):
195 |
vowels = 'aeiouy'
196 |
count = 0
197 |
endings = 'es', 'ed', 'e'
198 |
#exceptions for word with endings
199 |
word = word.lower().strip()
200 |
if word.endswith(endings):
201 |
word = word[:-2]#subtract 2 characters from ending of word
202 |
elif word.emdswith('le'):
203 |
word = word[:-2]
204 |
endings = ''
205 |
elif word.endswith('ing'):
206 |
word = word[:-3]#subtract 3 characters from ending of word
207 |
endings = ''
208 |
#counting vowels in word
209 |
if len(word)<=3:
210 |
return 1
211 |
for index, letter in enumerate(word):
212 |
if letter in vowels and (index ==0 or word[index -1] not in vowels):
213 |
count +=1
214 |
#handling y as vowel at end of word
215 |
if word.endswith('y') and word[-2] not in vowels:
216 |
count +=1
217 |
return count
218 |
#per text
219 |
def vowel_syllable_perword(words):
220 |
total_syllables = sum(syllable_count(word) for word in words)
221 |
return total_syllables
222 |
223 |
#personal pronouns
224 |
def count_pronouns(text):
225 |
pattern = r'\b(?:I|we|my|ours|us)\b'#define regex pattern for matching pronouns
226 |
#find all matches
227 |
matches = re.findall(pattern, text, flags=re.IGNORECASE)
228 |
#excluse 'US' when reffering to USA
229 |
matches_fin = [matches for match in matches if match.lower() != 'us']
230 |
countpron = len(matches_fin)#count of pronouns
231 |
return countpron
232 |
233 |
#average word length
234 |
def calc_avg_wordlength(words):
235 |
total_chars = sum(len(word) for word in words)#calculate total charactes in text
236 |
total_words = len(words)
237 |
if total_words != 0:
238 |
avg_wordlength = total_chars/total_words
239 |
else: avg_wordlength = 0
240 |
return avg_wordlength
241 |
242 |
def appendtodf(url_idkey, calc_values, process_df):
243 |
rowindex = df[df['URL_ID'] == url_idkey].index #get index of row where url_id = url_idkey
244 |
if not rowindex.empty:
245 |
idx_toupdate = rowindex[0]
246 |
# Create a new row with the columns from the original DataFrame df
247 |
new_row = pd.DataFrame(columns=process_df.columns)
248 |
# Assign the existing values from df to the new row at the corresponding index
249 |
new_row.loc[0, process_df.columns[:2]] = df.loc[idx_toupdate, ['URL_ID', 'URL']]
250 |
# Update the new row with the calculated values
251 |
for col, value in calc_values.items():
252 |
new_row[col] = value
253 |
# Add the new row to the process_df
254 |
process_df = process_df._append(new_row, ignore_index=True)
255 |
print(f"Result updated for {url_idkey}")
256 |
257 |
print(f"!not found {url_idkey}")
258 |
return process_df
259 |
260 |
#process data/ processing each file
261 |
process_df = pd.DataFrame(columns=df.columns)
262 |
for filename in files_subset:
263 |
filepath = os.path.join(textfile_path, filename)
264 |
#to update values for each URL_ID
265 |
url_idkey = re.search(r'blackassign\d{4}', filepath).group()
266 |
if os.path.isfile(filepath):
267 |
with open(filepath, 'r', encoding='utf-8', errors='ignore') as file:
268 |
text = file.read()
269 |
#tokenize text
270 |
words = word_tokenize(text)
271 |
sentences = sent_tokenize(text)
272 |
totalwords = len(words)
273 |
274 |
#calculate positive score
275 |
positive_score = calculate_positivescore(words, positivewords)
276 |
print(f"{filename} positive socre: {positive_score}")
277 |
278 |
#calculate negative score
279 |
negative_score = calculate_negativescore(words, negativewords)
280 |
print(f"{filename} negative socre: {negative_score}")
281 |
282 |
#calculate polarity score
283 |
polarity_score = (positive_score - negative_score)/((positive_score+negative_score)+0.000001)
284 |
print(f"{filename} polarity socre: {polarity_score}")
285 |
286 |
#calculate subjective score
287 |
subjectivity_score = (positive_score+negative_score)/((totalwords)+0.000001)
288 |
print(f"{filename} subjectivity socre: {subjectivity_score}")
289 |
290 |
#readibility analysis
291 |
avg_sentencelen, percent_complexwords, fog_index = calc_readibility(words, sentences)
292 |
print(f"{filename} avg sentencelength: {avg_sentencelen}")
293 |
#load(df, "AVG SENTENCE LENGTH",avg_sentencelen, url_idkey)
294 |
print(f"{filename} percentage of complex words: {percent_complexwords}")
295 |
#load(df, "PERCENTAGE OF COMPLEX WORDS",percent_complexwords, url_idkey)
296 |
print(f"{filename} Fog Index: {fog_index}")
297 |
298 |
#average number of words per sentence
299 |
avg_wordper_sentence = avg_wordspersentence(words, sentences)
300 |
print(f"{filename} avg words per sentence: {avg_wordper_sentence}")
301 |
302 |
#complex word count
303 |
complexword_count = complexwords_count(words)
304 |
print(f"{filename} complex words count: {complexword_count}")
305 |
306 |
#word count
307 |
cleanword_count = cleanwords_count(words, stopwords)
308 |
print(f"{filename} clean words count: {cleanword_count}")
309 |
310 |
#syllable count per word
311 |
syllablecount_perword = vowel_syllable_perword(words)
312 |
print(f"{filename} syllable count per word: {syllablecount_perword}")
313 |
314 |
#personal pronouns
315 |
pronouns_count = count_pronouns(text)
316 |
print(f"{filename} personal pronouns count: {pronouns_count}")
317 |
318 |
#avg word length
319 |
avg_wordlength = calc_avg_wordlength(words)
320 |
print(f"{filename} avg word length: {avg_wordlength}")
321 |
else: print(f"df not updated for {filename}!")
322 |
323 |
calc_values = {
324 |
"POSITIVE SCORE": positive_score,
325 |
"NEGATIVE SCORE": negative_score,
326 |
"POLARITY SCORE": polarity_score,
327 |
"SUBJECTIVITY SCORE": subjectivity_score,
328 |
"AVG SENTENCE LENGTH": avg_sentencelen,
329 |
"PERCENTAGE OF COMPLEX WORDS": percent_complexwords,
330 |
"FOG INDEX": fog_index,
331 |
"AVG NUMBER OF WORDS PER SENTENCE": avg_wordper_sentence,
332 |
"COMPLEX WORD COUNT": complexword_count,
333 |
"WORD COUNT": cleanword_count,
334 |
"SYLLABLE PER WORD": syllablecount_perword,
335 |
"PERSONAL PRONOUNS": pronouns_count,
336 |
"AVG WORD LENGTH": avg_wordlength
337 |
338 |
339 |
process_df = appendtodf(url_idkey,calc_values, process_df)
340 |
except Exception as e:
341 |
342 |
343 |
344 |
345 |
346 |
347 |
#runengine(df, stopwords, files_subset, dflist)
348 |
if __name__ == '__main__':
349 |
starttime = time.time()
350 |
files_toprocess = os.listdir(textfile_path)
351 |
#files_toprocess = [r'blackassign0049.txt', r'blackassign0099.txt', r'blackassign0100.txt']
352 |
num_processes = multiprocessing.cpu_count()
353 |
print(str(num_processes)+ " CPUs")
354 |
files_perprocess = len(files_toprocess) // num_processes
355 |
356 |
357 |
processes = []
358 |
# Create a Manager object to share a list among processes
359 |
manager = multiprocessing.Manager()
360 |
dflist = manager.list()
361 |
362 |
for i in range(num_processes):
363 |
364 |
start = i*files_perprocess
365 |
end = (i+1)*files_perprocess if i != num_processes-1 else len(files_toprocess)
366 |
files_subset = files_toprocess[start:end]
367 |
368 |
p = multiprocessing.Process(target=runengine, args =(df, stopwords, files_subset, dflist))
369 |
370 |
371 |
except Exception as e:
372 |
373 |
374 |
print("waiting for all processes to end...")
375 |
for i in processes:
376 |
377 |
for process in processes:
378 |
379 |
380 |
except Exception as e:
381 |
382 |
for i in processes:
383 |
384 |
385 |
print(str(len(dflist))+" result dataframes obtained.")
386 |
result_df = pd.concat(dflist, ignore_index=True)
387 |
result_df = result_df.sort_values(by='URL_ID')
388 |
389 |
390 |
loadoutput(folderpath, result_df)
@@ -0,0 +1,4 @@
1 |
2 |
3 |
4 |