Spaces:
Sleeping
Sleeping
import pandas as pd | |
import os | |
import nltk | |
from nltk.corpus import stopwords | |
import plotly.express as px | |
from collections import Counter | |
import re | |
import matplotlib.pyplot as plt | |
from wordcloud import WordCloud | |
place_mapping = { | |
'united states': 'United States', | |
'u.s.': 'United States', | |
'US': 'United States', | |
'america': 'United States', | |
'north america': 'North America', | |
'usa': 'United States', | |
'south america': 'South America', | |
'american': 'United States', | |
'europe': 'Europe', | |
'eu': 'Europe', | |
'china': 'China', | |
'chinese': 'China', | |
'russia': 'Russia', | |
'arab': 'Arab Countries', | |
'middle east': 'Middle East', | |
'asia': 'Asia', | |
'asian': 'Asia', | |
'spain': 'Spain', | |
'germany': 'Germany', | |
'france': 'France', | |
'uk': 'United Kingdom', | |
'britain': 'United Kingdom', | |
'canada': 'Canada', | |
'mexico': 'Mexico', | |
'brazil': 'Brazil', | |
'venezuela': 'Venezuela', | |
'angola': 'Angola', | |
'nigeria': 'Nigeria', | |
'libya': 'Libya', | |
'iraq': 'Iraq', | |
'iran': 'Iran', | |
'kuwait': 'Kuwait', | |
'qatar': 'Qatar', | |
'saudi arabia': 'Saudi Arabia', | |
'gcc': 'Gulf Cooperation Council', | |
'asia-pacific': 'Asia', | |
'southeast asia': 'Asia', | |
'latin america': 'Latin America', | |
'caribbean': 'Caribbean', | |
} | |
region_mapping = { | |
'North America': ['United States', 'Canada', 'Mexico'], | |
'South America': ['Brazil', 'Venezuela'], | |
'Europe': ['United Kingdom', 'Germany', 'France', 'Spain', 'Russia'], | |
'Asia': ['China', 'India', 'Japan', 'South Korea'], | |
'Middle East': ['Saudi Arabia', 'Iran', 'Iraq', 'Qatar', 'Kuwait'], | |
'Africa': ['Nigeria', 'Libya', 'Angola'], | |
# Add more regions as necessary | |
} | |
nomenclature_mapping = { | |
'petroleum': 'Petroleum', | |
'energy': 'Energy', | |
'fuel oil': 'Fuel Oil', | |
'shale': 'Shale', | |
'offshore': 'Offshore', | |
'upstream': 'Upstream', | |
'hsfo': 'HSFO', | |
'downstream': 'Downstream', | |
'crude oil': 'Crude Oil', | |
'crude' : 'Crude Oil', | |
'refinery': 'Refinery', | |
'oil field': 'Oil Field', | |
'drilling': 'Drilling', | |
'gas': 'Gas', | |
'liquefied natural gas': 'LNG', | |
'natural gas': 'NG', | |
'oil': 'Crude Oil', | |
} | |
company_mapping = { | |
'exxonmobil': 'ExxonMobil', | |
'exxon': 'ExxonMobil', | |
'chevron': 'Chevron', | |
'bp': 'BP', | |
'british petroleum': 'BP', | |
'shell': 'Shell', | |
'total energies': 'TotalEnergies', | |
'conoco': 'ConocoPhillips', | |
'halliburton': 'Halliburton', | |
'slb': 'SLB', | |
'schlumberger': 'SLB', | |
'devon': 'Devon Energy', | |
'occidental': 'Occidental Petroleum', | |
'marathon': 'Marathon Oil', | |
'valero': 'Valero Energy', | |
'aramco': 'Aramco', | |
} | |
nltk.download('stopwords') | |
stop_words = set(stopwords.words('english')) | |
# Function to clean, tokenize, and remove stopwords | |
def tokenize(text): | |
text = re.sub(r'[^\w\s]', '', text.lower()) | |
words = text.split() | |
mapped_words = [] | |
for word in words: | |
mapped_word = place_mapping.get(word, | |
nomenclature_mapping.get(word, | |
company_mapping.get(word, word))) | |
mapped_words.append(mapped_word) | |
filtered_words = [word for word in mapped_words if word not in stop_words] | |
return filtered_words | |
# Function to apply filtering and plotting based on search input | |
def generateChartBar(data, search_word, body=False): | |
filtered_df = data[data['headline'].str.contains(search_word, case=False) | data['body'].str.contains(search_word, case=False)] | |
all_words = [] | |
filtered_df['headline'].apply(lambda x: all_words.extend(tokenize(x))) | |
if body: | |
filtered_df['body'].apply(lambda x: all_words.extend(tokenize(x))) | |
word_counts = Counter(all_words) | |
top_10_words = word_counts.most_common(20) | |
top_10_df = pd.DataFrame(top_10_words, columns=['word', 'frequency']) | |
fig = px.bar(top_10_df, x='word', y='frequency', title=f'Top 20 Most Common Words (Excluding Stopwords) for "{search_word}"', | |
labels={'word': 'Word', 'frequency': 'Frequency'}, | |
text='frequency') | |
return fig | |
# Function to filter based on the whole word/phrase and region | |
def filterPlace(data, search_place): | |
# Check if the search_place is a region | |
if search_place in region_mapping: | |
# Get all countries in the region | |
countries_in_region = region_mapping[search_place] | |
# Map countries to their place_mapping synonyms | |
synonyms_pattern = '|'.join( | |
r'\b{}\b'.format(re.escape(key)) | |
for country in countries_in_region | |
for key in place_mapping | |
if place_mapping[key] == country | |
) | |
else: | |
# If a country is selected, get its standard place and synonyms | |
standard_place = place_mapping.get(search_place.lower(), search_place) | |
synonyms_pattern = '|'.join( | |
r'\b{}\b'.format(re.escape(key)) | |
for key in place_mapping | |
if place_mapping[key] == standard_place | |
) | |
# Filter the DataFrame for headlines or body containing the whole word/phrase | |
filtered_df = data[ | |
data['headline'].str.contains(synonyms_pattern, case=False, na=False) | | |
data['body'].str.contains(synonyms_pattern, case=False, na=False) | |
] | |
if filtered_df.empty: | |
print(f'No data found for {search_place}. Please try a different location or region.') | |
return None | |
return filtered_df | |
# Function to filter DataFrame and generate a word cloud | |
def generateWordCloud(data): | |
# standard_place = place_mapping.get(search_place.lower(), search_place) | |
# synonyms_pattern = '|'.join(re.escape(key) for key in place_mapping if place_mapping[key] == standard_place) | |
# filtered_df = data[data['headline'].str.contains(synonyms_pattern, case=False, na=False) | | |
# data['body'].str.contains(synonyms_pattern, case=False, na=False)] | |
# if filtered_df.empty: | |
# print(f'No data found for {search_place}. Please try a different location.') | |
# return | |
text = ' '.join(data['headline'].tolist() + data['body'].tolist()) | |
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text) | |
return wordcloud |