|
import streamlit as st |
|
import pandas as pd |
|
import numpy as np |
|
import torch |
|
import networkx as nx |
|
import plotly.express as px |
|
import plotly.graph_objs as go |
|
import matplotlib.pyplot as plt |
|
import seaborn as sns |
|
from scipy.signal import savgol_filter |
|
from sklearn.feature_extraction.text import TfidfVectorizer |
|
from sklearn.metrics.pairwise import cosine_similarity |
|
from wordcloud import WordCloud |
|
import spacy |
|
|
|
st.set_page_config(page_title="Political Speech Analysis", page_icon="🗣️", layout="wide") |
|
|
|
from transformers import ( |
|
AutoTokenizer, |
|
AutoModelForSequenceClassification, |
|
pipeline, |
|
AutoModelForTokenClassification, |
|
RobertaTokenizer, |
|
RobertaForSequenceClassification |
|
) |
|
import nltk |
|
from nltk.corpus import stopwords |
|
from nltk.tokenize import word_tokenize |
|
from textstat import flesch_reading_ease, flesch_kincaid_grade |
|
|
|
nltk.download('punkt', quiet=True) |
|
nltk.download('averaged_perceptron_tagger', quiet=True) |
|
nltk.download('stopwords', quiet=True) |
|
nltk.download('punkt_tab', quiet=True) |
|
|
|
try: |
|
nlp = spacy.load('en_core_web_lg') |
|
except: |
|
st.error("Please install spaCy and en_core_web_lg model: \n" |
|
"pip install spacy\n" |
|
"python -m spacy download en_core_web_lg") |
|
|
|
MORAL_FOUNDATIONS = { |
|
'care': 'Care/Harm', |
|
'fairness': 'Fairness/Cheating', |
|
'loyalty': 'Loyalty/Betrayal', |
|
'authority': 'Authority/Subversion', |
|
'sanctity': 'Sanctity/Degradation' |
|
} |
|
|
|
RHETORICAL_DEVICES = { |
|
'analogy': ['like', 'as', 'similar to'], |
|
'repetition': ['repetitive', 'recurring'], |
|
'metaphor': ['as if', 'like', 'represents'], |
|
'hyperbole': ['always', 'never', 'absolute'], |
|
'rhetorical_question': ['?'] |
|
} |
|
|
|
class SpeechAnalyzer: |
|
def __init__(self): |
|
self.moral_model_path = "MMADS/MoralFoundationsClassifier" |
|
self.moral_tokenizer = RobertaTokenizer.from_pretrained(self.moral_model_path) |
|
self.moral_model = RobertaForSequenceClassification.from_pretrained(self.moral_model_path) |
|
|
|
self.label_names = ['care', 'fairness', 'loyalty', 'authority', 'sanctity'] |
|
|
|
self.sentiment_pipeline = pipeline("sentiment-analysis") |
|
self.ner_tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER") |
|
self.ner_model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER") |
|
self.ner_pipeline = pipeline("ner", model=self.ner_model, tokenizer=self.ner_tokenizer) |
|
|
|
self.emotion_classifier = pipeline("text-classification", |
|
model="j-hartmann/emotion-english-distilroberta-base") |
|
|
|
|
|
def split_text(self, text, max_length=256, overlap=50): |
|
"""Split long text into overlapping segments""" |
|
words = text.split() |
|
segments = [] |
|
current_segment = [] |
|
current_length = 0 |
|
|
|
for word in words: |
|
if current_length + len(word.split()) > max_length: |
|
segments.append(' '.join(current_segment)) |
|
current_segment = current_segment[-overlap:] + [word] |
|
current_length = len(' '.join(current_segment).split()) |
|
else: |
|
current_segment.append(word) |
|
current_length = len(' '.join(current_segment).split()) |
|
|
|
if current_segment: |
|
segments.append(' '.join(current_segment)) |
|
|
|
return segments |
|
|
|
def analyze_moral_foundations(self, text): |
|
"""Analyze moral foundations using the RoBERTa-based classifier""" |
|
segments = self.split_text(text) |
|
|
|
foundation_scores = { |
|
'care': [], 'fairness': [], 'loyalty': [], |
|
'authority': [], 'sanctity': [] |
|
} |
|
|
|
for segment in segments: |
|
inputs = self.moral_tokenizer(segment, return_tensors="pt", truncation=True, max_length=512) |
|
|
|
with torch.no_grad(): |
|
outputs = self.moral_model(**inputs) |
|
|
|
probabilities = torch.softmax(outputs.logits, dim=1) |
|
|
|
for idx, label in enumerate(self.label_names): |
|
foundation = label.lower() |
|
if foundation in foundation_scores: |
|
foundation_scores[foundation].append(probabilities[0][idx].item()) |
|
|
|
aggregated_scores = { |
|
foundation: np.mean(scores) for foundation, scores in foundation_scores.items() |
|
} |
|
|
|
return aggregated_scores |
|
|
|
def analyze_emotional_trajectory(self, text, window_size=5): |
|
"""Enhanced emotional trajectory analysis with basic emotions""" |
|
segments = self.split_text(text, max_length=512) |
|
sentiment_scores = [] |
|
basic_emotions = [] |
|
|
|
for segment in segments: |
|
sentiment_result = self.sentiment_pipeline(segment, truncation=True, max_length=512) |
|
score = sentiment_result[0]['score'] |
|
if sentiment_result[0]['label'] == 'POSITIVE': |
|
score = 0.5 + (score * 0.5) |
|
else: |
|
score = 0.5 - (score * 0.5) |
|
sentiment_scores.append(score) |
|
|
|
emotion_result = self.emotion_classifier(segment, truncation=True, max_length=512) |
|
emotion = emotion_result[0]['label'] |
|
basic_emotions.append(emotion) |
|
|
|
return sentiment_scores, basic_emotions |
|
|
|
|
|
def detect_named_entities(self, text): |
|
"""Detect named entities in the text""" |
|
entities = self.ner_pipeline(text) |
|
return entities |
|
|
|
def extract_key_phrases(self, text, top_n=10): |
|
"""Extract key phrases using TF-IDF""" |
|
vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1,2)) |
|
tfidf_matrix = vectorizer.fit_transform([text]) |
|
feature_names = vectorizer.get_feature_names_out() |
|
|
|
sorted_idx = tfidf_matrix.toarray()[0].argsort()[::-1] |
|
top_phrases = [feature_names[i] for i in sorted_idx[:top_n]] |
|
|
|
return top_phrases |
|
|
|
def calculate_readability(self, text): |
|
"""Calculate readability metrics""" |
|
return { |
|
'Flesch Reading Ease': flesch_reading_ease(text), |
|
'Flesch-Kincaid Grade Level': flesch_kincaid_grade(text) |
|
} |
|
|
|
def detect_rhetorical_devices(self, text): |
|
"""Detect rhetorical devices""" |
|
devices_found = {} |
|
for device, markers in RHETORICAL_DEVICES.items(): |
|
count = sum(text.lower().count(marker) for marker in markers) |
|
if count > 0: |
|
devices_found[device] = count |
|
return devices_found |
|
|
|
def create_semantic_network(self, text, top_n=20, window_size=10, chunk_size=10000): |
|
"""Create semantic network graph with weighted edges""" |
|
chunks = [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)] |
|
|
|
all_nouns = [] |
|
noun_freq = nltk.FreqDist() |
|
|
|
for chunk in chunks: |
|
doc = nlp(chunk) |
|
chunk_nouns = [token.text.lower() for token in doc if token.pos_ == 'NOUN'] |
|
all_nouns.extend(chunk_nouns) |
|
noun_freq.update(chunk_nouns) |
|
|
|
top_nouns = [noun for noun, freq in noun_freq.most_common(top_n)] |
|
|
|
G = nx.Graph() |
|
cooc_matrix = np.zeros((len(top_nouns), len(top_nouns))) |
|
noun_to_idx = {noun: idx for idx, noun in enumerate(top_nouns)} |
|
|
|
for chunk in chunks: |
|
doc = nlp(chunk) |
|
words = [token.text.lower() for token in doc] |
|
|
|
for i in range(len(words)): |
|
window_words = words[max(0, i-window_size):min(len(words), i+window_size)] |
|
for noun1 in top_nouns: |
|
if noun1 in window_words: |
|
for noun2 in top_nouns: |
|
if noun1 != noun2 and noun2 in window_words: |
|
idx1, idx2 = noun_to_idx[noun1], noun_to_idx[noun2] |
|
cooc_matrix[idx1][idx2] += 1 |
|
cooc_matrix[idx2][idx1] += 1 |
|
|
|
for noun in top_nouns: |
|
G.add_node(noun, size=noun_freq[noun]) |
|
|
|
max_weight = np.max(cooc_matrix) |
|
if max_weight > 0: |
|
for i in range(len(top_nouns)): |
|
for j in range(i+1, len(top_nouns)): |
|
weight = cooc_matrix[i][j] |
|
if weight > 0: |
|
G.add_edge(top_nouns[i], top_nouns[j], |
|
weight=weight, |
|
width=3 * (weight/max_weight)) |
|
|
|
pos = nx.spring_layout(G, k=1, iterations=50) |
|
for node in G.nodes(): |
|
G.nodes[node]['pos'] = pos[node] |
|
|
|
return G |
|
|
|
@st.cache_data |
|
def process_all_analyses(text, _analyzer): |
|
segments = _analyzer.split_text(text, max_length=512) |
|
num_segments = len(segments) |
|
segment_labels = [f"{i+1}" for i in range(num_segments)] |
|
|
|
sentiment_scores, basic_emotions = _analyzer.analyze_emotional_trajectory(text) |
|
moral_trajectories = {foundation: [] for foundation in ['care', 'fairness', 'loyalty', 'authority', 'sanctity']} |
|
for segment in segments: |
|
moral_scores = _analyzer.analyze_moral_foundations(segment) |
|
for foundation in moral_trajectories.keys(): |
|
moral_trajectories[foundation].append(moral_scores[foundation]) |
|
|
|
return segments, segment_labels, sentiment_scores, basic_emotions, moral_trajectories |
|
|
|
|
|
def main(): |
|
st.title("🗣️ Political Text Analysis Toolkit") |
|
|
|
analyzer = SpeechAnalyzer() |
|
|
|
uploaded_file = st.file_uploader("Upload your document", type=['txt', 'docx', 'pdf']) |
|
|
|
if uploaded_file is not None: |
|
if uploaded_file.name.endswith('.txt'): |
|
text = uploaded_file.getvalue().decode('utf-8') |
|
elif uploaded_file.name.endswith('.docx'): |
|
import docx |
|
doc = docx.Document(uploaded_file) |
|
text = '\n'.join([paragraph.text for paragraph in doc.paragraphs]) |
|
elif uploaded_file.name.endswith('.pdf'): |
|
import PyPDF2 |
|
pdf_reader = PyPDF2.PdfReader(uploaded_file) |
|
text = ' '.join([page.extract_text() for page in pdf_reader.pages]) |
|
|
|
progress_bar = st.progress(0) |
|
status_text = st.empty() |
|
tab1, tab2, tab3, tab4, tab5 = st.tabs([ |
|
"Moral Foundations", |
|
"Emotional Analysis", |
|
"Linguistic Insights", |
|
"Semantic Network", |
|
"Advanced NLP" |
|
]) |
|
|
|
with tab1: |
|
status_text.text('Analyzing Moral Foundations...') |
|
progress_bar.progress(20) |
|
st.subheader("Moral Foundations Analysis") |
|
moral_scores = analyzer.analyze_moral_foundations(text) |
|
|
|
moral_df = pd.DataFrame.from_dict(moral_scores, orient='index', columns=['Score']) |
|
moral_df.index.name = 'Moral Foundation' |
|
moral_df = moral_df.reset_index() |
|
|
|
fig = px.bar( |
|
moral_df, |
|
x='Moral Foundation', |
|
y='Score', |
|
title='Moral Foundations Breakdown', |
|
color='Moral Foundation' |
|
) |
|
st.plotly_chart(fig) |
|
|
|
for foundation, score in moral_scores.items(): |
|
st.write(f"**{MORAL_FOUNDATIONS[foundation]}**: {score:.2%}") |
|
|
|
with tab2: |
|
status_text.text('Processing Emotional Trajectory...') |
|
progress_bar.progress(40) |
|
st.subheader("Speech Trajectory Analysis") |
|
|
|
segments, segment_labels, sentiment_scores, basic_emotions, moral_trajectories = process_all_analyses(text, analyzer) |
|
|
|
unified_fig = go.Figure() |
|
|
|
viz_options = st.multiselect( |
|
"Select analyses to display:", |
|
["Sentiment Flow", "Moral Foundations Flow", "Basic Emotions Flow"], |
|
default=["Sentiment Flow"] |
|
) |
|
|
|
if "Sentiment Flow" in viz_options: |
|
unified_fig.add_trace(go.Scatter( |
|
x=segment_labels, |
|
y=sentiment_scores, |
|
name='Sentiment', |
|
mode='lines+markers', |
|
line=dict(color='#1f77b4', width=3), |
|
marker=dict( |
|
size=8, |
|
color=['#ff4444' if score < -0.3 else '#44ff44' if score > 0.3 else '#888888' for score in sentiment_scores] |
|
) |
|
)) |
|
|
|
if "Moral Foundations Flow" in viz_options: |
|
colors = px.colors.qualitative.Set3[:5] |
|
for idx, (foundation, scores) in enumerate(moral_trajectories.items()): |
|
unified_fig.add_trace(go.Scatter( |
|
x=segment_labels, |
|
y=scores, |
|
name=MORAL_FOUNDATIONS[foundation], |
|
mode='lines+markers', |
|
line=dict(color=colors[idx], width=2), |
|
marker=dict(size=6) |
|
)) |
|
if "Basic Emotions Flow" in viz_options: |
|
emotions_df = pd.DataFrame({ |
|
'Segment': segment_labels, |
|
'Emotion': basic_emotions |
|
}) |
|
|
|
emotion_colors = { |
|
'joy': '#FFD700', |
|
'sadness': '#4169E1', |
|
'anger': '#FF4500', |
|
'fear': '#800080', |
|
'disgust': '#006400', |
|
'surprise': '#FFA500' |
|
} |
|
|
|
unified_fig.add_trace(go.Bar( |
|
x=segment_labels, |
|
y=[1] * len(basic_emotions), |
|
name=f'Emotions Found: {", ".join(sorted(set(basic_emotions)))}', |
|
marker=dict( |
|
color=[emotion_colors.get(e.lower(), '#808080') for e in basic_emotions], |
|
line=dict(width=1, color='#000000') |
|
), |
|
opacity=0.8, |
|
hovertemplate="Segment %{x}<br>Emotion: %{text}<extra></extra>", |
|
text=basic_emotions, |
|
textposition='auto' |
|
)) |
|
|
|
st.plotly_chart(unified_fig, use_container_width=True) |
|
|
|
with tab3: |
|
status_text.text('Analyzing Linguistic Features...') |
|
progress_bar.progress(60) |
|
st.subheader("Linguistic Analysis") |
|
readability = analyzer.calculate_readability(text) |
|
|
|
col1, col2 = st.columns(2) |
|
with col1: |
|
score = readability['Flesch Reading Ease'] |
|
interpretation = "Complex" if score < 50 else "Standard" if score < 70 else "Easy" |
|
st.metric( |
|
label="Reading Ease", |
|
value=f"{score:.1f}/100", |
|
delta=interpretation, |
|
delta_color="normal" |
|
) |
|
|
|
with col2: |
|
grade = readability['Flesch-Kincaid Grade Level'] |
|
st.metric( |
|
label="Education Level", |
|
value=f"Grade {grade:.1f}", |
|
delta="Years of Education", |
|
delta_color="normal" |
|
) |
|
|
|
st.subheader("Key Topics and Themes") |
|
key_phrases = analyzer.extract_key_phrases(text) |
|
|
|
cols = st.columns(3) |
|
for idx, phrase in enumerate(key_phrases): |
|
col_idx = idx % 3 |
|
cols[col_idx].markdown( |
|
f"""<div style=' |
|
background-color: rgba(31, 119, 180, {0.9 - idx*0.05}); |
|
color: white; |
|
padding: 8px 15px; |
|
margin: 5px 0; |
|
border-radius: 15px; |
|
text-align: center; |
|
'>{phrase}</div>""", |
|
unsafe_allow_html=True |
|
) |
|
|
|
with tab4: |
|
status_text.text('Building Semantic Network...') |
|
progress_bar.progress(80) |
|
st.subheader("Semantic Network") |
|
semantic_graph = analyzer.create_semantic_network(text) |
|
|
|
network_fig = go.Figure() |
|
|
|
for edge in semantic_graph.edges(): |
|
x0, y0 = semantic_graph.nodes[edge[0]]['pos'] |
|
x1, y1 = semantic_graph.nodes[edge[1]]['pos'] |
|
weight = semantic_graph.edges[edge]['weight'] |
|
max_weight = max(d['weight'] for _, _, d in semantic_graph.edges(data=True)) |
|
|
|
normalized_weight = weight / max_weight |
|
|
|
width = 2 + (normalized_weight * 8) |
|
|
|
color = f'rgba(31, 119, 180, {0.3 + normalized_weight * 0.7})' |
|
|
|
network_fig.add_trace(go.Scatter( |
|
x=[x0, x1, None], |
|
y=[y0, y1, None], |
|
mode='lines', |
|
line=dict( |
|
width=width, |
|
color=color |
|
), |
|
hoverinfo='text', |
|
hovertext=f'Relationship strength: {weight:.2f}' |
|
)) |
|
|
|
for node in semantic_graph.nodes(): |
|
x, y = semantic_graph.nodes[node]['pos'] |
|
size = semantic_graph.nodes[node]['size'] |
|
|
|
network_fig.add_trace(go.Scatter( |
|
x=[x], |
|
y=[y], |
|
mode='markers+text', |
|
marker=dict( |
|
size=15 + size/2, |
|
color='#ffffff', |
|
line=dict(width=2, color='#1f77b4'), |
|
symbol='circle' |
|
), |
|
text=[node], |
|
textposition="top center", |
|
textfont=dict(size=12, color='black'), |
|
hoverinfo='text', |
|
hovertext=f'Term: {node}<br>Frequency: {size}' |
|
)) |
|
|
|
network_fig.update_layout( |
|
showlegend=False, |
|
hovermode='closest', |
|
margin=dict(b=20, l=20, r=20, t=20), |
|
xaxis=dict(showgrid=False, zeroline=False, showticklabels=False), |
|
yaxis=dict(showgrid=False, zeroline=False, showticklabels=False), |
|
plot_bgcolor='white', |
|
width=800, |
|
height=600 |
|
) |
|
|
|
st.plotly_chart(network_fig, use_container_width=True) |
|
|
|
with tab5: |
|
status_text.text('Extracting Named Entities...') |
|
progress_bar.progress(100) |
|
st.subheader("Named Entity Recognition") |
|
named_entities = analyzer.detect_named_entities(text) |
|
|
|
entities_df = pd.DataFrame(named_entities) |
|
|
|
type_mapping = { |
|
'B-PER': 'Person', |
|
'I-PER': 'Person', |
|
'B-ORG': 'Organization', |
|
'I-ORG': 'Organization', |
|
'B-LOC': 'Location', |
|
'I-LOC': 'Location', |
|
'B-MISC': 'Other', |
|
'I-MISC': 'Other' |
|
} |
|
|
|
display_df = pd.DataFrame({ |
|
'Term': entities_df['word'], |
|
'Category': entities_df['entity'].map(type_mapping), |
|
'Confidence': entities_df['score'].apply(lambda x: f"{x*100:.1f}%") |
|
}) |
|
|
|
grouped_df = display_df.groupby('Category').agg({ |
|
'Term': lambda x: ', '.join(set(x)), |
|
'Confidence': 'count' |
|
}).reset_index() |
|
|
|
for category in grouped_df['Category'].unique(): |
|
category_data = grouped_df[grouped_df['Category'] == category] |
|
st.write(f"### {category}") |
|
st.markdown(f"**Found**: {category_data['Term'].iloc[0]}") |
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
main() |