import streamlit as st import pandas as pd import numpy as np import torch import networkx as nx import plotly.express as px import plotly.graph_objs as go import matplotlib.pyplot as plt import seaborn as sns from scipy.signal import savgol_filter from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity from wordcloud import WordCloud import spacy st.set_page_config(page_title="Advanced Political Speech Analysis", page_icon="🗣️", layout="wide") # Advanced NLP Libraries from transformers import ( AutoTokenizer, AutoModelForSequenceClassification, pipeline, AutoModelForTokenClassification, RobertaTokenizer, RobertaForSequenceClassification ) import nltk from nltk.corpus import stopwords from nltk.tokenize import word_tokenize from textstat import flesch_reading_ease, flesch_kincaid_grade # Download necessary NLTK resources nltk.download('punkt', quiet=True) nltk.download('stopwords', quiet=True) # Load spaCy model (requires separate installation) try: nlp = spacy.load('en_core_web_lg') except: st.error("Please install spaCy and en_core_web_lg model: \n" "pip install spacy\n" "python -m spacy download en_core_web_lg") # Constants and Configurations MORAL_FOUNDATIONS = { 'care': 'Care/Harm', 'fairness': 'Fairness/Cheating', 'loyalty': 'Loyalty/Betrayal', 'authority': 'Authority/Subversion', 'sanctity': 'Sanctity/Degradation' } RHETORICAL_DEVICES = { 'analogy': ['like', 'as', 'similar to'], 'repetition': ['repetitive', 'recurring'], 'metaphor': ['as if', 'like', 'represents'], 'hyperbole': ['always', 'never', 'absolute'], 'rhetorical_question': ['?'] } class SpeechAnalyzer: def __init__(self): # Load MoralFoundations model self.moral_model_path = "MMADS/MoralFoundationsClassifier" self.moral_tokenizer = RobertaTokenizer.from_pretrained(self.moral_model_path) self.moral_model = RobertaForSequenceClassification.from_pretrained(self.moral_model_path) # Define label names directly self.label_names = ['care', 'fairness', 'loyalty', 'authority', 'sanctity'] # Other pipelines remain the same self.sentiment_pipeline = pipeline("sentiment-analysis") self.ner_tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER") self.ner_model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER") self.ner_pipeline = pipeline("ner", model=self.ner_model, tokenizer=self.ner_tokenizer) def split_text(self, text, max_length=512, overlap=50): """Split long text into overlapping segments""" words = text.split() segments = [] current_segment = [] current_length = 0 for word in words: if current_length + len(word.split()) > max_length: segments.append(' '.join(current_segment)) current_segment = current_segment[-overlap:] + [word] current_length = len(' '.join(current_segment).split()) else: current_segment.append(word) current_length = len(' '.join(current_segment).split()) if current_segment: segments.append(' '.join(current_segment)) return segments def analyze_moral_foundations(self, text): """Analyze moral foundations using the RoBERTa-based classifier""" segments = self.split_text(text) foundation_scores = { 'care': [], 'fairness': [], 'loyalty': [], 'authority': [], 'sanctity': [] } for segment in segments: inputs = self.moral_tokenizer(segment, return_tensors="pt", truncation=True, max_length=512) with torch.no_grad(): outputs = self.moral_model(**inputs) probabilities = torch.softmax(outputs.logits, dim=1) for idx, label in enumerate(self.label_names): foundation = label.lower() if foundation in foundation_scores: foundation_scores[foundation].append(probabilities[0][idx].item()) # Average the scores across segments aggregated_scores = { foundation: np.mean(scores) for foundation, scores in foundation_scores.items() } return aggregated_scores def analyze_emotional_trajectory(self, text, window_size=5): """Perform emotional trajectory analysis""" segments = self.split_text(text, max_length=256) sentiment_scores = [] for segment in segments: result = self.sentiment_pipeline(segment)[0] score = 1 if result['label'] == 'POSITIVE' else -1 sentiment_scores.append(score) smoothed_scores = (savgol_filter(sentiment_scores, window_length=window_size, polyorder=2) if len(sentiment_scores) > window_size else sentiment_scores) return smoothed_scores def detect_named_entities(self, text): """Detect named entities in the text""" entities = self.ner_pipeline(text) return entities def extract_key_phrases(self, text, top_n=10): """Extract key phrases using TF-IDF""" vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1,2)) tfidf_matrix = vectorizer.fit_transform([text]) feature_names = vectorizer.get_feature_names_out() # Get top phrases by TF-IDF score sorted_idx = tfidf_matrix.toarray()[0].argsort()[::-1] top_phrases = [feature_names[i] for i in sorted_idx[:top_n]] return top_phrases def calculate_readability(self, text): """Calculate readability metrics""" return { 'Flesch Reading Ease': flesch_reading_ease(text), 'Flesch-Kincaid Grade Level': flesch_kincaid_grade(text) } def detect_rhetorical_devices(self, text): """Detect rhetorical devices""" devices_found = {} for device, markers in RHETORICAL_DEVICES.items(): count = sum(text.lower().count(marker) for marker in markers) if count > 0: devices_found[device] = count return devices_found def create_semantic_network(self, text, top_n=20): """Create semantic network graph""" # Use spaCy for advanced parsing doc = nlp(text) # Create graph G = nx.Graph() # Extract top nouns and their relationships nouns = [token.text for token in doc if token.pos_ == 'NOUN'] noun_freq = nltk.FreqDist(nouns) top_nouns = [noun for noun, _ in noun_freq.most_common(top_n)] # Add nodes and edges for noun in top_nouns: G.add_node(noun) # Connect related nouns for i in range(len(top_nouns)): for j in range(i+1, len(top_nouns)): if top_nouns[i] in text and top_nouns[j] in text: G.add_edge(top_nouns[i], top_nouns[j]) return G def main(): st.title("🗣️ Advanced Political Speech Analysis Toolkit") # Initialize analyzer analyzer = SpeechAnalyzer() # File upload uploaded_file = st.file_uploader("Upload Political Speech", type=['txt', 'docx', 'pdf']) if uploaded_file is not None: # Read file (similar to previous implementation) if uploaded_file.name.endswith('.txt'): text = uploaded_file.getvalue().decode('utf-8') elif uploaded_file.name.endswith('.docx'): import docx doc = docx.Document(uploaded_file) text = '\n'.join([paragraph.text for paragraph in doc.paragraphs]) elif uploaded_file.name.endswith('.pdf'): import PyPDF2 pdf_reader = PyPDF2.PdfReader(uploaded_file) text = ' '.join([page.extract_text() for page in pdf_reader.pages]) # Create tabs for different analyses tab1, tab2, tab3, tab4, tab5 = st.tabs([ "Moral Foundations", "Emotional Analysis", "Linguistic Insights", "Semantic Network", "Advanced NLP" ]) with tab1: st.subheader("Moral Foundations Analysis") moral_scores = analyzer.analyze_moral_foundations(text) # Plotly bar chart moral_df = pd.DataFrame.from_dict(moral_scores, orient='index', columns=['Score']) moral_df.index.name = 'Moral Foundation' moral_df = moral_df.reset_index() fig = px.bar( moral_df, x='Moral Foundation', y='Score', title='Moral Foundations Breakdown', color='Moral Foundation' ) st.plotly_chart(fig) # Detailed insights for foundation, score in moral_scores.items(): st.write(f"**{MORAL_FOUNDATIONS[foundation]}**: {score:.2%}") with tab2: st.subheader("Emotional Trajectory") emotional_trajectory = analyzer.analyze_emotional_trajectory(text) # Plotly line chart trajectory_fig = go.Figure(data=go.Scatter( y=emotional_trajectory, mode='lines+markers', name='Emotional Intensity' )) trajectory_fig.update_layout( title='Speech Emotional Trajectory', xaxis_title='Speech Segments', yaxis_title='Emotional Intensity' ) st.plotly_chart(trajectory_fig) with tab3: st.subheader("Linguistic Complexity") readability = analyzer.calculate_readability(text) col1, col2 = st.columns(2) with col1: st.metric("Flesch Reading Ease", f"{readability['Flesch Reading Ease']:.2f}") with col2: st.metric("Flesch-Kincaid Grade Level", f"{readability['Flesch-Kincaid Grade Level']:.2f}") # Key Phrases st.subheader("Key Phrases") key_phrases = analyzer.extract_key_phrases(text) st.write(", ".join(key_phrases)) with tab4: st.subheader("Semantic Network") semantic_graph = analyzer.create_semantic_network(text) # Convert NetworkX graph to Plotly edge_x = [] edge_y = [] for edge in semantic_graph.edges(): x0, y0 = semantic_graph.nodes[edge[0]].get('pos', (0,0)) x1, y1 = semantic_graph.nodes[edge[1]].get('pos', (0,0)) edge_x.append(x0) edge_x.append(x1) edge_y.append(y0) edge_y.append(y1) # Plotly network visualization network_fig = go.Figure() network_fig.add_trace(go.Scatter( x=edge_x, y=edge_y, mode='lines', line=dict(width=0.5, color='#888'), hoverinfo='none' )) st.plotly_chart(network_fig) with tab5: st.subheader("Advanced NLP Analysis") # Named Entities st.write("### Named Entities") named_entities = analyzer.detect_named_entities(text) entities_df = pd.DataFrame(named_entities) st.dataframe(entities_df) # Rhetorical Devices st.write("### Rhetorical Devices") rhetorical_devices = analyzer.detect_rhetorical_devices(text) for device, count in rhetorical_devices.items(): st.write(f"**{device.capitalize()}**: {count} instances") if __name__ == "__main__": main()