from typing import Dict, Any, Iterable from sklearn import TfIdfVectorizer import wordcloud from pydantic import BaseModel class WordCloudExtractor: tfidf_params: Dict[str, Any] def extract_from_corpus(self, texts: Iterable[str], n_words: int) -> wordcloud.WordCloud: pass from sklearn.feature_extraction.text import TfidfVectorizer from wordcloud import WordCloud import numpy as np class TextVisualization: @staticmethod def extract_from_corpus(texts, max_features=100): """ Extract word frequencies from a corpus using TF-IDF vectorization and generate word cloud frequencies. Args: texts: List of text documents max_features: Maximum number of words to include Returns: Dictionary of word frequencies suitable for WordCloud """ # Initialize TF-IDF vectorizer tfidf = TfidfVectorizer( max_features=max_features, stop_words='english', lowercase=True ) # Fit and transform the texts tfidf_matrix = tfidf.fit_transform(texts) # Get feature names (words) feature_names = tfidf.get_feature_names_out() # Calculate mean TF-IDF scores across documents mean_tfidf = np.array(tfidf_matrix.mean(axis=0)).flatten() # Create frequency dictionary frequencies = dict(zip(feature_names, mean_tfidf)) return frequencies