File size: 1,519 Bytes
869e1b8 21d27ae 42de6bd |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 |
from typing import Dict, Any, Iterable
from sklearn import TfIdfVectorizer
import wordcloud
from pydantic import BaseModel
class WordCloudExtractor:
tfidf_params: Dict[str, Any]
def extract_from_corpus(self, texts: Iterable[str], n_words: int) -> wordcloud.WordCloud:
pass
from sklearn.feature_extraction.text import TfidfVectorizer
from wordcloud import WordCloud
import numpy as np
class TextVisualization:
@staticmethod
def extract_from_corpus(texts, max_features=100):
"""
Extract word frequencies from a corpus using TF-IDF vectorization
and generate word cloud frequencies.
Args:
texts: List of text documents
max_features: Maximum number of words to include
Returns:
Dictionary of word frequencies suitable for WordCloud
"""
# Initialize TF-IDF vectorizer
tfidf = TfidfVectorizer(
max_features=max_features,
stop_words='english',
lowercase=True
)
# Fit and transform the texts
tfidf_matrix = tfidf.fit_transform(texts)
# Get feature names (words)
feature_names = tfidf.get_feature_names_out()
# Calculate mean TF-IDF scores across documents
mean_tfidf = np.array(tfidf_matrix.mean(axis=0)).flatten()
# Create frequency dictionary
frequencies = dict(zip(feature_names, mean_tfidf))
return frequencies
|