File size: 1,799 Bytes
869e1b8 f5c0c01 21d27ae f5c0c01 21d27ae f5c0c01 21d27ae f5c0c01 42de6bd f5c0c01 42de6bd f5c0c01 42de6bd |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 |
from typing import Dict, Any, Iterable
from sklearn.feature_extraction.text import TfidfVectorizer
import wordcloud
from pydantic import BaseModel, Field
import numpy as np
import PIL
class WordCloudExtractor(BaseModel):
max_words: int = 50
wordcloud_params: Dict[str, Any] = Field(default_factory=dict)
tfidf_params: Dict[str, Any] = Field(default_factory=lambda: {"stop_words": "english"})
def extract_wordcloud_image(self, texts) -> PIL.Image.Image:
frequencies = self._extract_frequencies(texts, self.max_words, tfidf_params=self.tfidf_params)
wc = wordcloud.WordCloud(**self.wordcloud_params).generate_from_frequencies(frequencies)
return wc.to_image()
@classmethod
def _extract_frequencies(cls, texts, max_words=100, tfidf_params: dict={}) -> Dict[str, float]:
"""
Extract word frequencies from a corpus using TF-IDF vectorization
and generate word cloud frequencies.
Args:
texts: List of text documents
max_features: Maximum number of words to include
Returns:
Dictionary of word frequencies suitable for WordCloud
"""
# Initialize TF-IDF vectorizer
tfidf = TfidfVectorizer(
max_features=max_words,
**tfidf_params
)
# Fit and transform the texts
tfidf_matrix = tfidf.fit_transform(texts)
# Get feature names (words)
feature_names = tfidf.get_feature_names_out()
# Calculate mean TF-IDF scores across documents
mean_tfidf = np.array(tfidf_matrix.mean(axis=0)).flatten()
# Create frequency dictionary
frequencies = dict(zip(feature_names, mean_tfidf))
return frequencies
|