|
from typing import Dict, Any, Iterable |
|
from sklearn.feature_extraction.text import TfidfVectorizer |
|
import wordcloud |
|
from pydantic import BaseModel, Field |
|
import numpy as np |
|
import PIL |
|
|
|
|
|
class WordCloudExtractor(BaseModel): |
|
max_words: int = 50 |
|
wordcloud_params: Dict[str, Any] = Field(default_factory=dict) |
|
tfidf_params: Dict[str, Any] = Field(default_factory=lambda: {"stop_words": "english"}) |
|
|
|
def extract_wordcloud_image(self, texts) -> PIL.Image.Image: |
|
frequencies = self._extract_frequencies(texts, self.max_words, tfidf_params=self.tfidf_params) |
|
wc = wordcloud.WordCloud(**self.wordcloud_params).generate_from_frequencies(frequencies) |
|
return wc.to_image() |
|
|
|
@classmethod |
|
def _extract_frequencies(cls, texts, max_words=100, tfidf_params: dict={}) -> Dict[str, float]: |
|
""" |
|
Extract word frequencies from a corpus using TF-IDF vectorization |
|
and generate word cloud frequencies. |
|
|
|
Args: |
|
texts: List of text documents |
|
max_features: Maximum number of words to include |
|
|
|
Returns: |
|
Dictionary of word frequencies suitable for WordCloud |
|
""" |
|
|
|
tfidf = TfidfVectorizer( |
|
max_features=max_words, |
|
**tfidf_params |
|
) |
|
|
|
|
|
tfidf_matrix = tfidf.fit_transform(texts) |
|
|
|
|
|
feature_names = tfidf.get_feature_names_out() |
|
|
|
|
|
mean_tfidf = np.array(tfidf_matrix.mean(axis=0)).flatten() |
|
|
|
|
|
frequencies = dict(zip(feature_names, mean_tfidf)) |
|
|
|
return frequencies |
|
|