github_search_visualizations / text_visualization.py
lambdaofgod's picture
fix: Add missing imports to resolve flake8 errors
869e1b8
raw
history blame
1.52 kB
from typing import Dict, Any, Iterable
from sklearn import TfIdfVectorizer
import wordcloud
from pydantic import BaseModel
class WordCloudExtractor:
tfidf_params: Dict[str, Any]
def extract_from_corpus(self, texts: Iterable[str], n_words: int) -> wordcloud.WordCloud:
pass
from sklearn.feature_extraction.text import TfidfVectorizer
from wordcloud import WordCloud
import numpy as np
class TextVisualization:
@staticmethod
def extract_from_corpus(texts, max_features=100):
"""
Extract word frequencies from a corpus using TF-IDF vectorization
and generate word cloud frequencies.
Args:
texts: List of text documents
max_features: Maximum number of words to include
Returns:
Dictionary of word frequencies suitable for WordCloud
"""
# Initialize TF-IDF vectorizer
tfidf = TfidfVectorizer(
max_features=max_features,
stop_words='english',
lowercase=True
)
# Fit and transform the texts
tfidf_matrix = tfidf.fit_transform(texts)
# Get feature names (words)
feature_names = tfidf.get_feature_names_out()
# Calculate mean TF-IDF scores across documents
mean_tfidf = np.array(tfidf_matrix.mean(axis=0)).flatten()
# Create frequency dictionary
frequencies = dict(zip(feature_names, mean_tfidf))
return frequencies