lambdaofgod commited on
Commit
f5c0c01
1 Parent(s): 869e1b8

wordclouds

Browse files
Files changed (3) hide show
  1. app.py +14 -1
  2. pyproject.toml +11 -0
  3. text_visualization.py +16 -16
app.py CHANGED
@@ -5,6 +5,7 @@ import re
5
  from task_visualizations import TaskVisualizations
6
  import plotly.graph_objects as go
7
  from functools import partial
 
8
 
9
  logging.basicConfig(level=logging.INFO)
10
 
@@ -43,9 +44,21 @@ def display_representations(repo, representation1, representation2):
43
  return text1, text2
44
 
45
 
 
 
 
 
 
 
 
 
46
  def setup_repository_representations_tab(repos, representation_types):
47
- gr.Markdown("Select a repository and two representation types to compare them.")
48
 
 
 
 
 
 
49
  with gr.Row():
50
  repo = gr.Dropdown(choices=repos, label="Repository", value=repos[0])
51
  representation1 = gr.Dropdown(
 
5
  from task_visualizations import TaskVisualizations
6
  import plotly.graph_objects as go
7
  from functools import partial
8
+ from text_visualization import WordCloudExtractor
9
 
10
  logging.basicConfig(level=logging.INFO)
11
 
 
44
  return text1, text2
45
 
46
 
47
+ def get_representation_wordclouds(representations, repos_df):
48
+ wordclouds = dict()
49
+ for representation in representations:
50
+ texts = list(repos_df[repos_df["representation"] == representation]["text"])
51
+ wordclouds[representation] = WordCloudExtractor().extract_wordcloud_image(texts)
52
+ return wordclouds
53
+
54
+
55
  def setup_repository_representations_tab(repos, representation_types):
 
56
 
57
+ wordcloud_dict = get_representation_wordclouds(representation_types, repos_df)
58
+ gr.Markdown("## Wordclouds")
59
+ gr.Gallery([(wordcloud, representation_type) for representation_type, wordcloud in wordcloud_dict.items()])
60
+
61
+ gr.Markdown("Select a repository and two representation types to compare them.")
62
  with gr.Row():
63
  repo = gr.Dropdown(choices=repos, label="Repository", value=repos[0])
64
  representation1 = gr.Dropdown(
pyproject.toml ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [project]
2
+ name = "github-search-visualizations"
3
+ version = "0.1.0"
4
+ description = "Add your description here"
5
+ readme = "README.md"
6
+ requires-python = ">=3.10"
7
+ dependencies = [
8
+ "pydantic>=2.9.2",
9
+ "scikit-learn>=1.5.2",
10
+ "wordcloud>=1.9.3",
11
+ ]
text_visualization.py CHANGED
@@ -1,22 +1,23 @@
1
  from typing import Dict, Any, Iterable
2
- from sklearn import TfIdfVectorizer
3
  import wordcloud
4
- from pydantic import BaseModel
5
-
 
6
 
7
- class WordCloudExtractor:
8
 
9
- tfidf_params: Dict[str, Any]
 
 
 
10
 
11
- def extract_from_corpus(self, texts: Iterable[str], n_words: int) -> wordcloud.WordCloud:
12
- pass
13
- from sklearn.feature_extraction.text import TfidfVectorizer
14
- from wordcloud import WordCloud
15
- import numpy as np
16
 
17
- class TextVisualization:
18
- @staticmethod
19
- def extract_from_corpus(texts, max_features=100):
20
  """
21
  Extract word frequencies from a corpus using TF-IDF vectorization
22
  and generate word cloud frequencies.
@@ -30,9 +31,8 @@ class TextVisualization:
30
  """
31
  # Initialize TF-IDF vectorizer
32
  tfidf = TfidfVectorizer(
33
- max_features=max_features,
34
- stop_words='english',
35
- lowercase=True
36
  )
37
 
38
  # Fit and transform the texts
 
1
  from typing import Dict, Any, Iterable
2
+ from sklearn.feature_extraction.text import TfidfVectorizer
3
  import wordcloud
4
+ from pydantic import BaseModel, Field
5
+ import numpy as np
6
+ import PIL
7
 
 
8
 
9
+ class WordCloudExtractor(BaseModel):
10
+ max_words: int = 50
11
+ wordcloud_params: Dict[str, Any] = Field(default_factory=dict)
12
+ tfidf_params: Dict[str, Any] = Field(default_factory=lambda: {"stop_words": "english"})
13
 
14
+ def extract_wordcloud_image(self, texts) -> PIL.Image.Image:
15
+ frequencies = self._extract_frequencies(texts, self.max_words, tfidf_params=self.tfidf_params)
16
+ wc = wordcloud.WordCloud(**self.wordcloud_params).generate_from_frequencies(frequencies)
17
+ return wc.to_image()
 
18
 
19
+ @classmethod
20
+ def _extract_frequencies(cls, texts, max_words=100, tfidf_params: dict={}) -> Dict[str, float]:
 
21
  """
22
  Extract word frequencies from a corpus using TF-IDF vectorization
23
  and generate word cloud frequencies.
 
31
  """
32
  # Initialize TF-IDF vectorizer
33
  tfidf = TfidfVectorizer(
34
+ max_features=max_words,
35
+ **tfidf_params
 
36
  )
37
 
38
  # Fit and transform the texts