Spaces:
Runtime error
Runtime error
lambdaofgod
commited on
Commit
•
f5c0c01
1
Parent(s):
869e1b8
wordclouds
Browse files- app.py +14 -1
- pyproject.toml +11 -0
- text_visualization.py +16 -16
app.py
CHANGED
@@ -5,6 +5,7 @@ import re
|
|
5 |
from task_visualizations import TaskVisualizations
|
6 |
import plotly.graph_objects as go
|
7 |
from functools import partial
|
|
|
8 |
|
9 |
logging.basicConfig(level=logging.INFO)
|
10 |
|
@@ -43,9 +44,21 @@ def display_representations(repo, representation1, representation2):
|
|
43 |
return text1, text2
|
44 |
|
45 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
46 |
def setup_repository_representations_tab(repos, representation_types):
|
47 |
-
gr.Markdown("Select a repository and two representation types to compare them.")
|
48 |
|
|
|
|
|
|
|
|
|
|
|
49 |
with gr.Row():
|
50 |
repo = gr.Dropdown(choices=repos, label="Repository", value=repos[0])
|
51 |
representation1 = gr.Dropdown(
|
|
|
5 |
from task_visualizations import TaskVisualizations
|
6 |
import plotly.graph_objects as go
|
7 |
from functools import partial
|
8 |
+
from text_visualization import WordCloudExtractor
|
9 |
|
10 |
logging.basicConfig(level=logging.INFO)
|
11 |
|
|
|
44 |
return text1, text2
|
45 |
|
46 |
|
47 |
+
def get_representation_wordclouds(representations, repos_df):
|
48 |
+
wordclouds = dict()
|
49 |
+
for representation in representations:
|
50 |
+
texts = list(repos_df[repos_df["representation"] == representation]["text"])
|
51 |
+
wordclouds[representation] = WordCloudExtractor().extract_wordcloud_image(texts)
|
52 |
+
return wordclouds
|
53 |
+
|
54 |
+
|
55 |
def setup_repository_representations_tab(repos, representation_types):
|
|
|
56 |
|
57 |
+
wordcloud_dict = get_representation_wordclouds(representation_types, repos_df)
|
58 |
+
gr.Markdown("## Wordclouds")
|
59 |
+
gr.Gallery([(wordcloud, representation_type) for representation_type, wordcloud in wordcloud_dict.items()])
|
60 |
+
|
61 |
+
gr.Markdown("Select a repository and two representation types to compare them.")
|
62 |
with gr.Row():
|
63 |
repo = gr.Dropdown(choices=repos, label="Repository", value=repos[0])
|
64 |
representation1 = gr.Dropdown(
|
pyproject.toml
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[project]
|
2 |
+
name = "github-search-visualizations"
|
3 |
+
version = "0.1.0"
|
4 |
+
description = "Add your description here"
|
5 |
+
readme = "README.md"
|
6 |
+
requires-python = ">=3.10"
|
7 |
+
dependencies = [
|
8 |
+
"pydantic>=2.9.2",
|
9 |
+
"scikit-learn>=1.5.2",
|
10 |
+
"wordcloud>=1.9.3",
|
11 |
+
]
|
text_visualization.py
CHANGED
@@ -1,22 +1,23 @@
|
|
1 |
from typing import Dict, Any, Iterable
|
2 |
-
from sklearn import
|
3 |
import wordcloud
|
4 |
-
from pydantic import BaseModel
|
5 |
-
|
|
|
6 |
|
7 |
-
class WordCloudExtractor:
|
8 |
|
9 |
-
|
|
|
|
|
|
|
10 |
|
11 |
-
def
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
import numpy as np
|
16 |
|
17 |
-
|
18 |
-
|
19 |
-
def extract_from_corpus(texts, max_features=100):
|
20 |
"""
|
21 |
Extract word frequencies from a corpus using TF-IDF vectorization
|
22 |
and generate word cloud frequencies.
|
@@ -30,9 +31,8 @@ class TextVisualization:
|
|
30 |
"""
|
31 |
# Initialize TF-IDF vectorizer
|
32 |
tfidf = TfidfVectorizer(
|
33 |
-
max_features=
|
34 |
-
|
35 |
-
lowercase=True
|
36 |
)
|
37 |
|
38 |
# Fit and transform the texts
|
|
|
1 |
from typing import Dict, Any, Iterable
|
2 |
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
3 |
import wordcloud
|
4 |
+
from pydantic import BaseModel, Field
|
5 |
+
import numpy as np
|
6 |
+
import PIL
|
7 |
|
|
|
8 |
|
9 |
+
class WordCloudExtractor(BaseModel):
|
10 |
+
max_words: int = 50
|
11 |
+
wordcloud_params: Dict[str, Any] = Field(default_factory=dict)
|
12 |
+
tfidf_params: Dict[str, Any] = Field(default_factory=lambda: {"stop_words": "english"})
|
13 |
|
14 |
+
def extract_wordcloud_image(self, texts) -> PIL.Image.Image:
|
15 |
+
frequencies = self._extract_frequencies(texts, self.max_words, tfidf_params=self.tfidf_params)
|
16 |
+
wc = wordcloud.WordCloud(**self.wordcloud_params).generate_from_frequencies(frequencies)
|
17 |
+
return wc.to_image()
|
|
|
18 |
|
19 |
+
@classmethod
|
20 |
+
def _extract_frequencies(cls, texts, max_words=100, tfidf_params: dict={}) -> Dict[str, float]:
|
|
|
21 |
"""
|
22 |
Extract word frequencies from a corpus using TF-IDF vectorization
|
23 |
and generate word cloud frequencies.
|
|
|
31 |
"""
|
32 |
# Initialize TF-IDF vectorizer
|
33 |
tfidf = TfidfVectorizer(
|
34 |
+
max_features=max_words,
|
35 |
+
**tfidf_params
|
|
|
36 |
)
|
37 |
|
38 |
# Fit and transform the texts
|