Spaces:
Runtime error
Runtime error
Add CLI and refactor
Browse files- app.py +3 -90
- cli.py +68 -0
- perplexity_lenses/data.py +1 -1
- perplexity_lenses/engine.py +110 -0
- requirements.txt +3 -2
app.py
CHANGED
@@ -1,104 +1,17 @@
|
|
1 |
import logging
|
2 |
from functools import partial
|
3 |
-
from typing import Callable, Optional
|
4 |
|
5 |
-
import pandas as pd
|
6 |
import streamlit as st
|
7 |
-
from bokeh.plotting import Figure
|
8 |
from embedding_lenses.data import uploaded_file_to_dataframe
|
9 |
from embedding_lenses.dimensionality_reduction import get_tsne_embeddings, get_umap_embeddings
|
10 |
-
from embedding_lenses.embedding import
|
11 |
-
from embedding_lenses.utils import encode_labels
|
12 |
-
from sentence_transformers import SentenceTransformer
|
13 |
|
14 |
from perplexity_lenses.data import documents_df_to_sentences_df, hub_dataset_to_dataframe
|
|
|
15 |
from perplexity_lenses.perplexity import KenlmModel
|
16 |
-
from perplexity_lenses.visualization import draw_interactive_scatter_plot
|
17 |
|
18 |
logging.basicConfig(level=logging.INFO)
|
19 |
logger = logging.getLogger(__name__)
|
20 |
-
EMBEDDING_MODELS = ["distiluse-base-multilingual-cased-v1", "all-mpnet-base-v2", "flax-sentence-embeddings/all_datasets_v3_mpnet-base"]
|
21 |
-
DIMENSIONALITY_REDUCTION_ALGORITHMS = ["UMAP", "t-SNE"]
|
22 |
-
LANGUAGES = [
|
23 |
-
"af",
|
24 |
-
"ar",
|
25 |
-
"az",
|
26 |
-
"be",
|
27 |
-
"bg",
|
28 |
-
"bn",
|
29 |
-
"ca",
|
30 |
-
"cs",
|
31 |
-
"da",
|
32 |
-
"de",
|
33 |
-
"el",
|
34 |
-
"en",
|
35 |
-
"es",
|
36 |
-
"et",
|
37 |
-
"fa",
|
38 |
-
"fi",
|
39 |
-
"fr",
|
40 |
-
"gu",
|
41 |
-
"he",
|
42 |
-
"hi",
|
43 |
-
"hr",
|
44 |
-
"hu",
|
45 |
-
"hy",
|
46 |
-
"id",
|
47 |
-
"is",
|
48 |
-
"it",
|
49 |
-
"ja",
|
50 |
-
"ka",
|
51 |
-
"kk",
|
52 |
-
"km",
|
53 |
-
"kn",
|
54 |
-
"ko",
|
55 |
-
"lt",
|
56 |
-
"lv",
|
57 |
-
"mk",
|
58 |
-
"ml",
|
59 |
-
"mn",
|
60 |
-
"mr",
|
61 |
-
"my",
|
62 |
-
"ne",
|
63 |
-
"nl",
|
64 |
-
"no",
|
65 |
-
"pl",
|
66 |
-
"pt",
|
67 |
-
"ro",
|
68 |
-
"ru",
|
69 |
-
"uk",
|
70 |
-
"zh",
|
71 |
-
]
|
72 |
-
DOCUMENT_TYPES = ["Whole document", "Sentence"]
|
73 |
-
SEED = 0
|
74 |
-
|
75 |
-
|
76 |
-
def generate_plot(
|
77 |
-
df: pd.DataFrame,
|
78 |
-
text_column: str,
|
79 |
-
label_column: str,
|
80 |
-
sample: Optional[int],
|
81 |
-
dimensionality_reduction_function: Callable,
|
82 |
-
model: SentenceTransformer,
|
83 |
-
) -> Figure:
|
84 |
-
if text_column not in df.columns:
|
85 |
-
raise ValueError(f"The specified column name doesn't exist. Columns available: {df.columns.values}")
|
86 |
-
if label_column not in df.columns:
|
87 |
-
df[label_column] = 0
|
88 |
-
df = df.dropna(subset=[text_column, label_column])
|
89 |
-
if sample:
|
90 |
-
df = df.sample(min(sample, df.shape[0]), random_state=SEED)
|
91 |
-
with st.spinner(text="Embedding text..."):
|
92 |
-
embeddings = embed_text(df[text_column].values.tolist(), model)
|
93 |
-
logger.info("Encoding labels")
|
94 |
-
encoded_labels = encode_labels(df[label_column])
|
95 |
-
with st.spinner("Reducing dimensionality..."):
|
96 |
-
embeddings_2d = dimensionality_reduction_function(embeddings)
|
97 |
-
logger.info("Generating figure")
|
98 |
-
plot = draw_interactive_scatter_plot(
|
99 |
-
df[text_column].values, embeddings_2d[:, 0], embeddings_2d[:, 1], encoded_labels.values, df[label_column].values, text_column, label_column
|
100 |
-
)
|
101 |
-
return plot
|
102 |
|
103 |
|
104 |
st.title("Perplexity Lenses")
|
@@ -150,7 +63,7 @@ if uploaded_file or hub_dataset:
|
|
150 |
# Round perplexity
|
151 |
df["perplexity"] = df["perplexity"].round().astype(int)
|
152 |
logger.info(f"Perplexity range: {df['perplexity'].min()} - {df['perplexity'].max()}")
|
153 |
-
plot = generate_plot(df, text_column, "perplexity", None, dimensionality_reduction_function, model)
|
154 |
logger.info("Displaying plot")
|
155 |
st.bokeh_chart(plot)
|
156 |
logger.info("Done")
|
|
|
1 |
import logging
|
2 |
from functools import partial
|
|
|
3 |
|
|
|
4 |
import streamlit as st
|
|
|
5 |
from embedding_lenses.data import uploaded_file_to_dataframe
|
6 |
from embedding_lenses.dimensionality_reduction import get_tsne_embeddings, get_umap_embeddings
|
7 |
+
from embedding_lenses.embedding import load_model
|
|
|
|
|
8 |
|
9 |
from perplexity_lenses.data import documents_df_to_sentences_df, hub_dataset_to_dataframe
|
10 |
+
from perplexity_lenses.engine import DIMENSIONALITY_REDUCTION_ALGORITHMS, DOCUMENT_TYPES, EMBEDDING_MODELS, LANGUAGES, SEED, generate_plot
|
11 |
from perplexity_lenses.perplexity import KenlmModel
|
|
|
12 |
|
13 |
logging.basicConfig(level=logging.INFO)
|
14 |
logger = logging.getLogger(__name__)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
15 |
|
16 |
|
17 |
st.title("Perplexity Lenses")
|
|
|
63 |
# Round perplexity
|
64 |
df["perplexity"] = df["perplexity"].round().astype(int)
|
65 |
logger.info(f"Perplexity range: {df['perplexity'].min()} - {df['perplexity'].max()}")
|
66 |
+
plot = generate_plot(df, text_column, "perplexity", None, dimensionality_reduction_function, model, seed=SEED, context_logger=st.spinner)
|
67 |
logger.info("Displaying plot")
|
68 |
st.bokeh_chart(plot)
|
69 |
logger.info("Done")
|
cli.py
ADDED
@@ -0,0 +1,68 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import logging
|
2 |
+
from functools import partial
|
3 |
+
from typing import Optional
|
4 |
+
|
5 |
+
import typer
|
6 |
+
from bokeh.plotting import output_file as bokeh_output_file
|
7 |
+
from bokeh.plotting import save
|
8 |
+
from embedding_lenses.data import uploaded_file_to_dataframe
|
9 |
+
from embedding_lenses.dimensionality_reduction import get_tsne_embeddings, get_umap_embeddings
|
10 |
+
from embedding_lenses.embedding import load_model
|
11 |
+
|
12 |
+
from perplexity_lenses.data import documents_df_to_sentences_df, hub_dataset_to_dataframe
|
13 |
+
from perplexity_lenses.engine import DIMENSIONALITY_REDUCTION_ALGORITHMS, DOCUMENT_TYPES, EMBEDDING_MODELS, LANGUAGES, SEED, generate_plot
|
14 |
+
from perplexity_lenses.perplexity import KenlmModel
|
15 |
+
|
16 |
+
logging.basicConfig(level=logging.INFO)
|
17 |
+
logger = logging.getLogger(__name__)
|
18 |
+
|
19 |
+
|
20 |
+
app = typer.Typer()
|
21 |
+
|
22 |
+
|
23 |
+
@app.command()
|
24 |
+
def main(
|
25 |
+
dataset: str = typer.Option("mc4", help="The name of the hub dataset or local csv/tsv file."),
|
26 |
+
dataset_config: Optional[str] = typer.Option("es", help="The configuration of the hub dataset, if any. Does not apply to local csv/tsv files."),
|
27 |
+
dataset_split: Optional[str] = typer.Option("train", help="The dataset split. Does not apply to local csv/tsv files."),
|
28 |
+
text_column: str = typer.Option("text", help="The text field name."),
|
29 |
+
language: str = typer.Option("es", help=f"The language of the text. Options: {LANGUAGES}"),
|
30 |
+
doc_type: str = typer.Option("sentence", help=f"Whether to embed at the sentence or document level. Options: {DOCUMENT_TYPES}."),
|
31 |
+
sample: int = typer.Option(1000, help="Maximum number of examples to use."),
|
32 |
+
dimensionality_reduction: str = typer.Option(
|
33 |
+
DIMENSIONALITY_REDUCTION_ALGORITHMS[0],
|
34 |
+
help=f"Whether to use UMAP or t-SNE for dimensionality reduction. Options: {DIMENSIONALITY_REDUCTION_ALGORITHMS}.",
|
35 |
+
),
|
36 |
+
model_name: str = typer.Option(EMBEDDING_MODELS[0], help=f"The sentence embedding model to use. Options: {EMBEDDING_MODELS}"),
|
37 |
+
output_file: str = typer.Option("perplexity.html", help="The name of the output visualization HTML file."),
|
38 |
+
):
|
39 |
+
"""
|
40 |
+
Perplexity Lenses: Visualize text embeddings in 2D using colors to represent perplexity values.
|
41 |
+
"""
|
42 |
+
logger.info("Loading embedding model...")
|
43 |
+
model = load_model(model_name)
|
44 |
+
dimensionality_reduction_function = (
|
45 |
+
partial(get_umap_embeddings, random_state=SEED) if dimensionality_reduction.lower() == "umap" else partial(get_tsne_embeddings, random_state=SEED)
|
46 |
+
)
|
47 |
+
logger.info("Loading KenLM model...")
|
48 |
+
kenlm_model = KenlmModel.from_pretrained(language)
|
49 |
+
logger.info("Loading dataset...")
|
50 |
+
if dataset.endswith(".csv") or dataset.endswith(".tsv"):
|
51 |
+
df = uploaded_file_to_dataframe(dataset)
|
52 |
+
if doc_type.lower() == "sentence":
|
53 |
+
df = documents_df_to_sentences_df(df, text_column, sample, seed=SEED)
|
54 |
+
df["perplexity"] = df[text_column].map(kenlm_model.get_perplexity)
|
55 |
+
else:
|
56 |
+
df = hub_dataset_to_dataframe(dataset, dataset_config, dataset_split, sample, text_column, kenlm_model, seed=SEED, doc_type=doc_type)
|
57 |
+
# Round perplexity
|
58 |
+
df["perplexity"] = df["perplexity"].round().astype(int)
|
59 |
+
logger.info(f"Perplexity range: {df['perplexity'].min()} - {df['perplexity'].max()}")
|
60 |
+
plot = generate_plot(df, text_column, "perplexity", None, dimensionality_reduction_function, model, seed=SEED)
|
61 |
+
logger.info("Saving plot")
|
62 |
+
bokeh_output_file(output_file)
|
63 |
+
save(plot)
|
64 |
+
logger.info("Done")
|
65 |
+
|
66 |
+
|
67 |
+
if __name__ == "__main__":
|
68 |
+
app()
|
perplexity_lenses/data.py
CHANGED
@@ -17,7 +17,7 @@ def hub_dataset_to_dataframe(
|
|
17 |
if split:
|
18 |
load_dataset_fn = partial(load_dataset_fn, split=split)
|
19 |
dataset = load_dataset_fn(streaming=True).shuffle(buffer_size=10000, seed=seed)
|
20 |
-
if doc_type == "
|
21 |
dataset = dataset.map(lambda x: [{text_column: sentence, "perplexity": model.get_perplexity(sentence)} for sentence in x[text_column].split("\n")])
|
22 |
else:
|
23 |
dataset = dataset.map(lambda x: {text_column: x[text_column], "perplexity": model.get_perplexity(x[text_column])})
|
|
|
17 |
if split:
|
18 |
load_dataset_fn = partial(load_dataset_fn, split=split)
|
19 |
dataset = load_dataset_fn(streaming=True).shuffle(buffer_size=10000, seed=seed)
|
20 |
+
if doc_type.lower() == "sentence":
|
21 |
dataset = dataset.map(lambda x: [{text_column: sentence, "perplexity": model.get_perplexity(sentence)} for sentence in x[text_column].split("\n")])
|
22 |
else:
|
23 |
dataset = dataset.map(lambda x: {text_column: x[text_column], "perplexity": model.get_perplexity(x[text_column])})
|
perplexity_lenses/engine.py
ADDED
@@ -0,0 +1,110 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import logging
|
2 |
+
import time
|
3 |
+
from typing import Callable, Optional, Union
|
4 |
+
|
5 |
+
import pandas as pd
|
6 |
+
import streamlit as st
|
7 |
+
from bokeh.plotting import Figure
|
8 |
+
from embedding_lenses.embedding import embed_text
|
9 |
+
from embedding_lenses.utils import encode_labels
|
10 |
+
from sentence_transformers import SentenceTransformer
|
11 |
+
|
12 |
+
from perplexity_lenses.visualization import draw_interactive_scatter_plot
|
13 |
+
|
14 |
+
logger = logging.getLogger(__name__)
|
15 |
+
EMBEDDING_MODELS = ["distiluse-base-multilingual-cased-v1", "all-mpnet-base-v2", "flax-sentence-embeddings/all_datasets_v3_mpnet-base"]
|
16 |
+
DIMENSIONALITY_REDUCTION_ALGORITHMS = ["UMAP", "t-SNE"]
|
17 |
+
DOCUMENT_TYPES = ["Whole document", "Sentence"]
|
18 |
+
SEED = 0
|
19 |
+
LANGUAGES = [
|
20 |
+
"af",
|
21 |
+
"ar",
|
22 |
+
"az",
|
23 |
+
"be",
|
24 |
+
"bg",
|
25 |
+
"bn",
|
26 |
+
"ca",
|
27 |
+
"cs",
|
28 |
+
"da",
|
29 |
+
"de",
|
30 |
+
"el",
|
31 |
+
"en",
|
32 |
+
"es",
|
33 |
+
"et",
|
34 |
+
"fa",
|
35 |
+
"fi",
|
36 |
+
"fr",
|
37 |
+
"gu",
|
38 |
+
"he",
|
39 |
+
"hi",
|
40 |
+
"hr",
|
41 |
+
"hu",
|
42 |
+
"hy",
|
43 |
+
"id",
|
44 |
+
"is",
|
45 |
+
"it",
|
46 |
+
"ja",
|
47 |
+
"ka",
|
48 |
+
"kk",
|
49 |
+
"km",
|
50 |
+
"kn",
|
51 |
+
"ko",
|
52 |
+
"lt",
|
53 |
+
"lv",
|
54 |
+
"mk",
|
55 |
+
"ml",
|
56 |
+
"mn",
|
57 |
+
"mr",
|
58 |
+
"my",
|
59 |
+
"ne",
|
60 |
+
"nl",
|
61 |
+
"no",
|
62 |
+
"pl",
|
63 |
+
"pt",
|
64 |
+
"ro",
|
65 |
+
"ru",
|
66 |
+
"uk",
|
67 |
+
"zh",
|
68 |
+
]
|
69 |
+
|
70 |
+
|
71 |
+
class ContextLogger:
|
72 |
+
def __init__(self, text: str = ""):
|
73 |
+
self.text = text
|
74 |
+
self.start_time = time.time()
|
75 |
+
|
76 |
+
def __enter__(self):
|
77 |
+
logger.info(self.text)
|
78 |
+
|
79 |
+
def __exit__(self, type, value, traceback):
|
80 |
+
logger.info(f"Took: {time.time() - self.start_time:.4f} seconds")
|
81 |
+
|
82 |
+
|
83 |
+
def generate_plot(
|
84 |
+
df: pd.DataFrame,
|
85 |
+
text_column: str,
|
86 |
+
label_column: str,
|
87 |
+
sample: Optional[int],
|
88 |
+
dimensionality_reduction_function: Callable,
|
89 |
+
model: SentenceTransformer,
|
90 |
+
seed: int = 0,
|
91 |
+
context_logger: Union[st.spinner, ContextLogger] = ContextLogger,
|
92 |
+
) -> Figure:
|
93 |
+
if text_column not in df.columns:
|
94 |
+
raise ValueError(f"The specified column name doesn't exist. Columns available: {df.columns.values}")
|
95 |
+
if label_column not in df.columns:
|
96 |
+
df[label_column] = 0
|
97 |
+
df = df.dropna(subset=[text_column, label_column])
|
98 |
+
if sample:
|
99 |
+
df = df.sample(min(sample, df.shape[0]), random_state=seed)
|
100 |
+
with context_logger(text="Embedding text..."):
|
101 |
+
embeddings = embed_text(df[text_column].values.tolist(), model)
|
102 |
+
logger.info("Encoding labels")
|
103 |
+
encoded_labels = encode_labels(df[label_column])
|
104 |
+
with context_logger("Reducing dimensionality..."):
|
105 |
+
embeddings_2d = dimensionality_reduction_function(embeddings)
|
106 |
+
logger.info("Generating figure")
|
107 |
+
plot = draw_interactive_scatter_plot(
|
108 |
+
df[text_column].values, embeddings_2d[:, 0], embeddings_2d[:, 1], encoded_labels.values, df[label_column].values, text_column, label_column
|
109 |
+
)
|
110 |
+
return plot
|
requirements.txt
CHANGED
@@ -6,5 +6,6 @@ sentence-transformers==2.0.0
|
|
6 |
bokeh==2.2.2
|
7 |
umap-learn==0.5.2
|
8 |
numpy==1.20.0
|
9 |
-
https://files.pythonhosted.org/packages/
|
10 |
-
https://github.com/kpu/kenlm/archive/master.zip
|
|
|
|
6 |
bokeh==2.2.2
|
7 |
umap-learn==0.5.2
|
8 |
numpy==1.20.0
|
9 |
+
https://files.pythonhosted.org/packages/2f/58/e00d2495b54f4ba97ca31a11aa7e636f80183ccf9b616f7eaa5518d050bb/embedding_lenses-0.5.0-py3-none-any.whl
|
10 |
+
https://github.com/kpu/kenlm/archive/master.zip
|
11 |
+
typer==0.4.0
|