edugp commited on
Commit
86e673e
1 Parent(s): 9ec7b19

Add CLI and refactor

Browse files
Files changed (5) hide show
  1. app.py +3 -90
  2. cli.py +68 -0
  3. perplexity_lenses/data.py +1 -1
  4. perplexity_lenses/engine.py +110 -0
  5. requirements.txt +3 -2
app.py CHANGED
@@ -1,104 +1,17 @@
1
  import logging
2
  from functools import partial
3
- from typing import Callable, Optional
4
 
5
- import pandas as pd
6
  import streamlit as st
7
- from bokeh.plotting import Figure
8
  from embedding_lenses.data import uploaded_file_to_dataframe
9
  from embedding_lenses.dimensionality_reduction import get_tsne_embeddings, get_umap_embeddings
10
- from embedding_lenses.embedding import embed_text, load_model
11
- from embedding_lenses.utils import encode_labels
12
- from sentence_transformers import SentenceTransformer
13
 
14
  from perplexity_lenses.data import documents_df_to_sentences_df, hub_dataset_to_dataframe
 
15
  from perplexity_lenses.perplexity import KenlmModel
16
- from perplexity_lenses.visualization import draw_interactive_scatter_plot
17
 
18
  logging.basicConfig(level=logging.INFO)
19
  logger = logging.getLogger(__name__)
20
- EMBEDDING_MODELS = ["distiluse-base-multilingual-cased-v1", "all-mpnet-base-v2", "flax-sentence-embeddings/all_datasets_v3_mpnet-base"]
21
- DIMENSIONALITY_REDUCTION_ALGORITHMS = ["UMAP", "t-SNE"]
22
- LANGUAGES = [
23
- "af",
24
- "ar",
25
- "az",
26
- "be",
27
- "bg",
28
- "bn",
29
- "ca",
30
- "cs",
31
- "da",
32
- "de",
33
- "el",
34
- "en",
35
- "es",
36
- "et",
37
- "fa",
38
- "fi",
39
- "fr",
40
- "gu",
41
- "he",
42
- "hi",
43
- "hr",
44
- "hu",
45
- "hy",
46
- "id",
47
- "is",
48
- "it",
49
- "ja",
50
- "ka",
51
- "kk",
52
- "km",
53
- "kn",
54
- "ko",
55
- "lt",
56
- "lv",
57
- "mk",
58
- "ml",
59
- "mn",
60
- "mr",
61
- "my",
62
- "ne",
63
- "nl",
64
- "no",
65
- "pl",
66
- "pt",
67
- "ro",
68
- "ru",
69
- "uk",
70
- "zh",
71
- ]
72
- DOCUMENT_TYPES = ["Whole document", "Sentence"]
73
- SEED = 0
74
-
75
-
76
- def generate_plot(
77
- df: pd.DataFrame,
78
- text_column: str,
79
- label_column: str,
80
- sample: Optional[int],
81
- dimensionality_reduction_function: Callable,
82
- model: SentenceTransformer,
83
- ) -> Figure:
84
- if text_column not in df.columns:
85
- raise ValueError(f"The specified column name doesn't exist. Columns available: {df.columns.values}")
86
- if label_column not in df.columns:
87
- df[label_column] = 0
88
- df = df.dropna(subset=[text_column, label_column])
89
- if sample:
90
- df = df.sample(min(sample, df.shape[0]), random_state=SEED)
91
- with st.spinner(text="Embedding text..."):
92
- embeddings = embed_text(df[text_column].values.tolist(), model)
93
- logger.info("Encoding labels")
94
- encoded_labels = encode_labels(df[label_column])
95
- with st.spinner("Reducing dimensionality..."):
96
- embeddings_2d = dimensionality_reduction_function(embeddings)
97
- logger.info("Generating figure")
98
- plot = draw_interactive_scatter_plot(
99
- df[text_column].values, embeddings_2d[:, 0], embeddings_2d[:, 1], encoded_labels.values, df[label_column].values, text_column, label_column
100
- )
101
- return plot
102
 
103
 
104
  st.title("Perplexity Lenses")
@@ -150,7 +63,7 @@ if uploaded_file or hub_dataset:
150
  # Round perplexity
151
  df["perplexity"] = df["perplexity"].round().astype(int)
152
  logger.info(f"Perplexity range: {df['perplexity'].min()} - {df['perplexity'].max()}")
153
- plot = generate_plot(df, text_column, "perplexity", None, dimensionality_reduction_function, model)
154
  logger.info("Displaying plot")
155
  st.bokeh_chart(plot)
156
  logger.info("Done")
 
1
  import logging
2
  from functools import partial
 
3
 
 
4
  import streamlit as st
 
5
  from embedding_lenses.data import uploaded_file_to_dataframe
6
  from embedding_lenses.dimensionality_reduction import get_tsne_embeddings, get_umap_embeddings
7
+ from embedding_lenses.embedding import load_model
 
 
8
 
9
  from perplexity_lenses.data import documents_df_to_sentences_df, hub_dataset_to_dataframe
10
+ from perplexity_lenses.engine import DIMENSIONALITY_REDUCTION_ALGORITHMS, DOCUMENT_TYPES, EMBEDDING_MODELS, LANGUAGES, SEED, generate_plot
11
  from perplexity_lenses.perplexity import KenlmModel
 
12
 
13
  logging.basicConfig(level=logging.INFO)
14
  logger = logging.getLogger(__name__)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
 
16
 
17
  st.title("Perplexity Lenses")
 
63
  # Round perplexity
64
  df["perplexity"] = df["perplexity"].round().astype(int)
65
  logger.info(f"Perplexity range: {df['perplexity'].min()} - {df['perplexity'].max()}")
66
+ plot = generate_plot(df, text_column, "perplexity", None, dimensionality_reduction_function, model, seed=SEED, context_logger=st.spinner)
67
  logger.info("Displaying plot")
68
  st.bokeh_chart(plot)
69
  logger.info("Done")
cli.py ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ from functools import partial
3
+ from typing import Optional
4
+
5
+ import typer
6
+ from bokeh.plotting import output_file as bokeh_output_file
7
+ from bokeh.plotting import save
8
+ from embedding_lenses.data import uploaded_file_to_dataframe
9
+ from embedding_lenses.dimensionality_reduction import get_tsne_embeddings, get_umap_embeddings
10
+ from embedding_lenses.embedding import load_model
11
+
12
+ from perplexity_lenses.data import documents_df_to_sentences_df, hub_dataset_to_dataframe
13
+ from perplexity_lenses.engine import DIMENSIONALITY_REDUCTION_ALGORITHMS, DOCUMENT_TYPES, EMBEDDING_MODELS, LANGUAGES, SEED, generate_plot
14
+ from perplexity_lenses.perplexity import KenlmModel
15
+
16
+ logging.basicConfig(level=logging.INFO)
17
+ logger = logging.getLogger(__name__)
18
+
19
+
20
+ app = typer.Typer()
21
+
22
+
23
+ @app.command()
24
+ def main(
25
+ dataset: str = typer.Option("mc4", help="The name of the hub dataset or local csv/tsv file."),
26
+ dataset_config: Optional[str] = typer.Option("es", help="The configuration of the hub dataset, if any. Does not apply to local csv/tsv files."),
27
+ dataset_split: Optional[str] = typer.Option("train", help="The dataset split. Does not apply to local csv/tsv files."),
28
+ text_column: str = typer.Option("text", help="The text field name."),
29
+ language: str = typer.Option("es", help=f"The language of the text. Options: {LANGUAGES}"),
30
+ doc_type: str = typer.Option("sentence", help=f"Whether to embed at the sentence or document level. Options: {DOCUMENT_TYPES}."),
31
+ sample: int = typer.Option(1000, help="Maximum number of examples to use."),
32
+ dimensionality_reduction: str = typer.Option(
33
+ DIMENSIONALITY_REDUCTION_ALGORITHMS[0],
34
+ help=f"Whether to use UMAP or t-SNE for dimensionality reduction. Options: {DIMENSIONALITY_REDUCTION_ALGORITHMS}.",
35
+ ),
36
+ model_name: str = typer.Option(EMBEDDING_MODELS[0], help=f"The sentence embedding model to use. Options: {EMBEDDING_MODELS}"),
37
+ output_file: str = typer.Option("perplexity.html", help="The name of the output visualization HTML file."),
38
+ ):
39
+ """
40
+ Perplexity Lenses: Visualize text embeddings in 2D using colors to represent perplexity values.
41
+ """
42
+ logger.info("Loading embedding model...")
43
+ model = load_model(model_name)
44
+ dimensionality_reduction_function = (
45
+ partial(get_umap_embeddings, random_state=SEED) if dimensionality_reduction.lower() == "umap" else partial(get_tsne_embeddings, random_state=SEED)
46
+ )
47
+ logger.info("Loading KenLM model...")
48
+ kenlm_model = KenlmModel.from_pretrained(language)
49
+ logger.info("Loading dataset...")
50
+ if dataset.endswith(".csv") or dataset.endswith(".tsv"):
51
+ df = uploaded_file_to_dataframe(dataset)
52
+ if doc_type.lower() == "sentence":
53
+ df = documents_df_to_sentences_df(df, text_column, sample, seed=SEED)
54
+ df["perplexity"] = df[text_column].map(kenlm_model.get_perplexity)
55
+ else:
56
+ df = hub_dataset_to_dataframe(dataset, dataset_config, dataset_split, sample, text_column, kenlm_model, seed=SEED, doc_type=doc_type)
57
+ # Round perplexity
58
+ df["perplexity"] = df["perplexity"].round().astype(int)
59
+ logger.info(f"Perplexity range: {df['perplexity'].min()} - {df['perplexity'].max()}")
60
+ plot = generate_plot(df, text_column, "perplexity", None, dimensionality_reduction_function, model, seed=SEED)
61
+ logger.info("Saving plot")
62
+ bokeh_output_file(output_file)
63
+ save(plot)
64
+ logger.info("Done")
65
+
66
+
67
+ if __name__ == "__main__":
68
+ app()
perplexity_lenses/data.py CHANGED
@@ -17,7 +17,7 @@ def hub_dataset_to_dataframe(
17
  if split:
18
  load_dataset_fn = partial(load_dataset_fn, split=split)
19
  dataset = load_dataset_fn(streaming=True).shuffle(buffer_size=10000, seed=seed)
20
- if doc_type == "Sentence":
21
  dataset = dataset.map(lambda x: [{text_column: sentence, "perplexity": model.get_perplexity(sentence)} for sentence in x[text_column].split("\n")])
22
  else:
23
  dataset = dataset.map(lambda x: {text_column: x[text_column], "perplexity": model.get_perplexity(x[text_column])})
 
17
  if split:
18
  load_dataset_fn = partial(load_dataset_fn, split=split)
19
  dataset = load_dataset_fn(streaming=True).shuffle(buffer_size=10000, seed=seed)
20
+ if doc_type.lower() == "sentence":
21
  dataset = dataset.map(lambda x: [{text_column: sentence, "perplexity": model.get_perplexity(sentence)} for sentence in x[text_column].split("\n")])
22
  else:
23
  dataset = dataset.map(lambda x: {text_column: x[text_column], "perplexity": model.get_perplexity(x[text_column])})
perplexity_lenses/engine.py ADDED
@@ -0,0 +1,110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import time
3
+ from typing import Callable, Optional, Union
4
+
5
+ import pandas as pd
6
+ import streamlit as st
7
+ from bokeh.plotting import Figure
8
+ from embedding_lenses.embedding import embed_text
9
+ from embedding_lenses.utils import encode_labels
10
+ from sentence_transformers import SentenceTransformer
11
+
12
+ from perplexity_lenses.visualization import draw_interactive_scatter_plot
13
+
14
+ logger = logging.getLogger(__name__)
15
+ EMBEDDING_MODELS = ["distiluse-base-multilingual-cased-v1", "all-mpnet-base-v2", "flax-sentence-embeddings/all_datasets_v3_mpnet-base"]
16
+ DIMENSIONALITY_REDUCTION_ALGORITHMS = ["UMAP", "t-SNE"]
17
+ DOCUMENT_TYPES = ["Whole document", "Sentence"]
18
+ SEED = 0
19
+ LANGUAGES = [
20
+ "af",
21
+ "ar",
22
+ "az",
23
+ "be",
24
+ "bg",
25
+ "bn",
26
+ "ca",
27
+ "cs",
28
+ "da",
29
+ "de",
30
+ "el",
31
+ "en",
32
+ "es",
33
+ "et",
34
+ "fa",
35
+ "fi",
36
+ "fr",
37
+ "gu",
38
+ "he",
39
+ "hi",
40
+ "hr",
41
+ "hu",
42
+ "hy",
43
+ "id",
44
+ "is",
45
+ "it",
46
+ "ja",
47
+ "ka",
48
+ "kk",
49
+ "km",
50
+ "kn",
51
+ "ko",
52
+ "lt",
53
+ "lv",
54
+ "mk",
55
+ "ml",
56
+ "mn",
57
+ "mr",
58
+ "my",
59
+ "ne",
60
+ "nl",
61
+ "no",
62
+ "pl",
63
+ "pt",
64
+ "ro",
65
+ "ru",
66
+ "uk",
67
+ "zh",
68
+ ]
69
+
70
+
71
+ class ContextLogger:
72
+ def __init__(self, text: str = ""):
73
+ self.text = text
74
+ self.start_time = time.time()
75
+
76
+ def __enter__(self):
77
+ logger.info(self.text)
78
+
79
+ def __exit__(self, type, value, traceback):
80
+ logger.info(f"Took: {time.time() - self.start_time:.4f} seconds")
81
+
82
+
83
+ def generate_plot(
84
+ df: pd.DataFrame,
85
+ text_column: str,
86
+ label_column: str,
87
+ sample: Optional[int],
88
+ dimensionality_reduction_function: Callable,
89
+ model: SentenceTransformer,
90
+ seed: int = 0,
91
+ context_logger: Union[st.spinner, ContextLogger] = ContextLogger,
92
+ ) -> Figure:
93
+ if text_column not in df.columns:
94
+ raise ValueError(f"The specified column name doesn't exist. Columns available: {df.columns.values}")
95
+ if label_column not in df.columns:
96
+ df[label_column] = 0
97
+ df = df.dropna(subset=[text_column, label_column])
98
+ if sample:
99
+ df = df.sample(min(sample, df.shape[0]), random_state=seed)
100
+ with context_logger(text="Embedding text..."):
101
+ embeddings = embed_text(df[text_column].values.tolist(), model)
102
+ logger.info("Encoding labels")
103
+ encoded_labels = encode_labels(df[label_column])
104
+ with context_logger("Reducing dimensionality..."):
105
+ embeddings_2d = dimensionality_reduction_function(embeddings)
106
+ logger.info("Generating figure")
107
+ plot = draw_interactive_scatter_plot(
108
+ df[text_column].values, embeddings_2d[:, 0], embeddings_2d[:, 1], encoded_labels.values, df[label_column].values, text_column, label_column
109
+ )
110
+ return plot
requirements.txt CHANGED
@@ -6,5 +6,6 @@ sentence-transformers==2.0.0
6
  bokeh==2.2.2
7
  umap-learn==0.5.2
8
  numpy==1.20.0
9
- https://files.pythonhosted.org/packages/f1/4a/ffebb6203694b9d2c9fdaafcdecba553ff039a0556804e88450c20ab9c73/embedding_lenses-0.4.0-py3-none-any.whl
10
- https://github.com/kpu/kenlm/archive/master.zip
 
 
6
  bokeh==2.2.2
7
  umap-learn==0.5.2
8
  numpy==1.20.0
9
+ https://files.pythonhosted.org/packages/2f/58/e00d2495b54f4ba97ca31a11aa7e636f80183ccf9b616f7eaa5518d050bb/embedding_lenses-0.5.0-py3-none-any.whl
10
+ https://github.com/kpu/kenlm/archive/master.zip
11
+ typer==0.4.0