Spaces:
Runtime error
Runtime error
Add tests and fix issue when splitting into sentences, to grab the minimum number between total sentences and sample size, rather than total original documents and sample size
Browse files- README.md +5 -1
- perplexity_lenses/data.py +1 -1
- tests/__init__.py +0 -0
- tests/test_data.py +13 -0
README.md
CHANGED
@@ -20,7 +20,7 @@ The app is hosted [here](https://huggingface.co/spaces/edugp/perplexity-lenses).
|
|
20 |
python -m streamlit run app.py
|
21 |
```
|
22 |
|
23 |
-
# CLI
|
24 |
The CLI with no arguments defaults to running mc4 in Spanish.
|
25 |
For full usage:
|
26 |
```
|
@@ -40,3 +40,7 @@ python cli.py \
|
|
40 |
--model-name distiluse-base-multilingual-cased-v1 \
|
41 |
--output-file perplexity.html
|
42 |
```
|
|
|
|
|
|
|
|
|
|
20 |
python -m streamlit run app.py
|
21 |
```
|
22 |
|
23 |
+
# CLI:
|
24 |
The CLI with no arguments defaults to running mc4 in Spanish.
|
25 |
For full usage:
|
26 |
```
|
|
|
40 |
--model-name distiluse-base-multilingual-cased-v1 \
|
41 |
--output-file perplexity.html
|
42 |
```
|
43 |
+
# Tests:
|
44 |
+
```
|
45 |
+
python -m unittest discover -s ./tests/ -p "test_*.py"
|
46 |
+
```
|
perplexity_lenses/data.py
CHANGED
@@ -40,4 +40,4 @@ def hub_dataset_to_dataframe(
|
|
40 |
|
41 |
def documents_df_to_sentences_df(df: pd.DataFrame, text_column: str, sample: int, seed: int = 0):
|
42 |
df_sentences = pd.DataFrame({text_column: np.array(df[text_column].map(lambda x: x.split("\n")).values.tolist()).flatten()})
|
43 |
-
return df_sentences.sample(min(sample,
|
|
|
40 |
|
41 |
def documents_df_to_sentences_df(df: pd.DataFrame, text_column: str, sample: int, seed: int = 0):
|
42 |
df_sentences = pd.DataFrame({text_column: np.array(df[text_column].map(lambda x: x.split("\n")).values.tolist()).flatten()})
|
43 |
+
return df_sentences.sample(min(sample, df_sentences.shape[0]), random_state=seed)
|
tests/__init__.py
ADDED
File without changes
|
tests/test_data.py
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import unittest
|
2 |
+
|
3 |
+
import pandas as pd
|
4 |
+
|
5 |
+
from perplexity_lenses.data import documents_df_to_sentences_df
|
6 |
+
|
7 |
+
|
8 |
+
class TestData(unittest.TestCase):
|
9 |
+
def test_documents_df_to_sentences_df(self):
|
10 |
+
input_df = pd.DataFrame({"text": ["foo\nbar"]})
|
11 |
+
expected_output_df = pd.DataFrame({"text": ["foo", "bar"]})
|
12 |
+
output_df = documents_df_to_sentences_df(input_df, "text", 100)
|
13 |
+
pd.testing.assert_frame_equal(output_df, expected_output_df, check_like=True, check_exact=True)
|