Spaces:

argilla
/

synthetic-data-generator

Running

App Files Files Community

sdiazlor HF staff commited on 27 days ago

Commit

6a8a817

•

1 Parent(s): 229dcf3

feat: Address edge cases and improve textcat UI

Browse files

Files changed (6) hide show

src/distilabel_dataset_generator/apps/base.py +39 -33
src/distilabel_dataset_generator/apps/faq.py +1 -1
src/distilabel_dataset_generator/apps/sft.py +9 -6
src/distilabel_dataset_generator/apps/textcat.py +117 -72
src/distilabel_dataset_generator/pipelines/textcat.py +79 -45
src/distilabel_dataset_generator/utils.py +2 -2

src/distilabel_dataset_generator/apps/base.py CHANGED Viewed

@@ -1,12 +1,11 @@
 import io
-import re
 import uuid
-from typing import Any, Callable, List, Optional, Tuple, Union
 import argilla as rg
 import gradio as gr
 import pandas as pd
-from datasets import Dataset, Features, ClassLabel, Value
 from distilabel.distiset import Distiset
 from gradio import OAuthToken
 from huggingface_hub import HfApi, upload_file
@@ -15,16 +14,12 @@ from src.distilabel_dataset_generator.utils import (
     _LOGGED_OUT_CSS,
     get_argilla_client,
     list_orgs,
 )
 TEXTCAT_TASK = "text_classification"
-SFT_TASK = "supervised_finetuning"
-def swap_visibilty(oauth_token: Optional[OAuthToken] = None):
-    if oauth_token:
-        return gr.update(elem_classes=["main_ui_logged_in"])
-    else:
-        return gr.update(elem_classes=["main_ui_logged_out"])
 def get_main_ui(
@@ -42,11 +37,22 @@ def get_main_ui(
                 return default_datasets[index]
         if task == TEXTCAT_TASK:
             result = fn_generate_dataset(
-                system_prompt, difficulty="mixed", clarity="mixed", labels=[], num_labels=1, num_rows=1, progress=progress, is_sample=True
             )
         else:
             result = fn_generate_dataset(
-                system_prompt, num_turns=1, num_rows=1, progress=progress, is_sample=True
             )
         return result
@@ -77,6 +83,7 @@ def get_main_ui(
                 default_dataset_descriptions=default_dataset_descriptions,
                 default_system_prompts=default_system_prompts,
                 default_datasets=default_datasets,
             )
             gr.Markdown("## Generate full dataset")
             gr.Markdown(
@@ -88,7 +95,7 @@ def get_main_ui(
             (
                 dataset_name,
                 add_to_existing_dataset,
-                btn_generate_full_dataset_copy,
                 btn_generate_and_push_to_argilla,
                 btn_push_to_argilla,
                 org_name,
@@ -99,7 +106,7 @@ def get_main_ui(
                 btn_push_to_hub,
                 final_dataset,
                 success_message,
-            ) = get_push_to_hub_ui(default_datasets)
         sample_dataset.change(
             fn=lambda x: x,
@@ -118,7 +125,7 @@ def get_main_ui(
             outputs=[sample_dataset],
             show_progress=True,
         )
         btn_generate_sample_dataset.click(
             fn=fn_generate_sample_dataset,
             inputs=[system_prompt],
@@ -141,7 +148,7 @@ def get_main_ui(
         btn_generate_sample_dataset,
         dataset_name,
         add_to_existing_dataset,
-        btn_generate_full_dataset_copy,
         btn_generate_and_push_to_argilla,
         btn_push_to_argilla,
         org_name,
@@ -185,12 +192,6 @@ def validate_argilla_user_workspace_dataset(
     return final_dataset
-def get_login_button():
-    return gr.LoginButton(
-        value="Sign in with Hugging Face!", size="lg", scale=2
-    ).activate()
 def get_org_dropdown(oauth_token: OAuthToken = None):
     orgs = list_orgs(oauth_token)
     return gr.Dropdown(
@@ -201,12 +202,12 @@ def get_org_dropdown(oauth_token: OAuthToken = None):
     )
-def get_push_to_hub_ui(default_datasets):
-    with gr.Column() as push_to_hub_ui:
         (
             dataset_name,
             add_to_existing_dataset,
-            btn_generate_full_dataset_copy,
             btn_generate_and_push_to_argilla,
             btn_push_to_argilla,
         ) = get_argilla_tab()
@@ -223,7 +224,7 @@ def get_push_to_hub_ui(default_datasets):
     return (
         dataset_name,
         add_to_existing_dataset,
-        btn_generate_full_dataset_copy,
         btn_generate_and_push_to_argilla,
         btn_push_to_argilla,
         org_name,
@@ -241,10 +242,11 @@ def get_iterate_on_sample_dataset_ui(
     default_dataset_descriptions: List[str],
     default_system_prompts: List[str],
     default_datasets: List[pd.DataFrame],
 ):
     with gr.Column():
         dataset_description = gr.TextArea(
-            label="Give a precise description of the assistant or tool. Don't describe the dataset",
             value=default_dataset_descriptions[0],
             lines=2,
         )
@@ -261,9 +263,9 @@ def get_iterate_on_sample_dataset_ui(
             gr.Column(scale=1)
         system_prompt = gr.TextArea(
-            label="System prompt for dataset generation. You can tune it and regenerate the sample",
             value=default_system_prompts[0],
-            lines=5,
         )
         with gr.Row():
@@ -315,7 +317,7 @@ def get_argilla_tab() -> Tuple[Any]:
                 dataset_name = gr.Textbox(
                     label="Dataset name",
                     placeholder="dataset_name",
-                    value=f"my-distiset-{uuid.uuid4()}", ######## CHANGE AFTER TESTING
                 )
                 add_to_existing_dataset = gr.Checkbox(
                     label="Allow adding records to existing dataset",
@@ -326,7 +328,7 @@ def get_argilla_tab() -> Tuple[Any]:
                 )
             with gr.Row(variant="panel"):
-                btn_generate_full_dataset_copy = gr.Button(
                     value="Generate", variant="primary", scale=2
                 )
                 btn_generate_and_push_to_argilla = gr.Button(
@@ -344,7 +346,7 @@ def get_argilla_tab() -> Tuple[Any]:
     return (
         dataset_name,
         add_to_existing_dataset,
-        btn_generate_full_dataset_copy,
         btn_generate_and_push_to_argilla,
         btn_push_to_argilla,
     )
@@ -418,8 +420,12 @@ def push_dataset_to_hub(
 ) -> pd.DataFrame:
     progress(0.1, desc="Setting up dataset")
     repo_id = _check_push_to_hub(org_name, repo_name)
     if task == TEXTCAT_TASK and num_labels == 1:
         distiset = Distiset(
             {
                 "default": Dataset.from_pandas(

 import io
 import uuid
+from typing import Any, Callable, List, Tuple, Union
 import argilla as rg
 import gradio as gr
 import pandas as pd
+from datasets import ClassLabel, Dataset, Features, Value
 from distilabel.distiset import Distiset
 from gradio import OAuthToken
 from huggingface_hub import HfApi, upload_file
     _LOGGED_OUT_CSS,
     get_argilla_client,
     list_orgs,
+    swap_visibilty,
+    get_login_button,
 )
 TEXTCAT_TASK = "text_classification"
+SFT_TASK = "supervised_fine_tuning"
 def get_main_ui(
                 return default_datasets[index]
         if task == TEXTCAT_TASK:
             result = fn_generate_dataset(
+                system_prompt=system_prompt,
+                difficulty="mixed",
+                clarity="mixed",
+                labels=[],
+                num_labels=1,
+                num_rows=1,
+                progress=progress,
+                is_sample=True,
             )
         else:
             result = fn_generate_dataset(
+                system_prompt=system_prompt,
+                num_turns=1,
+                num_rows=1,
+                progress=progress,
+                is_sample=True,
             )
         return result
                 default_dataset_descriptions=default_dataset_descriptions,
                 default_system_prompts=default_system_prompts,
                 default_datasets=default_datasets,
+                task=task,
             )
             gr.Markdown("## Generate full dataset")
             gr.Markdown(
             (
                 dataset_name,
                 add_to_existing_dataset,
+                btn_generate_full_dataset_argilla,
                 btn_generate_and_push_to_argilla,
                 btn_push_to_argilla,
                 org_name,
                 btn_push_to_hub,
                 final_dataset,
                 success_message,
+            ) = get_push_to_ui(default_datasets)
         sample_dataset.change(
             fn=lambda x: x,
             outputs=[sample_dataset],
             show_progress=True,
         )
         btn_generate_sample_dataset.click(
             fn=fn_generate_sample_dataset,
             inputs=[system_prompt],
         btn_generate_sample_dataset,
         dataset_name,
         add_to_existing_dataset,
+        btn_generate_full_dataset_argilla,
         btn_generate_and_push_to_argilla,
         btn_push_to_argilla,
         org_name,
     return final_dataset
 def get_org_dropdown(oauth_token: OAuthToken = None):
     orgs = list_orgs(oauth_token)
     return gr.Dropdown(
     )
+def get_push_to_ui(default_datasets):
+    with gr.Column() as push_to_ui:
         (
             dataset_name,
             add_to_existing_dataset,
+            btn_generate_full_dataset_argilla,
             btn_generate_and_push_to_argilla,
             btn_push_to_argilla,
         ) = get_argilla_tab()
     return (
         dataset_name,
         add_to_existing_dataset,
+        btn_generate_full_dataset_argilla,
         btn_generate_and_push_to_argilla,
         btn_push_to_argilla,
         org_name,
     default_dataset_descriptions: List[str],
     default_system_prompts: List[str],
     default_datasets: List[pd.DataFrame],
+    task: str,
 ):
     with gr.Column():
         dataset_description = gr.TextArea(
+            label="Give a precise description of your desired application. Check the examples for inspiration.",
             value=default_dataset_descriptions[0],
             lines=2,
         )
             gr.Column(scale=1)
         system_prompt = gr.TextArea(
+            label="System prompt for dataset generation. You can tune it and regenerate the sample.",
             value=default_system_prompts[0],
+            lines=2 if task == TEXTCAT_TASK else 5,
         )
         with gr.Row():
                 dataset_name = gr.Textbox(
                     label="Dataset name",
                     placeholder="dataset_name",
+                    value="my-distiset",
                 )
                 add_to_existing_dataset = gr.Checkbox(
                     label="Allow adding records to existing dataset",
                 )
             with gr.Row(variant="panel"):
+                btn_generate_full_dataset_argilla = gr.Button(
                     value="Generate", variant="primary", scale=2
                 )
                 btn_generate_and_push_to_argilla = gr.Button(
     return (
         dataset_name,
         add_to_existing_dataset,
+        btn_generate_full_dataset_argilla,
         btn_generate_and_push_to_argilla,
         btn_push_to_argilla,
     )
 ) -> pd.DataFrame:
     progress(0.1, desc="Setting up dataset")
     repo_id = _check_push_to_hub(org_name, repo_name)
     if task == TEXTCAT_TASK and num_labels == 1:
+        labels = [label.lower().strip() for label in labels]
+        dataframe["label"] = dataframe["label"].apply(
+            lambda x: x if x in labels else None
+        )
         distiset = Distiset(
             {
                 "default": Dataset.from_pandas(

src/distilabel_dataset_generator/apps/faq.py CHANGED Viewed

@@ -15,7 +15,7 @@ with gr.Blocks() as app:
                     <p>This tool simplifies the process of creating custom datasets, enabling you to:</p>
                     <ul>
                         <li>Define the characteristics of your desired application</li>
-                        <li>Generate system prompts automatically</li>
                         <li>Create sample datasets for quick iteration</li>
                         <li>Produce full-scale datasets with customizable parameters</li>
                         <li>Push your generated datasets directly to the Hugging Face Hub</li>

                     <p>This tool simplifies the process of creating custom datasets, enabling you to:</p>
                     <ul>
                         <li>Define the characteristics of your desired application</li>
+                        <li>Generate system prompts and tasks automatically</li>
                         <li>Create sample datasets for quick iteration</li>
                         <li>Produce full-scale datasets with customizable parameters</li>
                         <li>Push your generated datasets directly to the Hugging Face Hub</li>

src/distilabel_dataset_generator/apps/sft.py CHANGED Viewed

@@ -67,9 +67,12 @@ def push_dataset_to_hub(
 ):
     original_dataframe = dataframe.copy(deep=True)
     dataframe = convert_dataframe_messages(dataframe)
-    push_to_hub_base(
-        dataframe, private, org_name, repo_name, oauth_token, progress, task=TASK
-    )
     return original_dataframe
@@ -297,7 +300,7 @@ def generate_dataset(
     progress(
         1,
         total=total_steps,
-        desc="(2/2) Generating responses",
     )
     # create distiset
@@ -344,7 +347,7 @@ def generate_dataset(
     btn_generate_sample_dataset,
     dataset_name,
     add_to_existing_dataset,
-    btn_generate_full_dataset_copy,
     btn_generate_and_push_to_argilla,
     btn_push_to_argilla,
     org_name,
@@ -391,7 +394,7 @@ with app:
     gr.on(
         triggers=[
             btn_generate_full_dataset.click,
-            btn_generate_full_dataset_copy.click,
         ],
         fn=hide_success_message,
         outputs=[success_message],

 ):
     original_dataframe = dataframe.copy(deep=True)
     dataframe = convert_dataframe_messages(dataframe)
+    try:
+        push_to_hub_base(
+            dataframe, private, org_name, repo_name, oauth_token, progress, task=TASK
+        )
+    except Exception as e:
+        raise gr.Error(f"Error pushing dataset to the Hub: {e}")
     return original_dataframe
     progress(
         1,
         total=total_steps,
+        desc="(2/2) Creating dataset",
     )
     # create distiset
     btn_generate_sample_dataset,
     dataset_name,
     add_to_existing_dataset,
+    btn_generate_full_dataset_argilla,
     btn_generate_and_push_to_argilla,
     btn_push_to_argilla,
     org_name,
     gr.on(
         triggers=[
             btn_generate_full_dataset.click,
+            btn_generate_full_dataset_argilla.click,
         ],
         fn=hide_success_message,
         outputs=[success_message],

src/distilabel_dataset_generator/apps/textcat.py CHANGED Viewed

@@ -1,11 +1,10 @@
 import re
-from typing import Dict, List, Union
 import argilla as rg
 import gradio as gr
 import pandas as pd
 from datasets import Dataset
-from distilabel.distiset import Distiset
 from huggingface_hub import HfApi
 from src.distilabel_dataset_generator.apps.base import (
@@ -34,13 +33,14 @@ from src.distilabel_dataset_generator.pipelines.textcat import (
     DEFAULT_SYSTEM_PROMPTS,
     PROMPT_CREATION_PROMPT,
     generate_pipeline_code,
-    get_textcat_generator,
-    get_prompt_generator,
     get_labeller_generator,
 )
 TASK = "text_classification"
 def push_dataset_to_hub(
     dataframe: pd.DataFrame,
     private: bool = True,
@@ -52,17 +52,20 @@ def push_dataset_to_hub(
     num_labels: int = 1,
 ):
     original_dataframe = dataframe.copy(deep=True)
-    push_to_hub_base(
-        dataframe,
-        private,
-        org_name,
-        repo_name,
-        oauth_token,
-        progress,
-        labels,
-        num_labels,
-        task=TASK,
-    )
     return original_dataframe
@@ -79,6 +82,7 @@ def push_dataset_to_argilla(
         progress(0.1, desc="Setting up user and workspace")
         client = get_argilla_client()
         hf_user = HfApi().whoami(token=oauth_token.token)["name"]
         settings = rg.Settings(
             fields=[
                 rg.TextField(
@@ -131,7 +135,35 @@ def push_dataset_to_argilla(
             rg_dataset = rg_dataset.create()
         progress(0.7, desc="Pushing dataset to Argilla")
         hf_dataset = Dataset.from_pandas(dataframe)
-        rg_dataset.records.log(records=hf_dataset)
         progress(1.0, desc="Dataset pushed to Argilla")
     except Exception as e:
         raise gr.Error(f"Error pushing dataset to Argilla: {e}")
@@ -166,15 +198,22 @@ def generate_dataset(
     system_prompt: str,
     difficulty: str,
     clarity: str,
-    labels: List[str] = [],
-    num_labels: int = 2,
     num_rows: int = 10,
     is_sample: bool = False,
     progress=gr.Progress(),
 ) -> pd.DataFrame:
     progress(0.0, desc="(1/2) Generating text classification data")
-    textcat_generator = get_textcat_generator(difficulty, clarity, is_sample)
-    labeler_generator = get_labeller_generator(num_labels, labels, is_sample)
     total_steps: int = num_rows * 2
     batch_size = DEFAULT_BATCH_SIZE
@@ -197,48 +236,36 @@ def generate_dataset(
         result["text"] = result["input_text"]
     # label text classification data
-    progress(0.5, desc="(1/2) Labeling text classification data")
     if not is_sample:
         n_processed = 0
-        labeler_results = []
         while n_processed < num_rows:
             progress(
                 0.5 + 0.5 * n_processed / num_rows,
                 total=total_steps,
-                desc="(1/2) Generating text classification data",
             )
             batch = textcat_results[n_processed : n_processed + batch_size]
-            labels = list(labeler_generator.process(inputs=batch))
-            labeler_results.extend(labels[0])
             n_processed += batch_size
         progress(
             1,
             total=total_steps,
-            desc="(2/2) Labeling text classification data",
         )
     # create final dataset
     distiset_results = []
-    if is_sample:
-        for result in textcat_results:
-            record = {}
-            for relevant_keys in [
-                "text",
-                "label",
-            ]:
-                if relevant_keys in result:
-                    record[relevant_keys] = result[relevant_keys]
-            distiset_results.append(record)
-    else:
-        for result in labeler_results:
-            record = {}
-            for relevant_keys in [
-                "text",
-                "labels",
-            ]:
-                if relevant_keys in result:
-                    record[relevant_keys] = result[relevant_keys]
-            distiset_results.append(record)
     dataframe = pd.DataFrame(distiset_results)
     if num_labels == 1:
@@ -247,6 +274,23 @@ def generate_dataset(
     return dataframe
 (
     app,
     main_ui,
@@ -259,7 +303,7 @@ def generate_dataset(
     btn_generate_sample_dataset,
     dataset_name,
     add_to_existing_dataset,
-    btn_generate_full_dataset_copy,
     btn_generate_and_push_to_argilla,
     btn_push_to_argilla,
     org_name,
@@ -279,17 +323,6 @@ def generate_dataset(
     task=TASK,
 )
-def update_labels_based_on_checkbox(checked, system_prompt):
-    if checked:
-        pattern = r"'(\b\w+\b)'"
-        new_labels = re.findall(pattern, system_prompt)
-        gr.update(choices=new_labels)
-        return gr.update(value=new_labels)
-    else:
-        return gr.update(choices=[])
 with app:
     with main_ui:
         with custom_input_ui:
@@ -302,6 +335,7 @@ with app:
                 ],
                 value="mixed",
                 label="Difficulty",
             )
             clarity = gr.Dropdown(
                 choices=[
@@ -315,28 +349,35 @@ with app:
                 ],
                 value="mixed",
                 label="Clarity",
             )
-            with gr.Row(variant="default"):
                 labels = gr.Dropdown(
                     choices=[],
                     allow_custom_value=True,
                     interactive=True,
                     label="Labels",
                     multiselect=True,
                 )
-                suggested_labels = gr.Checkbox(
-                    label="Add suggested labels",
-                    value=False,
-                    interactive=True,
-                )
             num_labels = gr.Number(
-                label="Number of labels", value=1, minimum=1, maximum=10
             )
             num_rows = gr.Number(
                 label="Number of rows",
-                value=1,
                 minimum=1,
-                maximum=500,  ###### CHANGE AFTER TESTING
             )
         pipeline_code = get_pipeline_code_ui(
@@ -351,20 +392,24 @@ with app:
         )
     # define app triggers
-    suggested_labels.change(
-        update_labels_based_on_checkbox,
-        inputs=[suggested_labels, system_prompt],
         outputs=labels,
     )
     gr.on(
         triggers=[
             btn_generate_full_dataset.click,
-            btn_generate_full_dataset_copy.click,
         ],
         fn=hide_success_message,
         outputs=[success_message],
     ).then(
         fn=generate_dataset,
         inputs=[system_prompt, difficulty, clarity, labels, num_labels, num_rows],
         outputs=[final_dataset],
@@ -424,7 +469,7 @@ with app:
         outputs=[success_message],
     ).then(
         fn=push_dataset_to_hub,
-        inputs=[final_dataset, private, org_name, repo_name, labels],
         outputs=[final_dataset],
         show_progress=True,
     ).then(

 import re
+from typing import List, Union
 import argilla as rg
 import gradio as gr
 import pandas as pd
 from datasets import Dataset
 from huggingface_hub import HfApi
 from src.distilabel_dataset_generator.apps.base import (
     DEFAULT_SYSTEM_PROMPTS,
     PROMPT_CREATION_PROMPT,
     generate_pipeline_code,
     get_labeller_generator,
+    get_prompt_generator,
+    get_textcat_generator,
 )
 TASK = "text_classification"
 def push_dataset_to_hub(
     dataframe: pd.DataFrame,
     private: bool = True,
     num_labels: int = 1,
 ):
     original_dataframe = dataframe.copy(deep=True)
+    try:
+        push_to_hub_base(
+            dataframe,
+            private,
+            org_name,
+            repo_name,
+            oauth_token,
+            progress,
+            labels,
+            num_labels,
+            task=TASK,
+        )
+    except Exception as e:
+        raise gr.Error(f"Error pushing dataset to the Hub: {e}")
     return original_dataframe
         progress(0.1, desc="Setting up user and workspace")
         client = get_argilla_client()
         hf_user = HfApi().whoami(token=oauth_token.token)["name"]
+        labels = [label.lower().strip() for label in labels]
         settings = rg.Settings(
             fields=[
                 rg.TextField(
             rg_dataset = rg_dataset.create()
         progress(0.7, desc="Pushing dataset to Argilla")
         hf_dataset = Dataset.from_pandas(dataframe)
+        records = [
+            rg.Record(
+                fields={
+                    "text": sample["text"],
+                },
+                metadata={"text_length": sample["text_length"]},
+                vectors={"text_embeddings": sample["text_embeddings"]},
+                suggestions=(
+                    [
+                        rg.Suggestion(
+                            question_name="label" if num_labels == 1 else "labels",
+                            value=(
+                                sample["label"] if num_labels == 1 else sample["labels"]
+                            ),
+                        )
+                    ]
+                    if (
+                        (num_labels == 1 and sample["label"] in labels)
+                        or (
+                            num_labels > 1
+                            and all(label in labels for label in sample["labels"])
+                        )
+                    )
+                    else []
+                ),
+            )
+            for sample in hf_dataset
+        ]
+        rg_dataset.records.log(records=records)
         progress(1.0, desc="Dataset pushed to Argilla")
     except Exception as e:
         raise gr.Error(f"Error pushing dataset to Argilla: {e}")
     system_prompt: str,
     difficulty: str,
     clarity: str,
+    labels: List[str] = None,
+    num_labels: int = 1,
     num_rows: int = 10,
     is_sample: bool = False,
     progress=gr.Progress(),
 ) -> pd.DataFrame:
     progress(0.0, desc="(1/2) Generating text classification data")
+    textcat_generator = get_textcat_generator(
+        difficulty=difficulty, clarity=clarity, is_sample=is_sample
+    )
+    labeller_generator = get_labeller_generator(
+        system_prompt=system_prompt,
+        labels=labels,
+        num_labels=num_labels,
+        is_sample=is_sample,
+    )
     total_steps: int = num_rows * 2
     batch_size = DEFAULT_BATCH_SIZE
         result["text"] = result["input_text"]
     # label text classification data
+    progress(0.5, desc="(1/2) Generating text classification data")
     if not is_sample:
         n_processed = 0
+        labeller_results = []
         while n_processed < num_rows:
             progress(
                 0.5 + 0.5 * n_processed / num_rows,
                 total=total_steps,
+                desc="(1/2) Labeling text classification data",
             )
             batch = textcat_results[n_processed : n_processed + batch_size]
+            labels = list(labeller_generator.process(inputs=batch))
+            labeller_results.extend(labels[0])
             n_processed += batch_size
         progress(
             1,
             total=total_steps,
+            desc="(2/2) Creating dataset",
         )
     # create final dataset
     distiset_results = []
+    source_results = textcat_results if is_sample else labeller_results
+    for result in source_results:
+        record = {
+            key: result[key]
+            for key in ["text", "label" if is_sample else "labels"]
+            if key in result
+        }
+        distiset_results.append(record)
     dataframe = pd.DataFrame(distiset_results)
     if num_labels == 1:
     return dataframe
+def update_suggested_labels(system_prompt):
+    new_labels = re.findall(r"'(\b[\w-]+\b)'", system_prompt)
+    if not new_labels:
+        return gr.Warning(
+            "No labels found in the system prompt. Please add labels manually."
+        )
+    return gr.update(choices=new_labels, value=new_labels)
+def validate_input_labels(labels):
+    if not labels or len(labels) < 2:
+        raise gr.Error(
+            f"Please select at least 2 labels to classify your text. You selected {len(labels) if labels else 0}."
+        )
+    return labels
 (
     app,
     main_ui,
     btn_generate_sample_dataset,
     dataset_name,
     add_to_existing_dataset,
+    btn_generate_full_dataset_argilla,
     btn_generate_and_push_to_argilla,
     btn_push_to_argilla,
     org_name,
     task=TASK,
 )
 with app:
     with main_ui:
         with custom_input_ui:
                 ],
                 value="mixed",
                 label="Difficulty",
+                info="The difficulty of the text to be generated.",
             )
             clarity = gr.Dropdown(
                 choices=[
                 ],
                 value="mixed",
                 label="Clarity",
+                info="The clarity of the text to be generated.",
             )
+            with gr.Column():
                 labels = gr.Dropdown(
                     choices=[],
                     allow_custom_value=True,
                     interactive=True,
                     label="Labels",
                     multiselect=True,
+                    info="Add the labels to classify the text.",
                 )
+                with gr.Blocks():
+                    btn_suggested_labels = gr.Button(
+                        value="Add suggested labels",
+                        size="sm",
+                    )
             num_labels = gr.Number(
+                label="Number of labels",
+                value=1,
+                minimum=1,
+                maximum=10,
+                info="The number of labels to classify the text.",
             )
             num_rows = gr.Number(
                 label="Number of rows",
+                value=10,
                 minimum=1,
+                maximum=500,
+                info="More rows will take longer to generate.",
             )
         pipeline_code = get_pipeline_code_ui(
         )
     # define app triggers
+    btn_suggested_labels.click(
+        fn=update_suggested_labels,
+        inputs=[system_prompt],
         outputs=labels,
     )
     gr.on(
         triggers=[
             btn_generate_full_dataset.click,
+            btn_generate_full_dataset_argilla.click,
         ],
         fn=hide_success_message,
         outputs=[success_message],
     ).then(
+        fn=validate_input_labels,
+        inputs=[labels],
+        outputs=[labels],
+    ).success(
         fn=generate_dataset,
         inputs=[system_prompt, difficulty, clarity, labels, num_labels, num_rows],
         outputs=[final_dataset],
         outputs=[success_message],
     ).then(
         fn=push_dataset_to_hub,
+        inputs=[final_dataset, private, org_name, repo_name, labels, num_labels],
         outputs=[final_dataset],
         show_progress=True,
     ).then(

src/distilabel_dataset_generator/pipelines/textcat.py CHANGED Viewed

@@ -1,8 +1,12 @@
-import pandas as pd
 from typing import List
 from distilabel.llms import InferenceEndpointsLLM
-from distilabel.steps.tasks import GenerateTextClassificationData, TextClassification, TextGeneration
 from src.distilabel_dataset_generator.pipelines.base import (
     MODEL,
@@ -13,7 +17,9 @@ PROMPT_CREATION_PROMPT = """You are an AI assistant specialized in generating ve
 Your task is to write a prompt following the instruction of the user. Respond with the prompt and nothing else.
-The prompt you write should follow the same style and structure as the following example prompts, clearly specifying the possible classification labels where applicable:
 Classify the following customer review of a cinema as either 'positive' or 'negative'.
@@ -25,15 +31,15 @@ Identify the issue category for the following technical support ticket: 'billing
 Classify the following movie review into one of the following categories: 'critical', 'praise', 'disappointed', 'enthusiastic'.
-Determine the level of customer satisfaction from the following customer service transcript: 'satisfied', 'dissatisfied', 'highly satisfied', 'somewhat dissatisfied', 'indifferent'.
 Categorize the following product description into one of the following product types: 'smartphone', 'laptop', 'tablet', 'smartwatch', 'e-reader', 'headphones'.
 Classify the following tweet as expressing either 'support' or 'opposition' to the political event discussed.
-Classify the following restaurant review into one of the following categories: 'food quality', 'service', 'ambiance', or 'price'.
-Classify the following blog post based on its primary fashion trend or style: 'casual', 'formal', 'streetwear', 'vintage' or 'sustainable fashion'.
 User dataset description:
 """
@@ -70,76 +76,101 @@ DEFAULT_SYSTEM_PROMPTS = [
 ]
 def generate_pipeline_code(
     system_prompt: str,
-    difficulty: str,
-    clarity: str,
-    labels: List[str],
-    num_labels: int,
-    num_rows: int,
 ) -> str:
-    base = f"""
 # Requirements: `pip install distilabel[hf-inference-endpoints]`
 import os
 from distilabel.llms import InferenceEndpointsLLM
 from distilabel.pipeline import Pipeline
-from distilabel.steps import LoadDataFromDicts
-from distilabel.steps.tasks import GenerateTextClassificationData
 MODEL = "{MODEL}"
-TEXTCAT_TASK = "{system_prompt}"
 os.environ["HF_TOKEN"] = (
     "hf_xxx"  # https://huggingface.co/settings/tokens/new?ownUserPermissions=repo.content.read&ownUserPermissions=repo.write&globalPermissions=inference.serverless.write&canReadGatedRepos=true&tokenType=fineGrained
 )
 with Pipeline(name="textcat") as pipeline:
     textcat_generation = GenerateTextClassificationData(
         llm=InferenceEndpointsLLM(
             model_id=MODEL,
             tokenizer_id=MODEL,
-            api_key=_get_next_api_key(),
             generation_kwargs={{
                 "temperature": 0.8,
                 "max_new_tokens": 2048,
             }},
         ),
-        difficulty={None if difficulty == "mixed" else difficulty},
-        clarity={None if clarity == "mixed" else clarity},
         num_generations={num_rows},
     )
     keep_columns = KeepColumns(
-        columns=["input_text", "model_name"],
     )
-    """
-    if num_labels > 1:
-        return base + """
-    textcat_generation >> keep_columns >> textcat_labeler
     if __name__ == "__main__":
         distiset = pipeline.run()
     """
-    return f"""
-textcat_labeler = TextClassification(
-    llm=InferenceEndpointsLLM(
-        model_id=MODEL,
-        tokenizer_id=MODEL,
-        api_key=_get_next_api_key(),
-        generation_kwargs={{
-            "temperature": 0.8,
-            "max_new_tokens": 2048,
-        }},
-    ),
-    n= {num_labels},
-    available_labels={labels},
-)
-textcat_generation >> keep_columns >> textcat_labeler
-if __name__ == "__main__":
-    distiset = pipeline.run()
-"""
 def get_textcat_generator(difficulty, clarity, is_sample):
     textcat_generator = GenerateTextClassificationData(
@@ -159,7 +190,8 @@ def get_textcat_generator(difficulty, clarity, is_sample):
     return textcat_generator
-def get_labeller_generator(num_labels, labels, is_sample):
     labeller_generator = TextClassification(
         llm=InferenceEndpointsLLM(
             model_id=MODEL,
@@ -170,8 +202,10 @@ def get_labeller_generator(num_labels, labels, is_sample):
                 "max_new_tokens": 256 if is_sample else 1024,
             },
         ),
-        n= num_labels,
         available_labels=labels,
     )
     labeller_generator.load()
     return labeller_generator

 from typing import List
+import pandas as pd
 from distilabel.llms import InferenceEndpointsLLM
+from distilabel.steps.tasks import (
+    GenerateTextClassificationData,
+    TextClassification,
+    TextGeneration,
+)
 from src.distilabel_dataset_generator.pipelines.base import (
     MODEL,
 Your task is to write a prompt following the instruction of the user. Respond with the prompt and nothing else.
+The prompt you write should follow the same style and structure as the following example prompts, clearly specifying the possible classification labels.
+If a label is composed of multiple words, use a hyphen to separate them. For example, 'smartphone-review', 'customer-service', 'product-quality'.:
 Classify the following customer review of a cinema as either 'positive' or 'negative'.
 Classify the following movie review into one of the following categories: 'critical', 'praise', 'disappointed', 'enthusiastic'.
+Determine the level of customer satisfaction from the following customer service transcript: 'satisfied', 'dissatisfied', 'highly-satisfied', 'somewhat-dissatisfied', 'indifferent'.
 Categorize the following product description into one of the following product types: 'smartphone', 'laptop', 'tablet', 'smartwatch', 'e-reader', 'headphones'.
 Classify the following tweet as expressing either 'support' or 'opposition' to the political event discussed.
+Classify the following restaurant review into one of the following categories: 'food-quality', 'service', 'ambiance', or 'price'.
+Classify the following blog post based on its primary fashion trend or style: 'casual', 'formal', 'streetwear', 'vintage' or 'sustainable-fashion'.
 User dataset description:
 """
 ]
+from typing import List
+MODEL = "meta-llama/Meta-Llama-3.1-8B-Instruct"
 def generate_pipeline_code(
     system_prompt: str,
+    difficulty: str = None,
+    clarity: str = None,
+    labels: List[str] = None,
+    num_labels: int = 1,
+    num_rows: int = 10,
 ) -> str:
+    labels = [label.lower().strip() for label in labels or []]
+    base_code = f"""
 # Requirements: `pip install distilabel[hf-inference-endpoints]`
 import os
 from distilabel.llms import InferenceEndpointsLLM
 from distilabel.pipeline import Pipeline
+from distilabel.steps import LoadDataFromDicts, KeepColumns
+from distilabel.steps.tasks import {"GenerateTextClassificationData" if num_labels == 1 else "GenerateTextClassificationData, TextClassification"}
 MODEL = "{MODEL}"
+TEXT_CLASSIFICATION_TASK = "{system_prompt}"
 os.environ["HF_TOKEN"] = (
     "hf_xxx"  # https://huggingface.co/settings/tokens/new?ownUserPermissions=repo.content.read&ownUserPermissions=repo.write&globalPermissions=inference.serverless.write&canReadGatedRepos=true&tokenType=fineGrained
 )
 with Pipeline(name="textcat") as pipeline:
+    task_generator = LoadDataFromDicts(data=[{{"task": TEXT_CLASSIFICATION_TASK}}])
     textcat_generation = GenerateTextClassificationData(
         llm=InferenceEndpointsLLM(
             model_id=MODEL,
             tokenizer_id=MODEL,
+            api_key=os.environ["HF_TOKEN"],
             generation_kwargs={{
                 "temperature": 0.8,
                 "max_new_tokens": 2048,
             }},
         ),
+        difficulty={None if difficulty == "mixed" else repr(difficulty)},
+        clarity={None if clarity == "mixed" else repr(clarity)},
         num_generations={num_rows},
+        output_mappings={{"input_text": "text"}},
     )
+    """
+    if num_labels == 1:
+        return (
+            base_code
+            + """
     keep_columns = KeepColumns(
+        columns=["text", "label"],
     )
+    # Connect steps in the pipeline
+    task_generator >> textcat_generation >> keep_columns
     if __name__ == "__main__":
         distiset = pipeline.run()
     """
+        )
+    return (
+        base_code
+        + f"""
+    keep_columns = KeepColumns(
+        columns=["text"],
+    )
+    textcat_labeller = TextClassification(
+        llm=InferenceEndpointsLLM(
+            model_id=MODEL,
+            tokenizer_id=MODEL,
+            api_key=os.environ["HF_TOKEN"],
+            generation_kwargs={{
+                "temperature": 0.8,
+                "max_new_tokens": 2048,
+            }},
+        ),
+        n={num_labels},
+        available_labels={labels},
+        context=TEXT_CLASSIFICATION_TASK,
+        default_label="unknown"
+    )
+    task_generator >> textcat_generation >> keep_columns >> textcat_labeller
+    if __name__ == "__main__":
+        distiset = pipeline.run()
+    """
+    )
 def get_textcat_generator(difficulty, clarity, is_sample):
     textcat_generator = GenerateTextClassificationData(
     return textcat_generator
+def get_labeller_generator(system_prompt, labels, num_labels, is_sample):
+    labels = [label.lower().strip() for label in labels]
     labeller_generator = TextClassification(
         llm=InferenceEndpointsLLM(
             model_id=MODEL,
                 "max_new_tokens": 256 if is_sample else 1024,
             },
         ),
+        context=system_prompt,
         available_labels=labels,
+        n=num_labels,
+        default_label="unknown",
     )
     labeller_generator.load()
     return labeller_generator

src/distilabel_dataset_generator/utils.py CHANGED Viewed

@@ -1,5 +1,5 @@
 import os
-from typing import Union
 import argilla as rg
 import gradio as gr
@@ -80,7 +80,7 @@ def get_token(oauth_token: OAuthToken = None):
         return ""
-def swap_visibilty(oauth_token: OAuthToken = None):
     if oauth_token:
         return gr.update(elem_classes=["main_ui_logged_in"])
     else:

 import os
+from typing import Union, Optional
 import argilla as rg
 import gradio as gr
         return ""
+def swap_visibilty(oauth_token: Optional[OAuthToken] = None):
     if oauth_token:
         return gr.update(elem_classes=["main_ui_logged_in"])
     else: