File size: 3,317 Bytes
12a4d67
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2bd0078
12a4d67
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2bd0078
 
12a4d67
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2bd0078
12a4d67
 
 
 
2bd0078
12a4d67
 
 
 
 
 
 
906b0be
12a4d67
 
 
2bd0078
12a4d67
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
import gradio as gr
import polars as pl
from gradio_huggingfacehub_search import HuggingfaceHubSearch
import torch
import spaces
from torch import nn
from transformers import AutoModel, AutoTokenizer, AutoConfig
from huggingface_hub import PyTorchModelHubMixin
import pandas as pd


class QualityModel(nn.Module, PyTorchModelHubMixin):
    def __init__(self, config):
        super(QualityModel, self).__init__()
        self.model = AutoModel.from_pretrained(config["base_model"])
        self.dropout = nn.Dropout(config["fc_dropout"])
        self.fc = nn.Linear(self.model.config.hidden_size, len(config["id2label"]))

    def forward(self, input_ids, attention_mask):
        features = self.model(
            input_ids=input_ids, attention_mask=attention_mask
        ).last_hidden_state
        dropped = self.dropout(features)
        outputs = self.fc(dropped)
        return torch.softmax(outputs[:, 0, :], dim=1)

device = "cuda" if torch.cuda.is_available() else "cpu"
config = AutoConfig.from_pretrained("nvidia/quality-classifier-deberta")
tokenizer = AutoTokenizer.from_pretrained("nvidia/quality-classifier-deberta")
model = QualityModel.from_pretrained("nvidia/quality-classifier-deberta").to(device)
model.eval()


@spaces.GPU
def predict(texts: list[str]):
    inputs = tokenizer(
        texts, return_tensors="pt", padding="longest", truncation=True
    ).to(device)
    outputs = model(inputs["input_ids"], inputs["attention_mask"])
    predicted_classes = torch.argmax(outputs, dim=1)
    predicted_domains = [
        config.id2label[class_idx.item()] for class_idx in predicted_classes.cpu().numpy()
    ]
    return predicted_domains


def run_quality_check(dataset, column, n_samples):
    config = "default"
    data = pl.read_parquet(f"hf://datasets/{dataset}@parquet~/{config}/train/0000.parquet", columns=[column])
    texts = data[column].tolist()
    predictions = predict(texts[:n_samples])
    return pd.DataFrame({"quality": predictions}).value_counts()


with gr.Blocks() as demo:
    gr.Markdown("# 💫 Dataset Quality Checker 💫")
    gr_dataset_name = HuggingfaceHubSearch(
            label="Hub Dataset ID",
            placeholder="Search for dataset id on Huggingface",
            search_type="dataset",
            value="fka/awesome-chatgpt-prompts",
        )
    dataset_name = HuggingfaceHubSearch(
        label="Hub Dataset ID",
        placeholder="Search for dataset id on Huggingface",
        search_type="dataset",
        value="HuggingFaceFW/fineweb",
    )
    # config_name = "default"  # TODO: user input
    @gr.render(inputs=dataset_name)
    def embed(name):
        html_code = f"""
        <iframe
          src="https://huggingface.co/datasets/{name}/embed/viewer/default/train"
          frameborder="0"
          width="100%"
          height="700px"
        ></iframe>
            """
        return gr.HTML(value=html_code)
    text_column = gr.Textbox(placeholder="text", label="Text colum name to check (data must be non-nested, raw texts!)")
    n_samples = gr.Number(label="Num first samples to run check")
    gr_check_btn = gr.Button("Check Dataset")
    # plot = gr.BarPlot()
    df = gr.DataFrame(visible=False)
    gr_check_btn.click(run_quality_check, inputs=[dataset_name, text_column, n_samples], outputs=[df])
    gr.BarPlot(df)