Upload folder using huggingface_hub

Browse files

Files changed (14) hide show

Dockerfile +26 -0
__init__.py +16 -0
__pycache__/__init__.cpython-38.pyc +0 -0
__pycache__/aws_helper.cpython-38.pyc +0 -0
app.py +176 -0
aws_helper.py +27 -0
config.json +26 -0
pytorch_model.bin +3 -0
requirements.txt +12 -0
serve.yaml +22 -0
special_tokens_map.json +7 -0
tokenizer.json +0 -0
tokenizer_config.json +13 -0
vocab.txt +0 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,26 @@

+# read the doc: https://huggingface.co/docs/hub/spaces-sdks-docker
+# you will also find guides on how best to write your Dockerfile
+FROM python:3.9
+WORKDIR /code
+COPY ./requirements.txt /code/requirements.txt
+RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
+# Set up a new user named "user" with user ID 1000
+RUN useradd -m -u 1000 user
+# Switch to the "user" user
+USER user
+# Set home to the user's home directory
+ENV HOME=/home/user \
+    PATH=/home/user/.local/bin:$PATH
+# Set the working directory to the user's home directory
+WORKDIR $HOME/app
+# Copy the current directory contents into the container at $HOME/app setting the owner to the user
+COPY --chown=user . $HOME/app
+CMD ["python", "app.py", "--server.port=7860", "--server.address=0.0.0.0"]

__init__.py ADDED Viewed

	@@ -0,0 +1,16 @@

+# Apache Software License 2.0
+#
+# Copyright (c) ZenML GmbH 2023. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#

__pycache__/__init__.cpython-38.pyc ADDED Viewed

Binary file (169 Bytes). View file

__pycache__/aws_helper.cpython-38.pyc ADDED Viewed

Binary file (965 Bytes). View file

app.py ADDED Viewed

	@@ -0,0 +1,176 @@

+# Apache Software License 2.0
+#
+# Copyright (c) ZenML GmbH 2023. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+from os.path import dirname
+from typing import Optional
+import click
+import numpy as np
+import sagemaker
+from aws_helper import get_sagemaker_session
+from transformers import AutoModelForSequenceClassification, AutoTokenizer
+from zenml.client import Client
+import gradio as gr
+@click.command()
+@click.option(
+    "--tokenizer_name_or_path",
+    default=None,
+    help="Name or the path of the tokenizer.",
+)
+@click.option(
+    "--model_name_or_path", default=None, help="Name or the path of the model."
+)
+@click.option(
+    "--labels", default="Negative,Positive", help="Comma-separated list of labels."
+)
+@click.option(
+    "--title", default="ZenML NLP Use-Case", help="Title of the Gradio interface."
+)
+@click.option(
+    "--description",
+    default="Text Classification - Sentiment Analysis - ZenML - Gradio",
+    help="Description of the Gradio interface.",
+)
+@click.option(
+    "--interpretation",
+    default="default",
+    help="Interpretation mode for the Gradio interface.",
+)
+@click.option(
+    "--examples",
+    default="This is an awesome journey, I love it!",
+    help="Comma-separated list of examples to show in the Gradio interface.",
+)
+@click.option(
+    "--pipeline_version",
+    default="3",
+    help="Which version of the deploy pipeline should be deployed.",
+    type=int
+)
+def sentiment_analysis(
+    tokenizer_name_or_path: Optional[str],
+    model_name_or_path: Optional[str],
+    labels: Optional[str],
+    title: Optional[str],
+    description: Optional[str],
+    interpretation: Optional[str],
+    pipeline_version: int,
+    examples: Optional[str]
+):
+    """Launches a Gradio interface for sentiment analysis.
+    This function launches a Gradio interface for text-classification.
+    It loads a model and a tokenizer from the provided paths and uses
+    them to predict the sentiment of the input text.
+    Args:
+        tokenizer_name_or_path (str): Name or the path of the tokenizer.
+        model_name_or_path (str): Name or the path of the model.
+        labels (str): Comma-separated list of labels.
+        title (str): Title of the Gradio interface.
+        description (str): Description of the Gradio interface.
+        interpretation (str): Interpretation mode for the Gradio interface.
+        pipeline_version (int): Which pipeline version to user
+        examples (str): Comma-separated list of examples to show in the Gradio interface.
+    """
+    labels = labels.split(",")
+    def preprocess(text: str) -> str:
+        """Preprocesses the text.
+        Args:
+            text (str): Input text.
+        Returns:
+            str: Preprocessed text.
+        """
+        new_text = []
+        for t in text.split(" "):
+            t = "@user" if t.startswith("@") and len(t) > 1 else t
+            t = "http" if t.startswith("http") else t
+            new_text.append(t)
+        return " ".join(new_text)
+    def softmax(x):
+        e_x = np.exp(x - np.max(x))
+        return e_x / e_x.sum(axis=0)
+    def analyze_text(inference_type, text):
+        if inference_type == "local":
+            cur_path = os.path.abspath(dirname(__file__))
+            model_path, tokenizer_path = cur_path, cur_path
+            if model_name_or_path:
+                model_path = f"{dirname(__file__)}/{model_name_or_path}/"
+            print(f"Loading model from {model_path}")
+            if tokenizer_name_or_path:
+                tokenizer_path = f"{dirname(__file__)}/{tokenizer_name_or_path}/"
+            print(f"Loading tokenizer from {tokenizer_path}")
+            tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
+            model = AutoModelForSequenceClassification.from_pretrained(model_path)
+            text = preprocess(text)
+            encoded_input = tokenizer(text, return_tensors="pt")
+            output = model(**encoded_input)
+            scores_ = output[0][0].detach().numpy()
+            scores_ = softmax(scores_)
+            scores = {l: float(s) for (l, s) in zip(labels, scores_)}
+        else:
+            client = Client()
+            latest_run = client.get_pipeline(
+                "sentinment_analysis_deploy_pipeline", version=pipeline_version
+            ).runs[0]
+            endpoint_name = (
+                latest_run.steps["deploy_hf_to_sagemaker"]
+                .outputs["sagemaker_endpoint_name"]
+                .load()
+            )
+            predictor = sagemaker.Predictor(
+                endpoint_name=endpoint_name,
+                sagemaker_session=get_sagemaker_session(),
+                serializer=sagemaker.serializers.JSONSerializer(),
+                deserializer=sagemaker.deserializers.JSONDeserializer(),
+            )
+            res = predictor.predict({"inputs": text})
+            if res[0]["label"] == "LABEL_1":
+                scores = {"Negative": 1 - res[0]["score"], "Positive": res[0]["score"]}
+            else:
+                scores = {"Negative": res[0]["score"], "Positive": 1 - res[0]["score"]}
+        return scores
+    demo = gr.Interface(
+        fn=analyze_text,
+        inputs=[
+            gr.Dropdown(
+                ["local", "sagemaker"], label="Select inference type", value="sagemaker"
+            ),
+            gr.TextArea("Write your text or tweet here", label="Analyze Text"),
+        ],
+        outputs=["label"],
+        title=title,
+        description=description,
+        interpretation=interpretation,
+    )
+    demo.launch(share=True, debug=True)
+if __name__ == "__main__":
+    sentiment_analysis()

aws_helper.py ADDED Viewed

	@@ -0,0 +1,27 @@

+import os
+import boto3
+import sagemaker
+# Assign default value if env variable not fond
+REGION_NAME = os.getenv("AWS_REGION", "us-east-1")
+ROLE_NAME = os.getenv("AWS_ROLE_NAME", "hamza_connector")
+os.environ["AWS_DEFAULT_REGION"] = REGION_NAME
+auth_arguments = {
+    "aws_access_key_id": os.getenv("AWS_ACCESS_KEY_ID", None),
+    "aws_secret_access_key": os.getenv("AWS_SECRET_ACCESS_KEY", None),
+    "aws_session_token": os.getenv("AWS_SESSION_TOKEN", None),
+    "region_name": REGION_NAME,
+}
+def get_sagemaker_role():
+    iam = boto3.client("iam", **auth_arguments)
+    role = iam.get_role(RoleName=ROLE_NAME)["Role"]["Arn"]
+    return role
+def get_sagemaker_session():
+    session = sagemaker.Session(boto3.Session(**auth_arguments))
+    return session

config.json ADDED Viewed

	@@ -0,0 +1,26 @@

+{
+  "_name_or_path": "/var/folders/hr/bpgg5x394sq1qt_z8k7_dl740000gn/T/tmp3d28qn75",
+  "activation": "gelu",
+  "architectures": [
+    "DistilBertForSequenceClassification"
+  ],
+  "attention_dropout": 0.1,
+  "dim": 768,
+  "dropout": 0.1,
+  "hidden_dim": 3072,
+  "initializer_range": 0.02,
+  "max_position_embeddings": 512,
+  "model_type": "distilbert",
+  "n_heads": 12,
+  "n_layers": 6,
+  "output_past": true,
+  "pad_token_id": 0,
+  "problem_type": "single_label_classification",
+  "qa_dropout": 0.1,
+  "seq_classif_dropout": 0.2,
+  "sinusoidal_pos_embds": false,
+  "tie_weights_": true,
+  "torch_dtype": "float32",
+  "transformers_version": "4.31.0",
+  "vocab_size": 28996
+}

pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:297383ad805bd5d73b7f98729631fe936bc1725faa2143b31eae299e6b8e5bf1
+size 263166698

requirements.txt ADDED Viewed

	@@ -0,0 +1,12 @@

+nltk
+torch
+torchvision
+torchaudio
+gradio
+datasets==2.12.0
+numpy==1.22.4
+pandas==1.5.3
+session_info==1.0.0
+scikit-learn==1.2.2
+transformers==4.28.1
+IPython==7.34.0

serve.yaml ADDED Viewed

	@@ -0,0 +1,22 @@

+# Task name (optional), used for display purposes.
+name: ZenML NLP project}
+resources:
+  cloud: gcp  # The cloud to use (optional).
+# Working directory (optional), synced to ~/sky_workdir on the remote cluster
+# each time launch or exec is run with the yaml file.
+#
+# Commands in "setup" and "run" will be executed under it.
+#
+# If a .gitignore file (or a .git/info/exclude file) exists in the working
+# directory, files and directories listed in it will be excluded from syncing.
+workdir: ./gradio
+setup: |
+  echo "Begin setup."
+  pip install -r requirements.txt
+  echo "Setup complete."
+run: |
+  echo 'Starting gradio app...'
+  python app.py

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "cls_token": "[CLS]",
+  "mask_token": "[MASK]",
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "unk_token": "[UNK]"
+}

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,13 @@

+{
+  "clean_up_tokenization_spaces": true,
+  "cls_token": "[CLS]",
+  "do_lower_case": true,
+  "mask_token": "[MASK]",
+  "model_max_length": 512,
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "strip_accents": null,
+  "tokenize_chinese_chars": true,
+  "tokenizer_class": "DistilBertTokenizer",
+  "unk_token": "[UNK]"
+}

vocab.txt ADDED Viewed

The diff for this file is too large to render. See raw diff