Spaces:

Mat17892
/

iris

Runtime error

App Files Files Community

desert commited on Nov 28, 2024

Commit

f613acc

1 Parent(s): 77156ce

update with my model

Browse files

Files changed (1) hide show

app.py +41 -18

app.py CHANGED Viewed

@@ -1,12 +1,25 @@
 import gradio as gr
-from huggingface_hub import InferenceClient
-"""
-For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
-"""
-client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")
 def respond(
     message,
     history: list[tuple[str, str]],
@@ -15,34 +28,44 @@ def respond(
     temperature,
     top_p,
 ):
     messages = [{"role": "system", "content": system_message}]
     for val in history:
         if val[0]:
             messages.append({"role": "user", "content": val[0]})
         if val[1]:
             messages.append({"role": "assistant", "content": val[1]})
     messages.append({"role": "user", "content": message})
-    response = ""
-    for message in client.chat_completion(
         messages,
-        max_tokens=max_tokens,
-        stream=True,
         temperature=temperature,
         top_p=top_p,
-    ):
-        token = message.choices[0].delta.content
-        response += token
-        yield response
-"""
-For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
-"""
 demo = gr.ChatInterface(
     respond,
     additional_inputs=[
@@ -59,6 +82,6 @@ demo = gr.ChatInterface(
     ],
 )
 if __name__ == "__main__":
     demo.launch()

 import gradio as gr
+from unsloth import FastLanguageModel
+import torch
+# Load your model and tokenizer (make sure to adjust the path to where your model is stored)
+max_seq_length = 2048  # Adjust as necessary
+load_in_4bit = True  # Enable 4-bit quantization for reduced memory usage
+model_path = "/content/drive/My Drive/llama_lora_model_1"  # Path to your custom model
+# Load the model and tokenizer
+model, tokenizer = FastLanguageModel.from_pretrained(
+    model_name=model_path,
+    max_seq_length=max_seq_length,
+    load_in_4bit=load_in_4bit,
+)
+# Move model to GPU if available
+device = "cuda" if torch.cuda.is_available() else "cpu"
+model = model.to(device)
+# Respond function
 def respond(
     message,
     history: list[tuple[str, str]],
     temperature,
     top_p,
 ):
+    # Prepare the system message
     messages = [{"role": "system", "content": system_message}]
+    # Add history to the messages
     for val in history:
         if val[0]:
             messages.append({"role": "user", "content": val[0]})
         if val[1]:
             messages.append({"role": "assistant", "content": val[1]})
+    # Add the current message from the user
     messages.append({"role": "user", "content": message})
+    # Prepare the inputs for the model
+    inputs = tokenizer.apply_chat_template(
         messages,
+        tokenize=True,
+        add_generation_prompt=True,
+        return_tensors="pt",
+    ).to(device)
+    # Generate the response using your model
+    outputs = model.generate(
+        input_ids=inputs["input_ids"],
+        max_new_tokens=max_tokens,
         temperature=temperature,
         top_p=top_p,
+        use_cache=True,
+    )
+    # Decode the generated output
+    response = tokenizer.batch_decode(outputs, skip_special_tokens=True)
+    # Return the response
+    return response[0]
+# Gradio interface setup
 demo = gr.ChatInterface(
     respond,
     additional_inputs=[
     ],
 )
 if __name__ == "__main__":
     demo.launch()