Spaces:

hgdgng
/

HG_Llama3.2

Runtime error

App Files Files Community

hgdgng commited on 23 days ago

Commit

8a0ad15

•

1 Parent(s): d2f66e0

Update app.py

Browse files

Files changed (1) hide show

app.py +49 -90

app.py CHANGED Viewed

@@ -1,100 +1,59 @@
-from transformers import MllamaForConditionalGeneration, AutoProcessor, TextIteratorStreamer
-from PIL import Image
 import os
-import requests
 import torch
-from threading import Thread
-import gradio as gr
-from gradio import FileData
-import time
-import spaces
-hf_token = os.environ.get("HF_KEY")
-ckpt = "meta-llama/Llama-3.2-11B-Vision-Instruct"
-model = MllamaForConditionalGeneration.from_pretrained(ckpt,
     torch_dtype=torch.bfloat16,
-    token=hf_token).to("cuda")
-processor = AutoProcessor.from_pretrained(ckpt, token=hf_token)
-@spaces.GPU
-def bot_streaming(message, history, max_new_tokens=250):
-    txt = message["text"]
-    ext_buffer = f"{txt}"
-    messages= []
-    images = []
-    for i, msg in enumerate(history):
-        if isinstance(msg[0], tuple):
-            messages.append({"role": "user", "content": [{"type": "text", "text": history[i+1][0]}, {"type": "image"}]})
-            messages.append({"role": "assistant", "content": [{"type": "text", "text": history[i+1][1]}]})
-            images.append(Image.open(msg[0][0]).convert("RGB"))
-        elif isinstance(history[i-1], tuple) and isinstance(msg[0], str):
-            # messages are already handled
-            pass
-        elif isinstance(history[i-1][0], str) and isinstance(msg[0], str): # text only turn
-            messages.append({"role": "user", "content": [{"type": "text", "text": msg[0]}]})
-            messages.append({"role": "assistant", "content": [{"type": "text", "text": msg[1]}]})
-    # add current message
-    if len(message["files"]) == 1:
-        if isinstance(message["files"][0], str): # examples
-            image = Image.open(message["files"][0]).convert("RGB")
-        else: # regular input
-            image = Image.open(message["files"][0]["path"]).convert("RGB")
-        images.append(image)
-        messages.append({"role": "user", "content": [{"type": "text", "text": txt}, {"type": "image"}]})
-    else:
-        messages.append({"role": "user", "content": [{"type": "text", "text": txt}]})
-    texts = processor.apply_chat_template(messages, add_generation_prompt=True)
-    if images == []:
-        inputs = processor(text=texts, return_tensors="pt").to("cuda")
-    else:
-        inputs = processor(text=texts, images=images, return_tensors="pt").to("cuda")
-    streamer = TextIteratorStreamer(processor, skip_special_tokens=True, skip_prompt=True)
-    generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=max_new_tokens)
-    generated_text = ""
-    thread = Thread(target=model.generate, kwargs=generation_kwargs)
-    thread.start()
-    buffer = ""
-    for new_text in streamer:
-        buffer += new_text
-        generated_text_without_prompt = buffer
-        time.sleep(0.01)
-        yield buffer
-demo = gr.ChatInterface(fn=bot_streaming, title="Multimodal Llama", examples=[
-    [{"text": "Which era does this piece belong to? Give details about the era.", "files":["./examples/rococo.jpg"]},
-    200],
-    [{"text": "Where do the droughts happen according to this diagram?", "files":["./examples/weather_events.png"]},
-    250],
     ],
-      textbox=gr.MultimodalTextbox(),
-      additional_inputs = [gr.Slider(
-              minimum=10,
-              maximum=500,
-              value=250,
-              step=10,
-              label="Maximum number of new tokens to generate",
-          )
-        ],
-      cache_examples=False,
-      description="Try Multimodal Llama by Meta with transformers in this demo. Upload an image, and start chatting about it, or simply try one of the examples below. To learn more about Llama Vision, visit [our blog post](https://huggingface.co/blog/llama32). ",
-      stop_btn="Stop Generation",
-      fill_height=True,
-    multimodal=True)
-demo.launch(debug=True)

+# Import required libraries
+import gradio as gr
 import os
 import torch
+from transformers import AutoProcessor, MllamaForConditionalGeneration
+from PIL import Image
+# Set up Hugging Face authentication
+hf_token = os.getenv("HF_KEY")  # Get token from environment variable
+if not hf_token:
+    raise ValueError("HF_KEY environment variable not set. Please set your Hugging Face token.")
+# Model configuration and loading
+model_name = "meta-llama/Llama-3.2-11B-Vision-Instruct"
+model = MllamaForConditionalGeneration.from_pretrained(
+    model_name,
+    use_auth_token=hf_token,
     torch_dtype=torch.bfloat16,
+    device_map="auto",
+)
+processor = AutoProcessor.from_pretrained(model_name, use_auth_token=hf_token)
+# Define prediction function for image and text processing
+def predict(image, text):
+    # Prepare messages
+    messages = [
+        {"role": "user", "content": [
+            {"type": "image"},
+            {"type": "text", "text": text}
+        ]}
+    ]
+    # Create input text
+    input_text = processor.apply_chat_template(messages, add_generation_prompt=True)
+    # Process inputs and move to device
+    inputs = processor(image, input_text, return_tensors="pt").to(model.device)
+    # Generate model response
+    outputs = model.generate(**inputs, max_new_tokens=100)
+    # Decode output
+    response = processor.decode(outputs[0], skip_special_tokens=True)
+    return response
+# Setup Gradio interface
+interface = gr.Interface(
+    fn=predict,
+    inputs=[
+        gr.Image(type="pil", label="Image Input"),
+        gr.Textbox(label="Text Input")
     ],
+    outputs=gr.Textbox(label="Output"),
+    title="Llama 3.2 11B Vision Instruct Demo",
+    description="Meta's new model that generates a response based on an image and text input."
+)
+# Launch the interface
+interface.launch()