Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -1,100 +1,59 @@
|
|
1 |
-
|
2 |
-
|
3 |
import os
|
4 |
-
import requests
|
5 |
import torch
|
6 |
-
from
|
7 |
-
|
8 |
-
from gradio import FileData
|
9 |
-
import time
|
10 |
-
import spaces
|
11 |
-
|
12 |
-
hf_token = os.environ.get("HF_KEY")
|
13 |
|
14 |
-
|
|
|
|
|
|
|
15 |
|
16 |
-
|
|
|
|
|
|
|
|
|
17 |
torch_dtype=torch.bfloat16,
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
24 |
|
25 |
-
|
26 |
-
|
27 |
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
for i, msg in enumerate(history):
|
33 |
-
if isinstance(msg[0], tuple):
|
34 |
-
messages.append({"role": "user", "content": [{"type": "text", "text": history[i+1][0]}, {"type": "image"}]})
|
35 |
-
messages.append({"role": "assistant", "content": [{"type": "text", "text": history[i+1][1]}]})
|
36 |
-
images.append(Image.open(msg[0][0]).convert("RGB"))
|
37 |
-
elif isinstance(history[i-1], tuple) and isinstance(msg[0], str):
|
38 |
-
# messages are already handled
|
39 |
-
pass
|
40 |
-
elif isinstance(history[i-1][0], str) and isinstance(msg[0], str): # text only turn
|
41 |
-
messages.append({"role": "user", "content": [{"type": "text", "text": msg[0]}]})
|
42 |
-
messages.append({"role": "assistant", "content": [{"type": "text", "text": msg[1]}]})
|
43 |
-
|
44 |
-
# add current message
|
45 |
-
if len(message["files"]) == 1:
|
46 |
-
|
47 |
-
if isinstance(message["files"][0], str): # examples
|
48 |
-
image = Image.open(message["files"][0]).convert("RGB")
|
49 |
-
else: # regular input
|
50 |
-
image = Image.open(message["files"][0]["path"]).convert("RGB")
|
51 |
-
images.append(image)
|
52 |
-
messages.append({"role": "user", "content": [{"type": "text", "text": txt}, {"type": "image"}]})
|
53 |
-
else:
|
54 |
-
messages.append({"role": "user", "content": [{"type": "text", "text": txt}]})
|
55 |
-
|
56 |
-
|
57 |
-
texts = processor.apply_chat_template(messages, add_generation_prompt=True)
|
58 |
-
|
59 |
-
if images == []:
|
60 |
-
inputs = processor(text=texts, return_tensors="pt").to("cuda")
|
61 |
-
else:
|
62 |
-
inputs = processor(text=texts, images=images, return_tensors="pt").to("cuda")
|
63 |
-
streamer = TextIteratorStreamer(processor, skip_special_tokens=True, skip_prompt=True)
|
64 |
-
|
65 |
-
generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=max_new_tokens)
|
66 |
-
generated_text = ""
|
67 |
|
68 |
-
|
69 |
-
|
70 |
-
buffer = ""
|
71 |
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
[{"text": "Where do the droughts happen according to this diagram?", "files":["./examples/weather_events.png"]},
|
83 |
-
250],
|
84 |
],
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
)
|
93 |
-
],
|
94 |
-
cache_examples=False,
|
95 |
-
description="Try Multimodal Llama by Meta with transformers in this demo. Upload an image, and start chatting about it, or simply try one of the examples below. To learn more about Llama Vision, visit [our blog post](https://huggingface.co/blog/llama32). ",
|
96 |
-
stop_btn="Stop Generation",
|
97 |
-
fill_height=True,
|
98 |
-
multimodal=True)
|
99 |
-
|
100 |
-
demo.launch(debug=True)
|
|
|
1 |
+
# Import required libraries
|
2 |
+
import gradio as gr
|
3 |
import os
|
|
|
4 |
import torch
|
5 |
+
from transformers import AutoProcessor, MllamaForConditionalGeneration
|
6 |
+
from PIL import Image
|
|
|
|
|
|
|
|
|
|
|
7 |
|
8 |
+
# Set up Hugging Face authentication
|
9 |
+
hf_token = os.getenv("HF_KEY") # Get token from environment variable
|
10 |
+
if not hf_token:
|
11 |
+
raise ValueError("HF_KEY environment variable not set. Please set your Hugging Face token.")
|
12 |
|
13 |
+
# Model configuration and loading
|
14 |
+
model_name = "meta-llama/Llama-3.2-11B-Vision-Instruct"
|
15 |
+
model = MllamaForConditionalGeneration.from_pretrained(
|
16 |
+
model_name,
|
17 |
+
use_auth_token=hf_token,
|
18 |
torch_dtype=torch.bfloat16,
|
19 |
+
device_map="auto",
|
20 |
+
)
|
21 |
+
processor = AutoProcessor.from_pretrained(model_name, use_auth_token=hf_token)
|
22 |
+
|
23 |
+
# Define prediction function for image and text processing
|
24 |
+
def predict(image, text):
|
25 |
+
# Prepare messages
|
26 |
+
messages = [
|
27 |
+
{"role": "user", "content": [
|
28 |
+
{"type": "image"},
|
29 |
+
{"type": "text", "text": text}
|
30 |
+
]}
|
31 |
+
]
|
32 |
|
33 |
+
# Create input text
|
34 |
+
input_text = processor.apply_chat_template(messages, add_generation_prompt=True)
|
35 |
|
36 |
+
# Process inputs and move to device
|
37 |
+
inputs = processor(image, input_text, return_tensors="pt").to(model.device)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
38 |
|
39 |
+
# Generate model response
|
40 |
+
outputs = model.generate(**inputs, max_new_tokens=100)
|
|
|
41 |
|
42 |
+
# Decode output
|
43 |
+
response = processor.decode(outputs[0], skip_special_tokens=True)
|
44 |
+
return response
|
45 |
+
|
46 |
+
# Setup Gradio interface
|
47 |
+
interface = gr.Interface(
|
48 |
+
fn=predict,
|
49 |
+
inputs=[
|
50 |
+
gr.Image(type="pil", label="Image Input"),
|
51 |
+
gr.Textbox(label="Text Input")
|
|
|
|
|
52 |
],
|
53 |
+
outputs=gr.Textbox(label="Output"),
|
54 |
+
title="Llama 3.2 11B Vision Instruct Demo",
|
55 |
+
description="Meta's new model that generates a response based on an image and text input."
|
56 |
+
)
|
57 |
+
|
58 |
+
# Launch the interface
|
59 |
+
interface.launch()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|