hgdgng commited on
Commit
8a0ad15
1 Parent(s): d2f66e0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +49 -90
app.py CHANGED
@@ -1,100 +1,59 @@
1
- from transformers import MllamaForConditionalGeneration, AutoProcessor, TextIteratorStreamer
2
- from PIL import Image
3
  import os
4
- import requests
5
  import torch
6
- from threading import Thread
7
- import gradio as gr
8
- from gradio import FileData
9
- import time
10
- import spaces
11
-
12
- hf_token = os.environ.get("HF_KEY")
13
 
14
- ckpt = "meta-llama/Llama-3.2-11B-Vision-Instruct"
 
 
 
15
 
16
- model = MllamaForConditionalGeneration.from_pretrained(ckpt,
 
 
 
 
17
  torch_dtype=torch.bfloat16,
18
- token=hf_token).to("cuda")
19
- processor = AutoProcessor.from_pretrained(ckpt, token=hf_token)
20
-
21
-
22
- @spaces.GPU
23
- def bot_streaming(message, history, max_new_tokens=250):
 
 
 
 
 
 
 
24
 
25
- txt = message["text"]
26
- ext_buffer = f"{txt}"
27
 
28
- messages= []
29
- images = []
30
-
31
-
32
- for i, msg in enumerate(history):
33
- if isinstance(msg[0], tuple):
34
- messages.append({"role": "user", "content": [{"type": "text", "text": history[i+1][0]}, {"type": "image"}]})
35
- messages.append({"role": "assistant", "content": [{"type": "text", "text": history[i+1][1]}]})
36
- images.append(Image.open(msg[0][0]).convert("RGB"))
37
- elif isinstance(history[i-1], tuple) and isinstance(msg[0], str):
38
- # messages are already handled
39
- pass
40
- elif isinstance(history[i-1][0], str) and isinstance(msg[0], str): # text only turn
41
- messages.append({"role": "user", "content": [{"type": "text", "text": msg[0]}]})
42
- messages.append({"role": "assistant", "content": [{"type": "text", "text": msg[1]}]})
43
-
44
- # add current message
45
- if len(message["files"]) == 1:
46
-
47
- if isinstance(message["files"][0], str): # examples
48
- image = Image.open(message["files"][0]).convert("RGB")
49
- else: # regular input
50
- image = Image.open(message["files"][0]["path"]).convert("RGB")
51
- images.append(image)
52
- messages.append({"role": "user", "content": [{"type": "text", "text": txt}, {"type": "image"}]})
53
- else:
54
- messages.append({"role": "user", "content": [{"type": "text", "text": txt}]})
55
-
56
-
57
- texts = processor.apply_chat_template(messages, add_generation_prompt=True)
58
-
59
- if images == []:
60
- inputs = processor(text=texts, return_tensors="pt").to("cuda")
61
- else:
62
- inputs = processor(text=texts, images=images, return_tensors="pt").to("cuda")
63
- streamer = TextIteratorStreamer(processor, skip_special_tokens=True, skip_prompt=True)
64
-
65
- generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=max_new_tokens)
66
- generated_text = ""
67
 
68
- thread = Thread(target=model.generate, kwargs=generation_kwargs)
69
- thread.start()
70
- buffer = ""
71
 
72
- for new_text in streamer:
73
- buffer += new_text
74
- generated_text_without_prompt = buffer
75
- time.sleep(0.01)
76
- yield buffer
77
-
78
-
79
- demo = gr.ChatInterface(fn=bot_streaming, title="Multimodal Llama", examples=[
80
- [{"text": "Which era does this piece belong to? Give details about the era.", "files":["./examples/rococo.jpg"]},
81
- 200],
82
- [{"text": "Where do the droughts happen according to this diagram?", "files":["./examples/weather_events.png"]},
83
- 250],
84
  ],
85
- textbox=gr.MultimodalTextbox(),
86
- additional_inputs = [gr.Slider(
87
- minimum=10,
88
- maximum=500,
89
- value=250,
90
- step=10,
91
- label="Maximum number of new tokens to generate",
92
- )
93
- ],
94
- cache_examples=False,
95
- description="Try Multimodal Llama by Meta with transformers in this demo. Upload an image, and start chatting about it, or simply try one of the examples below. To learn more about Llama Vision, visit [our blog post](https://huggingface.co/blog/llama32). ",
96
- stop_btn="Stop Generation",
97
- fill_height=True,
98
- multimodal=True)
99
-
100
- demo.launch(debug=True)
 
1
+ # Import required libraries
2
+ import gradio as gr
3
  import os
 
4
  import torch
5
+ from transformers import AutoProcessor, MllamaForConditionalGeneration
6
+ from PIL import Image
 
 
 
 
 
7
 
8
+ # Set up Hugging Face authentication
9
+ hf_token = os.getenv("HF_KEY") # Get token from environment variable
10
+ if not hf_token:
11
+ raise ValueError("HF_KEY environment variable not set. Please set your Hugging Face token.")
12
 
13
+ # Model configuration and loading
14
+ model_name = "meta-llama/Llama-3.2-11B-Vision-Instruct"
15
+ model = MllamaForConditionalGeneration.from_pretrained(
16
+ model_name,
17
+ use_auth_token=hf_token,
18
  torch_dtype=torch.bfloat16,
19
+ device_map="auto",
20
+ )
21
+ processor = AutoProcessor.from_pretrained(model_name, use_auth_token=hf_token)
22
+
23
+ # Define prediction function for image and text processing
24
+ def predict(image, text):
25
+ # Prepare messages
26
+ messages = [
27
+ {"role": "user", "content": [
28
+ {"type": "image"},
29
+ {"type": "text", "text": text}
30
+ ]}
31
+ ]
32
 
33
+ # Create input text
34
+ input_text = processor.apply_chat_template(messages, add_generation_prompt=True)
35
 
36
+ # Process inputs and move to device
37
+ inputs = processor(image, input_text, return_tensors="pt").to(model.device)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
 
39
+ # Generate model response
40
+ outputs = model.generate(**inputs, max_new_tokens=100)
 
41
 
42
+ # Decode output
43
+ response = processor.decode(outputs[0], skip_special_tokens=True)
44
+ return response
45
+
46
+ # Setup Gradio interface
47
+ interface = gr.Interface(
48
+ fn=predict,
49
+ inputs=[
50
+ gr.Image(type="pil", label="Image Input"),
51
+ gr.Textbox(label="Text Input")
 
 
52
  ],
53
+ outputs=gr.Textbox(label="Output"),
54
+ title="Llama 3.2 11B Vision Instruct Demo",
55
+ description="Meta's new model that generates a response based on an image and text input."
56
+ )
57
+
58
+ # Launch the interface
59
+ interface.launch()