Files changed (4) hide show
  1. .gitattributes +1 -0
  2. Example1.webp +0 -0
  3. Example2.png +3 -0
  4. app.py +96 -94
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ Example2.png filter=lfs diff=lfs merge=lfs -text
Example1.webp ADDED
Example2.png ADDED

Git LFS Details

  • SHA256: 7839e93dd753e5356176bf70d38c43bc56355099d8891ead7aaa342029369268
  • Pointer size: 132 Bytes
  • Size of remote file: 2.04 MB
app.py CHANGED
@@ -1,95 +1,97 @@
1
- from transformers import MllamaForConditionalGeneration, AutoProcessor, TextIteratorStreamer
2
- from PIL import Image
3
- import requests
4
- import torch
5
- from threading import Thread
6
- import gradio as gr
7
- from gradio import FileData
8
- import time
9
- import spaces
10
- import re
11
- ckpt = "Xkev/Llama-3.2V-11B-cot"
12
- model = MllamaForConditionalGeneration.from_pretrained(ckpt,
13
- torch_dtype=torch.bfloat16).to("cuda")
14
- processor = AutoProcessor.from_pretrained(ckpt)
15
-
16
-
17
- @spaces.GPU
18
- def bot_streaming(message, history, max_new_tokens=250):
19
-
20
- txt = message["text"]
21
- ext_buffer = f"{txt}"
22
-
23
- messages= []
24
- images = []
25
-
26
-
27
- for i, msg in enumerate(history):
28
- if isinstance(msg[0], tuple):
29
- messages.append({"role": "user", "content": [{"type": "text", "text": history[i+1][0]}, {"type": "image"}]})
30
- messages.append({"role": "assistant", "content": [{"type": "text", "text": history[i+1][1]}]})
31
- images.append(Image.open(msg[0][0]).convert("RGB"))
32
- elif isinstance(history[i-1], tuple) and isinstance(msg[0], str):
33
- # messages are already handled
34
- pass
35
- elif isinstance(history[i-1][0], str) and isinstance(msg[0], str): # text only turn
36
- messages.append({"role": "user", "content": [{"type": "text", "text": msg[0]}]})
37
- messages.append({"role": "assistant", "content": [{"type": "text", "text": msg[1]}]})
38
-
39
- # add current message
40
- if len(message["files"]) == 1:
41
-
42
- if isinstance(message["files"][0], str): # examples
43
- image = Image.open(message["files"][0]).convert("RGB")
44
- else: # regular input
45
- image = Image.open(message["files"][0]["path"]).convert("RGB")
46
- images.append(image)
47
- messages.append({"role": "user", "content": [{"type": "text", "text": txt}, {"type": "image"}]})
48
- else:
49
- messages.append({"role": "user", "content": [{"type": "text", "text": txt}]})
50
-
51
-
52
- texts = processor.apply_chat_template(messages, add_generation_prompt=True)
53
-
54
- if images == []:
55
- inputs = processor(text=texts, return_tensors="pt").to("cuda")
56
- else:
57
- inputs = processor(text=texts, images=images, return_tensors="pt").to("cuda")
58
-
59
- streamer = TextIteratorStreamer(processor, skip_special_tokens=True, skip_prompt=True)
60
-
61
- generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=max_new_tokens)
62
- generated_text = ""
63
-
64
- thread = Thread(target=model.generate, kwargs=generation_kwargs)
65
- thread.start()
66
- buffer = ""
67
-
68
- for new_text in streamer:
69
- buffer += new_text
70
- generated_text_without_prompt = buffer
71
- time.sleep(0.01)
72
-
73
- buffer = re.sub(r"<(\w+)>", r"(Here begins the \1 stage)", buffer)
74
- buffer = re.sub(r"</(\w+)>", r"(Here ends the \1 stage)", buffer)
75
-
76
- yield buffer
77
-
78
-
79
- demo = gr.ChatInterface(fn=bot_streaming, title="LLaVA-CoT",
80
- textbox=gr.MultimodalTextbox(),
81
- additional_inputs = [gr.Slider(
82
- minimum=512,
83
- maximum=1024,
84
- value=512,
85
- step=1,
86
- label="Maximum number of new tokens to generate",
87
- )
88
- ],
89
- cache_examples=False,
90
- description="Upload an image, and start chatting about it. To learn more about LLaVA-CoT, visit [our GitHub page](https://github.com/PKU-YuanGroup/LLaVA-CoT). Note: Since Gradio currently does not support displaying the special markings in the output, we have replaced it with the expression (Here begins the X phase).",
91
- stop_btn="Stop Generation",
92
- fill_height=True,
93
- multimodal=True)
94
-
 
 
95
  demo.launch(debug=True)
 
1
+ from transformers import MllamaForConditionalGeneration, AutoProcessor, TextIteratorStreamer
2
+ from PIL import Image
3
+ import requests
4
+ import torch
5
+ from threading import Thread
6
+ import gradio as gr
7
+ from gradio import FileData
8
+ import time
9
+ import spaces
10
+ import re
11
+ ckpt = "Xkev/Llama-3.2V-11B-cot"
12
+ model = MllamaForConditionalGeneration.from_pretrained(ckpt,
13
+ torch_dtype=torch.bfloat16).to("cuda")
14
+ processor = AutoProcessor.from_pretrained(ckpt)
15
+
16
+
17
+ @spaces.GPU
18
+ def bot_streaming(message, history, max_new_tokens=250):
19
+
20
+ txt = message["text"]
21
+ ext_buffer = f"{txt}"
22
+
23
+ messages= []
24
+ images = []
25
+
26
+
27
+ for i, msg in enumerate(history):
28
+ if isinstance(msg[0], tuple):
29
+ messages.append({"role": "user", "content": [{"type": "text", "text": history[i+1][0]}, {"type": "image"}]})
30
+ messages.append({"role": "assistant", "content": [{"type": "text", "text": history[i+1][1]}]})
31
+ images.append(Image.open(msg[0][0]).convert("RGB"))
32
+ elif isinstance(history[i-1], tuple) and isinstance(msg[0], str):
33
+ # messages are already handled
34
+ pass
35
+ elif isinstance(history[i-1][0], str) and isinstance(msg[0], str): # text only turn
36
+ messages.append({"role": "user", "content": [{"type": "text", "text": msg[0]}]})
37
+ messages.append({"role": "assistant", "content": [{"type": "text", "text": msg[1]}]})
38
+
39
+ # add current message
40
+ if len(message["files"]) == 1:
41
+
42
+ if isinstance(message["files"][0], str): # examples
43
+ image = Image.open(message["files"][0]).convert("RGB")
44
+ else: # regular input
45
+ image = Image.open(message["files"][0]["path"]).convert("RGB")
46
+ images.append(image)
47
+ messages.append({"role": "user", "content": [{"type": "text", "text": txt}, {"type": "image"}]})
48
+ else:
49
+ messages.append({"role": "user", "content": [{"type": "text", "text": txt}]})
50
+
51
+
52
+ texts = processor.apply_chat_template(messages, add_generation_prompt=True)
53
+
54
+ if images == []:
55
+ inputs = processor(text=texts, return_tensors="pt").to("cuda")
56
+ else:
57
+ inputs = processor(text=texts, images=images, return_tensors="pt").to("cuda")
58
+
59
+ streamer = TextIteratorStreamer(processor, skip_special_tokens=True, skip_prompt=True)
60
+
61
+ generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=max_new_tokens)
62
+ generated_text = ""
63
+
64
+ thread = Thread(target=model.generate, kwargs=generation_kwargs)
65
+ thread.start()
66
+ buffer = ""
67
+
68
+ for new_text in streamer:
69
+ buffer += new_text
70
+ generated_text_without_prompt = buffer
71
+ time.sleep(0.01)
72
+
73
+ buffer = re.sub(r"<(\w+)>", r"(Here begins the \1 stage)", buffer)
74
+ buffer = re.sub(r"</(\w+)>", r"(Here ends the \1 stage)", buffer)
75
+
76
+ yield buffer
77
+
78
+
79
+ demo = gr.ChatInterface(fn=bot_streaming, title="LLaVA-CoT",
80
+ textbox=gr.MultimodalTextbox(),
81
+ additional_inputs = [gr.Slider(
82
+ minimum=512,
83
+ maximum=1024,
84
+ value=512,
85
+ step=1,
86
+ label="Maximum number of new tokens to generate",
87
+ )
88
+ ],
89
+ examples=[{"text": "What is on the flower?", "files": ["./Example1.webp"]},
90
+ {"text": "How to make this pastry?", "files": ["./Example2.png"]}],
91
+ cache_examples=False,
92
+ description="Upload an image, and start chatting about it. To learn more about LLaVA-CoT, visit [our GitHub page](https://github.com/PKU-YuanGroup/LLaVA-CoT). Note: Since Gradio currently does not support displaying the special markings in the output, we have replaced it with the expression (Here begins the X phase).",
93
+ stop_btn="Stop Generation",
94
+ fill_height=True,
95
+ multimodal=True)
96
+
97
  demo.launch(debug=True)