Spaces:
Sleeping
Sleeping
gordonhubackup
commited on
Commit
•
8bce163
1
Parent(s):
c40d27f
init
Browse files- .gitignore +5 -0
- app.py +6 -4
- llava/__pycache__/__init__.cpython-310.pyc +0 -0
- llava/__pycache__/chat.cpython-310.pyc +0 -0
- llava/__pycache__/constants.cpython-310.pyc +0 -0
- llava/__pycache__/conversation.cpython-310.pyc +0 -0
- llava/__pycache__/mm_utils.cpython-310.pyc +0 -0
- llava/__pycache__/utils.cpython-310.pyc +0 -0
- llava/chat.py +11 -12
- llava/model/__pycache__/__init__.cpython-310.pyc +0 -0
- llava/model/__pycache__/builder.cpython-310.pyc +0 -0
- llava/model/__pycache__/llava_arch.cpython-310.pyc +0 -0
- llava/model/language_model/__pycache__/llava_llama.cpython-310.pyc +0 -0
- llava/model/language_model/__pycache__/llava_mpt.cpython-310.pyc +0 -0
- llava/model/multimodal_encoder/__pycache__/builder.cpython-310.pyc +0 -0
- llava/model/multimodal_encoder/__pycache__/clip_encoder.cpython-310.pyc +0 -0
- llava/model/multimodal_projector/__pycache__/builder.cpython-310.pyc +0 -0
.gitignore
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Python
|
2 |
+
__pycache__
|
3 |
+
*.pyc
|
4 |
+
*.egg-info
|
5 |
+
dist
|
app.py
CHANGED
@@ -49,6 +49,7 @@ def parse_args():
|
|
49 |
parser.add_argument("--num_beams", type=int, default=1)
|
50 |
parser.add_argument("--max_new_tokens", type=int, default=512)
|
51 |
parser.add_argument("--num-visual-tokens", type=int, default=256)
|
|
|
52 |
args = parser.parse_args()
|
53 |
return args
|
54 |
|
@@ -68,7 +69,7 @@ disable_torch_init()
|
|
68 |
|
69 |
model_name = get_model_name_from_path(args.model_path)
|
70 |
tokenizer, model, image_processor, context_len = load_pretrained_model(
|
71 |
-
args.model_path, args.model_base, model_name
|
72 |
)
|
73 |
|
74 |
# vis_processor_cfg = cfg.datasets_cfg.cc_sbu_align.vis_processor.train
|
@@ -109,13 +110,14 @@ def gradio_answer(chatbot, chat_state, img_list, num_beams, temperature, num_vis
|
|
109 |
num_beams=num_beams,
|
110 |
temperature=temperature,
|
111 |
num_visual_tokens=num_visual_tokens,
|
112 |
-
)[0]
|
113 |
chatbot[-1][1] = llm_message[0]
|
114 |
return chatbot, chat_state, img_list
|
115 |
|
116 |
title = """<h1 align="center">Demo of MQT-LLaVA</h1>"""
|
117 |
-
description = """<h3>This is the demo of MQT-LLaVA. Upload your images and start chatting
|
118 |
-
example questions, click example image, hit upload, and press enter in the chatbox
|
|
|
119 |
article = """<p><a href='https://gordonhu608.github.io/mqt-llava/'><img src='https://img.shields.io/badge/Project-Page-Green'></a></p><p><a href='https://github.com/gordonhu608/MQT-LLaVA'><img src='https://img.shields.io/badge/Github-Code-blue'></a></p><p><a href='https://arxiv.org/abs/'><img src='https://img.shields.io/badge/Paper-ArXiv-red'></a></p>
|
120 |
"""
|
121 |
|
|
|
49 |
parser.add_argument("--num_beams", type=int, default=1)
|
50 |
parser.add_argument("--max_new_tokens", type=int, default=512)
|
51 |
parser.add_argument("--num-visual-tokens", type=int, default=256)
|
52 |
+
parser.add_argument("--gpu-id", type=int, default=0)
|
53 |
args = parser.parse_args()
|
54 |
return args
|
55 |
|
|
|
69 |
|
70 |
model_name = get_model_name_from_path(args.model_path)
|
71 |
tokenizer, model, image_processor, context_len = load_pretrained_model(
|
72 |
+
args.model_path, args.model_base, model_name, device_map=device, device=device
|
73 |
)
|
74 |
|
75 |
# vis_processor_cfg = cfg.datasets_cfg.cc_sbu_align.vis_processor.train
|
|
|
110 |
num_beams=num_beams,
|
111 |
temperature=temperature,
|
112 |
num_visual_tokens=num_visual_tokens,
|
113 |
+
) #[0]
|
114 |
chatbot[-1][1] = llm_message[0]
|
115 |
return chatbot, chat_state, img_list
|
116 |
|
117 |
title = """<h1 align="center">Demo of MQT-LLaVA</h1>"""
|
118 |
+
description = """<h3>This is the demo of MQT-LLaVA. Upload your images and start chatting! <br> To use
|
119 |
+
example questions, click example image, hit upload & start chat, and press enter on your keyboard in the chatbox.
|
120 |
+
<br> Due to limited memory constraint, we only support single turn conversation. To ask multiple questions, hit Restart and upload your image! </h3>"""
|
121 |
article = """<p><a href='https://gordonhu608.github.io/mqt-llava/'><img src='https://img.shields.io/badge/Project-Page-Green'></a></p><p><a href='https://github.com/gordonhu608/MQT-LLaVA'><img src='https://img.shields.io/badge/Github-Code-blue'></a></p><p><a href='https://arxiv.org/abs/'><img src='https://img.shields.io/badge/Paper-ArXiv-red'></a></p>
|
122 |
"""
|
123 |
|
llava/__pycache__/__init__.cpython-310.pyc
CHANGED
Binary files a/llava/__pycache__/__init__.cpython-310.pyc and b/llava/__pycache__/__init__.cpython-310.pyc differ
|
|
llava/__pycache__/chat.cpython-310.pyc
ADDED
Binary file (13.3 kB). View file
|
|
llava/__pycache__/constants.cpython-310.pyc
CHANGED
Binary files a/llava/__pycache__/constants.cpython-310.pyc and b/llava/__pycache__/constants.cpython-310.pyc differ
|
|
llava/__pycache__/conversation.cpython-310.pyc
CHANGED
Binary files a/llava/__pycache__/conversation.cpython-310.pyc and b/llava/__pycache__/conversation.cpython-310.pyc differ
|
|
llava/__pycache__/mm_utils.cpython-310.pyc
CHANGED
Binary files a/llava/__pycache__/mm_utils.cpython-310.pyc and b/llava/__pycache__/mm_utils.cpython-310.pyc differ
|
|
llava/__pycache__/utils.cpython-310.pyc
CHANGED
Binary files a/llava/__pycache__/utils.cpython-310.pyc and b/llava/__pycache__/utils.cpython-310.pyc differ
|
|
llava/chat.py
CHANGED
@@ -442,20 +442,21 @@ def load_images(image_files):
|
|
442 |
class Chat:
|
443 |
def __init__(self, model, tokenizer, image_processor, args, device='cuda:0'):
|
444 |
self.device = device
|
445 |
-
self.model = model
|
446 |
self.tokenizer = tokenizer
|
447 |
self.image_processor = image_processor
|
448 |
self.args = args
|
449 |
|
450 |
def ask(self, text, conv):
|
451 |
#conv.messages = [] #hack not keeping history.
|
|
|
452 |
conv.append_message(conv.roles[0], text)
|
453 |
|
454 |
def answer(self, conv, img_list, num_visual_tokens=256, max_new_tokens=512, num_beams=1, temperature=0.0):
|
455 |
conv.append_message(conv.roles[1], None)
|
456 |
|
457 |
question = conv.get_prompt()
|
458 |
-
images = img_list[0] #torch.stack(img_list).to(self.device)
|
459 |
|
460 |
images_tensor = process_images(
|
461 |
images,
|
@@ -466,7 +467,7 @@ class Chat:
|
|
466 |
input_ids = (
|
467 |
tokenizer_image_token(question, self.tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt")
|
468 |
.unsqueeze(0)
|
469 |
-
.cuda()
|
470 |
)
|
471 |
|
472 |
with torch.inference_mode():
|
@@ -488,21 +489,19 @@ class Chat:
|
|
488 |
return output_text, ''
|
489 |
|
490 |
def upload_img(self, image, conv, img_list):
|
491 |
-
|
492 |
-
|
493 |
-
|
494 |
-
|
495 |
-
|
496 |
-
|
497 |
-
# raw_image = raw_image.convert('RGB')
|
498 |
-
# image = self.vis_processor(raw_image).unsqueeze(0).to(self.device)
|
499 |
# elif isinstance(image, torch.Tensor):
|
500 |
# if len(image.shape) == 3:
|
501 |
# image = image.unsqueeze(0)
|
502 |
# image = image.to(self.device)
|
503 |
|
504 |
#image_emb, _ = self.model.encode_img(image)
|
505 |
-
img_list.append(
|
506 |
#conv.append_message(conv.roles[0], "")
|
507 |
msg = "Received."
|
508 |
# self.conv.append_message(self.conv.roles[1], msg)
|
|
|
442 |
class Chat:
|
443 |
def __init__(self, model, tokenizer, image_processor, args, device='cuda:0'):
|
444 |
self.device = device
|
445 |
+
self.model = model.to(device)
|
446 |
self.tokenizer = tokenizer
|
447 |
self.image_processor = image_processor
|
448 |
self.args = args
|
449 |
|
450 |
def ask(self, text, conv):
|
451 |
#conv.messages = [] #hack not keeping history.
|
452 |
+
text = DEFAULT_IMAGE_TOKEN + "\n" + text
|
453 |
conv.append_message(conv.roles[0], text)
|
454 |
|
455 |
def answer(self, conv, img_list, num_visual_tokens=256, max_new_tokens=512, num_beams=1, temperature=0.0):
|
456 |
conv.append_message(conv.roles[1], None)
|
457 |
|
458 |
question = conv.get_prompt()
|
459 |
+
images = img_list #[0] #torch.stack(img_list).to(self.device)
|
460 |
|
461 |
images_tensor = process_images(
|
462 |
images,
|
|
|
467 |
input_ids = (
|
468 |
tokenizer_image_token(question, self.tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt")
|
469 |
.unsqueeze(0)
|
470 |
+
.to(self.device) #cuda()
|
471 |
)
|
472 |
|
473 |
with torch.inference_mode():
|
|
|
489 |
return output_text, ''
|
490 |
|
491 |
def upload_img(self, image, conv, img_list):
|
492 |
+
|
493 |
+
if isinstance(image, str): # is a image path
|
494 |
+
raw_image = Image.open(image).convert('RGB')
|
495 |
+
elif isinstance(image, Image.Image):
|
496 |
+
raw_image = image
|
497 |
+
raw_image = raw_image.convert('RGB')
|
|
|
|
|
498 |
# elif isinstance(image, torch.Tensor):
|
499 |
# if len(image.shape) == 3:
|
500 |
# image = image.unsqueeze(0)
|
501 |
# image = image.to(self.device)
|
502 |
|
503 |
#image_emb, _ = self.model.encode_img(image)
|
504 |
+
img_list.append(raw_image)
|
505 |
#conv.append_message(conv.roles[0], "")
|
506 |
msg = "Received."
|
507 |
# self.conv.append_message(self.conv.roles[1], msg)
|
llava/model/__pycache__/__init__.cpython-310.pyc
CHANGED
Binary files a/llava/model/__pycache__/__init__.cpython-310.pyc and b/llava/model/__pycache__/__init__.cpython-310.pyc differ
|
|
llava/model/__pycache__/builder.cpython-310.pyc
CHANGED
Binary files a/llava/model/__pycache__/builder.cpython-310.pyc and b/llava/model/__pycache__/builder.cpython-310.pyc differ
|
|
llava/model/__pycache__/llava_arch.cpython-310.pyc
CHANGED
Binary files a/llava/model/__pycache__/llava_arch.cpython-310.pyc and b/llava/model/__pycache__/llava_arch.cpython-310.pyc differ
|
|
llava/model/language_model/__pycache__/llava_llama.cpython-310.pyc
CHANGED
Binary files a/llava/model/language_model/__pycache__/llava_llama.cpython-310.pyc and b/llava/model/language_model/__pycache__/llava_llama.cpython-310.pyc differ
|
|
llava/model/language_model/__pycache__/llava_mpt.cpython-310.pyc
CHANGED
Binary files a/llava/model/language_model/__pycache__/llava_mpt.cpython-310.pyc and b/llava/model/language_model/__pycache__/llava_mpt.cpython-310.pyc differ
|
|
llava/model/multimodal_encoder/__pycache__/builder.cpython-310.pyc
CHANGED
Binary files a/llava/model/multimodal_encoder/__pycache__/builder.cpython-310.pyc and b/llava/model/multimodal_encoder/__pycache__/builder.cpython-310.pyc differ
|
|
llava/model/multimodal_encoder/__pycache__/clip_encoder.cpython-310.pyc
CHANGED
Binary files a/llava/model/multimodal_encoder/__pycache__/clip_encoder.cpython-310.pyc and b/llava/model/multimodal_encoder/__pycache__/clip_encoder.cpython-310.pyc differ
|
|
llava/model/multimodal_projector/__pycache__/builder.cpython-310.pyc
CHANGED
Binary files a/llava/model/multimodal_projector/__pycache__/builder.cpython-310.pyc and b/llava/model/multimodal_projector/__pycache__/builder.cpython-310.pyc differ
|
|