Spaces:

gordonhu
/

MQT-LLaVA

Sleeping

App Files Files Community

gordonhubackup commited on May 29

Commit

8bce163

•

1 Parent(s): c40d27f

init

Browse files

Files changed (17) hide show

.gitignore +5 -0
app.py +6 -4
llava/__pycache__/__init__.cpython-310.pyc +0 -0
llava/__pycache__/chat.cpython-310.pyc +0 -0
llava/__pycache__/constants.cpython-310.pyc +0 -0
llava/__pycache__/conversation.cpython-310.pyc +0 -0
llava/__pycache__/mm_utils.cpython-310.pyc +0 -0
llava/__pycache__/utils.cpython-310.pyc +0 -0
llava/chat.py +11 -12
llava/model/__pycache__/__init__.cpython-310.pyc +0 -0
llava/model/__pycache__/builder.cpython-310.pyc +0 -0
llava/model/__pycache__/llava_arch.cpython-310.pyc +0 -0
llava/model/language_model/__pycache__/llava_llama.cpython-310.pyc +0 -0
llava/model/language_model/__pycache__/llava_mpt.cpython-310.pyc +0 -0
llava/model/multimodal_encoder/__pycache__/builder.cpython-310.pyc +0 -0
llava/model/multimodal_encoder/__pycache__/clip_encoder.cpython-310.pyc +0 -0
llava/model/multimodal_projector/__pycache__/builder.cpython-310.pyc +0 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,5 @@

+# Python
+__pycache__
+*.pyc
+*.egg-info
+dist

app.py CHANGED Viewed

@@ -49,6 +49,7 @@ def parse_args():
     parser.add_argument("--num_beams", type=int, default=1)
     parser.add_argument("--max_new_tokens", type=int, default=512)
     parser.add_argument("--num-visual-tokens", type=int, default=256)
     args = parser.parse_args()
     return args
@@ -68,7 +69,7 @@ disable_torch_init()
 model_name = get_model_name_from_path(args.model_path)
 tokenizer, model, image_processor, context_len = load_pretrained_model(
-    args.model_path, args.model_base, model_name
 )
 # vis_processor_cfg = cfg.datasets_cfg.cc_sbu_align.vis_processor.train
@@ -109,13 +110,14 @@ def gradio_answer(chatbot, chat_state, img_list, num_beams, temperature, num_vis
                               num_beams=num_beams,
                               temperature=temperature,
                               num_visual_tokens=num_visual_tokens,
-                              )[0]
     chatbot[-1][1] = llm_message[0]
     return chatbot, chat_state, img_list
 title = """<h1 align="center">Demo of MQT-LLaVA</h1>"""
-description = """<h3>This is the demo of MQT-LLaVA. Upload your images and start chatting!. <br> To use
-            example questions, click example image, hit upload, and press enter in the chatbox.</h3>"""
 article = """<p><a href='https://gordonhu608.github.io/mqt-llava/'><img src='https://img.shields.io/badge/Project-Page-Green'></a></p><p><a href='https://github.com/gordonhu608/MQT-LLaVA'><img src='https://img.shields.io/badge/Github-Code-blue'></a></p><p><a href='https://arxiv.org/abs/'><img src='https://img.shields.io/badge/Paper-ArXiv-red'></a></p>
 """

     parser.add_argument("--num_beams", type=int, default=1)
     parser.add_argument("--max_new_tokens", type=int, default=512)
     parser.add_argument("--num-visual-tokens", type=int, default=256)
+    parser.add_argument("--gpu-id", type=int, default=0)
     args = parser.parse_args()
     return args
 model_name = get_model_name_from_path(args.model_path)
 tokenizer, model, image_processor, context_len = load_pretrained_model(
+    args.model_path, args.model_base, model_name, device_map=device, device=device
 )
 # vis_processor_cfg = cfg.datasets_cfg.cc_sbu_align.vis_processor.train
                               num_beams=num_beams,
                               temperature=temperature,
                               num_visual_tokens=num_visual_tokens,
+                              )  #[0]
     chatbot[-1][1] = llm_message[0]
     return chatbot, chat_state, img_list
 title = """<h1 align="center">Demo of MQT-LLaVA</h1>"""
+description = """<h3>This is the demo of MQT-LLaVA. Upload your images and start chatting! <br> To use
+            example questions, click example image, hit upload & start chat, and press enter on your keyboard in the chatbox.
+            <br> Due to limited memory constraint, we only support single turn conversation. To ask multiple questions, hit Restart and upload your image! </h3>"""
 article = """<p><a href='https://gordonhu608.github.io/mqt-llava/'><img src='https://img.shields.io/badge/Project-Page-Green'></a></p><p><a href='https://github.com/gordonhu608/MQT-LLaVA'><img src='https://img.shields.io/badge/Github-Code-blue'></a></p><p><a href='https://arxiv.org/abs/'><img src='https://img.shields.io/badge/Paper-ArXiv-red'></a></p>
 """

llava/__pycache__/__init__.cpython-310.pyc CHANGED Viewed

Binary files a/llava/__pycache__/__init__.cpython-310.pyc and b/llava/__pycache__/__init__.cpython-310.pyc differ

llava/__pycache__/chat.cpython-310.pyc ADDED Viewed

Binary file (13.3 kB). View file

llava/__pycache__/constants.cpython-310.pyc CHANGED Viewed

Binary files a/llava/__pycache__/constants.cpython-310.pyc and b/llava/__pycache__/constants.cpython-310.pyc differ

llava/__pycache__/conversation.cpython-310.pyc CHANGED Viewed

Binary files a/llava/__pycache__/conversation.cpython-310.pyc and b/llava/__pycache__/conversation.cpython-310.pyc differ

llava/__pycache__/mm_utils.cpython-310.pyc CHANGED Viewed

Binary files a/llava/__pycache__/mm_utils.cpython-310.pyc and b/llava/__pycache__/mm_utils.cpython-310.pyc differ

llava/__pycache__/utils.cpython-310.pyc CHANGED Viewed

Binary files a/llava/__pycache__/utils.cpython-310.pyc and b/llava/__pycache__/utils.cpython-310.pyc differ

llava/chat.py CHANGED Viewed

@@ -442,20 +442,21 @@ def load_images(image_files):
 class Chat:
     def __init__(self, model, tokenizer, image_processor, args, device='cuda:0'):
         self.device = device
-        self.model = model
         self.tokenizer = tokenizer
         self.image_processor = image_processor
         self.args = args
     def ask(self, text, conv):
         #conv.messages = [] #hack not keeping history.
         conv.append_message(conv.roles[0], text)
     def answer(self, conv, img_list, num_visual_tokens=256, max_new_tokens=512, num_beams=1, temperature=0.0):
         conv.append_message(conv.roles[1], None)
         question = conv.get_prompt()
-        images =  img_list[0]    #torch.stack(img_list).to(self.device)
         images_tensor = process_images(
             images,
@@ -466,7 +467,7 @@ class Chat:
         input_ids = (
             tokenizer_image_token(question, self.tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt")
             .unsqueeze(0)
-            .cuda()
         )
         with torch.inference_mode():
@@ -488,21 +489,19 @@ class Chat:
         return output_text, ''
     def upload_img(self, image, conv, img_list):
-        images = load_images([image])
-        # if isinstance(image, str):  # is a image path
-        #     raw_image = Image.open(image).convert('RGB')
-        #     image = self.vis_processor(raw_image).unsqueeze(0).to(self.device)
-        # elif isinstance(image, Image.Image):
-        #     raw_image = image
-        #     raw_image = raw_image.convert('RGB')
-        #     image = self.vis_processor(raw_image).unsqueeze(0).to(self.device)
         # elif isinstance(image, torch.Tensor):
         #     if len(image.shape) == 3:
         #         image = image.unsqueeze(0)
         #     image = image.to(self.device)
         #image_emb, _ = self.model.encode_img(image)
-        img_list.append(images[0])
         #conv.append_message(conv.roles[0], "")
         msg = "Received."
         # self.conv.append_message(self.conv.roles[1], msg)

 class Chat:
     def __init__(self, model, tokenizer, image_processor, args, device='cuda:0'):
         self.device = device
+        self.model = model.to(device)
         self.tokenizer = tokenizer
         self.image_processor = image_processor
         self.args = args
     def ask(self, text, conv):
         #conv.messages = [] #hack not keeping history.
+        text = DEFAULT_IMAGE_TOKEN + "\n" + text
         conv.append_message(conv.roles[0], text)
     def answer(self, conv, img_list, num_visual_tokens=256, max_new_tokens=512, num_beams=1, temperature=0.0):
         conv.append_message(conv.roles[1], None)
         question = conv.get_prompt()
+        images =  img_list   #[0]    #torch.stack(img_list).to(self.device)
         images_tensor = process_images(
             images,
         input_ids = (
             tokenizer_image_token(question, self.tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt")
             .unsqueeze(0)
+            .to(self.device)   #cuda()
         )
         with torch.inference_mode():
         return output_text, ''
     def upload_img(self, image, conv, img_list):
+        if isinstance(image, str):  # is a image path
+            raw_image = Image.open(image).convert('RGB')
+        elif isinstance(image, Image.Image):
+            raw_image = image
+            raw_image = raw_image.convert('RGB')
         # elif isinstance(image, torch.Tensor):
         #     if len(image.shape) == 3:
         #         image = image.unsqueeze(0)
         #     image = image.to(self.device)
         #image_emb, _ = self.model.encode_img(image)
+        img_list.append(raw_image)
         #conv.append_message(conv.roles[0], "")
         msg = "Received."
         # self.conv.append_message(self.conv.roles[1], msg)

llava/model/__pycache__/__init__.cpython-310.pyc CHANGED Viewed

Binary files a/llava/model/__pycache__/__init__.cpython-310.pyc and b/llava/model/__pycache__/__init__.cpython-310.pyc differ

llava/model/__pycache__/builder.cpython-310.pyc CHANGED Viewed

Binary files a/llava/model/__pycache__/builder.cpython-310.pyc and b/llava/model/__pycache__/builder.cpython-310.pyc differ

llava/model/__pycache__/llava_arch.cpython-310.pyc CHANGED Viewed

Binary files a/llava/model/__pycache__/llava_arch.cpython-310.pyc and b/llava/model/__pycache__/llava_arch.cpython-310.pyc differ

llava/model/language_model/__pycache__/llava_llama.cpython-310.pyc CHANGED Viewed

Binary files a/llava/model/language_model/__pycache__/llava_llama.cpython-310.pyc and b/llava/model/language_model/__pycache__/llava_llama.cpython-310.pyc differ

llava/model/language_model/__pycache__/llava_mpt.cpython-310.pyc CHANGED Viewed

Binary files a/llava/model/language_model/__pycache__/llava_mpt.cpython-310.pyc and b/llava/model/language_model/__pycache__/llava_mpt.cpython-310.pyc differ

llava/model/multimodal_encoder/__pycache__/builder.cpython-310.pyc CHANGED Viewed

Binary files a/llava/model/multimodal_encoder/__pycache__/builder.cpython-310.pyc and b/llava/model/multimodal_encoder/__pycache__/builder.cpython-310.pyc differ

llava/model/multimodal_encoder/__pycache__/clip_encoder.cpython-310.pyc CHANGED Viewed

Binary files a/llava/model/multimodal_encoder/__pycache__/clip_encoder.cpython-310.pyc and b/llava/model/multimodal_encoder/__pycache__/clip_encoder.cpython-310.pyc differ

llava/model/multimodal_projector/__pycache__/builder.cpython-310.pyc CHANGED Viewed

Binary files a/llava/model/multimodal_projector/__pycache__/builder.cpython-310.pyc and b/llava/model/multimodal_projector/__pycache__/builder.cpython-310.pyc differ