moondream2-batch-processing

Running on Zero

App Files Files Community

Csplk commited on Oct 5

Commit

3068721

•

1 Parent(s): 04e1dd3

Update app.py

Browse files

Files changed (1) hide show

app.py +22 -17

app.py CHANGED Viewed

@@ -2,38 +2,43 @@ import spaces
 import torch
 import re
 import gradio as gr
-from transformers import AutoTokenizer, AutoModelForCausalLM
-from PIL import Image
-if torch.cuda.is_available():
-    device, dtype = "cuda", torch.float16
-else:
-    device, dtype = "cpu", torch.float32
 model_id = "vikhyatk/moondream2"
-revision = "2024-07-23"
 tokenizer = AutoTokenizer.from_pretrained(model_id, revision=revision)
 moondream = AutoModelForCausalLM.from_pretrained(
-    model_id, trust_remote_code=True, revision=revision, torch_dtype=dtype
-).to(device=device)
 moondream.eval()
-@spaces.GPU(120)
 def answer_questions(image_tuples, prompt_text):
     result = ""
     Q_and_A = ""
     prompts = [p.strip() for p in prompt_text.split(',')]
     image_embeds = [img[0] for img in image_tuples if img[0] is not None]
-    #print(f"\nprompts: {prompts}\n\n")
     answers = []
     for prompt in prompts:
-        image_answers = moondream.batch_answer(
-            images=[img.convert("RGB") for img in image_embeds],
-            prompts=[prompt] * len(image_embeds),
-            tokenizer=tokenizer,
         )
         answers.append(image_answers)
     for i, prompt in enumerate(prompts):
         Q_and_A += f"### Q: {prompt}\n"
@@ -43,7 +48,7 @@ def answer_questions(image_tuples, prompt_text):
             Q_and_A += f"**{image_name} A:** \n {answer_text} \n\n"
     result = {'headers': prompts, 'data': answers}
-    #print(f"result\n{result}\n\nQ_and_A\n{Q_and_A}\n\n")
     return Q_and_A, result
 with gr.Blocks() as demo:

 import torch
 import re
 import gradio as gr
+from threading import Thread
+from transformers import TextIteratorStreamer, AutoTokenizer, AutoModelForCausalLM
+from PIL import ImageDraw
+from torchvision.transforms.v2 import Resize
+import subprocess
+subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
 model_id = "vikhyatk/moondream2"
+revision = "2024-08-26"
 tokenizer = AutoTokenizer.from_pretrained(model_id, revision=revision)
 moondream = AutoModelForCausalLM.from_pretrained(
+    model_id, trust_remote_code=True, revision=revision,
+    torch_dtype=torch.bfloat16, device_map={"": "cuda"},
+    attn_implementation="flash_attention_2"
+)
 moondream.eval()
+@spaces.GPU
 def answer_questions(image_tuples, prompt_text):
     result = ""
     Q_and_A = ""
     prompts = [p.strip() for p in prompt_text.split(',')]
     image_embeds = [img[0] for img in image_tuples if img[0] is not None]
     answers = []
     for prompt in prompts:
+        thread = Thread(
+            image_answers = moondream.batch_answer(
+                images=[img.convert("RGB") for img in image_embeds],
+                prompts=[prompt] * len(image_embeds),
+                tokenizer=tokenizer
+            )
         )
         answers.append(image_answers)
+    thread.start()
     for i, prompt in enumerate(prompts):
         Q_and_A += f"### Q: {prompt}\n"
             Q_and_A += f"**{image_name} A:** \n {answer_text} \n\n"
     result = {'headers': prompts, 'data': answers}
+    print(f"result\n{result}\n\nQ_and_A\n{Q_and_A}\n\n")
     return Q_and_A, result
 with gr.Blocks() as demo: