Spaces:

lehduong
/

OneDiffusion

Running on Zero

App Files Files Community

lehduong commited on 9 days ago

Commit

39732f4

•

1 Parent(s): d4ffd77

Update app.py

Browse files

Files changed (1) hide show

app.py +30 -30

app.py CHANGED Viewed

@@ -8,7 +8,7 @@ import base64
 import io
 from PIL import Image
 from transformers import (
-    LlavaNextProcessor, LlavaNextForConditionalGeneration,
     T5EncoderModel, T5Tokenizer
 )
 from transformers import (
@@ -53,34 +53,34 @@ TASK2SPECIAL_TOKENS = {
 NEGATIVE_PROMPT = "monochrome, greyscale, low-res, bad anatomy, bad hands, text, error, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality, normal quality, jpeg artifacts, signature, watermark, username, blurry, artist name, poorly drawn, bad anatomy, wrong anatomy, extra limb, missing limb, floating limbs, disconnected limbs, mutation, mutated, ugly, disgusting, blurry, amputation"
-class LlavaCaptionProcessor:
-    def __init__(self):
-        model_name = "llava-hf/llama3-llava-next-8b-hf"
-        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-        dtype = torch.float16 if torch.cuda.is_available() else torch.float32
-        self.processor = LlavaNextProcessor.from_pretrained(model_name)
-        self.model = LlavaNextForConditionalGeneration.from_pretrained(
-            model_name, torch_dtype=dtype, low_cpu_mem_usage=True
-        ).to(device)
-        self.SPECIAL_TOKENS = "assistant\n\n\n"
-    def generate_response(self, image: Image.Image, msg: str) -> str:
-        conversation = [{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": msg}]}]
-        with torch.no_grad():
-            prompt = self.processor.apply_chat_template(conversation, add_generation_prompt=True)
-            inputs = self.processor(prompt, image, return_tensors="pt").to(self.model.device)
-            output = self.model.generate(**inputs, max_new_tokens=200)
-            response = self.processor.decode(output[0], skip_special_tokens=True)
-        return response.split(msg)[-1].strip()[len(self.SPECIAL_TOKENS):]
-    def process(self, images: List[Image.Image], msg: str = None) -> List[str]:
-        if msg is None:
-            msg = f"Describe the contents of the photo in 150 words or fewer."
-        try:
-            return [self.generate_response(img, msg) for img in images]
-        except Exception as e:
-            print(f"Error in process: {str(e)}")
-            raise
 class MolmoCaptionProcessor:
@@ -756,7 +756,7 @@ def delete_all_images():
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description='Start the Gradio demo with specified captioner.')
-    parser.add_argument('--captioner', type=str, choices=['molmo', 'llava', 'disable'], default='disable', help='Captioner to use: molmo, llava, disable.')
     args = parser.parse_args()
     # Initialize models with the specified captioner

 import io
 from PIL import Image
 from transformers import (
+    # LlavaNextProcessor, LlavaNextForConditionalGeneration,
     T5EncoderModel, T5Tokenizer
 )
 from transformers import (
 NEGATIVE_PROMPT = "monochrome, greyscale, low-res, bad anatomy, bad hands, text, error, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality, normal quality, jpeg artifacts, signature, watermark, username, blurry, artist name, poorly drawn, bad anatomy, wrong anatomy, extra limb, missing limb, floating limbs, disconnected limbs, mutation, mutated, ugly, disgusting, blurry, amputation"
+# class LlavaCaptionProcessor:
+#     def __init__(self):
+#         model_name = "llava-hf/llama3-llava-next-8b-hf"
+#         device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+#         dtype = torch.float16 if torch.cuda.is_available() else torch.float32
+#         self.processor = LlavaNextProcessor.from_pretrained(model_name)
+#         self.model = LlavaNextForConditionalGeneration.from_pretrained(
+#             model_name, torch_dtype=dtype, low_cpu_mem_usage=True
+#         ).to(device)
+#         self.SPECIAL_TOKENS = "assistant\n\n\n"
+#
+#     def generate_response(self, image: Image.Image, msg: str) -> str:
+#         conversation = [{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": msg}]}]
+#         with torch.no_grad():
+#             prompt = self.processor.apply_chat_template(conversation, add_generation_prompt=True)
+#             inputs = self.processor(prompt, image, return_tensors="pt").to(self.model.device)
+#             output = self.model.generate(**inputs, max_new_tokens=200)
+#             response = self.processor.decode(output[0], skip_special_tokens=True)
+#         return response.split(msg)[-1].strip()[len(self.SPECIAL_TOKENS):]
+#
+#     def process(self, images: List[Image.Image], msg: str = None) -> List[str]:
+#         if msg is None:
+#             msg = f"Describe the contents of the photo in 150 words or fewer."
+#         try:
+#             return [self.generate_response(img, msg) for img in images]
+#         except Exception as e:
+#             print(f"Error in process: {str(e)}")
+#             raise
 class MolmoCaptionProcessor:
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description='Start the Gradio demo with specified captioner.')
+    parser.add_argument('--captioner', type=str, choices=['molmo', 'llava', 'disable'], default='molmo', help='Captioner to use: molmo, llava, disable.')
     args = parser.parse_args()
     # Initialize models with the specified captioner