davanstrien
/

Molmo-7B-D-0924

Image-Text-to-Text

text-generation

Inference Endpoints

Model card Files Files and versions Community

davanstrien HF staff commited on Oct 4

Commit

5fd9231

•

1 Parent(s): 2dce2dc

Create handler.py

Files changed (1) hide show

handler.py +47 -0

handler.py ADDED Viewed

	@@ -0,0 +1,47 @@

+from typing import Dict, List, Any
+from transformers import AutoModelForCausalLM, AutoProcessor, GenerationConfig
+from PIL import Image
+import requests
+import torch
+class EndpointHandler:
+    def __init__(self, path=""):
+        self.processor = AutoProcessor.from_pretrained(
+            path, trust_remote_code=True, torch_dtype="auto", device_map="auto"
+        )
+        self.model = AutoModelForCausalLM.from_pretrained(
+            path, trust_remote_code=True, torch_dtype="auto", device_map="auto"
+        )
+    def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
+        # Extract inputs from the request data
+        image_url = data.get("image_url")
+        text_prompt = data.get("text_prompt", "Describe this image.")
+        # Download and process the image
+        image = Image.open(requests.get(image_url, stream=True).raw)
+        if image.mode != "RGB":
+            image = image.convert("RGB")
+        # Process the image and text
+        inputs = self.processor.process(images=[image], text=text_prompt)
+        # Move inputs to the correct device and make a batch of size 1
+        inputs = {k: v.to(self.model.device).unsqueeze(0) for k, v in inputs.items()}
+        # Generate output
+        with torch.autocast(device_type="cuda", enabled=True, dtype=torch.bfloat16):
+            output = self.model.generate_from_batch(
+                inputs,
+                GenerationConfig(max_new_tokens=200, stop_strings="<|endoftext|>"),
+                tokenizer=self.processor.tokenizer,
+            )
+        # Decode the generated tokens
+        generated_tokens = output[0, inputs["input_ids"].size(1) :]
+        generated_text = self.processor.tokenizer.decode(
+            generated_tokens, skip_special_tokens=True
+        )
+        return [{"generated_text": generated_text}]