import os import torch from transformers import AutoModelForVision2Seq, AutoProcessor from PIL import Image import gradio as gr # Login to Hugging Face Hub from huggingface_hub import login token = os.environ.get('HUGGING_FACE_HUB_TOKEN') if token: login(token=token) def load_model(): base_model_path = "meta-llama/Llama-3.2-11B-Vision-Instruct" hub_model_path = "Aekanun/thai-handwriting-llm" processor = AutoProcessor.from_pretrained(base_model_path, token=token) model = AutoModelForVision2Seq.from_pretrained(hub_model_path, token=token) return model, processor model, processor = load_model() def process_image(image): if image is None: return "กรุณาอัพโหลดรูปภาพ" if not isinstance(image, Image.Image): image = Image.fromarray(image) if image.mode != "RGB": image = image.convert("RGB") prompt = "Transcribe the Thai handwritten text from the provided image.\nOnly return the transcription in Thai language." messages = [ { "role": "user", "content": [ {"type": "text", "text": prompt}, {"type": "image", "image": image} ], } ] text = processor.apply_chat_template(messages, tokenize=False) inputs = processor(text=text, images=image, return_tensors="pt") with torch.no_grad(): outputs = model.generate( **inputs, max_new_tokens=256, do_sample=False, pad_token_id=processor.tokenizer.pad_token_id ) transcription = processor.decode(outputs[0], skip_special_tokens=True) return transcription.strip() demo = gr.Interface( fn=process_image, inputs=gr.Image(type="pil"), outputs="text", title="Thai Handwriting OCR", ) if __name__ == "__main__": demo.launch()