Jawi-OCR-Qwen-v2

This model is a fine-tuned version of Qwen2.5-VL-3B-Instruct for optical character recognition of Jawi script. Jawi is an Arabic-based writing system traditionally used for writing Malay and other languages in Southeast Asia.

Usage

from transformers import AutoModelForImageTextToText, AutoProcessor
import torch
from PIL import Image
from datasets import load_dataset
import pandas as pd

prompt = "Transcribe the Jawi script in this image into Jawi text"  
model = AutoModelForImageTextToText.from_pretrained("culturalheritagenus/Jawi-OCR-Qwen-v2")
model = model.to("cuda" if torch.cuda.is_available() else "cpu")
processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-3B-Instruct")

def process_image(image):
    try:
        messages = [
            {
                "role": "user",
                "content": [
                    {"type": "image", "image": image},
                    {"type": "text", "text": prompt}
                ]
            },
        ]
        inputs = processor.apply_chat_template(
            messages,
            add_generation_prompt=True,
            tokenize=True,
            return_dict=True,
            return_tensors="pt",
        ).to(model.device)

        generated_ids = model.generate(
            **inputs, 
            max_new_tokens=128,
            do_sample=False,
            temperature=1.0,
            pad_token_id=processor.tokenizer.pad_token_id,
            eos_token_id=processor.tokenizer.eos_token_id
        )

        generated_ids_trimmed = [
            out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs["input_ids"], generated_ids)
        ]
        generated_text = processor.tokenizer.batch_decode(generated_ids_trimmed, skip_special_tokens=True)[0]
        return generated_text

    except Exception as e:
        print(f"Error during generation: {e}")
        return "Error: Could not process image"

Running inference at scale, assuming an HF-style dataset ds.

for i, row in enumerate(ds):
    img: Image.Image = row["Image"]  # PIL image from HF dataset
    identifier = row["Identifier"]
    text = row["Text"]
    buffered = BytesIO()
    img.save(buffered, format="PNG")
    img_b64 = base64.b64encode(buffered.getvalue()).decode("utf-8")

    jawi = process_image(img_b64)