Jawi
Collection
Models for historical documents in Jawi (an adaptation of the Perso-Arabic script for the Malay language)
•
5 items
•
Updated
This model is a fine-tuned version of Qwen2.5-VL-3B-Instruct for optical character recognition of Jawi script. Jawi is an Arabic-based writing system traditionally used for writing Malay and other languages in Southeast Asia.
from transformers import AutoModelForImageTextToText, AutoProcessor
import torch
from PIL import Image
from datasets import load_dataset
import pandas as pd
prompt = "Transcribe the Jawi script in this image into Jawi text"
model = AutoModelForImageTextToText.from_pretrained("culturalheritagenus/Jawi-OCR-Qwen-v2")
model = model.to("cuda" if torch.cuda.is_available() else "cpu")
processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-3B-Instruct")
def process_image(image):
try:
messages = [
{
"role": "user",
"content": [
{"type": "image", "image": image},
{"type": "text", "text": prompt}
]
},
]
inputs = processor.apply_chat_template(
messages,
add_generation_prompt=True,
tokenize=True,
return_dict=True,
return_tensors="pt",
).to(model.device)
generated_ids = model.generate(
**inputs,
max_new_tokens=128,
do_sample=False,
temperature=1.0,
pad_token_id=processor.tokenizer.pad_token_id,
eos_token_id=processor.tokenizer.eos_token_id
)
generated_ids_trimmed = [
out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs["input_ids"], generated_ids)
]
generated_text = processor.tokenizer.batch_decode(generated_ids_trimmed, skip_special_tokens=True)[0]
return generated_text
except Exception as e:
print(f"Error during generation: {e}")
return "Error: Could not process image"
Running inference at scale, assuming an HF-style dataset ds.
for i, row in enumerate(ds):
img: Image.Image = row["Image"] # PIL image from HF dataset
identifier = row["Identifier"]
text = row["Text"]
buffered = BytesIO()
img.save(buffered, format="PNG")
img_b64 = base64.b64encode(buffered.getvalue()).decode("utf-8")
jawi = process_image(img_b64)
Base model
Qwen/Qwen2.5-VL-3B-Instruct