Spaces:

Aekanun
/

Thai-HandWriting-to-Text

Running on Zero

App Files Files Community

Aekanun commited on Nov 16, 2024

Commit

948e2eb

1 Parent(s): 925d635

fixing app.py

Browse files

Files changed (1) hide show

app.py +29 -19

app.py CHANGED Viewed

@@ -2,12 +2,12 @@ import os
 import warnings
 import torch
 import gc
-from transformers import LlavaForConditionalGeneration, LlavaProcessor
 from PIL import Image
 import gradio as gr
 from huggingface_hub import login
-# Basic settings
 warnings.filterwarnings('ignore')
 os.environ["CUDA_VISIBLE_DEVICES"] = "0"
@@ -15,7 +15,7 @@ os.environ["CUDA_VISIBLE_DEVICES"] = "0"
 model = None
 processor = None
-# Clear CUDA cache
 if torch.cuda.is_available():
     torch.cuda.empty_cache()
     gc.collect()
@@ -34,24 +34,35 @@ def load_model_and_processor():
     print("กำลังโหลดโมเดลและ processor...")
     try:
-        # Model paths
         hub_model_path = "Aekanun/thai-handwriting-llm"
-        # Load processor and model directly using LLaVA classes
-        processor = LlavaProcessor.from_pretrained(
-            hub_model_path,
-            trust_remote_code=True
         )
         print("กำลังโหลดโมเดลจาก Hub...")
-        model = LlavaForConditionalGeneration.from_pretrained(
             hub_model_path,
             device_map="auto",
             torch_dtype=torch.bfloat16,
             trust_remote_code=True,
-            load_in_4bit=True
         )
-        print("โหลดโมเดลสำเร็จ!")
         return True
     except Exception as e:
@@ -73,12 +84,12 @@ def process_handwriting(image):
         # Convert to RGB if needed
         if image.mode != "RGB":
             image = image.convert("RGB")
-        # Create prompt
         prompt = """Transcribe the Thai handwritten text from the provided image.
 Only return the transcription in Thai language."""
-        # Create model inputs
         messages = [
             {
                 "role": "user",
@@ -89,12 +100,12 @@ Only return the transcription in Thai language."""
             }
         ]
-        # Process with model
         text = processor.apply_chat_template(messages, tokenize=False)
         inputs = processor(text=text, images=image, return_tensors="pt")
         inputs = {k: v.to(model.device) for k, v in inputs.items()}
-        # Generate
         with torch.no_grad():
             outputs = model.generate(
                 **inputs,
@@ -103,7 +114,7 @@ Only return the transcription in Thai language."""
                 pad_token_id=processor.tokenizer.pad_token_id
             )
-        # Decode output
         transcription = processor.decode(outputs[0], skip_special_tokens=True)
         return transcription.strip()
@@ -126,5 +137,4 @@ if load_model_and_processor():
     if __name__ == "__main__":
         demo.launch()
 else:
-    print("ไม่สามารถเริ่มต้นแอปพลิเคชันได้")

 import warnings
 import torch
 import gc
+from transformers import AutoModelForVision2Seq, AutoProcessor, BitsAndBytesConfig
 from PIL import Image
 import gradio as gr
 from huggingface_hub import login
+# ตั้งค่าพื้นฐาน
 warnings.filterwarnings('ignore')
 os.environ["CUDA_VISIBLE_DEVICES"] = "0"
 model = None
 processor = None
+# เคลียร์ CUDA cache
 if torch.cuda.is_available():
     torch.cuda.empty_cache()
     gc.collect()
     print("กำลังโหลดโมเดลและ processor...")
     try:
+        # กำหนด paths
+        base_model_path = "meta-llama/Llama-3.2-11B-Vision-Instruct"
         hub_model_path = "Aekanun/thai-handwriting-llm"
+        # ตั้งค่า BitsAndBytes
+        bnb_config = BitsAndBytesConfig(
+            load_in_4bit=True,
+            bnb_4bit_use_double_quant=True,
+            bnb_4bit_quant_type="nf4",
+            bnb_4bit_compute_dtype=torch.bfloat16
+        )
+        # โหลด processor จาก base model
+        processor = AutoProcessor.from_pretrained(
+            base_model_path,
+            use_auth_token=os.environ.get('HUGGING_FACE_HUB_TOKEN')
         )
+        # โหลดโมเดลจาก Hub
         print("กำลังโหลดโมเดลจาก Hub...")
+        model = AutoModelForVision2Seq.from_pretrained(
             hub_model_path,
             device_map="auto",
             torch_dtype=torch.bfloat16,
+            quantization_config=bnb_config,
             trust_remote_code=True,
+            use_auth_token=os.environ.get('HUGGING_FACE_HUB_TOKEN')
         )
+        print("โหลดโมเดลจาก Hub สำเร็จ!")
         return True
     except Exception as e:
         # Convert to RGB if needed
         if image.mode != "RGB":
             image = image.convert("RGB")
+        # สร้าง prompt สำหรับการถอดความ
         prompt = """Transcribe the Thai handwritten text from the provided image.
 Only return the transcription in Thai language."""
+        # สร้าง input สำหรับโมเดล
         messages = [
             {
                 "role": "user",
             }
         ]
+        # สร้าง inputs โดยตรงจาก processor
         text = processor.apply_chat_template(messages, tokenize=False)
         inputs = processor(text=text, images=image, return_tensors="pt")
         inputs = {k: v.to(model.device) for k, v in inputs.items()}
+        # ทำนาย
         with torch.no_grad():
             outputs = model.generate(
                 **inputs,
                 pad_token_id=processor.tokenizer.pad_token_id
             )
+        # แปลงผลลัพธ์
         transcription = processor.decode(outputs[0], skip_special_tokens=True)
         return transcription.strip()
     if __name__ == "__main__":
         demo.launch()
 else:
+    print("ไม่สามารถเริ่มต้นแอปพลิเคชันได้")