Spaces:

sounar
/

ContactDoctor-API

Paused

App Files Files Community

sounar commited on Nov 19, 2024

Commit

d16c5f3

verified ·

1 Parent(s): b37e8c8

Update app.py

Browse files

Files changed (1) hide show

app.py +7 -28

app.py CHANGED Viewed

@@ -3,15 +3,7 @@ import torch
 from transformers import AutoModel, AutoTokenizer, BitsAndBytesConfig
 import gradio as gr
 from PIL import Image
-# First, let's check if flash-attn is installed
-try:
-    import flash_attn
-    FLASH_ATTN_AVAILABLE = True
-except ImportError:
-    FLASH_ATTN_AVAILABLE = False
-    print("Flash Attention is not installed. Using default attention mechanism.")
-    print("To install Flash Attention, run: pip install flash-attn --no-build-isolation")
 # Get API token from environment variable
 api_token = os.getenv("HF_TOKEN").strip()
@@ -24,23 +16,15 @@ bnb_config = BitsAndBytesConfig(
     bnb_4bit_compute_dtype=torch.float16
 )
-# Initialize model with conditional Flash Attention
-model_args = {
-    "quantization_config": bnb_config,
-    "device_map": "auto",
-    "torch_dtype": torch.float16,
-    "trust_remote_code": True,
-    "token": api_token
-}
-# Only add flash attention if available
-if FLASH_ATTN_AVAILABLE:
-    model_args["attn_implementation"] = "flash_attention_2"
 # Initialize model and tokenizer
 model = AutoModel.from_pretrained(
     "ContactDoctor/Bio-Medical-MultiModal-Llama-3-8B-V1",
-    **model_args
 )
 tokenizer = AutoTokenizer.from_pretrained(
@@ -100,11 +84,6 @@ demo = gr.Interface(
 # Launch the Gradio app
 if __name__ == "__main__":
-    # Print installation instructions if Flash Attention is not available
-    if not FLASH_ATTN_AVAILABLE:
-        print("\nTo enable Flash Attention 2 for better performance, please install it using:")
-        print("pip install flash-attn --no-build-isolation")
     demo.launch(
         share=True,
         server_name="0.0.0.0",

 from transformers import AutoModel, AutoTokenizer, BitsAndBytesConfig
 import gradio as gr
 from PIL import Image
+from torchvision.transforms import ToTensor
 # Get API token from environment variable
 api_token = os.getenv("HF_TOKEN").strip()
     bnb_4bit_compute_dtype=torch.float16
 )
 # Initialize model and tokenizer
 model = AutoModel.from_pretrained(
     "ContactDoctor/Bio-Medical-MultiModal-Llama-3-8B-V1",
+    quantization_config=bnb_config,
+    device_map="auto",
+    torch_dtype=torch.float16,
+    trust_remote_code=True,
+    attn_implementation="flash_attention_2",
+    token=api_token
 )
 tokenizer = AutoTokenizer.from_pretrained(
 # Launch the Gradio app
 if __name__ == "__main__":
     demo.launch(
         share=True,
         server_name="0.0.0.0",