import spaces
import gradio as gr
import torch
from diffusers import StableDiffusion3Pipeline
from huggingface_hub import snapshot_download,login
from transformers import pipeline
from PIL import Image
import os


# Retrieve the API token from the environment variable
huggingface_token = os.getenv("HUGGINGFACE_TOKEN")
if huggingface_token is None:
    raise ValueError("HUGGINGFACE_TOKEN environment variable is not set.")

# Log in to Hugging Face
login(token=huggingface_token)    

# Check if CUDA is available
device = "cuda" if torch.cuda.is_available() else "cpu"

# Ensure GPU is available
if device == "cuda":
    print("CUDA is available. Using GPU.")
else:
    print("CUDA is not available. Using CPU.")

# Download and load the Stable Diffusion model
model_path = snapshot_download(
    repo_id="stabilityai/stable-diffusion-3-medium",
    revision="refs/pr/26",
    repo_type="model",
    ignore_patterns=["*.md", "*.gitattributes"],
    local_dir="stable-diffusion-3-medium",
    token=huggingface_token
)
image_gen = StableDiffusion3Pipeline.from_pretrained(model_path, text_encoder_3=None, tokenizer_3=None,torch_dtype=torch.float16)

image_gen = image_gen.to(device)

# Load the image-to-text pipeline
caption_image = pipeline("image-to-text", model="Salesforce/blip-image-captioning-large", device=device)


@spaces.GPU(enable_queue=True)
def generate_image_from_caption(image, num_inference_steps=50, guidance_scale=7.5):
    # Generate the caption
    caption = caption_image(image)[0]['generated_text']
    print("Generated Caption:", caption)
    
    # Generate the image from the caption
    result = image_gen(
        prompt=caption,
        num_inference_steps=num_inference_steps,
        guidance_scale=guidance_scale,
        negative_prompt="blurred, ugly, watermark, low resolution, blurry",
        height=512,
        width=512
    )
    # Get the generated image
    generated_image = result.images[0]
    return generated_image

# Create the Gradio interface
iface = gr.Interface(
    fn=generate_image_from_caption,
    inputs=[
        gr.Image(type="pil",label="Upload an image"),
        gr.Slider(label="Number of inference steps", minimum=1, maximum=100, value=50),
        gr.Slider(label="Guidance scale", minimum=1.0, maximum=20.0, value=7.5)
    ],
    outputs=gr.Image(label="Generated Image"),
    title="Image-to-Image Generator using Caption",
    description="Upload an image to generate a caption, and then use the caption as a prompt to generate a new image using Stable Diffusion."
)

# Launch the Gradio app
iface.launch()