import spaces import gradio as gr import torch from diffusers import StableDiffusion3Pipeline from huggingface_hub import snapshot_download,login from transformers import pipeline from PIL import Image import os # Retrieve the API token from the environment variable huggingface_token = os.getenv("HUGGINGFACE_TOKEN") if huggingface_token is None: raise ValueError("HUGGINGFACE_TOKEN environment variable is not set.") # Log in to Hugging Face login(token=huggingface_token) # Check if CUDA is available device = "cuda" if torch.cuda.is_available() else "cpu" # Ensure GPU is available if device == "cuda": print("CUDA is available. Using GPU.") else: print("CUDA is not available. Using CPU.") # Download and load the Stable Diffusion model model_path = snapshot_download( repo_id="stabilityai/stable-diffusion-3-medium", revision="refs/pr/26", repo_type="model", ignore_patterns=["*.md", "*.gitattributes"], local_dir="stable-diffusion-3-medium", token=huggingface_token ) image_gen = StableDiffusion3Pipeline.from_pretrained(model_path, text_encoder_3=None, tokenizer_3=None,torch_dtype=torch.float16) image_gen = image_gen.to(device) # Load the image-to-text pipeline caption_image = pipeline("image-to-text", model="Salesforce/blip-image-captioning-large", device=device) @spaces.GPU(enable_queue=True) def generate_image_from_caption(image, num_inference_steps=50, guidance_scale=7.5): # Generate the caption caption = caption_image(image)[0]['generated_text'] print("Generated Caption:", caption) # Generate the image from the caption result = image_gen( prompt=caption, num_inference_steps=num_inference_steps, guidance_scale=guidance_scale, negative_prompt="blurred, ugly, watermark, low resolution, blurry", height=512, width=512 ) # Get the generated image generated_image = result.images[0] return generated_image # Create the Gradio interface iface = gr.Interface( fn=generate_image_from_caption, inputs=[ gr.Image(type="pil",label="Upload an image"), gr.Slider(label="Number of inference steps", minimum=1, maximum=100, value=50), gr.Slider(label="Guidance scale", minimum=1.0, maximum=20.0, value=7.5) ], outputs=gr.Image(label="Generated Image"), title="Image-to-Image Generator using Caption", description="Upload an image to generate a caption, and then use the caption as a prompt to generate a new image using Stable Diffusion." ) # Launch the Gradio app iface.launch()