Spaces:

salomonsky
/

flux3

Running

App Files Files Community

flux3 / app.py

vilarin

Update app.py

7b555f0 verified 5 months ago

raw

history blame

3.03 kB

	import gradio as gr
	import torch
	from diffusers import StableAudioPipeline
	from huggingface_hub import hf_hub_download
	import spaces
	from translatepy import Translator
	import numpy as np
	import random
	import soundfile as sf

	translator = Translator()

	# Constants
	model = "stabilityai/stable-audio-open-1.0"
	# MAX_SEED = np.iinfo(np.int32).max

	CSS = """
	.gradio-container {
	max-width: 690px !important;
	}
	footer {
	visibility: hidden;
	}
	"""

	JS = """function () {
	gradioURL = window.location.href
	if (!gradioURL.endsWith('?__theme=dark')) {
	window.location.replace(gradioURL + '?__theme=dark');
	}
	}"""
	DESCRIPTION = """
	<center>
	Stable Audio Open 1.0 generates variable-length (up to 47s) stereo audio at 44.1kHz from text prompts. \
	It comprises three components: an autoencoder that compresses waveforms into a manageable sequence length, \
	a T5-based text embedding for text conditioning, and a transformer-based diffusion (DiT) model that operates in the latent space of the autoencoder.
	</center>
	"""

	# Ensure model and scheduler are initialized in GPU-enabled function
	if torch.cuda.is_available():
	pipe = StableAudioPipeline.from_pretrained(
	model,
	low_cpu_mem_usage=True,
	torch_dtype=torch.float16).to("cuda")


	# Function
	@spaces.GPU(duration=120)
	def generate_image(
	prompt,
	negative="low quality",
	second: float = 10.0):

	# if seed == -1:
	# seed = random.randint(0, MAX_SEED)
	# seed = int(seed)
	# generator = torch.Generator().manual_seed(seed)

	prompt = str(translator.translate(prompt, 'English'))

	print(f'prompt:{prompt}')

	audio = pipe(
	prompt,
	negative_prompt=negative,
	audio_end_in_s=second,
	).audios

	os.makedirs("outputs", exist_ok=True)
	base_count = len(glob(os.path.join("outputs", "*.mp4")))
	audio_path = os.path.join("outputs", f"{base_count:06d}.wav")

	sf.write(audio_path, audio[0].T.float().cpu().numpy(), pipe.vae.samping_rate)

	return audio_path

	# Gradio Interface

	with gr.Blocks(theme='soft', css=CSS, js=JS, title="Stable Audio Open") as iface:
	with gr.Accordion(""):
	gr.Markdown(DESCRIPTION)
	with gr.Row():
	output = gr.Audio(label="Podcast", type="filepath", interactive=False, autoplay=True, elem_classes="audio") # Create an output textbox
	with gr.Row():
	prompt = gr.Textbox(label="Prompt", placeholder="1000 BPM percussive sound of water drops")
	with gr.Row():
	negative = gr.Textbox(label="Negative prompt", placeholder="Low quality")
	second = gr.Slider(5.0, 60.0, value=10.0, label="Second", step=0.1),
	with gr.Row():
	submit_btn = gr.Button("🚀 Send") # Create a submit button
	clear_btn = gr.ClearButton(output, value="🗑️ Clear") # Create a clear button

	# Set up the event listeners
	submit_btn.click(main, inputs=[prompt, negative, second], outputs=output)


	#gr.close_all()

	iface.queue().launch(show_api=False) # Launch the Gradio interface