Spaces:

amphion
/

PicoAudio

Running on Zero

App Files Files Community

ZeyuXie commited on Jul 18, 2024

Commit

101c1cd

verified ·

1 Parent(s): 9fd4c53

Update app.py

Browse files

Files changed (1) hide show

app.py +14 -11

app.py CHANGED Viewed

@@ -33,7 +33,7 @@ class InferRunner:
         self.scheduler = DDPMScheduler.from_pretrained(train_args.scheduler_name, subfolder="scheduler")
 device = "cuda" if torch.cuda.is_available() else "cpu"
-# runner = InferRunner(device)
 event_list = get_event()
 def infer(caption, num_steps=200, guidance_scale=3.0, audio_len=16000*10):
     with torch.no_grad():
@@ -49,7 +49,6 @@ def preprocess(caption):
     return output, output
 def update_textbox(event_name, current_text):
-    print(event_name, current_text)
     event = event_name + ' two times.'
     if current_text:
         return current_text.strip('.') + ' then ' + event
@@ -60,7 +59,7 @@ with gr.Blocks() as demo:
     with gr.Row():
         gr.Markdown("## PicoAudio")
     with gr.Row():
-        description_text = f"18 events supported :"
         gr.Markdown(description_text)
@@ -80,20 +79,22 @@ with gr.Blocks() as demo:
     with gr.Row():
-        gr.Markdown("## Step1")
     with gr.Row():
-        preprocess_description_text = f"Preprocess: transfer free-text into timestamp caption via LLM. "+\
             "This demo uses Gemini as the preprocessor. If any errors occur, please try a few more times. "+\
                 "We also provide the GPT version consistent with the paper in the file 'Files/llm_reprocessing.py'. You can use your own api_key to modify and run 'Files/inference.py' for local inference."
         gr.Markdown(preprocess_description_text)
     with gr.Row():
         with gr.Column():
-            freetext_prompt = gr.Textbox(label="Prompt: Input your free-text caption here. (e.g. a dog barks three times.)",
                 value="a dog barks three times.",)
-            preprocess_run_button = gr.Button()
             prompt = None
         with gr.Column():
-            freetext_prompt_out = gr.Textbox(label="Preprocess output")
     with gr.Row():
         with gr.Column():
             gr.Examples(
@@ -108,15 +109,17 @@ with gr.Blocks() as demo:
     with gr.Row():
-        gr.Markdown("## Step2")
     with gr.Row():
         generate_description_text = f"Generate audio based on timestamp caption."
         gr.Markdown(generate_description_text)
     with gr.Row():
         with gr.Column():
-            prompt = gr.Textbox(label="Prompt: Input your caption formatted as 'event1 at onset1-offset1_onset2-offset2 and event2 at onset1-offset1'.",
                 value="spraying at 0.38-1.176_3.06-3.856 and gunshot at 1.729-3.729_4.367-6.367_7.031-9.031.",)
-            generate_run_button = gr.Button()
             with gr.Accordion("Advanced options", open=False):
                 num_steps = gr.Slider(label="num_steps", minimum=1, maximum=300, value=200, step=1)
                 guidance_scale = gr.Slider(label="guidance_scale", minimum=0.1, maximum=8.0, value=3.0, step=0.1)

         self.scheduler = DDPMScheduler.from_pretrained(train_args.scheduler_name, subfolder="scheduler")
 device = "cuda" if torch.cuda.is_available() else "cpu"
+runner = InferRunner(device)
 event_list = get_event()
 def infer(caption, num_steps=200, guidance_scale=3.0, audio_len=16000*10):
     with torch.no_grad():
     return output, output
 def update_textbox(event_name, current_text):
     event = event_name + ' two times.'
     if current_text:
         return current_text.strip('.') + ' then ' + event
     with gr.Row():
         gr.Markdown("## PicoAudio")
     with gr.Row():
+        description_text = f"18 events supported:"
         gr.Markdown(description_text)
     with gr.Row():
+        gr.Markdown("## Step1-Preprocess")
     with gr.Row():
+        preprocess_description_text = f"Transfer free-text into timestamp caption via LLM. "+\
             "This demo uses Gemini as the preprocessor. If any errors occur, please try a few more times. "+\
                 "We also provide the GPT version consistent with the paper in the file 'Files/llm_reprocessing.py'. You can use your own api_key to modify and run 'Files/inference.py' for local inference."
         gr.Markdown(preprocess_description_text)
     with gr.Row():
         with gr.Column():
+            freetext_prompt = gr.Textbox(label="Free-text Prompt: Input your free-text caption here. (e.g. a dog barks three times.)",
                 value="a dog barks three times.",)
+            with gr.Row():
+                preprocess_run_button = gr.Button()
+                preprocess_run_clear = gr.ClearButton([freetext_prompt])
             prompt = None
         with gr.Column():
+            freetext_prompt_out = gr.Textbox(label="Timestamp Caption: Preprocess output")
     with gr.Row():
         with gr.Column():
             gr.Examples(
     with gr.Row():
+        gr.Markdown("## Step2-Generate")
     with gr.Row():
         generate_description_text = f"Generate audio based on timestamp caption."
         gr.Markdown(generate_description_text)
     with gr.Row():
         with gr.Column():
+            prompt = gr.Textbox(label="Timestamp Caption: Specify your timestamp caption formatted as 'event1 at onset1-offset1_onset2-offset2 and event2 at onset1-offset1'.",
                 value="spraying at 0.38-1.176_3.06-3.856 and gunshot at 1.729-3.729_4.367-6.367_7.031-9.031.",)
+            with gr.Row():
+                generate_run_button = gr.Button()
+                generate_run_clear = gr.ClearButton([prompt])
             with gr.Accordion("Advanced options", open=False):
                 num_steps = gr.Slider(label="num_steps", minimum=1, maximum=300, value=200, step=1)
                 guidance_scale = gr.Slider(label="guidance_scale", minimum=0.1, maximum=8.0, value=3.0, step=0.1)