|
import gradio as gr |
|
from gradio_client import Client |
|
import json |
|
import re |
|
|
|
def get_caption(image_in): |
|
kosmos2_client = Client("https://ydshieh-kosmos-2.hf.space/") |
|
|
|
kosmos2_result = kosmos2_client.predict( |
|
image_in, |
|
"Detailed", |
|
fn_index=4 |
|
) |
|
|
|
print(f"KOSMOS2 RETURNS: {kosmos2_result}") |
|
|
|
with open(kosmos2_result[1], 'r') as f: |
|
data = json.load(f) |
|
|
|
reconstructed_sentence = [] |
|
for sublist in data: |
|
reconstructed_sentence.append(sublist[0]) |
|
|
|
full_sentence = ' '.join(reconstructed_sentence) |
|
|
|
|
|
|
|
pattern = r'^Describe this image in detail:\s*(.*)$' |
|
|
|
match = re.search(pattern, full_sentence) |
|
if match: |
|
description = match.group(1) |
|
print(description) |
|
else: |
|
print("Unable to locate valid description.") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
return description |
|
|
|
def get_magnet(prompt): |
|
amended_prompt = f"No Music. {prompt}" |
|
client = Client("https://fffiloni-magnet.hf.space/--replicas/oo8sb/") |
|
result = client.predict( |
|
"facebook/magnet-small-10secs", |
|
None, |
|
amended_prompt, |
|
3, |
|
0.9, |
|
10, |
|
1, |
|
20, |
|
10, |
|
10, |
|
10, |
|
"prod-stride1 (new!)", |
|
api_name="/predict_full" |
|
) |
|
print(result) |
|
return result[0] |
|
|
|
def get_audioldm(prompt): |
|
client = Client("https://haoheliu-audioldm2-text2audio-text2music.hf.space/") |
|
result = client.predict( |
|
prompt, |
|
"Low quality. Music.", |
|
5, |
|
0, |
|
5, |
|
1, |
|
fn_index=1 |
|
) |
|
print(result) |
|
return result |
|
|
|
def infer(image_in): |
|
caption = get_caption(image_in) |
|
magnet_result = get_magnet(caption) |
|
audioldm_result = get_audioldm(caption) |
|
return magnet_result, audioldm_result |
|
|
|
with gr.Blocks() as demo: |
|
with gr.Column(): |
|
gr.HTML(""" |
|
<h2 style="text-align: center;"> |
|
Image to SFX |
|
</h2> |
|
<p style="text-align: center;"> |
|
Compare MAGNet and AudioLDM2 sound effects generation from image caption (Kosmos2) |
|
</p> |
|
""") |
|
with gr.Row(): |
|
with gr.Column(): |
|
image_in = gr.Image(sources=["upload"], type="filepath", label="Image input") |
|
submit_btn = gr.Button("Submit") |
|
with gr.Column(): |
|
magnet_o = gr.Video(label="MAGNet output") |
|
audioldm2_o = gr.Video(label="AudioLDM2 output") |
|
submit_btn.click( |
|
fn=infer, |
|
inputs=[image_in], |
|
outputs=[magnet_o, audioldm2_o] |
|
) |
|
demo.queue(max_size=10).launch(debug=True) |