Spaces:
Sleeping
Sleeping
import gradio as gr | |
import json | |
import re | |
from gradio_client import Client | |
#fusecap_client = Client("https://noamrot-fusecap-image-captioning.hf.space/") | |
#fuyu_client = Client("https://adept-fuyu-8b-demo.hf.space/") | |
kosmos2_client = Client("https://ydshieh-kosmos-2.hf.space/") | |
def get_caption(image_in): | |
""" | |
fuyu_result = fuyu_client.predict( | |
image_in, # str representing input in 'raw_image' Image component | |
True, # bool in 'Enable detailed captioning' Checkbox component | |
fn_index=2 | |
) | |
""" | |
kosmos2_result = kosmos2_client.predict( | |
image_in, # str (filepath or URL to image) in 'Test Image' Image component | |
"Detailed", # str in 'Description Type' Radio component | |
fn_index=4 | |
) | |
print(f"KOSMOS2 RETURNS: {kosmos2_result}") | |
with open(kosmos2_result[1], 'r') as f: | |
data = json.load(f) | |
reconstructed_sentence = [] | |
for sublist in data: | |
reconstructed_sentence.append(sublist[0]) | |
full_sentence = ' '.join(reconstructed_sentence) | |
#print(full_sentence) | |
# Find the pattern matching the expected format ("Describe this image in detail:" followed by optional space and then the rest)... | |
pattern = r'^Describe this image in detail:\s*(.*)$' | |
# Apply the regex pattern to extract the description text. | |
match = re.search(pattern, full_sentence) | |
if match: | |
description = match.group(1) | |
print(description) | |
else: | |
print("Unable to locate valid description.") | |
# Find the last occurrence of "." | |
#last_period_index = full_sentence.rfind('.') | |
# Truncate the string up to the last period | |
#truncated_caption = full_sentence[:last_period_index + 1] | |
# print(truncated_caption) | |
#print(f"\n—\nIMAGE CAPTION: {truncated_caption}") | |
return description | |
def get_caption_from_MD(image_in): | |
client = Client("https://vikhyatk-moondream1.hf.space/") | |
result = client.predict( | |
image_in, # filepath in 'image' Image component | |
"Describe precisely the image.", # str in 'Question' Textbox component | |
api_name="/answer_question" | |
) | |
print(result) | |
return result | |
def get_magnet(prompt): | |
amended_prompt = f"{prompt}" | |
print(amended_prompt) | |
client = Client("https://fffiloni-magnet.hf.space/") | |
result = client.predict( | |
"facebook/magnet-medium-10secs", # Literal['facebook/magnet-small-10secs', 'facebook/magnet-medium-10secs', 'facebook/magnet-small-30secs', 'facebook/magnet-medium-30secs', 'facebook/audio-magnet-small', 'facebook/audio-magnet-medium'] in 'Model' Radio component | |
"", # str in 'Model Path (custom models)' Textbox component | |
amended_prompt, # str in 'Input Text' Textbox component | |
3, # float in 'Temperature' Number component | |
0.9, # float in 'Top-p' Number component | |
10, # float in 'Max CFG coefficient' Number component | |
1, # float in 'Min CFG coefficient' Number component | |
20, # float in 'Decoding Steps (stage 1)' Number component | |
10, # float in 'Decoding Steps (stage 2)' Number component | |
10, # float in 'Decoding Steps (stage 3)' Number component | |
10, # float in 'Decoding Steps (stage 4)' Number component | |
"prod-stride1 (new!)", # Literal['max-nonoverlap', 'prod-stride1 (new!)'] in 'Span Scoring' Radio component | |
api_name="/predict_full" | |
) | |
print(result) | |
return result[1] | |
import re | |
import torch | |
from transformers import pipeline | |
zephyr_model = "HuggingFaceH4/zephyr-7b-beta" | |
mixtral_model = "mistralai/Mixtral-8x7B-Instruct-v0.1" | |
pipe = pipeline("text-generation", model=mixtral_model, torch_dtype=torch.bfloat16, device_map="auto") | |
agent_maker_sys = f""" | |
You are an AI whose job is to help users create their own music which its genre will reflect the character or scene from an image described by users. | |
In particular, you need to respond succintly in a friendly tone, write a musical prompt for an music generation model. | |
For example, if a user says, "a picture of a man in a black suit and tie riding a black dragon", provide immediately a musical prompt corresponding to the image description. | |
Immediately STOP after that. It should be EXACTLY in this format: | |
"A grand orchestral arrangement with thunderous percussion, epic brass fanfares, and soaring strings, creating a cinematic atmosphere fit for a heroic battle" | |
""" | |
instruction = f""" | |
<|system|> | |
{agent_maker_sys}</s> | |
<|user|> | |
""" | |
def infer(image_in): | |
gr.Info("Getting image caption with Kosmos2...") | |
user_prompt = get_caption(image_in) | |
prompt = f"{instruction.strip()}\n{user_prompt}</s>" | |
#print(f"PROMPT: {prompt}") | |
gr.Info("Building a system according to the image caption ...") | |
outputs = pipe(prompt, max_new_tokens=256, do_sample=True, temperature=0.7, top_k=50, top_p=0.95) | |
pattern = r'\<\|system\|\>(.*?)\<\|assistant\|\>' | |
cleaned_text = re.sub(pattern, '', outputs[0]["generated_text"], flags=re.DOTALL) | |
print(f"SUGGESTED Musical prompt: {cleaned_text}") | |
music_o = get_magnet(cleaned_text) | |
return cleaned_text.lstrip("\n"), music_o | |
title = "Image to Music V2", | |
description = "Get music from a picture" | |
css = """ | |
#col-container{ | |
margin: 0 auto; | |
max-width: 780px; | |
text-align: left; | |
} | |
""" | |
with gr.Blocks(css=css) as demo: | |
with gr.Column(elem_id="col-container"): | |
gr.HTML(f""" | |
<h2 style="text-align: center;">{title}</h2> | |
<p style="text-align: center;">{description}</p> | |
""") | |
with gr.Row(): | |
with gr.Column(): | |
image_in = gr.Image( | |
label = "Image reference", | |
type = "filepath", | |
elem_id = "image-in" | |
) | |
submit_btn = gr.Button("Make LLM system from my pic !") | |
with gr.Column(): | |
caption = gr.Textbox( | |
label = "Musical prompt" | |
) | |
result = gr.Audio( | |
label = "Music" | |
) | |
with gr.Row(): | |
gr.Examples( | |
examples = [ | |
["examples/monalisa.png"], | |
["examples/santa.png"], | |
["examples/ocean_poet.jpeg"], | |
["examples/winter_hiking.png"], | |
["examples/teatime.jpeg"], | |
["examples/news_experts.jpeg"], | |
["examples/chicken_adobo.jpeg"] | |
], | |
fn = infer, | |
inputs = [image_in], | |
outputs = [caption, result], | |
cache_examples = False | |
) | |
submit_btn.click( | |
fn = infer, | |
inputs = [ | |
image_in | |
], | |
outputs =[ | |
caption, | |
result | |
] | |
) | |
demo.queue().launch(show_api=False) |