Spaces:

sohojoe
/

soho-clip-embeddings-explorer

Running

App Files Files Community

sohojoe commited on Feb 4, 2023

Commit

dcd6afb

1 Parent(s): 489d13e

switch to use clip retreval's clip implementation

Browse files

Files changed (29) hide show

app.py +482 -0
images/371739.jpeg +0 -0
images/452650.jpeg +0 -0
images/540554.jpeg +0 -0
images/557922.jpeg +0 -0
images/Anya Taylor-Joy 003.jpg +0 -0
images/ColorWheel001 BW.jpg +0 -0
images/ColorWheel001.jpg +0 -0
images/ColorWheel002 BW.jpg +0 -0
images/ColorWheel002.jpg +0 -0
images/Donkey.jpg +0 -0
images/Lizzo 001.jpeg +0 -0
images/Mirai.jpg +0 -0
images/OnChainMonkey #2278.jpeg +0 -0
images/OnChainMonkey-2278.jpg +0 -0
images/Ray-Liotta-Goodfellas.jpg +0 -0
images/Snoop Dogg.jpg +0 -0
images/SohoJoeEth + Donkey.jpeg +0 -0
images/SohoJoeEth + Ray.jpeg +0 -0
images/SohoJoeEth + Snoop Dogg.jpeg +0 -0
images/SohoJoeEth.jpeg +0 -0
images/Wassie 4498.jpeg +0 -0
images/billie eilish 004.jpeg +0 -0
images/pup1.jpg +0 -0
images/pup2.jpg +0 -0
images/pup3.jpg +0 -0
images/pup4.jpeg +0 -0
images/pup5.jpg +0 -0
requirements.txt +10 -0

app.py ADDED Viewed

	@@ -0,0 +1,482 @@

+import gradio as gr
+import torch
+from PIL import Image
+from torchvision import transforms
+# from diffusers import StableDiffusionPipeline, StableDiffusionImageVariationPipeline, DiffusionPipeline
+import numpy as np
+import pandas as pd
+import math
+from transformers import CLIPTextModel, CLIPTokenizer
+import os
+from clip_retrieval.clip_client import ClipClient, Modality
+# clip_model_id = "openai/clip-vit-large-patch14-336"
+# clip_retrieval_indice_name, clip_model_id ="laion5B-L-14", "/laion/CLIP-ViT-L-14-laion2B-s32B-b82K"
+clip_retrieval_service_url = "https://knn.laion.ai/knn-service"
+# available models = ['RN50', 'RN101', 'RN50x4', 'RN50x16', 'RN50x64', 'ViT-B/32', 'ViT-B/16', 'ViT-L/14', 'ViT-L/14@336px']
+# clip_model="ViT-B/32"
+clip_model="ViT-L/14"
+clip_model_id ="laion5B-L-14"
+max_tabs = 10
+input_images = [None for i in range(max_tabs)]
+input_prompts = [None for i in range(max_tabs)]
+embedding_plots = [None for i in range(max_tabs)]
+embedding_powers = [1. for i in range(max_tabs)]
+# global embedding_base64s
+embedding_base64s = [None for i in range(max_tabs)]
+# embedding_base64s = gr.State(value=[None for i in range(max_tabs)])
+def image_to_embedding(input_im):
+    # approch A:
+    tform = transforms.Compose([
+    transforms.ToTensor(),
+    transforms.Resize(
+        (336, 336),
+        interpolation=transforms.InterpolationMode.BICUBIC,
+        antialias=False,
+        ),
+        transforms.Normalize(
+          [0.48145466, 0.4578275, 0.40821073],
+          [0.26862954, 0.26130258, 0.27577711]),
+    ])
+    input = tform(input_im).to(device)
+    # approch B: convert input_im to torch
+    # inp = torch.from_numpy(np.array(input_im)).to(device)
+    # inp = torch.from_numpy(np.array(input_im)).permute(2, 0, 1).to(device)
+    # dtype = torch.float32
+    # input = input.to(device=device, dtype=dtype)
+    input = input.unsqueeze(0)
+    # image_embeddings = pipe.image_encoder(image).image_embeds
+    # image_embeddings = image_embeddings[0]
+    with torch.no_grad():
+        # image_embeddings_np = model.get_text_features(prompt_tokens.to(device))
+        image_embeddings = model.get_image_features(input)
+    # image_embeddings /= image_embeddings.norm(dim=-1, keepdim=True)
+    image_embeddings_np = image_embeddings.cpu().detach().numpy()
+    return image_embeddings_np
+def prompt_to_embedding(prompt):
+    # inputs = processor(prompt, images=imgs, return_tensors="pt", padding=True)
+    inputs = processor(prompt, return_tensors="pt", padding='max_length', max_length=77)
+    # labels = torch.tensor(labels)
+    # prompt_tokens = inputs.input_ids[0]
+    prompt_tokens = inputs.input_ids
+    # image = inputs.pixel_values
+    with torch.no_grad():
+        prompt_embededdings = model.get_text_features(prompt_tokens.to(device))
+    # prompt_embededdings /= prompt_embededdings.norm(dim=-1, keepdim=True)
+    prompt_embededdings = prompt_embededdings[0].cpu().detach().numpy()
+    return prompt_embededdings
+def embedding_to_image(embeddings):
+    size = math.ceil(math.sqrt(embeddings.shape[0]))
+    image_embeddings_square = np.pad(embeddings, (0, size**2 - embeddings.shape[0]), 'constant')
+    image_embeddings_square.resize(size,size)
+    embedding_image = Image.fromarray(image_embeddings_square, mode="L")
+    return embedding_image
+def embedding_to_base64(embeddings):
+    import base64
+    # ensure float16
+    embeddings = embeddings.astype(np.float16)
+    embeddings_b64 = base64.urlsafe_b64encode(embeddings).decode()
+    return embeddings_b64
+def base64_to_embedding(embeddings_b64):
+    import base64
+    embeddings = base64.urlsafe_b64decode(embeddings_b64)
+    embeddings = np.frombuffer(embeddings, dtype=np.float16)
+    # embeddings = torch.tensor(embeddings)
+    return embeddings
+def main(
+    # input_im,
+    embeddings,
+    scale=3.0,
+    n_samples=4,
+    steps=25,
+    seed=None
+    ):
+    if seed == None:
+        seed = np.random.randint(2147483647)
+    # if device contains cuda
+    if device.type == 'cuda':
+        generator = torch.Generator(device=device).manual_seed(int(seed))
+    else:
+        generator = torch.Generator().manual_seed(int(seed)) # use cpu as does not work on mps
+    embeddings = base64_to_embedding(embeddings)
+    embeddings = torch.tensor(embeddings, dtype=torch_size).to(device)
+    images_list = pipe(
+        # inp.tile(n_samples, 1, 1, 1),
+        # [embeddings * n_samples],
+        embeddings,
+        guidance_scale=scale,
+        num_inference_steps=steps,
+        generator=generator,
+        )
+    images = []
+    for i, image in enumerate(images_list["images"]):
+        images.append(image)
+    # images.append(embedding_image)
+    return images
+def on_image_load_update_embeddings(image_data):
+    # image to embeddings
+    if image_data is None:
+        # embeddings = prompt_to_embedding('')
+        # embeddings_b64 = embedding_to_base64(embeddings)
+        # return gr.Text.update(embeddings_b64)
+        return gr.Text.update('')
+    embeddings = image_to_embedding(image_data)
+    embeddings_b64 = embedding_to_base64(embeddings)
+    return gr.Text.update(embeddings_b64)
+def on_prompt_change_update_embeddings(prompt):
+    # prompt to embeddings
+    if prompt is None or prompt == "":
+        embeddings = prompt_to_embedding('')
+        embeddings_b64 = embedding_to_base64(embeddings)
+        return gr.Text.update(embedding_to_base64(embeddings))
+    embeddings = prompt_to_embedding(prompt)
+    embeddings_b64 = embedding_to_base64(embeddings)
+    return gr.Text.update(embeddings_b64)
+def update_average_embeddings(embedding_base64s_state, embedding_powers):
+    final_embedding = None
+    num_embeddings = 0
+    for i, embedding_base64 in enumerate(embedding_base64s_state):
+        if embedding_base64 is None or embedding_base64 == "":
+            continue
+        embedding = base64_to_embedding(embedding_base64)
+        embedding = embedding * embedding_powers[i]
+        if final_embedding is None:
+            final_embedding = embedding
+        else:
+            final_embedding = final_embedding + embedding
+        num_embeddings += 1
+    if final_embedding is None:
+        # embeddings = prompt_to_embedding('')
+        # embeddings_b64 = embedding_to_base64(embeddings)
+        # return gr.Text.update(embeddings_b64)
+        return gr.Text.update('')
+    # TODO toggle this to support average or sum
+    final_embedding = final_embedding / num_embeddings
+    embeddings_b64 = embedding_to_base64(final_embedding)
+    return embeddings_b64
+def on_power_change_update_average_embeddings(embedding_base64s_state, embedding_power_state, power, idx):
+    embedding_power_state[idx] = power
+    embeddings_b64 = update_average_embeddings(embedding_base64s_state, embedding_power_state)
+    return gr.Text.update(embeddings_b64)
+def on_embeddings_changed_update_average_embeddings(embedding_base64s_state, embedding_power_state, embedding_base64, idx):
+    embedding_base64s_state[idx] = embedding_base64 if embedding_base64 != '' else None
+    embeddings_b64 = update_average_embeddings(embedding_base64s_state, embedding_power_state)
+    return gr.Text.update(embeddings_b64)
+def on_embeddings_changed_update_plot(embeddings_b64):
+    # plot new embeddings
+    if embeddings_b64 is None or embeddings_b64 == "":
+        data = pd.DataFrame({
+            'embedding': [],
+            'index': []})
+        return gr.LinePlot.update(data,
+            x="index",
+            y="embedding",
+            # color="country",
+            title="Embeddings",
+            # stroke_dash="cluster",
+            # x_lim=[1950, 2010],
+            tooltip=['index', 'embedding'],
+            # stroke_dash_legend_title="Country Cluster",
+            # height=300,
+            width=0)
+    embeddings = base64_to_embedding(embeddings_b64)
+    data = pd.DataFrame({
+            'embedding': embeddings,
+            'index': [n for n in range(len(embeddings))]})
+    return gr.LinePlot.update(data,
+            x="index",
+            y="embedding",
+            # color="country",
+            title="Embeddings",
+            # stroke_dash="cluster",
+            # x_lim=[1950, 2010],
+            tooltip=['index', 'embedding'],
+            # stroke_dash_legend_title="Country Cluster",
+            # height=300,
+            width=embeddings.shape[0])
+def on_example_image_click_set_image(input_image, image_url):
+    input_image.value = image_url
+# device = torch.device("mps" if torch.backends.mps.is_available() else "cuda:0" if torch.cuda.is_available() else "cpu")
+device = "cuda:0" if torch.cuda.is_available() else "cpu"
+torch_size = torch.float16 if device == ('cuda') else torch.float32
+# torch_size = torch.float32
+# pipe = StableDiffusionPipeline.from_pretrained(
+#     model_id,
+#     custom_pipeline="pipeline.py",
+#     torch_dtype=torch_size,
+#     # , revision="fp16",
+#     requires_safety_checker = False, safety_checker=None,
+#     text_encoder = CLIPTextModel,
+#     tokenizer = CLIPTokenizer,
+#     )
+# pipe = pipe.to(device)
+from transformers import AutoProcessor, AutoModel
+# processor = AutoProcessor.from_pretrained(clip_model_id)
+# model = AutoModel.from_pretrained(clip_model_id)
+# model = model.to(device)
+from clip_retrieval.load_clip import load_clip, get_tokenizer
+# model, preprocess = load_clip(clip_model, use_jit=True, device=device)
+model, preprocess = load_clip(clip_model, use_jit=True, device=device)
+tokenizer = get_tokenizer(clip_model)
+test_url = "https://placekitten.com/400/600"
+test_caption = "an image of a cat"
+test_image_1 = "tests/test_clip_inference/test_images/123_456.jpg"
+test_image_2 = "tests/test_clip_inference/test_images/416_264.jpg"
+# clip_retrieval_service_url = "https://knn.laion.ai/knn-service"
+clip_retrieval_client = ClipClient(
+    url=clip_retrieval_service_url,
+    indice_name=clip_model_id,
+    use_safety_model = False,
+    use_violence_detector = False,
+    )
+# results = clip_retrieval_client.query(text="an image of a cat")
+# results[0]
+examples = [
+    ["SohoJoeEth.jpeg", "Ray-Liotta-Goodfellas.jpg", "SohoJoeEth + Ray.jpeg"],
+    # ["SohoJoeEth.jpeg", "Donkey.jpg", "SohoJoeEth + Donkey.jpeg"],
+    # ["SohoJoeEth.jpeg", "Snoop Dogg.jpg", "SohoJoeEth + Snoop Dogg.jpeg"],
+]
+tile_size = 100
+# image_folder = os.path.join("file", "images")
+image_folder ="images"
+# image_examples = {
+#         "452650": "452650.jpeg",
+#         "Prompt 1": "a college dorm with a desk and bunk beds",
+#         "371739": "371739.jpeg",
+#         "Prompt 2": "a large banana is placed before a stuffed monkey.",
+#         "557922": "557922.jpeg",
+#         "Prompt 3": "a person sitting on a bench using a cell phone",
+# }
+tabbed_examples = {
+    "CoCo": {
+        "452650": "452650.jpeg",
+        "Prompt 1": "a college dorm with a desk and bunk beds",
+        "371739": "371739.jpeg",
+        "Prompt 2": "a large banana is placed before a stuffed monkey.",
+        "557922": "557922.jpeg",
+        "Prompt 3": "a person sitting on a bench using a cell phone",
+        "540554": "540554.jpeg",
+        "Prompt 4": "two trains are coming down the tracks, a steam engine and a modern train.",
+    },
+    "Transforms": {
+        "ColorWheel001": "ColorWheel001.jpg",
+        "ColorWheel001 BW": "ColorWheel001 BW.jpg",
+        "ColorWheel002": "ColorWheel002.jpg",
+        "ColorWheel002 BW": "ColorWheel002 BW.jpg",
+    },
+    "Portraits": {
+        "Snoop": "Snoop Dogg.jpg",
+        "Snoop Prompt": "Snoop Dogg",
+        "Ray": "Ray-Liotta-Goodfellas.jpg",
+        "Ray Prompt": "Ray Liotta, Goodfellas",
+        "Anya": "Anya Taylor-Joy 003.jpg",
+        "Anya Prompt": "Anya Taylor-Joy, The Queen's Gambit",
+        "Billie": "billie eilish 004.jpeg",
+        "Billie Prompt": "Billie Eilish, blonde hair",
+        "Lizzo": "Lizzo 001.jpeg",
+        "Lizzo Prompt": "Lizzo,",
+        "Donkey": "Donkey.jpg",
+        "Donkey Prompt": "Donkey, from Shrek",
+    },
+    "NFT's": {
+        "SohoJoe": "SohoJoeEth.jpeg",
+        "SohoJoe Prompt": "SohoJoe.Eth",
+        "Mirai": "Mirai.jpg",
+        "Mirai Prompt": "Mirai from White Rabbit, @shibuyaxyz",
+        "OnChainMonkey": "OnChainMonkey-2278.jpg",
+        "OCM Prompt": "On Chain Monkey",
+        "Wassie": "Wassie 4498.jpeg",
+        "Wassie Prompt": "Wassie by Wassies",
+    },
+    "Pups": {
+        "Pup1": "pup1.jpg",
+        "Prompt": "Teacup Yorkies",
+        "Pup2": "pup2.jpg",
+        "Pup3": "pup3.jpg",
+        "Pup4": "pup4.jpeg",
+        "Pup5": "pup5.jpg",
+    },
+}
+image_examples_tile_size = 50
+with gr.Blocks() as demo:
+    with gr.Row():
+        with gr.Column(scale=5):
+            gr.Markdown(
+"""
+# Soho-Clip
+A tool for exploring CLIP embedding spaces.
+Try uploading a few images and/or add some text prompts and click generate images.
+""")
+        with gr.Column(scale=2, min_width=(tile_size+20)*3):
+            with gr.Row():
+                with gr.Column(scale=1, min_width=tile_size):
+                    gr.Markdown("## Input 1")
+                with gr.Column(scale=1, min_width=tile_size):
+                    gr.Markdown("## Input 2")
+                with gr.Column(scale=1, min_width=tile_size):
+                    gr.Markdown("## Generates:")
+            for example in examples:
+                with gr.Row():
+                    for example in example:
+                        with gr.Column(scale=1, min_width=tile_size):
+                            local_path = os.path.join(image_folder, example)
+                            gr.Image(
+                                value = local_path, shape=(tile_size,tile_size),
+                                show_label=False, interactive=False) \
+                                .style(height=tile_size, width=tile_size)
+    with gr.Row():
+        for i in range(max_tabs):
+            with gr.Tab(f"Input {i+1}"):
+                with gr.Row():
+                    with gr.Column(scale=1, min_width=240):
+                        input_images[i] = gr.Image(label="Image Prompt", show_label=True)
+                    with gr.Column(scale=3, min_width=600):
+                        embedding_plots[i] = gr.LinePlot(show_label=False).style(container=False)
+                        # input_image.change(on_image_load, inputs= [input_image, plot])
+                with gr.Row():
+                    with gr.Column(scale=2, min_width=240):
+                        input_prompts[i] = gr.Textbox(label="Text Prompt", show_label=True)
+                    with gr.Column(scale=3, min_width=600):
+                        with gr.Row():
+                            # with gr.Slider(min=-5, max=5, value=1, label="Power", show_label=True):
+                            #     embedding_powers[i] = gr.Slider.value
+                            embedding_powers[i] = gr.Slider(minimum=-3, maximum=3, value=1, label="Power", show_label=True, interactive=True)
+                        with gr.Row():
+                            with gr.Accordion(f"Embeddings (base64)", open=False):
+                                embedding_base64s[i] = gr.Textbox(show_label=False)
+                for idx, (tab_title, examples) in enumerate(tabbed_examples.items()):
+                    with gr.Tab(tab_title):
+                        with gr.Row():
+                            for idx, (title, example) in enumerate(examples.items()):
+                                if example.endswith(".jpg") or example.endswith(".jpeg"):
+                                    # add image example
+                                    local_path = os.path.join(image_folder, example)
+                                    with gr.Column(scale=1, min_width=image_examples_tile_size):
+                                        gr.Examples(
+                                            examples=[local_path],
+                                            inputs=input_images[i],
+                                            label=title,
+                                        )
+                                else:
+                                    # add text example
+                                    with gr.Column(scale=1, min_width=image_examples_tile_size*2):
+                                        gr.Examples(
+                                            examples=[example],
+                                            inputs=input_prompts[i],
+                                            label=title,
+                                        )
+    with gr.Row():
+        average_embedding_plot = gr.LinePlot(show_label=True, label="Average Embeddings (base64)").style(container=False)
+    with gr.Row():
+        with gr.Accordion(f"Avergage embeddings in base 64", open=False):
+            average_embedding_base64 = gr.Textbox(show_label=False)
+    with gr.Row():
+        submit = gr.Button("Generate images")
+    with gr.Row():
+        with gr.Column(scale=1, min_width=200):
+            scale = gr.Slider(0, 25, value=3, step=1, label="Guidance scale")
+        with gr.Column(scale=1, min_width=200):
+            n_samples = gr.Slider(1, 4, value=1, step=1, label="Number images")
+        with gr.Column(scale=1, min_width=200):
+            steps = gr.Slider(5, 50, value=25, step=5, label="Steps")
+        with gr.Column(scale=1, min_width=200):
+            seed = gr.Number(None, label="Seed (blank = random)", precision=0)
+    with gr.Row():
+        output = gr.Gallery(label="Generated variations")
+    embedding_base64s_state = gr.State(value=[None for i in range(max_tabs)])
+    embedding_power_state = gr.State(value=[1. for i in range(max_tabs)])
+    for i in range(max_tabs):
+        input_images[i].change(on_image_load_update_embeddings, input_images[i], [embedding_base64s[i]])
+        input_prompts[i].change(on_prompt_change_update_embeddings, input_prompts[i], [embedding_base64s[i]])
+        embedding_base64s[i].change(on_embeddings_changed_update_plot, embedding_base64s[i], [embedding_plots[i]])
+        idx_state = gr.State(value=i)
+        embedding_base64s[i].change(on_embeddings_changed_update_average_embeddings, [embedding_base64s_state, embedding_power_state, embedding_base64s[i], idx_state], average_embedding_base64)
+        embedding_powers[i].change(on_power_change_update_average_embeddings, [embedding_base64s_state, embedding_power_state, embedding_powers[i], idx_state], average_embedding_base64)
+    average_embedding_base64.change(on_embeddings_changed_update_plot, average_embedding_base64, average_embedding_plot)
+    # submit.click(main, inputs= [embedding_base64s[0], scale, n_samples, steps, seed], outputs=output)
+    submit.click(main, inputs= [average_embedding_base64, scale, n_samples, steps, seed], outputs=output)
+    output.style(grid=2)
+    with gr.Row():
+        gr.Markdown(
+"""
+My interest is to use CLIP for image/video understanding (see [CLIP_visual-spatial-reasoning](https://github.com/Sohojoe/CLIP_visual-spatial-reasoning).)
+### Initial Features
+- Combine up to 10 Images and/or text inputs to create an average embedding space.
+- View embedding spaces as graph
+- Generate a new image based on the average embedding space
+### Known limitations
+- Text input is a little off (requires fine tuning and I'm having issues with that at the moment)
+- It can only generate a single image at a time
+- Not easy to use the sample images
+### Acknowledgements
+- I heavily build on Justin Pinkney's [Experiments in Image Variation](https://www.justinpinkney.com/image-variation-experiments). Please credit them if you use this work.
+- [CLIP](https://openai.com/blog/clip/)
+- [Stable Diffusion](https://github.com/CompVis/stable-diffusion)
+""")
+# ![Alt Text](file/pup1.jpg)
+# <img src="file/pup1.jpg" width="100" height="100">
+# ![Alt Text](file/pup1.jpg){height=100 width=100}
+if __name__ == "__main__":
+    demo.launch()

images/371739.jpeg ADDED Viewed

images/452650.jpeg ADDED Viewed

images/540554.jpeg ADDED Viewed

images/557922.jpeg ADDED Viewed

images/Anya Taylor-Joy 003.jpg ADDED Viewed

images/ColorWheel001 BW.jpg ADDED Viewed

images/ColorWheel001.jpg ADDED Viewed

images/ColorWheel002 BW.jpg ADDED Viewed

images/ColorWheel002.jpg ADDED Viewed

images/Donkey.jpg ADDED Viewed

images/Lizzo 001.jpeg ADDED Viewed

images/Mirai.jpg ADDED Viewed

images/OnChainMonkey #2278.jpeg ADDED Viewed

images/OnChainMonkey-2278.jpg ADDED Viewed

images/Ray-Liotta-Goodfellas.jpg ADDED Viewed

images/Snoop Dogg.jpg ADDED Viewed

images/SohoJoeEth + Donkey.jpeg ADDED Viewed

images/SohoJoeEth + Ray.jpeg ADDED Viewed

images/SohoJoeEth + Snoop Dogg.jpeg ADDED Viewed

images/SohoJoeEth.jpeg ADDED Viewed

images/Wassie 4498.jpeg ADDED Viewed

images/billie eilish 004.jpeg ADDED Viewed

images/pup1.jpg ADDED Viewed

images/pup2.jpg ADDED Viewed

images/pup3.jpg ADDED Viewed

images/pup4.jpeg ADDED Viewed

images/pup5.jpg ADDED Viewed

requirements.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+--extra-index-url https://download.pytorch.org/whl/cu113
+torch
+torchvision
+numpy
+transformers
+# diffusers
+# ftfy
+gradio
+accelerate
+clip-retrieval