Spaces:

Emma02
/

LVM

Running on Zero

App Files Files Community

Emma02 commited on Jun 13, 2024

Commit

a858bb2

1 Parent(s): 06f8697

Add application file

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +1 -0
README.md +5 -7
__init__.py +0 -0
app.py +244 -0
batch_generation.py +223 -0
demo.py +263 -0
eval_perplexity.py +127 -0
eval_video_perplexity.py +134 -0
eval_videos.py +160 -0
generate_videos.py +168 -0
inference.py +240 -0
prompts/.DS_Store +0 -0
prompts/Composition/Slide1.png +3 -0
prompts/Composition/Slide10.png +3 -0
prompts/Composition/Slide11.png +3 -0
prompts/Composition/Slide12.png +3 -0
prompts/Composition/Slide13.png +3 -0
prompts/Composition/Slide14.png +3 -0
prompts/Composition/Slide15.png +3 -0
prompts/Composition/Slide2.png +3 -0
prompts/Composition/Slide3.png +3 -0
prompts/Composition/Slide4.png +3 -0
prompts/Composition/Slide5.png +3 -0
prompts/Composition/Slide6.png +3 -0
prompts/Composition/Slide7.png +3 -0
prompts/Composition/Slide8.png +3 -0
prompts/Composition/Slide9.png +3 -0
prompts/Depth Estimation/1.png +3 -0
prompts/Depth Estimation/1_depth.png +3 -0
prompts/Depth Estimation/2.png +3 -0
prompts/Depth Estimation/2_depth.png +3 -0
prompts/Depth Estimation/3.png +3 -0
prompts/Depth Estimation/3_depth.png +3 -0
prompts/Depth Estimation/4.png +3 -0
prompts/Depth Estimation/4_depth.png +3 -0
prompts/Depth Estimation/5.png +3 -0
prompts/Depth Estimation/5_depth.png +3 -0
prompts/Depth Estimation/6.png +3 -0
prompts/Depth Estimation/6_depth.png +3 -0
prompts/Depth Estimation/7.png +3 -0
prompts/Depth Estimation/7_depth.png +3 -0
prompts/Depth Estimation/8.png +3 -0
prompts/Eaten Apples/1.png +3 -0
prompts/Eaten Apples/10.png +3 -0
prompts/Eaten Apples/2.png +3 -0
prompts/Eaten Apples/3.png +3 -0
prompts/Eaten Apples/4.png +3 -0
prompts/Eaten Apples/5.png +3 -0
prompts/Eaten Apples/6.png +3 -0
prompts/Eaten Apples/7.png +3 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+*.png filter=lfs diff=lfs merge=lfs -text

README.md CHANGED Viewed

@@ -1,13 +1,11 @@
 ---
-title: LVM
-emoji: 🔥
-colorFrom: yellow
-colorTo: gray
 sdk: gradio
-sdk_version: 4.36.1
 app_file: app.py
 pinned: false
-license: apache-2.0
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: VQLM Demo
+emoji: 🎨
+colorFrom: "yellow"
+colorTo: "blue"
 sdk: gradio
+sdk_version: "4.29.0"
 app_file: app.py
 pinned: false
 ---

__init__.py ADDED Viewed

File without changes

app.py ADDED Viewed

	@@ -0,0 +1,244 @@

+import gradio as gr
+import numpy as np
+import mlxu
+import os
+import re
+import torch
+from io import BytesIO
+from natsort import natsorted
+from PIL import Image
+from inference import LocalInferenceModel
+FLAGS, _ = mlxu.define_flags_with_default(
+    host='0.0.0.0',
+    port=5000,
+    dtype='float16',
+    checkpoint='Emma02/LVM_ckpts',
+    torch_devices='',
+    context_frames=16,
+)
+def natural_sort_key(s):
+    return [int(text) if text.isdigit() else text.lower() for text in re.split('([0-9]+)', s)]
+def load_example_image_groups(directory):
+    example_groups = {}
+    for subdir in os.listdir(directory):
+        subdir_path = os.path.join(directory, subdir)
+        if os.path.isdir(subdir_path):
+            example_groups[subdir] = []
+            images = [f for f in os.listdir(subdir_path) if f.lower().endswith(('.png', '.jpg', '.jpeg'))]
+            images = natsorted(images, key=natural_sort_key)
+            for filename in images:
+                img = Image.open(os.path.join(subdir_path, filename))
+                example_groups[subdir].append(img)
+    return example_groups
+def main(_):
+    assert FLAGS.checkpoint != ''
+    model = LocalInferenceModel(
+        checkpoint=FLAGS.checkpoint,
+        torch_device=torch.device("cuda"),
+        dtype=FLAGS.dtype,
+        context_frames=FLAGS.context_frames,
+        use_lock=False,
+    )
+    checkerboard_r1 = np.concatenate([np.zeros((8, 8, 3)), np.ones((8, 8, 3)), np.zeros((8, 8, 3))], axis=1)
+    checkerboard_r2 = np.concatenate([np.ones((8, 8, 3)), np.zeros((8, 8, 3)), np.ones((8, 8, 3))], axis=1)
+    checkerboard = np.concatenate([checkerboard_r1, checkerboard_r2] * 16, axis=0).astype(np.float32)
+    def generate_images(input_images, n_new_frames, n_candidates, temperature=1.0, top_p=0.9):
+        assert len(input_images) > 0
+        input_images = [
+            np.array(img.convert('RGB').resize((256, 256)), dtype=np.float32) / 255.0
+            for img in input_images
+        ]
+        input_images = np.stack(input_images, axis=0)
+        output_images = model([input_images], n_new_frames, n_candidates, temperature, top_p)[0]
+        generated_images = []
+        for candidate in output_images:
+            concatenated_image = []
+            for i, img in enumerate(candidate):
+                concatenated_image.append(img)
+                if i < len(candidate) - 1:
+                    concatenated_image.append(checkerboard)
+            generated_images.append(
+                Image.fromarray(
+                    (np.concatenate(concatenated_image, axis=1) * 255).astype(np.uint8)
+                )
+            )
+        return generated_images
+    with gr.Blocks(css="""
+        .small-button {
+            padding: 5px 10px;
+            min-width: 80px;
+        }
+        .large-gallery img {
+            width: 100%;
+            height: auto;
+            max-height: 150px;
+        }
+    """) as demo:
+        with gr.Column():
+            image_list = gr.State([])
+            gr.Markdown('# VQLM Demo')
+            gr.Markdown(f'Serving model: {FLAGS.checkpoint}')
+            gr.Markdown('## Inputs')
+            with gr.Row():
+                upload_drag = gr.File(
+                    type='binary',
+                    file_types=['image'],
+                    file_count='multiple',
+                )
+                with gr.Column():
+                    gen_length_slider = gr.Slider(
+                        label='Generation length',
+                        minimum=1,
+                        maximum=32,
+                        value=1,
+                        step=1,
+                        interactive=True,
+                    )
+                    n_candidates_slider = gr.Slider(
+                        label='Number of candidates',
+                        minimum=1,
+                        maximum=10,
+                        value=1,
+                        step=1,
+                        interactive=True,
+                    )
+                    temp_slider = gr.Slider(
+                        label='Temperature',
+                        minimum=0,
+                        maximum=2.0,
+                        value=1.0,
+                        interactive=True,
+                    )
+                    top_p_slider = gr.Slider(
+                        label='Top p',
+                        minimum=0,
+                        maximum=1.0,
+                        value=0.9,
+                        interactive=True,
+                    )
+                    clear_btn = gr.Button(
+                        value='Clear',
+                        elem_classes=['small-button'],
+                    )
+                    generate_btn = gr.Button(
+                        value='Generate',
+                        interactive=False,
+                        elem_classes=['small-button'],
+                    )
+            input_gallery = gr.Gallery(
+                columns=7,
+                rows=1,
+                object_fit='scale-down',
+                label="Input image sequence"
+            )
+            gr.Markdown('## Outputs')
+            output_gallery = gr.Gallery(
+                columns=4,
+                object_fit='scale-down',
+                label="Output image"
+            )
+        def upload_image_fn(files, images):
+            for file in files:
+                images.append(Image.open(BytesIO(file)))
+            return {
+                upload_drag: None,
+                image_list: images,
+                input_gallery: images,
+                generate_btn: gr.update(interactive=True),
+            }
+        def clear_fn():
+            return {
+                image_list: [],
+                input_gallery: [],
+                generate_btn: gr.update(interactive=False),
+                output_gallery: [],
+            }
+        def disable_generate_btn():
+            return {
+                generate_btn: gr.update(interactive=False),
+            }
+        def generate_fn(images, n_candidates, gen_length, temperature, top_p):
+            new_images = generate_images(
+                images,
+                gen_length,
+                n_candidates=n_candidates,
+                temperature=temperature,
+                top_p=top_p,
+            )
+            return {
+                output_gallery: new_images,
+                generate_btn: gr.update(interactive=True),
+            }
+        upload_drag.upload(
+            upload_image_fn,
+            inputs=[upload_drag, image_list],
+            outputs=[upload_drag, image_list, input_gallery, generate_btn],
+        )
+        clear_btn.click(
+            clear_fn,
+            inputs=None,
+            outputs=[image_list, input_gallery, generate_btn, output_gallery],
+        )
+        generate_btn.click(
+            disable_generate_btn,
+            inputs=None,
+            outputs=[generate_btn],
+        ).then(
+            generate_fn,
+            inputs=[image_list, n_candidates_slider, gen_length_slider, temp_slider, top_p_slider],
+            outputs=[output_gallery, generate_btn],
+        )
+        example_groups = load_example_image_groups('prompts')
+        def add_image_group_fn(group_name, images):
+            new_images = images + example_groups[group_name]
+            return {
+                image_list: new_images,
+                input_gallery: new_images,
+                generate_btn: gr.update(interactive=True),
+            }
+        for group_name, group_images in example_groups.items():
+            with gr.Row():
+                with gr.Column(scale=3):
+                    add_button = gr.Button(value=f'Add {group_name}', elem_classes=['small-button'])
+                with gr.Column(scale=7):
+                    group_gallery = gr.Gallery(
+                        value=[Image.fromarray(np.array(img)) for img in group_images],
+                        columns=5,
+                        rows=1,
+                        object_fit='scale-down',
+                        label=group_name,
+                        elem_classes=['large-gallery'],
+                    )
+                add_button.click(
+                    add_image_group_fn,
+                    inputs=[gr.State(group_name), image_list],
+                    outputs=[image_list, input_gallery, generate_btn],
+                )
+    demo.launch()
+if __name__ == "__main__":
+    mlxu.run(main)

batch_generation.py ADDED Viewed

	@@ -0,0 +1,223 @@

+"""
+Batch generation for sequnce of images. This script accept a jsonl file
+as input. Each line of the jsonl file representing a dictionary. Each line
+represents one example in the evaluation set. The dictionary should have two key:
+    input: a list of paths to the input images as context to the model.
+    output: a string representing the path to the output of generation to be saved.
+Ths script runs the mode to generate the output images, and concatenate the
+input and output images together and save them to the output path.
+"""
+import os
+import json
+from PIL import Image
+import numpy as np
+import mlxu
+from tqdm import tqdm, trange
+from multiprocessing import Pool
+import einops
+import torch
+from .inference import MultiProcessInferenceModel
+from .utils import read_image_to_tensor, MultiProcessImageSaver
+FLAGS, _ = mlxu.define_flags_with_default(
+    input_file='',
+    checkpoint='',
+    input_base_dir='',
+    output_base_dir='',
+    evaluate_mse=False,
+    json_input_key='input',
+    json_output_key='output',
+    json_target_key='target',
+    n_new_frames=1,
+    n_candidates=2,
+    context_frames=16,
+    temperature=1.0,
+    top_p=1.0,
+    n_workers=8,
+    dtype='float16',
+    torch_devices='',
+    batch_size_factor=4,
+    max_examples=0,
+    resize_output='',
+    include_input=False,
+)
+# create this according to the json file.
+class MultiFrameDataset(torch.utils.data.Dataset):
+    def __init__(self, input_files, output_files, target_files=None):
+        assert len(input_files)
+        self.input_files = input_files
+        self.output_files = output_files
+        self.target_files = target_files
+    def __len__(self):
+        return len(self.input_files)
+    def __getitem__(self, idx):
+        original_size = Image.open(self.input_files[idx][-1]).size
+        input_images = np.stack(
+            [read_image_to_tensor(f) for f in self.input_files[idx]],
+            axis=0
+        )
+        if self.target_files is not None:
+            target_images = np.stack(
+                [read_image_to_tensor(f) for f in self.target_files[idx]],
+                axis=0
+            )
+        else:
+            target_images = None
+        return input_images, target_images, self.output_files[idx], np.array(original_size)
+def main(_):
+    assert FLAGS.checkpoint != ''
+    print(f'Loading checkpoint from {FLAGS.checkpoint}')
+    print(f'Evaluating input file from {FLAGS.input_file}')
+    # build a model.
+    model = MultiProcessInferenceModel(
+        checkpoint=FLAGS.checkpoint,
+        torch_devices=FLAGS.torch_devices,
+        dtype=FLAGS.dtype,
+        context_frames=FLAGS.context_frames,
+        use_lock=True,
+    )
+    # input_files: the json file that needs to be generated by the other file.
+    input_files = []
+    output_files = []
+    if FLAGS.evaluate_mse:
+        target_files = []
+    else:
+        target_files = None
+    with mlxu.open_file(FLAGS.input_file, 'r') as f:
+        for line in f:
+            record = json.loads(line)
+            input_files.append(record[FLAGS.json_input_key])
+            output_files.append(record[FLAGS.json_output_key])
+            if FLAGS.evaluate_mse:
+                target_files.append(record[FLAGS.json_target_key])
+    if FLAGS.max_examples > 0:
+        input_files = input_files[:FLAGS.max_examples]
+        output_files = output_files[:FLAGS.max_examples]
+        if FLAGS.evaluate_mse:
+            target_files = target_files[:FLAGS.max_examples]
+    if FLAGS.input_base_dir != '':
+        input_files = [
+            [os.path.join(FLAGS.input_base_dir, x) for x in y]
+            for y in input_files
+        ]
+        if FLAGS.evaluate_mse:
+            target_files = [
+                [os.path.join(FLAGS.input_base_dir, x) for x in y]
+                for y in target_files
+            ]
+    if FLAGS.output_base_dir != '':
+        os.makedirs(FLAGS.output_base_dir, exist_ok=True)
+        output_files = [
+            os.path.join(FLAGS.output_base_dir, x)
+            for x in output_files
+        ]
+    dataset = MultiFrameDataset(input_files, output_files, target_files)
+    data_loader = torch.utils.data.DataLoader(
+        dataset,
+        batch_size=FLAGS.batch_size_factor * model.n_processes,
+        shuffle=False,
+        num_workers=FLAGS.n_workers,
+    )
+    image_saver = MultiProcessImageSaver(FLAGS.n_workers)
+    mses = []
+    for batch_images, batch_targets, batch_output_files, batch_sizes in tqdm(data_loader, ncols=0):
+        # batch_images is input.
+        batch_images = batch_images.numpy()
+        #
+        context_length = batch_images.shape[1]
+        generated_images = model(
+            batch_images,
+            FLAGS.n_new_frames,
+            FLAGS.n_candidates,
+            temperature=FLAGS.temperature,
+            top_p=FLAGS.top_p
+        )
+        repeated_batch = einops.repeat(
+            batch_images,
+            'b s h w c -> b n s h w c',
+            n=FLAGS.n_candidates,
+        )
+        generated_images = np.array(generated_images)
+        if FLAGS.evaluate_mse:
+            batch_targets = einops.repeat(
+                batch_targets.numpy(),
+                'b s h w c -> b n s h w c', # batch, candidate, s
+                n=FLAGS.n_candidates,
+            )
+            channels = batch_targets.shape[-1]
+            # calculate mse loss.
+            mse = np.mean((generated_images - batch_targets) ** 2, axis=(1, 2, 3, 4, 5))
+            mses.append(mse * channels)
+        if FLAGS.include_input:
+            combined = einops.rearrange(
+                np.concatenate([repeated_batch, generated_images], axis=2),
+                'b n s h w c -> b (n h) (s w) c'
+            )
+        else:
+            combined = einops.rearrange(
+                generated_images,
+                'b n s h w c -> b (n h) (s w) c'
+            )
+        combined = (combined * 255).astype(np.uint8)
+        n_frames = FLAGS.n_new_frames
+        if FLAGS.include_input:
+            n_frames += context_length
+        if FLAGS.resize_output == '':
+            resizes = None
+        elif FLAGS.resize_output == 'original':
+            resizes = batch_sizes.numpy()
+            resizes = resizes * np.array([[n_frames, FLAGS.n_candidates]])
+        else:
+            resize = tuple(int(x) for x in FLAGS.resize_output.split(','))
+            resizes = np.array([resize] * len(batch_sizes))
+            resizes = resizes * np.array([[n_frames, FLAGS.n_candidates]])
+        image_saver(combined, batch_output_files, resizes)
+    if FLAGS.evaluate_mse:
+        mses = np.concatenate(mses, axis=0)
+        print(f'MSE: {np.mean(mses)}')
+    image_saver.close()
+if __name__ == "__main__":
+    mlxu.run(main)

demo.py ADDED Viewed

	@@ -0,0 +1,263 @@

+import re
+from natsort import natsorted
+def natural_sort_key(s):
+    return [int(text) if text.isdigit() else text.lower() for text in re.split('([0-9]+)', s)]
+def load_example_image_groups(directory):
+    example_groups = {}
+    for subdir in os.listdir(directory):
+        subdir_path = os.path.join(directory, subdir)
+        if os.path.isdir(subdir_path):
+            example_groups[subdir] = []
+            images = [f for f in os.listdir(subdir_path) if f.lower().endswith(('.png', '.jpg', '.jpeg'))]
+            images = natsorted(images, key=natural_sort_key)  # Natural sorting
+            for filename in images:
+                img = Image.open(os.path.join(subdir_path, filename))
+                example_groups[subdir].append(img)
+    return example_groups
+from io import BytesIO
+import gradio as gr
+import uvicorn
+from fastapi import FastAPI
+from PIL import Image
+import numpy as np
+import mlxu
+import os
+import re
+from natsort import natsorted
+from .inference import MultiProcessInferenceModel
+FLAGS, _ = mlxu.define_flags_with_default(
+    host='0.0.0.0',
+    port=5007,
+    dtype='float16',
+    checkpoint='',
+    torch_devices='',
+    context_frames=16,
+)
+def natural_sort_key(s):
+    return [int(text) if text.isdigit() else text.lower() for text in re.split('([0-9]+)', s)]
+def load_example_image_groups(directory):
+    example_groups = {}
+    for subdir in os.listdir(directory):
+        subdir_path = os.path.join(directory, subdir)
+        if os.path.isdir(subdir_path):
+            example_groups[subdir] = []
+            images = [f for f in os.listdir(subdir_path) if f.lower().endswith(('.png', '.jpg', '.jpeg'))]
+            images = natsorted(images, key=natural_sort_key)  # Natural sorting
+            for filename in images:
+                img = Image.open(os.path.join(subdir_path, filename))
+                example_groups[subdir].append(img)
+    return example_groups
+def main(_):
+    assert FLAGS.checkpoint != ''
+    model = MultiProcessInferenceModel(
+        checkpoint=FLAGS.checkpoint,
+        torch_devices=FLAGS.torch_devices,
+        dtype=FLAGS.dtype,
+        context_frames=FLAGS.context_frames,
+        use_lock=True,
+    )
+    checkerboard_r1 = np.concatenate([np.zeros((8, 8, 3)), np.ones((8, 8, 3)), np.zeros((8, 8, 3))], axis=1)
+    checkerboard_r2 = np.concatenate([np.ones((8, 8, 3)), np.zeros((8, 8, 3)), np.ones((8, 8, 3))], axis=1)
+    checkerboard = np.concatenate([checkerboard_r1, checkerboard_r2] * 16, axis=0).astype(np.float32)
+    def generate_images(input_images, n_new_frames, n_candidates, temperature=1.0, top_p=0.9):
+        assert len(input_images) > 0
+        input_images = [
+            np.array(img.convert('RGB').resize((256, 256)), dtype=np.float32) / 255.0
+            for img in input_images
+        ]
+        input_images = np.stack(input_images, axis=0)
+        output_images = model([input_images], n_new_frames, n_candidates, temperature, top_p)[0]
+        generated_images = []
+        for candidate in output_images:
+            concatenated_image = []
+            for i, img in enumerate(candidate):
+                concatenated_image.append(img)
+                if i < len(candidate) - 1:
+                    concatenated_image.append(checkerboard)
+            generated_images.append(
+                Image.fromarray(
+                    (np.concatenate(concatenated_image, axis=1) * 255).astype(np.uint8)
+                )
+            )
+        return generated_images
+    with gr.Blocks(css="""
+        .small-button {
+            padding: 5px 10px;
+            min-width: 80px;
+        }
+        .large-gallery img {
+            width: 100%;
+            height: auto;
+            max-height: 150px;
+        }
+    """) as demo:
+        with gr.Column():
+            image_list = gr.State([])
+            gr.Markdown('# LVM Demo')
+            gr.Markdown(f'Serving model: {FLAGS.checkpoint}')
+            gr.Markdown('## Inputs')
+            with gr.Row():
+                upload_drag = gr.File(
+                    type='binary',
+                    file_types=['image'],
+                    file_count='multiple',
+                )
+                with gr.Column():
+                    gen_length_slider = gr.Slider(
+                        label='Generation length',
+                        minimum=1,
+                        maximum=32,
+                        value=1,
+                        step=1,
+                        interactive=True,
+                    )
+                    n_candidates_slider = gr.Slider(
+                        label='Number of candidates',
+                        minimum=1,
+                        maximum=10,
+                        value=1,
+                        step=1,
+                        interactive=True,
+                    )
+                    temp_slider = gr.Slider(
+                        label='Temperature',
+                        minimum=0,
+                        maximum=2.0,
+                        value=1.0,
+                        interactive=True,
+                    )
+                    top_p_slider = gr.Slider(
+                        label='Top p',
+                        minimum=0,
+                        maximum=1.0,
+                        value=0.9,
+                        interactive=True,
+                    )
+                    clear_btn = gr.Button(
+                        value='Clear',
+                        elem_classes=['small-button'],
+                    )
+                    generate_btn = gr.Button(
+                        value='Generate',
+                        interactive=False,
+                        elem_classes=['small-button'],
+                    )
+            input_gallery = gr.Gallery(
+                columns=7,
+                rows=1,
+                object_fit='scale-down',
+            )
+            gr.Markdown('## Outputs')
+            output_gallery = gr.Gallery(
+                columns=4,
+                object_fit='scale-down',
+            )
+        def upload_image_fn(files, images):
+            for file in files:
+                images.append(Image.open(BytesIO(file)))
+            return {
+                upload_drag: None,
+                image_list: images,
+                input_gallery: images,
+                generate_btn: gr.update(interactive=True),
+            }
+        def clear_fn():
+            return {
+                image_list: [],
+                input_gallery: [],
+                generate_btn: gr.update(interactive=False),
+                output_gallery: [],
+            }
+        def disable_generate_btn():
+            return {
+                generate_btn: gr.update(interactive=False),
+            }
+        def generate_fn(images, n_candidates, gen_length, temperature, top_p):
+            new_images = generate_images(
+                images,
+                gen_length,
+                n_candidates=n_candidates,
+                temperature=temperature,
+                top_p=top_p,
+            )
+            return {
+                output_gallery: new_images,
+                generate_btn: gr.update(interactive=True),
+            }
+        upload_drag.upload(
+            upload_image_fn,
+            inputs=[upload_drag, image_list],
+            outputs=[upload_drag, image_list, input_gallery, generate_btn],
+        )
+        clear_btn.click(
+            clear_fn,
+            inputs=None,
+            outputs=[image_list, input_gallery, generate_btn, output_gallery],
+        )
+        generate_btn.click(
+            disable_generate_btn,
+            inputs=None,
+            outputs=[generate_btn],
+        ).then(
+            generate_fn,
+            inputs=[image_list, n_candidates_slider, gen_length_slider, temp_slider, top_p_slider],
+            outputs=[output_gallery, generate_btn],
+        )
+        example_groups = load_example_image_groups('/home/yutongbai/demo_images')
+        def add_image_group_fn(group_name, images):
+            new_images = images + example_groups[group_name]
+            return {
+                image_list: new_images,
+                input_gallery: new_images,
+                generate_btn: gr.update(interactive=True),
+            }
+        for group_name, group_images in example_groups.items():
+            with gr.Row():
+                with gr.Column(scale=3):
+                    add_button = gr.Button(value=f'Add {group_name}', elem_classes=['small-button'])
+                with gr.Column(scale=7):
+                    group_gallery = gr.Gallery(
+                        value=[Image.fromarray(np.array(img)) for img in group_images],
+                        columns=5,
+                        rows=1,
+                        object_fit='scale-down',
+                        label=group_name,
+                        elem_classes=['large-gallery'],
+                    )
+                add_button.click(
+                    add_image_group_fn,
+                    inputs=[gr.State(group_name), image_list],
+                    outputs=[image_list, input_gallery, generate_btn],
+                )
+    app = FastAPI()
+    app = gr.mount_gradio_app(app, demo, '/')
+    uvicorn.run(app, host=FLAGS.host, port=FLAGS.port)
+if __name__ == "__main__":
+    mlxu.run(main)

eval_perplexity.py ADDED Viewed

	@@ -0,0 +1,127 @@

+"""
+Evaluating the perplexity on few shot tasks. This script accept a jsonl file
+as input. Each line of the jsonl file representing a dictionary. Each line
+represents one example in the evaluation set. The dictionary should have two key:
+    input: a list of paths to the input images as context to the model. This
+        list should include the few shot examples.
+    target: a list of paths to the target images to evaluate perplexity
+Ths script should run the model and compute the average perplexity on the
+evaluation set.
+"""
+import os
+import json
+from PIL import Image
+import numpy as np
+import mlxu
+from tqdm import tqdm, trange
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import einops
+from .inference import MultiProcessInferenceModel
+FLAGS, _ = mlxu.define_flags_with_default(
+    input_file='',
+    checkpoint='',
+    input_base_dir='',
+    batch_size=2,
+    json_input_key='input',
+    json_target_key='target',
+    dtype='float16',
+    torch_devices='',
+    n_workers=4,
+    max_examples=0,
+)
+def read_image_to_tensor(path):
+    pil_im = Image.open(path).convert('RGB')
+    input_img = pil_im.resize((256, 256))
+    input_img = np.array(input_img) / 255.0
+    input_img = input_img.astype(np.float32)
+    return input_img
+class MultiFrameDataset(torch.utils.data.Dataset):
+    def __init__(self, input_files, target_files):
+        assert len(input_files) == len(target_files)
+        self.input_files = input_files
+        self.target_files = target_files
+    def __len__(self):
+        return len(self.input_files)
+    def __getitem__(self, idx):
+        input_list = np.stack(
+            [read_image_to_tensor(f) for f in self.input_files[idx]],
+            axis=0
+        )
+        target_list = np.stack(
+            [read_image_to_tensor(f) for f in self.target_files[idx]],
+            axis=0
+        )
+        return input_list, target_list
+def main(_):
+    assert FLAGS.checkpoint != ''
+    print(f'Loading checkpoint from {FLAGS.checkpoint}')
+    print(f'Evaluating input file from {FLAGS.input_file}')
+    model = MultiProcessInferenceModel(
+        checkpoint=FLAGS.checkpoint,
+        torch_devices=FLAGS.torch_devices,
+        dtype=FLAGS.dtype,
+        use_lock=True,
+        perplexity_batch_size=FLAGS.batch_size,
+    )
+    input_files = []
+    target_files = []
+    with mlxu.open_file(FLAGS.input_file, 'r') as f:
+        for line in f:
+            record = json.loads(line)
+            input_files.append(record[FLAGS.json_input_key])
+            target_files.append(record[FLAGS.json_target_key])
+    if FLAGS.input_base_dir != '':
+        input_files = [
+            [os.path.join(FLAGS.input_base_dir, x) for x in y]
+            for y in input_files
+        ]
+        target_files = [
+            [os.path.join(FLAGS.input_base_dir, x) for x in y]
+            for y in target_files
+        ]
+    if FLAGS.max_examples > 0:
+        input_files = input_files[:FLAGS.max_examples]
+        target_files = target_files[:FLAGS.max_examples]
+    dataset = MultiFrameDataset(input_files, target_files)
+    data_loader = torch.utils.data.DataLoader(
+        dataset,
+        batch_size=FLAGS.batch_size * model.n_processes,
+        shuffle=False,
+        num_workers=FLAGS.n_workers
+    )
+    perplexities = []
+    for input_images, target_images in tqdm(data_loader, ncols=0):
+        perplexity = model.compute_perplexity(input_images, target_images)
+        perplexities.append(perplexity)
+    perplexities = np.concatenate(perplexities, axis=0)
+    print(f'Perplexity: {np.mean(perplexities)}')
+if __name__ == "__main__":
+    mlxu.run(main)

eval_video_perplexity.py ADDED Viewed

	@@ -0,0 +1,134 @@

+import os
+import glob
+from functools import partial
+from tqdm import tqdm, trange
+from multiprocessing import Pool
+from PIL import Image
+import cv2
+import mlxu
+from natsort import natsorted
+import numpy as np
+import einops
+import torch
+from vqlm_demo.inference import MultiProcessInferenceModel
+from vqlm_demo.utils import (
+    is_video, random_square_crop,
+    read_frames_from_dir, read_frames_from_video
+)
+FLAGS, _ = mlxu.define_flags_with_default(
+    checkpoint='',
+    input_files='',
+    frame_input=False,
+    read_file_list='',
+    center_crop=1.0,
+    n_context_frames=15,
+    n_target_frames=1,
+    n_workers=8,
+    stride=8,
+    batch_size=2,
+    torch_devices='',
+    shuffle=False,
+    random_start=True,
+    max_examples=0,
+)
+class VideoDataset(torch.utils.data.Dataset):
+    def __init__(self, videos, frame_input=False, n_context_frames=15,
+                 n_target_frames=1, stride=1):
+        self.videos = videos
+        self.frame_input = frame_input
+        self.n_context_frames = n_context_frames
+        self.n_target_frames = n_target_frames
+        self.stride = stride
+    def __getitem__(self, index):
+        if self.frame_input:
+            frames = read_frames_from_dir(
+                self.videos[index],
+                self.n_context_frames + self.n_target_frames,
+                self.stride,
+                center_crop=FLAGS.center_crop,
+                random_start=FLAGS.random_start,
+            )
+        else:
+            frames = read_frames_from_video(
+                self.videos[index],
+                self.n_context_frames + self.n_target_frames,
+                self.stride,
+                center_crop=FLAGS.center_crop,
+                random_start=FLAGS.random_start,
+            )
+        if frames is None:
+            return self[np.random.randint(0, len(self))]
+        return frames[:self.n_context_frames], frames[self.n_context_frames:]
+    def __len__(self):
+        return len(self.videos)
+def main(_):
+    assert FLAGS.checkpoint != ''
+    assert FLAGS.read_file_list != '' or FLAGS.input_files != ''
+    model = MultiProcessInferenceModel(
+        checkpoint=FLAGS.checkpoint,
+        torch_devices=FLAGS.torch_devices,
+        perplexity_batch_size=FLAGS.batch_size,
+    )
+    if FLAGS.read_file_list != '':
+        with open(FLAGS.read_file_list, 'r') as f:
+            videos = [x.strip() for x in f.readlines()]
+    else:
+        videos = glob.glob(FLAGS.input_files)
+    if FLAGS.frame_input:
+        videos = [x for x in videos if os.path.isdir(x)]
+    else:
+        videos = [x for x in videos if is_video(x)]
+    if FLAGS.shuffle:
+        np.random.shuffle(videos)
+    if FLAGS.max_examples > 0:
+        videos = videos[:FLAGS.max_examples]
+    dataset = VideoDataset(
+        videos,
+        frame_input=FLAGS.frame_input,
+        n_context_frames=FLAGS.n_context_frames,
+        n_target_frames=FLAGS.n_target_frames,
+        stride=FLAGS.stride
+    )
+    dataloader = torch.utils.data.DataLoader(
+        dataset,
+        batch_size=FLAGS.batch_size * model.n_processes * 4,
+        shuffle=False,
+        num_workers=FLAGS.n_workers,
+        prefetch_factor=4,
+        drop_last=True,
+    )
+    perplexities = []
+    for batch_context_frames, batch_taret_frames in tqdm(dataloader, ncols=0):
+        batch_context_frames = batch_context_frames.numpy()
+        batch_taret_frames = batch_taret_frames.numpy()
+        perplexity = model.compute_perplexity(
+            batch_context_frames, batch_taret_frames
+        )
+        perplexities.append(perplexity)
+    perplexities = np.concatenate(perplexities, axis=0)
+    print(f'Perplexity: {np.mean(perplexities)}')
+if __name__ == '__main__':
+    mlxu.run(main)

eval_videos.py ADDED Viewed

	@@ -0,0 +1,160 @@

+import os
+import glob
+from functools import partial
+from tqdm import tqdm, trange
+from multiprocessing import Pool
+from PIL import Image
+import cv2
+import mlxu
+from natsort import natsorted
+import numpy as np
+import einops
+import torch
+from vqlm_demo.inference import MultiProcessInferenceModel
+from vqlm_demo.utils import (
+    is_video, random_square_crop,
+    read_frames_from_dir, read_frames_from_video
+)
+FLAGS, _ = mlxu.define_flags_with_default(
+    checkpoint='',
+    input_files='',
+    frame_input=False,
+    read_file_list='',
+    output_dir='',
+    center_crop=1.0,
+    n_context_frames=12,
+    n_new_frames=4,
+    n_candidates=8,
+    temperature=1.0,
+    top_p=1.0,
+    n_workers=8,
+    stride=8,
+    batch_size=32,
+    torch_devices='',
+    shuffle=False,
+    max_examples=0,
+)
+def save_image(args):
+    image, filename = args
+    base = FLAGS.input_files.split('*')[0]
+    filename = filename[len(base):].replace('/', '_') + '.png'
+    Image.fromarray(image).save(os.path.join(FLAGS.output_dir, filename))
+class VideoDataset(torch.utils.data.Dataset):
+    def __init__(self, videos, frame_input=False, n_frames=8, stride=1, new_frame=1):
+        self.videos = videos
+        self.frame_input = frame_input
+        self.n_frames = n_frames
+        self.stride = stride
+        self.new_frames = new_frames
+    def __getitem__(self, index):
+        if self.frame_input:
+            frames = read_frames_from_dir(
+                self.videos[index], self.n_frames, self.stride,
+                center_crop=FLAGS.center_crop,
+            )
+        else:
+            # 's h w c'
+            frames = read_frames_from_video(
+                self.videos[index], self.n_frames, self.stride,
+                center_crop=FLAGS.center_crop,
+            )
+            target_frames = frames[n_frames-new_frame:n_frames, :, :, :]
+        if frames is None:
+            return self[np.random.randint(0, len(self))]
+        return frames, target_frames, self.videos[index]
+    def __len__(self):
+        return len(self.videos)
+def main(_):
+    assert FLAGS.checkpoint != '' and FLAGS.output_dir != ''
+    assert FLAGS.read_file_list != '' or FLAGS.input_files != ''
+    os.makedirs(FLAGS.output_dir, exist_ok=True)
+    if FLAGS.read_file_list != '':
+        with open(FLAGS.read_file_list, 'r') as f:
+            videos = [x.strip() for x in f.readlines()]
+    else:
+        videos = glob.glob(FLAGS.input_files)
+    if FLAGS.frame_input:
+        videos = [x for x in videos if os.path.isdir(x)]
+    else:
+        videos = [x for x in videos if is_video(x)]
+    if FLAGS.shuffle:
+        np.random.shuffle(videos)
+    if FLAGS.max_examples > 0:
+        videos = videos[:FLAGS.max_examples]
+    dataset = VideoDataset(
+        videos,
+        frame_input=FLAGS.frame_input,
+        n_frames=FLAGS.n_context_frames,
+        stride=FLAGS.stride
+    )
+    dataloader = torch.utils.data.DataLoader(
+        dataset,
+        batch_size=FLAGS.batch_size,
+        shuffle=False,
+        num_workers=FLAGS.n_workers,
+        prefetch_factor=4,
+        drop_last=True,
+    )
+    if FLAGS.torch_devices == '':
+        torch_devices = None
+    else:
+        torch_devices = [f'cuda:{x}' for x in FLAGS.torch_devices.split(',')]
+    model = MultiProcessInferenceModel(
+        checkpoint=FLAGS.checkpoint, torch_devices=torch_devices,
+    )
+    save_img_pool = Pool(FLAGS.n_workers)
+    fids
+    for batch, batch_targets, filenames in tqdm(dataloader, ncols=0):
+        batch = batch.numpy() # 'b s h w c '
+        generated = model(
+            batch,
+            n_new_frames=FLAGS.n_new_frames,
+            n_candidates=FLAGS.n_candidates,
+            temperature=FLAGS.temperature,
+            top_p=FLAGS.top_p,
+        )
+        generated = np.array(generated)
+        batch_targets = einops.repeat(
+            batch_targets.numpy(),
+            'b s h w c -> b n s h w c', # batch, candidate, sequence, h, w, c.
+            n=FLAGS.n_candidates,
+        )
+if __name__ == '__main__':
+    mlxu.run(main)

generate_videos.py ADDED Viewed

	@@ -0,0 +1,168 @@

+import os
+import glob
+from functools import partial
+from tqdm import tqdm, trange
+from multiprocessing import Pool
+from PIL import Image
+import cv2
+import mlxu
+from natsort import natsorted
+import numpy as np
+import einops
+import torch
+from vqlm_demo.inference import MultiProcessInferenceModel
+from vqlm_demo.utils import (
+    is_video, random_square_crop,
+    read_frames_from_dir, read_frames_from_video
+)
+FLAGS, _ = mlxu.define_flags_with_default(
+    checkpoint='',
+    input_files='',
+    frame_input=False,
+    read_file_list='',
+    output_dir='',
+    center_crop=1.0,
+    n_context_frames=12,
+    n_new_frames=4,
+    n_candidates=8,
+    temperature=1.0,
+    top_p=1.0,
+    n_workers=8,
+    stride=8,
+    batch_size=32,
+    torch_devices='',
+    shuffle=False,
+    max_examples=0,
+)
+def save_image(args):
+    image, filename = args
+    base = FLAGS.input_files.split('*')[0]
+    filename = filename[len(base):].replace('/', '_') + '.png'
+    Image.fromarray(image).save(os.path.join(FLAGS.output_dir, filename))
+class VideoDataset(torch.utils.data.Dataset):
+    def __init__(self, videos, frame_input=False, n_frames=8, stride=1):
+        self.videos = videos
+        self.frame_input = frame_input
+        self.n_frames = n_frames
+        self.stride = stride
+    def __getitem__(self, index):
+        if self.frame_input:
+            frames = read_frames_from_dir(
+                self.videos[index], self.n_frames, self.stride,
+                center_crop=FLAGS.center_crop,
+            )
+        else:
+            frames = read_frames_from_video(
+                self.videos[index], self.n_frames, self.stride,
+                center_crop=FLAGS.center_crop,
+            )
+        if frames is None:
+            return self[np.random.randint(0, len(self))]
+        return frames, self.videos[index]
+    def __len__(self):
+        return len(self.videos)
+def main(_):
+    assert FLAGS.checkpoint != '' and FLAGS.output_dir != ''
+    assert FLAGS.read_file_list != '' or FLAGS.input_files != ''
+    os.makedirs(FLAGS.output_dir, exist_ok=True)
+    if FLAGS.read_file_list != '':
+        with open(FLAGS.read_file_list, 'r') as f:
+            videos = [x.strip() for x in f.readlines()]
+    else:
+        videos = glob.glob(FLAGS.input_files)
+    if FLAGS.frame_input:
+        videos = [x for x in videos if os.path.isdir(x)]
+    else:
+        videos = [x for x in videos if is_video(x)]
+    if FLAGS.shuffle:
+        np.random.shuffle(videos)
+    if FLAGS.max_examples > 0:
+        videos = videos[:FLAGS.max_examples]
+    dataset = VideoDataset(
+        videos,
+        frame_input=FLAGS.frame_input,
+        n_frames=FLAGS.n_context_frames,
+        stride=FLAGS.stride
+    )
+    dataloader = torch.utils.data.DataLoader(
+        dataset,
+        batch_size=FLAGS.batch_size,
+        shuffle=False,
+        num_workers=FLAGS.n_workers,
+        prefetch_factor=4,
+        drop_last=True,
+    )
+    if FLAGS.torch_devices == '':
+        torch_devices = None
+    else:
+        torch_devices = [f'cuda:{x}' for x in FLAGS.torch_devices.split(',')]
+    model = MultiProcessInferenceModel(
+        checkpoint=FLAGS.checkpoint, torch_devices=torch_devices,
+    )
+    save_img_pool = Pool(FLAGS.n_workers)
+    for batch, filenames in tqdm(dataloader, ncols=0):
+        batch = batch.numpy()
+        generated = model(
+            batch,
+            n_new_frames=FLAGS.n_new_frames,
+            n_candidates=FLAGS.n_candidates,
+            temperature=FLAGS.temperature,
+            top_p=FLAGS.top_p,
+        )
+        generated = np.array(generated)
+        output_batch = einops.repeat(
+            batch,
+            'b s h w c -> b n s h w c',
+            n=FLAGS.n_candidates,
+        )
+        combined = einops.rearrange(
+            np.concatenate([output_batch, generated], axis=2),
+            'b n s h w c -> b (n h) (s w) c'
+        )
+        combined = (np.clip(combined, 0, 1) * 255).astype(np.uint8)
+        save_img_pool.imap(save_image, zip(combined, filenames))
+if __name__ == '__main__':
+    mlxu.run(main)

inference.py ADDED Viewed

	@@ -0,0 +1,240 @@

+from abc import ABC, abstractmethod
+from contextlib import nullcontext
+import time
+import os
+from functools import partial
+from copy import deepcopy
+from multiprocessing import Pool
+from threading import Lock
+from PIL import Image
+import numpy as np
+import torch
+import torch.nn.functional as F
+import einops
+from transformers import LlamaForCausalLM
+import spaces
+from vqvae_muse import VQGANModel, get_tokenizer_muse
+from torch_vqvae_model import get_tokenizer
+def get_torch_float_dtype(dtype):
+    if dtype in (torch.float16, torch.bfloat16, torch.float32):
+        return dtype
+    return {
+        'float16': torch.float16,
+        'fp16': torch.float16,
+        'f16': torch.float16,
+        'bfloat16': torch.bfloat16,
+        'bf16': torch.bfloat16,
+        'float32': torch.float32,
+        'fp32': torch.float32,
+        'f32': torch.float32,
+    }[dtype]
+def get_pid():
+    time.sleep(1)
+    return os.getpid()
+class InferenceModel(ABC):
+    @abstractmethod
+    def __call__(input_images, n_new_frames, n_candidates, temperature=1.0, top_p=1.0):
+        raise NotImplementedError()
+class LocalInferenceModel(InferenceModel):
+    def __init__(self, checkpoint, dtype='float16', torch_device='cuda',
+                 context_frames=16, use_lock=False):
+        self.checkpoint = checkpoint
+        self.dtype = dtype
+        self.torch_device = torch_device
+        self.context_frames = context_frames
+        # new tokenizer
+        self.tokenizer = get_tokenizer_muse()
+        self.tokenizer.to(self.torch_device)
+        self.model = LlamaForCausalLM.from_pretrained(
+            self.checkpoint, torch_dtype=get_torch_float_dtype(self.dtype)
+        ).to(self.torch_device)
+        print("torch device", self.torch_device)
+        print("init device", self.model.device)
+        if use_lock:
+            self.lock = Lock()
+        else:
+            self.lock = nullcontext()
+    @torch.no_grad()
+    def compute_perplexity(self, input_images, target_images):
+        input_images = np.array(input_images)
+        target_images = np.array(target_images)
+        assert len(input_images.shape) == 5 and len(target_images.shape) == 5  # [B, S, H, W, C]
+        assert input_images.shape[0] == target_images.shape[0]
+        batch_size = input_images.shape[0]
+        with self.lock:
+            input_images = torch.tensor(
+                einops.rearrange(input_images, 'b s h w c -> b s c h w')
+            ).to(self.torch_device)
+            target_images = torch.tensor(
+                einops.rearrange(target_images, 'b s h w c -> b s c h w')
+            ).to(self.torch_device)
+            input_ids = self.tokenizer.tokenize(input_images).view(batch_size, -1)
+            target_ids = self.tokenizer.tokenize(target_images).view(batch_size, -1)
+            all_ids = torch.cat([input_ids, target_ids], dim=1)
+            logits = self.model(all_ids).logits
+            log_probs = F.log_softmax(logits, dim=-1)
+            target_ids_onehot = F.one_hot(target_ids, num_classes=logits.shape[-1])
+            target_log_probs = log_probs[:, input_ids.shape[1] - 1 : -1]
+            perplexity = torch.exp(
+                -torch.mean(
+                    torch.sum(target_log_probs * target_ids_onehot, dim=-1),
+                    dim=-1
+                )
+            )
+            return perplexity.detach().cpu().numpy()
+    @torch.no_grad()
+    def generate_once(self, input_images, n_new_frames, temperature=1.0, top_p=1.0):
+        assert type(input_images) == np.ndarray
+        with self.lock:
+            input_images = np.array(input_images, dtype=np.float32)
+            input_images = torch.tensor(
+                einops.rearrange(input_images, 'b h w c -> b c h w')
+            ).to(self.torch_device)
+            # not quite sure why i need to redo it here
+            self.model.to(self.torch_device)
+            self.tokenizer.to(self.torch_device)
+            # new tokenizer
+            _, input_ids = self.tokenizer.encode(input_images)
+            input_ids = input_ids.view(1, -1)
+            input_ids = input_ids[:, -(self.context_frames - 1) * 256:]
+            new_tokens = []
+            current_context_frames = input_ids.shape[1] // 256
+            fisrt_generation_left = self.context_frames - current_context_frames
+            first_new_frames = min(fisrt_generation_left, n_new_frames)
+            input_ids = self.model.generate(
+                input_ids=input_ids,
+                attention_mask=torch.ones_like(input_ids),
+                pad_token_id=8192,
+                max_new_tokens=256 * first_new_frames,
+                do_sample=True,
+                top_p=top_p,
+                temperature=temperature,
+                suppress_tokens=list(range(8192, self.model.vocab_size)),
+            )
+            new_tokens.append(input_ids[:, -256 * first_new_frames:])
+            input_ids = input_ids[:, -(self.context_frames - 1) * 256:]
+            for _ in range(max(0, n_new_frames - first_new_frames)):
+                input_ids = self.model.generate(
+                    input_ids=input_ids,
+                    attention_mask=torch.ones_like(input_ids),
+                    pad_token_id=8192,
+                    max_new_tokens=256,
+                    do_sample=True,
+                    top_p=top_p,
+                    temperature=temperature,
+                    suppress_tokens=list(range(8192, self.model.vocab_size)),
+                )
+                new_tokens.append(input_ids[:, -256:])
+                input_ids = input_ids[:, -(self.context_frames - 1) * 256:]
+            new_tokens = torch.cat(new_tokens, dim=1).view(-1, 256)
+            new_images = einops.rearrange(
+                torch.clamp(self.tokenizer.decode_code(new_tokens), 0.0, 1.0),
+                'b c h w -> b h w c'
+            ).detach().cpu().numpy()
+        return new_images
+    @spaces.GPU(duration=180)
+    def __call__(self, input_images, n_new_frames, n_candidates, temperature=1.0, top_p=1.0):
+        output = []
+        for seq in input_images:
+            output.append(
+                [self.generate_once(seq, n_new_frames, temperature, top_p)
+                 for _ in range(n_candidates)]
+            )
+        return output
+class MultiProcessInferenceModel(InferenceModel):
+    def __init__(self, checkpoint, torch_devices=None, dtype='float16',
+                 context_frames=16, use_lock=False, perplexity_batch_size=2):
+        if torch_devices is None or torch_devices == '':
+            torch_devices = [f'cuda:{i}' for i in range(torch.cuda.device_count())]
+        self.torch_devices = torch_devices
+        self.n_processes = len(torch_devices)
+        print(f'Using {self.n_processes} processes for inference')
+        self.worker_pool = Pool(self.n_processes)
+        self.worker_pids = self.worker_pool.starmap(get_pid, [tuple() for _ in range(self.n_processes)])
+        self.device_map = {
+            pid: torch_device
+            for pid, torch_device in zip(self.worker_pids, self.torch_devices)
+        }
+        self.worker_pool.starmap(
+            self.initialize_worker,
+            [(self.device_map, checkpoint, dtype, context_frames) for _ in range(self.n_processes)]
+        )
+        self.perplexity_batch_size = perplexity_batch_size
+        if use_lock:
+            self.lock = Lock()
+        else:
+            self.lock = nullcontext()
+    @staticmethod
+    def initialize_worker(device_map, checkpoint, dtype, context_frames):
+        global _current_process_backend
+        torch_device = device_map[os.getpid()]
+        _current_process_backend = LocalInferenceModel(
+            checkpoint, dtype, torch_device, context_frames
+        )
+    @staticmethod
+    def generate_once(input_images, n_new_frames, temperature=1.0, top_p=1.0):
+        return _current_process_backend.generate_once(input_images, n_new_frames, temperature, top_p)
+    @staticmethod
+    def compute_perplexity_once(input_images, target_images):
+        return _current_process_backend.compute_perplexity(input_images, target_images)
+    def compute_perplexity(self, input_images, target_images):
+        with self.lock:
+            map_args = []
+            for i in range(0, len(input_images), self.perplexity_batch_size):
+                map_args.append((
+                    input_images[i : i + self.perplexity_batch_size],
+                    target_images[i : i + self.perplexity_batch_size]
+                ))
+            outputs = self.worker_pool.starmap(self.compute_perplexity_once, map_args)
+            return np.concatenate(outputs, axis=0)
+    def __call__(self, input_images, n_new_frames, n_candidates, temperature=1.0, top_p=1.0):
+        with self.lock:
+            map_args = []
+            for seq in input_images:
+                for _ in range(n_candidates):
+                    map_args.append((seq, n_new_frames, temperature, top_p))
+            outputs = self.worker_pool.starmap(self.generate_once, map_args)
+            reshaped_output = []
+            index = 0
+            for _ in range(len(input_images)):
+                candidates = []
+                for _ in range(n_candidates):
+                    candidates.append(outputs[index])
+                    index += 1
+                reshaped_output.append(candidates)
+        return reshaped_output