aidealab
/

AIdeaLab-VideoJP

Model card Files Files and versions Community

alfredplpl commited on Jan 7

Commit

b83c682

·

verified ·

1 Parent(s): f6993e3

Update README.md

Files changed (1) hide show

README.md +85 -1

README.md CHANGED Viewed

@@ -59,7 +59,91 @@ pip install transformers diffusers
 2. Run the following script
 ```python
-TBA
 ```
 ## Uses

 2. Run the following script
 ```python
+from diffusers.utils import export_to_video
+import tqdm
+from torchvision.transforms import ToPILImage
+device="cuda"
+shape=(1,48//4,16,256//8,256//8)
+sample_N=25
+torch_dtype=torch.bfloat16
+eps=1
+cfg=2.5
+tokenizer = AutoTokenizer.from_pretrained(
+    "llm-jp/llm-jp-3-1.8b"
+)
+text_encoder = AutoModelForCausalLM.from_pretrained(
+    "llm-jp/llm-jp-3-1.8b",
+    torch_dtype=torch_dtype
+)
+text_encoder=text_encoder.to(device)
+transformer = CogVideoXTransformer3DModel.from_pretrained(
+    "aidealab/commonvideo",
+    torch_dtype=torch_dtype
+)
+transformer=transformer.to(device)
+vae = AutoencoderKLCogVideoX.from_pretrained(
+    "THUDM/CogVideoX-2b",
+    subfolder="vae"
+)
+vae=vae.to(dtype=torch_dtype, device=device)
+vae.enable_slicing()
+vae.enable_tiling()
+text_inputs = tokenizer(
+    prompt,
+    padding="max_length",
+    max_length=512,
+    truncation=True,
+    add_special_tokens=True,
+    return_tensors="pt",
+)
+text_input_ids = text_inputs.input_ids
+prompt_embeds = text_encoder(text_input_ids.to(device), output_hidden_states=True, attention_mask=text_inputs.attention_mask.to(device)).hidden_states[-1]
+prompt_embeds = prompt_embeds.to(dtype=torch_dtype, device=device)
+null_text_inputs = tokenizer(
+    "",
+    padding="max_length",
+    max_length=512,
+    truncation=True,
+    add_special_tokens=True,
+    return_tensors="pt",
+)
+null_text_input_ids = null_text_inputs.input_ids
+null_prompt_embeds = text_encoder(null_text_input_ids.to(device), output_hidden_states=True, attention_mask=null_text_inputs.attention_mask.to(device)).hidden_states[-1]
+null_prompt_embeds = null_prompt_embeds.to(dtype=torch_dtype, device=device)
+# euler discreate sampler with cfg
+z0 = torch.randn(shape, device=device)
+latents = z0.detach().clone().to(torch_dtype)
+dt = 1.0 / sample_N
+with torch.no_grad():
+    for i in tqdm.tqdm(range(sample_N)):
+        num_t = i / sample_N
+        t = torch.ones(shape[0], device=device) * num_t
+        psudo_t=(1000-eps)*(1-t)+eps
+        positive_conditional = transformer(hidden_states=latents, timestep=psudo_t, encoder_hidden_states=prompt_embeds, image_rotary_emb=None)
+        null_conditional = transformer(hidden_states=latents, timestep=psudo_t, encoder_hidden_states=null_prompt_embeds, image_rotary_emb=None)
+        pred = null_conditional.sample+cfg*(positive_conditional.sample-null_conditional.sample)
+        latents = latents.detach().clone() + dt * pred.detach().clone()
+    # Free vram
+    latents = latents / vae.config.scaling_factor
+    latents = latents.permute(0, 2, 1, 3, 4) # [B, F, C, H, W]
+    x=vae.decode(latents).sample
+    x = x / 2 + 0.5
+    x = x.clamp(0,1)
+    x=x.permute(0, 2, 1, 3, 4).to(torch.float32)# [B, F, C, H, W]
+    print(x.shape)
+    x=[ToPILImage()(frame) for frame in x[0]]
+export_to_video(x,"output.mp4",fps=24)
 ```
 ## Uses