Text-to-Video
Diffusers
Safetensors
Japanese
English
art
alfredplpl commited on
Commit
b83c682
·
verified ·
1 Parent(s): f6993e3

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +85 -1
README.md CHANGED
@@ -59,7 +59,91 @@ pip install transformers diffusers
59
  2. Run the following script
60
 
61
  ```python
62
- TBA
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
  ```
64
 
65
  ## Uses
 
59
  2. Run the following script
60
 
61
  ```python
62
+ from diffusers.utils import export_to_video
63
+ import tqdm
64
+ from torchvision.transforms import ToPILImage
65
+
66
+ device="cuda"
67
+ shape=(1,48//4,16,256//8,256//8)
68
+ sample_N=25
69
+ torch_dtype=torch.bfloat16
70
+ eps=1
71
+ cfg=2.5
72
+
73
+ tokenizer = AutoTokenizer.from_pretrained(
74
+ "llm-jp/llm-jp-3-1.8b"
75
+ )
76
+
77
+ text_encoder = AutoModelForCausalLM.from_pretrained(
78
+ "llm-jp/llm-jp-3-1.8b",
79
+ torch_dtype=torch_dtype
80
+ )
81
+ text_encoder=text_encoder.to(device)
82
+
83
+ transformer = CogVideoXTransformer3DModel.from_pretrained(
84
+ "aidealab/commonvideo",
85
+ torch_dtype=torch_dtype
86
+ )
87
+ transformer=transformer.to(device)
88
+
89
+ vae = AutoencoderKLCogVideoX.from_pretrained(
90
+ "THUDM/CogVideoX-2b",
91
+ subfolder="vae"
92
+ )
93
+ vae=vae.to(dtype=torch_dtype, device=device)
94
+ vae.enable_slicing()
95
+ vae.enable_tiling()
96
+
97
+ text_inputs = tokenizer(
98
+ prompt,
99
+ padding="max_length",
100
+ max_length=512,
101
+ truncation=True,
102
+ add_special_tokens=True,
103
+ return_tensors="pt",
104
+ )
105
+ text_input_ids = text_inputs.input_ids
106
+ prompt_embeds = text_encoder(text_input_ids.to(device), output_hidden_states=True, attention_mask=text_inputs.attention_mask.to(device)).hidden_states[-1]
107
+ prompt_embeds = prompt_embeds.to(dtype=torch_dtype, device=device)
108
+
109
+ null_text_inputs = tokenizer(
110
+ "",
111
+ padding="max_length",
112
+ max_length=512,
113
+ truncation=True,
114
+ add_special_tokens=True,
115
+ return_tensors="pt",
116
+ )
117
+ null_text_input_ids = null_text_inputs.input_ids
118
+ null_prompt_embeds = text_encoder(null_text_input_ids.to(device), output_hidden_states=True, attention_mask=null_text_inputs.attention_mask.to(device)).hidden_states[-1]
119
+ null_prompt_embeds = null_prompt_embeds.to(dtype=torch_dtype, device=device)
120
+
121
+ # euler discreate sampler with cfg
122
+ z0 = torch.randn(shape, device=device)
123
+ latents = z0.detach().clone().to(torch_dtype)
124
+
125
+ dt = 1.0 / sample_N
126
+ with torch.no_grad():
127
+ for i in tqdm.tqdm(range(sample_N)):
128
+ num_t = i / sample_N
129
+ t = torch.ones(shape[0], device=device) * num_t
130
+ psudo_t=(1000-eps)*(1-t)+eps
131
+ positive_conditional = transformer(hidden_states=latents, timestep=psudo_t, encoder_hidden_states=prompt_embeds, image_rotary_emb=None)
132
+ null_conditional = transformer(hidden_states=latents, timestep=psudo_t, encoder_hidden_states=null_prompt_embeds, image_rotary_emb=None)
133
+ pred = null_conditional.sample+cfg*(positive_conditional.sample-null_conditional.sample)
134
+ latents = latents.detach().clone() + dt * pred.detach().clone()
135
+
136
+ # Free vram
137
+ latents = latents / vae.config.scaling_factor
138
+ latents = latents.permute(0, 2, 1, 3, 4) # [B, F, C, H, W]
139
+ x=vae.decode(latents).sample
140
+ x = x / 2 + 0.5
141
+ x = x.clamp(0,1)
142
+ x=x.permute(0, 2, 1, 3, 4).to(torch.float32)# [B, F, C, H, W]
143
+ print(x.shape)
144
+ x=[ToPILImage()(frame) for frame in x[0]]
145
+
146
+ export_to_video(x,"output.mp4",fps=24)
147
  ```
148
 
149
  ## Uses