-
Star
(359)
You must be signed in to star a gist -
Fork
(65)
You must be signed in to fork a gist
-
-
Save karpathy/00103b0037c5aaea32fe1da1af553355 to your computer and use it in GitHub Desktop.
| """ | |
| stable diffusion dreaming | |
| creates hypnotic moving videos by smoothly walking randomly through the sample space | |
| example way to run this script: | |
| $ python stablediffusionwalk.py --prompt "blueberry spaghetti" --name blueberry | |
| to stitch together the images, e.g.: | |
| $ ffmpeg -r 10 -f image2 -s 512x512 -i blueberry/frame%06d.jpg -vcodec libx264 -crf 10 -pix_fmt yuv420p blueberry.mp4 | |
| nice slerp def from @xsteenbrugge ty | |
| you have to have access to stablediffusion checkpoints from https://huggingface.co/CompVis | |
| and install all the other dependencies (e.g. diffusers library) | |
| """ | |
| import os | |
| import inspect | |
| import fire | |
| from diffusers import StableDiffusionPipeline | |
| from diffusers.schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler | |
| from time import time | |
| from PIL import Image | |
| from einops import rearrange | |
| import numpy as np | |
| import torch | |
| from torch import autocast | |
| from torchvision.utils import make_grid | |
| # ----------------------------------------------------------------------------- | |
| @torch.no_grad() | |
| def diffuse( | |
| pipe, | |
| cond_embeddings, # text conditioning, should be (1, 77, 768) | |
| cond_latents, # image conditioning, should be (1, 4, 64, 64) | |
| num_inference_steps, | |
| guidance_scale, | |
| eta, | |
| ): | |
| torch_device = cond_latents.get_device() | |
| # classifier guidance: add the unconditional embedding | |
| max_length = cond_embeddings.shape[1] # 77 | |
| uncond_input = pipe.tokenizer([""], padding="max_length", max_length=max_length, return_tensors="pt") | |
| uncond_embeddings = pipe.text_encoder(uncond_input.input_ids.to(torch_device))[0] | |
| text_embeddings = torch.cat([uncond_embeddings, cond_embeddings]) | |
| # if we use LMSDiscreteScheduler, let's make sure latents are mulitplied by sigmas | |
| if isinstance(pipe.scheduler, LMSDiscreteScheduler): | |
| cond_latents = cond_latents * pipe.scheduler.sigmas[0] | |
| # init the scheduler | |
| accepts_offset = "offset" in set(inspect.signature(pipe.scheduler.set_timesteps).parameters.keys()) | |
| extra_set_kwargs = {} | |
| if accepts_offset: | |
| extra_set_kwargs["offset"] = 1 | |
| pipe.scheduler.set_timesteps(num_inference_steps, **extra_set_kwargs) | |
| # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature | |
| # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers. | |
| # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502 | |
| # and should be between [0, 1] | |
| accepts_eta = "eta" in set(inspect.signature(pipe.scheduler.step).parameters.keys()) | |
| extra_step_kwargs = {} | |
| if accepts_eta: | |
| extra_step_kwargs["eta"] = eta | |
| # diffuse! | |
| for i, t in enumerate(pipe.scheduler.timesteps): | |
| # expand the latents for classifier free guidance | |
| latent_model_input = torch.cat([cond_latents] * 2) | |
| if isinstance(pipe.scheduler, LMSDiscreteScheduler): | |
| sigma = pipe.scheduler.sigmas[i] | |
| latent_model_input = latent_model_input / ((sigma**2 + 1) ** 0.5) | |
| # predict the noise residual | |
| noise_pred = pipe.unet(latent_model_input, t, encoder_hidden_states=text_embeddings)["sample"] | |
| # cfg | |
| noise_pred_uncond, noise_pred_text = noise_pred.chunk(2) | |
| noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) | |
| # compute the previous noisy sample x_t -> x_t-1 | |
| if isinstance(pipe.scheduler, LMSDiscreteScheduler): | |
| cond_latents = pipe.scheduler.step(noise_pred, i, cond_latents, **extra_step_kwargs)["prev_sample"] | |
| else: | |
| cond_latents = pipe.scheduler.step(noise_pred, t, cond_latents, **extra_step_kwargs)["prev_sample"] | |
| # scale and decode the image latents with vae | |
| cond_latents = 1 / 0.18215 * cond_latents | |
| image = pipe.vae.decode(cond_latents) | |
| # generate output numpy image as uint8 | |
| image = (image / 2 + 0.5).clamp(0, 1) | |
| image = image.cpu().permute(0, 2, 3, 1).numpy() | |
| image = (image[0] * 255).astype(np.uint8) | |
| return image | |
| def slerp(t, v0, v1, DOT_THRESHOLD=0.9995): | |
| """ helper function to spherically interpolate two arrays v1 v2 """ | |
| if not isinstance(v0, np.ndarray): | |
| inputs_are_torch = True | |
| input_device = v0.device | |
| v0 = v0.cpu().numpy() | |
| v1 = v1.cpu().numpy() | |
| dot = np.sum(v0 * v1 / (np.linalg.norm(v0) * np.linalg.norm(v1))) | |
| if np.abs(dot) > DOT_THRESHOLD: | |
| v2 = (1 - t) * v0 + t * v1 | |
| else: | |
| theta_0 = np.arccos(dot) | |
| sin_theta_0 = np.sin(theta_0) | |
| theta_t = theta_0 * t | |
| sin_theta_t = np.sin(theta_t) | |
| s0 = np.sin(theta_0 - theta_t) / sin_theta_0 | |
| s1 = sin_theta_t / sin_theta_0 | |
| v2 = s0 * v0 + s1 * v1 | |
| if inputs_are_torch: | |
| v2 = torch.from_numpy(v2).to(input_device) | |
| return v2 | |
| def run( | |
| # -------------------------------------- | |
| # args you probably want to change | |
| prompt = "blueberry spaghetti", # prompt to dream about | |
| gpu = 0, # id of the gpu to run on | |
| name = 'blueberry', # name of this project, for the output directory | |
| rootdir = '/home/ubuntu/dreams', | |
| num_steps = 200, # number of steps between each pair of sampled points | |
| max_frames = 10000, # number of frames to write and then exit the script | |
| num_inference_steps = 50, # more (e.g. 100, 200 etc) can create slightly better images | |
| guidance_scale = 7.5, # can depend on the prompt. usually somewhere between 3-10 is good | |
| seed = 1337, | |
| # -------------------------------------- | |
| # args you probably don't want to change | |
| quality = 90, # for jpeg compression of the output images | |
| eta = 0.0, | |
| width = 512, | |
| height = 512, | |
| weights_path = "/home/ubuntu/stable-diffusion-v1-3-diffusers", | |
| # -------------------------------------- | |
| ): | |
| assert torch.cuda.is_available() | |
| assert height % 8 == 0 and width % 8 == 0 | |
| torch.manual_seed(seed) | |
| torch_device = f"cuda:{gpu}" | |
| # init the output dir | |
| outdir = os.path.join(rootdir, name) | |
| os.makedirs(outdir, exist_ok=True) | |
| # init all of the models and move them to a given GPU | |
| lms = LMSDiscreteScheduler(beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear") | |
| pipe = StableDiffusionPipeline.from_pretrained(weights_path, scheduler=lms, use_auth_token=True) | |
| pipe.unet.to(torch_device) | |
| pipe.vae.to(torch_device) | |
| pipe.text_encoder.to(torch_device) | |
| # get the conditional text embeddings based on the prompt | |
| text_input = pipe.tokenizer(prompt, padding="max_length", max_length=pipe.tokenizer.model_max_length, truncation=True, return_tensors="pt") | |
| cond_embeddings = pipe.text_encoder(text_input.input_ids.to(torch_device))[0] # shape [1, 77, 768] | |
| # sample a source | |
| init1 = torch.randn((1, pipe.unet.in_channels, height // 8, width // 8), device=torch_device) | |
| # iterate the loop | |
| frame_index = 0 | |
| while frame_index < max_frames: | |
| # sample the destination | |
| init2 = torch.randn((1, pipe.unet.in_channels, height // 8, width // 8), device=torch_device) | |
| for i, t in enumerate(np.linspace(0, 1, num_steps)): | |
| init = slerp(float(t), init1, init2) | |
| print("dreaming... ", frame_index) | |
| with autocast("cuda"): | |
| image = diffuse(pipe, cond_embeddings, init, num_inference_steps, guidance_scale, eta) | |
| im = Image.fromarray(image) | |
| outpath = os.path.join(outdir, 'frame%06d.jpg' % frame_index) | |
| im.save(outpath, quality=quality) | |
| frame_index += 1 | |
| init1 = init2 | |
| if __name__ == '__main__': | |
| fire.Fire(run) |
I've got CUDA installed now and past all the previous errors I was getting with the script, but now I'm getting "CUDA out of memory" error. I only have 6GB of VRAM. I tried lowering the higher and width parameters but it didn't help. Any suggestions for getting this to run?
Edit: running a simple script to generate a single image I also get a memory error unless I add: , torch_dtype=torch.float16
Adding torch_dtype=torch.float16 to this script I now get a new error on this line
cond_latents = pipe.scheduler.step(noise_pred, i, cond_latents, **extra_step_kwargs)["prev_sample"]
only one element tensors can be converted to Python scalars
Edit 2: Got it working by using gordicaleksa's recommendation for a simplified version above! Thanks!
One minor change for it to work for me, ["sample"][0] was invalid. I changed image variable name to pipelineOutput and then the save line changed to: pipelineOutput.images[0].save(outpath)
For faster inference, we can wrap the call to
diffuseintorch.autocastso the inference will run in half-precision. For examplefrom torch import autocast with autocast("cuda"): image = diffuse(text_embeddings, init, guidance_scale=10.0)yes i dropped this accidentally, added, ty
Interestingly enough, huggingface discourages the use of autocast!
Somehow got this error: "ValueError: only one element tensors can be converted to Python scalars"
Same. Did you manage to solve it?
I am new in this project. would you like to help me. I am facing error in this code of line@
image = diffuse(pipe, cond_embeddings, init, num_inference_steps, guidance_scale, eta)
cond_embeddings out of index 51. eta variable. what should I need to do to overcome this issue
would love to see some blueberry spaghetti prompts 🫐 on God Tier Prompts

Somehow got this error: "ValueError: only one element tensors can be converted to Python scalars"