Spaces:
Running
on
Zero
Running
on
Zero
| import os | |
| import argparse | |
| from huggingface_hub import hf_hub_download | |
| import safetensors.torch | |
| from transformers import CLIPImageProcessor, CLIPVisionModelWithProjection | |
| from diffusers import ( | |
| AutoencoderKL, | |
| # AutoencoderKLTemporalDecoder, | |
| EulerDiscreteScheduler, | |
| ) | |
| from convert.convert_svd_to_diffusers import ( | |
| convert_ldm_unet_checkpoint, | |
| # convert_ldm_vae_checkpoint, | |
| create_unet_diffusers_config, | |
| ) | |
| from diffusers_sv3d import SV3DUNetSpatioTemporalConditionModel, StableVideo3DDiffusionPipeline | |
| SVD_V1_CKPT = "stabilityai/stable-video-diffusion-img2vid-xt" | |
| SD_V15_CKPT = "chenguolin/stable-diffusion-v1-5" | |
| HF_HOME = "~/.cache/huggingface" | |
| HF_TOKEN = "" | |
| HF_USERNAME = "" | |
| # os.environ["HF_ENDPOINT"] = "https://hf-mirror.com" | |
| os.environ["HF_HOME"] = HF_HOME | |
| os.environ["HF_USERNAME"] = HF_USERNAME | |
| if __name__ == "__main__": | |
| parser = argparse.ArgumentParser() | |
| parser.add_argument("--original_ckpt_path", default=os.path.expanduser(f"{HF_HOME}/hub/models--stabilityai--sv3d/snapshots/31213729b4314a44b574ce7cc2d0c28356f097ed/sv3d_p.safetensors"), type=str, help="Path to the checkpoint to convert.") | |
| parser.add_argument("--hf_token", default=HF_TOKEN, type=str, help="your HuggingFace token") | |
| parser.add_argument("--config_path", default="convert/sv3d_p.yaml", type=str, help="Config filepath.") | |
| parser.add_argument("--repo_name", default="sv3d-diffusers", type=str) | |
| parser.add_argument("--push_to_hub", action="store_true") | |
| args = parser.parse_args() | |
| if not os.path.exists(args.original_ckpt_path): | |
| token = HF_TOKEN # open(os.path.expanduser("~/.cache/huggingface/token"), "r").read() | |
| hf_hub_download("stabilityai/sv3d", filename="sv3d_p.safetensors", token=token) | |
| original_ckpt = safetensors.torch.load_file(args.original_ckpt_path, device="cpu") | |
| from omegaconf import OmegaConf | |
| config = OmegaConf.load(args.config_path) | |
| unet_config = create_unet_diffusers_config(config, image_size=576) | |
| ori_config = unet_config.copy() | |
| unet_config.pop("attention_head_dim") | |
| unet_config.pop("use_linear_projection") | |
| unet_config.pop("class_embed_type") | |
| unet_config.pop("addition_embed_type") | |
| unet = SV3DUNetSpatioTemporalConditionModel(**unet_config) | |
| unet_state_dict = convert_ldm_unet_checkpoint(original_ckpt, ori_config) | |
| unet.load_state_dict(unet_state_dict, strict=True) | |
| # unet.save_pretrained("out/sv3d-diffusers", push_to_hub=True) | |
| vae = AutoencoderKL.from_pretrained(SD_V15_CKPT, subfolder="vae") | |
| scheduler = EulerDiscreteScheduler.from_pretrained(SVD_V1_CKPT, subfolder="scheduler") | |
| image_encoder = CLIPVisionModelWithProjection.from_pretrained(SVD_V1_CKPT, subfolder="image_encoder") | |
| feature_extractor = CLIPImageProcessor.from_pretrained(SVD_V1_CKPT, subfolder="feature_extractor") | |
| pipeline = StableVideo3DDiffusionPipeline( | |
| image_encoder=image_encoder, feature_extractor=feature_extractor, | |
| unet=unet, vae=vae, | |
| scheduler=scheduler, | |
| ) | |
| if args.push_to_hub: | |
| pipeline.push_to_hub(args.repo_name) | |