alexnasa commited on
Commit
35abee1
·
verified ·
1 Parent(s): 7babf45

Update args_config.yaml

Browse files
Files changed (1) hide show
  1. args_config.yaml +76 -76
args_config.yaml CHANGED
@@ -1,77 +1,77 @@
1
- config: configs/inference.yaml
2
-
3
- input_file: examples/infer_samples.txt
4
- debug: null
5
- infer: false
6
- hparams: ''
7
- dtype: bf16
8
-
9
- exp_path: pretrained_models/OmniAvatar-14B
10
- text_encoder_path: pretrained_models/Wan2.1-T2V-14B/models_t5_umt5-xxl-enc-bf16.pth
11
- image_encoder_path: None
12
- dit_path: pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00001-of-00006.safetensors,pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00002-of-00006.safetensors,pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00003-of-00006.safetensors,pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00004-of-00006.safetensors,pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00005-of-00006.safetensors,pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00006-of-00006.safetensors
13
- vae_path: pretrained_models/Wan2.1-T2V-14B/Wan2.1_VAE.pth
14
-
15
- # exp_path: pretrained_models/OmniAvatar-1.3B
16
- # text_encoder_path: pretrained_models/Wan2.1-T2V-1.3B/models_t5_umt5-xxl-enc-bf16.pth
17
- # image_encoder_path: None
18
- # dit_path: pretrained_models/Wan2.1-T2V-1.3B/diffusion_pytorch_model.safetensors
19
- # vae_path: pretrained_models/Wan2.1-T2V-1.3B/Wan2.1_VAE.pth
20
-
21
- wav2vec_path: pretrained_models/wav2vec2-base-960h
22
- num_persistent_param_in_dit:
23
- reload_cfg: true
24
- sp_size: 1
25
- seed: 42
26
- image_sizes_720:
27
- - - 400
28
- - 720
29
- # - - 720 commented out due duration needed on HF
30
- # - 720
31
- - - 720
32
- - 400
33
- image_sizes_1280:
34
- - - 720
35
- - 720
36
- - - 528
37
- - 960
38
- - - 960
39
- - 528
40
- - - 720
41
- - 1280
42
- - - 1280
43
- - 720
44
- max_hw: 720
45
- max_tokens: 40000
46
- seq_len: 200
47
- overlap_frame: 13
48
- guidance_scale: 4.5
49
- audio_scale: null
50
- num_steps: 8
51
- fps: 24
52
- sample_rate: 16000
53
- negative_prompt: Vivid color tones, background/camera moving quickly, screen switching,
54
- subtitles and special effects, mutation, overexposed, static, blurred details, subtitles,
55
- style, work, painting, image, still, overall grayish, worst quality, low quality,
56
- JPEG compression residue, ugly, incomplete, extra fingers, poorly drawn hands, poorly
57
- drawn face, deformed, disfigured, malformed limbs, fingers merging, motionless image,
58
- chaotic background, three legs, crowded background with many people, walking backward
59
- silence_duration_s: 0.0
60
- use_fsdp: false
61
- tea_cache_l1_thresh: 0
62
- rank: 0
63
- world_size: 1
64
- local_rank: 0
65
- device: cuda
66
- num_nodes: 1
67
- i2v: true
68
- use_audio: true
69
- random_prefix_frames: true
70
- model_config:
71
- in_dim: 33
72
- audio_hidden_size: 32
73
- train_architecture: lora
74
- lora_target_modules: q,k,v,o,ffn.0,ffn.2
75
- init_lora_weights: kaiming
76
- lora_rank: 128
77
  lora_alpha: 64.0
 
1
+ config: configs/inference.yaml
2
+
3
+ input_file: examples/infer_samples.txt
4
+ debug: null
5
+ infer: false
6
+ hparams: ''
7
+ dtype: bf16
8
+
9
+ exp_path: pretrained_models/OmniAvatar-14B
10
+ text_encoder_path: pretrained_models/Wan2.1-T2V-14B/models_t5_umt5-xxl-enc-bf16.pth
11
+ image_encoder_path: None
12
+ dit_path: pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00001-of-00006.safetensors,pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00002-of-00006.safetensors,pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00003-of-00006.safetensors,pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00004-of-00006.safetensors,pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00005-of-00006.safetensors,pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00006-of-00006.safetensors
13
+ vae_path: pretrained_models/Wan2.1-T2V-14B/Wan2.1_VAE.pth
14
+
15
+ # exp_path: pretrained_models/OmniAvatar-1.3B
16
+ # text_encoder_path: pretrained_models/Wan2.1-T2V-1.3B/models_t5_umt5-xxl-enc-bf16.pth
17
+ # image_encoder_path: None
18
+ # dit_path: pretrained_models/Wan2.1-T2V-1.3B/diffusion_pytorch_model.safetensors
19
+ # vae_path: pretrained_models/Wan2.1-T2V-1.3B/Wan2.1_VAE.pth
20
+
21
+ wav2vec_path: pretrained_models/wav2vec2-base-960h
22
+ num_persistent_param_in_dit:
23
+ reload_cfg: true
24
+ sp_size: 1
25
+ seed: 42
26
+ image_sizes_720:
27
+ - - 400
28
+ - 720
29
+ # - - 720 commented out due duration needed on HF
30
+ # - 720
31
+ # - - 720
32
+ # - 400
33
+ image_sizes_1280:
34
+ - - 720
35
+ - 720
36
+ - - 528
37
+ - 960
38
+ - - 960
39
+ - 528
40
+ - - 720
41
+ - 1280
42
+ - - 1280
43
+ - 720
44
+ max_hw: 720
45
+ max_tokens: 40000
46
+ seq_len: 200
47
+ overlap_frame: 13
48
+ guidance_scale: 4.5
49
+ audio_scale: null
50
+ num_steps: 8
51
+ fps: 24
52
+ sample_rate: 16000
53
+ negative_prompt: Vivid color tones, background/camera moving quickly, screen switching,
54
+ subtitles and special effects, mutation, overexposed, static, blurred details, subtitles,
55
+ style, work, painting, image, still, overall grayish, worst quality, low quality,
56
+ JPEG compression residue, ugly, incomplete, extra fingers, poorly drawn hands, poorly
57
+ drawn face, deformed, disfigured, malformed limbs, fingers merging, motionless image,
58
+ chaotic background, three legs, crowded background with many people, walking backward
59
+ silence_duration_s: 0.0
60
+ use_fsdp: false
61
+ tea_cache_l1_thresh: 0
62
+ rank: 0
63
+ world_size: 1
64
+ local_rank: 0
65
+ device: cuda
66
+ num_nodes: 1
67
+ i2v: true
68
+ use_audio: true
69
+ random_prefix_frames: true
70
+ model_config:
71
+ in_dim: 33
72
+ audio_hidden_size: 32
73
+ train_architecture: lora
74
+ lora_target_modules: q,k,v,o,ffn.0,ffn.2
75
+ init_lora_weights: kaiming
76
+ lora_rank: 128
77
  lora_alpha: 64.0