o _gXC@sddlZddlZddlZddlZddlmZddlmZddlm Z ddl m Z ddl m Z ddlmZddlmZdd lmZdd lmZdd lmZmZdd lmZdd lmZddlmZddlmZGdddZ dS)N) make_grid) to_pil_image)Image) TextModel) AutoencoderKL)UNet2DConditionModel) DDIMScheduler)DPMSolverSingleStepScheduler) get_betas)find_phrase_positions_in_text-classifier_free_guidance_image_prompt_cascade)mask_generation)instantiate_from_config)tqdm) rearrangec @seZdZ    d#ddZddZd d Zd d Zd dZddZddZ dddddddddgddf ddZ ddZ d d!Z d"S)$RealCustomInferencePipelineckpts/sdxl/vae/sdxl.jsonckpts/sdxl/vae/sdxl-vae.pthbf16cudac Cs|dkr tj|_ntj|_tjds&ddlm}t d|dddd d d ||_ ||_ ||_ | |||||||||dS) Nrzckpts/r)snapshot_downloadzDownloading RealCustom ...zbytedance-research/RealCustommodelZckptszckpts/**F)repo_id repo_type local_dirallow_patternslocal_dir_use_symlinks)torchZbfloat16 torch_dtypefloat32ospathexistshuggingface_hubrprintdeviceunet_checkpointrealcustom_checkpoint_load_unet_checkpoint_load_vae_checkpoint_load_encoder_checkpoint_init_scheduler_load_negative_prompt) self unet_configr&r' vae_configvae_checkpoint model_typer%rr2A/mnt/bn/huangmengqi-lf-nas-25bad429/Release/inference/pipeline.py__init__'s*     z$RealCustomInferencePipeline.__init__cCst| }t|}Wdn1swYd|_|dd}|dd|_tdi||_|j |j  |j |jj t j||j ddd|jj t j||j dddtddS)Nepsilonvision_model_configZ map_locationFstrictzloading unet model finished.r2)openjsonloadunet_predictionpopr6UNet2DConditionModelDiffusers unet_modelevaltor%rload_state_dictrr$)r-r.r&r'Zunet_config_filer6r2r2r3r(Ks    z1RealCustomInferencePipeline._load_unet_checkpointcCsD|jjtj||jddd|jjtj||jdddtddS)Nr7Fr8zreloading unet model finished.)r@rCrr<r%r$)r-r&r'r2r2r3_reload_unet_checkpoint\s z3RealCustomInferencePipeline._reload_unet_checkpointcst| }t|}Wdn1swY|d|_dt|dd|_td i||j |j  t j||j dt j fdddd |_t j fd ddd |_td dS) Nlatent_channelsZblock_out_channelsr7cs|jjddS)NrG)decodescaling_factorsampleclipxZ vae_modelr2r3kszBRealCustomInferencePipeline._load_vae_checkpoint..T)disablecs|jjS)N)encodeZ latent_distmodeZmul_rJrMrOr2r3rPlszloading vae finished.r2)r:r;r<rElenvae_downsample_factorrrArBr%rrCrcompile vae_decoderZ vae_encoderr$)r-r/r0Zvae_config_filer2rOr3r)as    z0RealCustomInferencePipeline._load_vae_checkpointcCsnddg}dg}t|||_|j|j|jtdt|j|_ |j |j|jtddS)Nzckpts/sdxl/clip-sdxl-1zckpts/sdxl/clip-sdxl-2Zpenultimate_nonormzloading text model finished.zloading image model finished.) r text_modelrArBr%rr$rr6 vision_model)r-Ztext_encoder_variantZtext_encoder_moder2r2r3r*ps   z4RealCustomInferencePipeline._load_encoder_checkpointcCsZd}d}d}d}d|_t|||dd}|dkrtnt}||||j|jd|_|jj|_dS) NiZsquared_linearZdpmrGF)nameZ num_stepsZ shift_snrZterminal_pure_noise)ZbetasZnum_train_timestepsZnum_inference_timestepsr%) sample_stepsr r rr% schedulerZ timestepsinfer_timesteps)r-Zddim_train_stepsZ schedule_typeZscheduler_typeZschedule_shift_snrZ ddim_betasZscheduler_classr2r2r3r+}sz+RealCustomInferencePipeline._init_schedulercCsHtd}||_Wdn1swY||j|_dS)Nzprompts/validation_negative.txt)r:readstripnegative_promptrXtext_negative_output)r-fr2r2r3r,s z1RealCustomInferencePipeline._load_negative_promptig @ig?Zmin_max_per_channel Fc1Cs| dkr!| |jkr!| |_|jjtj| |jdddtd| | dkrB| |jkrB| |_|jjtj| |jdddtd| t |}| |||}|dkrZt dd d  }t |}t t|j|jn|||}||}|jj|dd }|jdj|dd }|d kr|jjj|dd }|jjdj|dd }|}tj|}|dj|dd }tjjj|d ddd}t|}||j|j}||j|j}d|i}|j||jd}d|i}|j||jd}tj ||j!||j"||j"g|jt#|j$|d|j}||j"d}||j"d}g}d} |dd}!t%|j&d|!dddD]}"| | krb|j||"|dt'||dddddd \}#}$t(|$d|$)dg|| ||| d}%n|d*d }%|%+dkr}|,|%j-dd d n|,|%d |j||"|dt'||d||%dddd \}#}$|j||"|dt'||d||%dddd \}&}'t.d|#|&||dd d!}(|j/j0|(|j1|"|d"})|)j2}| d 7} q5|3|)j4}*Wdn 1swYWdn 1swYg}+t5|*6dD]},tj7|*|,d#d#dd d$8}-|+,t9|-qt:|*dd%t |d#d&8}*tj;|d d }.t<|.d'}.|.6d }/t<|.d(}.t:|.dd%t |j=|/d&}0|rU|+t9|*t9|0fSt9|*t9|0fS))Nrer7Fr8zReloading Unet {} finised.z Reloading RealCustom {} finised.rHri@B)rG)dimrG)rhbilinear)sizerSZ align_corners image_ref)r%)rjr% generatorrF([]T)iterabledesc dynamic_ncols)Z text_embedsZtime_ids) rKtimestepencoder_hidden_statesencoder_attention_maskadded_cond_kwargsvision_input_dictvision_guided_maskreturn_as_originreturn_text2image_maskZtext2image_crossmap_2dZself_attention_map)Zcrossmap_2d_listZselfmap_2d_list target_token mask_scopeZ mask_target_hZ mask_target_wZ mask_mode) rKrsrtrurvrwrxryrzZmultiple_reference_imageZnaive_global_direct)Z pred_t_condZ pred_ti_condZ pred_uncondZguidance_weight_tZguidance_weight_iZguidance_stdev_rescale_factorZcfg_rescale_mode)Z model_outputZmodel_output_typersrKg?)minmax)rHrG) normalizeZ value_rangenrowzb t c h w -> (b t) c h wzB (c 1) h w -> (B c) 1 h w)>r&r@rCrr<r%r$formatr'int _get_metadatarandintitemZno_gradZautocastr_find_phrase_positions_in_textrXZ embeddingsZrepeat_interleaveZpooledrb torchvision transformsZToTensorZ unsqueezenn functional interpolate zeros_likerBrYrandnrErU GeneratorZ manual_seedrr^dictr getsqueezergappendmeanr r]stepr=Z prev_samplerWZpred_original_samplerangerjclampfloatrrcatrr\)1r-textZ image_pil target_phraseheightwidthguidance_scaleseedsamples_per_promptr|Znew_unet_checkpointZnew_realcustom_checkpointZ mask_strategyZmask_reused_stepZreturn_each_imageimage_metadata_validater{Ztext_positive_outputZtext_positive_embeddingsZtext_positive_pooledZtext_negative_embeddingsZtext_negative_pooledZpositive_imageZnegative_imageZpositive_image_dictZpositive_image_outputZnegative_image_dictZnegative_image_outputZlatentZtarget_hZtarget_wZ)text2image_crossmap_2d_all_timesteps_listZ current_stepZ pbar_textrsZ pred_condZpred_cond_dictZcrossmap_2d_avgZ pred_negativeZpred_negative_dictpredrrKZimages_pil_listsample_iZsample_i_imageZ$text2image_crossmap_2d_all_timestepscZ sample_maskr2r2r3 generations               z&RealCustomInferencePipeline.generationcCs4tj||dd||g|j|jddd|d}|S)Nr)datar%dtyperGrH)rtensorr%rviewrepeat)r-rrrrr2r2r3rJs  z)RealCustomInferencePipeline._get_metadatac Cstdd|j}t||}|D]:}|d|}|d|t|}td|d||j|d}|j|d} td|d| d|dd|| f<q|S)NrGMzprompt before: z, prompt_include: zprompt_before_length: z, prompt_include_length: ) rzerosrBr%r rTr$rXZget_vaild_token_length) r-rrr{ positionspositionZ prompt_beforeZprompt_includeZprompt_before_lengthZprompt_include_lengthr2r2r3rZs  z:RealCustomInferencePipeline._find_phrase_positions_in_textN)rrrr) __name__ __module__ __qualname__r4r(rDr)r*r+r,rrrr2r2r2r3r&s6 $    < r)!r r;rrZtorchvision.utilsrZ!torchvision.transforms.functionalrPILrZ models.textrZ models.vaerZmodels.unet_2d_condition_customrr?Zschedulers.ddimrZschedulers.dpm_sr Zschedulers.utilsr Zinference_utilsr r r utilsrrZeinopsrrr2r2r2r3s&