o yf@sUddlZddlZddlmZddlZddlmZmZmZej dkr(ddlm Z ej j r2ddlm Z defdd Zdefd d Zd=defd d ZddZddZddZddZddZddZddZddZeededZejed<d Zeed!<dZejed"<dZejed#<dZ ejed$<dZ!ejed%<dZ"ejed&<ej#Z$ej$ed'<ej#Z%ej$ed(<ej#Z&ej$ed)<ej#Z'ej$ed*<d Z(d+d,Z)d-d.Z*dZ+ej,j-ej,j.ej,j/ej,j0ej,j1gZ2d/d0Z3ej4d1d2Z5d>d3d4Z6d>d5d6Z7Gd7d8d8e8Z9d9d:Z:ed;d<Z;dS)?N) lru_cache)errorsshared npu_specificdarwin) mac_specific) xpu_specificreturncCstjjotjSN)rcmd_optsuse_ipexrhas_xpurr`C:\Users\Luke\Documents\Stable diffusion\Automatic1111\stable-diffusion-webui\modules\devices.pyr sr cCstjdkrdStjS)NrF)sysplatformrhas_mpsrrrrrs rcCs0|durt}tj|dkotj|dS)N)zNVIDIA GeForce GTX 16)get_cuda_device_idtorchcudaget_device_capabilityget_device_name startswith) device_idrrrcuda_no_autocasts rcCs2tjjdurtjjrttjjndptjSNr)rr risdigitintrrcurrent_devicerrrrr#s  rcCstjjdur dtjjSdS)Nzcuda:r)rr rrrrrget_cuda_device_string+s r!cCs:tjrtStr dStrtStj rt SdS)Nmpscpu) rr is_availabler!rr rZget_xpu_device_stringrhas_npuZget_npu_device_stringrrrrget_optimal_device_name2s r&cCs ttSr )rdevicer&rrrrget_optimal_deviceBs r(cCs"|tjjvs dtjjvrtStS)Nall)rr use_cpur#r()taskrrrget_device_forFsr,cCstjr&tjttjtjWdn1s!wYtr-t t r4t t jr@tt dSdSr )rrr$r'r! empty_cache ipc_collectrrZ torch_mps_gcr rZ torch_xpu_gcrr%torch_npu_set_deviceZ torch_npu_gcrrrrtorch_gcMs    r0cCstjr tjddSdSr)rr%rnpu set_devicerrrrr/_sr/cCs8tjrtr dtjj_dtjjj_dtjj_dSdS)NT) rrr$rbackendscudnn benchmarkmatmul allow_tf32rrrr enable_tf32es   r8z Enabling TF32r#Ffp8r'device_interrogate device_gfpgan device_esrgandevice_codeformerdtype dtype_vae dtype_unetdtype_inferencecCstr|tS|Sr )unet_needs_upcasttor@inputrrrcond_cast_unetsrFcCstr|S|Sr )rBfloatrDrrrcond_cast_floatsrHcsfdd}|S)Ncstfdd|Drfdd|D}fdd|D}}|D] }|jkr1|j}nq%|kr;||j|i|}|krL||tkrkt|tr`tdd|D}|St|t j rk|t}|S)Nc3s&|]}t|tjo|jkVqdSr ) isinstancerTensorr>.0arg target_dtyperr s  z?manual_cast_forward..forward_wrapper..cs&g|]}t|tjr|n|qSrrIrrJrCrKrNrr s&z@manual_cast_forward..forward_wrapper..cs,i|]\}}|t|tjr|n|qSrrQ)rLkvrNrr s,z@manual_cast_forward..forward_wrapper..css*|]}t|tjr|tn|VqdSr )rIrrJrCrA)rLirrrrPs  ) anyitems parametersr>rC org_forwardrArItuplerrJ)selfargskwargsZ org_dtypeparamresultrNrrforward_wrappers2       z,manual_cast_forward..forward_wrapperr)rOrarrNrmanual_cast_forwards rbc csd}tD]"}t|dr qd}|j}|tjjkrttj|_nt||_||_qzdVW|rBtD]}t|drA|j|_t |dq1dSdS|r[tD]}t|drZ|j|_t |dqJww)NFrZT) patch_module_listhasattrforwardrnnMultiheadAttentionrbfloat32rZdelattr)rOapplied module_typerZrrr manual_casts:       rlcCs|rtStrttkrtjdtjddStr ttj kr t t St tj ks*ttj kr.tSt s7t s7tr;t t StdS)Nr#T)r>enabledr) contextlib nullcontextr9r'r#rautocastbfloat16rArhrlr>r rrdisablerrrrps  rpcCs"tr |s tjdddStS)NrF)rm)ris_autocast_enabledrprnrorrrrrwithout_autocasts"ruc@s eZdZdS) NansExceptionN)__name__ __module__ __qualname__rrrrrvsrvcCs|tjjrdStt|sdS|dkr!d}tjjs |d7}n|dkr4d}tjjs3tjjs3|d7}nd}|d7}t |) NZunetz,A tensor with all NaNs was produced in Unet.a$ This could be either because there's not enough precision to represent the picture, or because your video card does not support half type. Try setting the "Upcast cross attention layer to float32" option in Settings > Stable Diffusion or using the --no-half commandline argument to fix this.vaez+A tensor with all NaNs was produced in VAE.z This could be because there's not enough precision to represent the picture. Try adding --no-half-vae commandline argument to fix this.z$A tensor with all NaNs was produced.zD Use --disable-nan-check commandline argument to disable this check.) rr disable_nan_checkrr)isnanitemno_half no_half_vaerv)xwheremessagerrr test_for_nanss"rcCsftdtt}tjddtt}||tdtt}tjdddtt}||dS)z just do any calculation with pytorch layers - the first time this is done it allocaltes about 700MB of memory and spends about 2.7 seconds doing that, at least with NVidia. )rr)rrr)rrN)rzerosrCr'r>rfLinearConv2d)rlinearconv2drrrfirst_time_calculations  rr )F)r?r@rArBrFrHZnv_rngrfrrrg GroupNorm LayerNormrcrbcontextmanagerrlrpru Exceptionrvrrrrrrsf         #