o >g0@sddlZddlZddlmZddlZGdddejjZddejj j dZ Gdd d ejjZ Gd d d ejjZ Gd d d ejjZGdddejjZGdddejjZGdddejjZGdddejjZGdddejjZdS)N)optimized_attention_for_devicecs&eZdZdfdd ZddZZS) T5LayerNormư>Ncs.ttjtj|||d|_||_dS)Ndtypedevice)super__init__torchnn Parameteremptyweightvariance_epsilon)self hidden_sizeepsrr operations __class__@/home/comdoleger1/zen-flux-style-shape/comfy/text_encoders/t5.pyr s  zT5LayerNorm.__init__cCs<|djddd}|t||j}tj|j||S)NT)keepdim) powmeanr rsqrtrcomfyops cast_to_inputr)rxvariancerrrforward szT5LayerNorm.forward)rNNN__name__ __module__ __qualname__r r# __classcell__rrrrrsrcCstjjj|ddS)Ntanh) approximate)r r functionalgelu)arrrsr.)gelu_pytorch_tanhreluc$eZdZfddZddZZS)T5DenseActDensecsDt|j||d||d|_|j||d||d|_t||_dSNFbiasrr)rr Linearwiwo activationsactr model_dimff_dim ff_activationrrrrrrr s zT5DenseActDense.__init__cCs|||}||}|SN)r:r7r8)rr!rrrr#s zT5DenseActDense.forwardr$rrrrr2s r2cr1)T5DenseGatedActDensecsZt|j||d||d|_|j||d||d|_|j||d||d|_t||_dSr3)rr r6wi_0wi_1r8r9r:r;rrrr %s zT5DenseGatedActDense.__init__cCs0|||}||}||}||}|Sr?)r:rArBr8)rr!Z hidden_geluZ hidden_linearrrrr#-s   zT5DenseGatedActDense.forwardr$rrrrr@$s r@cr1) T5LayerFFcsNt|rt|||||||_n t|||||||_t||||d|_dSNrrr)rr r@DenseReluDenser2r layer_norm)rr<r=r> gated_actrrrrrrr 6s zT5LayerFF.__init__cCs ||}||}||7}|Sr?)rGrF)rr!Zforwarded_statesrrrr#@s  zT5LayerFF.forwardr$rrrrrC5s  rCcs<eZdZfddZed ddZdd Zdd d ZZS) T5Attentioncst|j||d||d|_|j||d||d|_|j||d||d|_|j||d||d|_||_d|_|rMd|_ d|_ |j |j |j||d|_dSdS)NFr4 rr) rr r6qkvo num_headsrelative_attention_biasrelative_attention_num_bucketsrelative_attention_max_distance Embedding)rr< inner_dimrQrRrrrrrrr Hs zT5Attention.__init__TrJrKcCsd}|r|d}||dktj|7}t|}n t|t| }|d}||k}|t||t||||tj}t|t ||d}|t |||7}|S)a Adapted from Mesh Tensorflow: https://github.com/tensorflow/mesh/blob/0cb87fe07da627bf0b7e60475d59f95ed6b5be3d/mesh_tensorflow/transformer/transformer_layers.py#L593 Translate relative position to a bucket number for relative attention. The relative position is defined as memory_position - query_position, i.e. the distance in tokens from the attending position to the attended-to position. If bidirectional=False, then positive relative positions are invalid. We use smaller buckets for small absolute relative_position and larger buckets for larger absolute relative_positions. All relative positions >=max_distance map to the same bucket. All relative positions <=-max_distance map to the same bucket. This should allow for more graceful generalization to longer sequences than the model has been trained on Args: relative_position: an int32 Tensor bidirectional: a boolean - whether the attention is bidirectional num_buckets: an integer max_distance: an integer Returns: a Tensor with the same shape as relative_position, containing int32 values in the range [0, num_buckets) rr) tor longabsmin zeros_likelogfloatmath full_likewhere)relative_position bidirectional num_buckets max_distanceZrelative_bucketsZ max_exactZis_smallZrelative_position_if_largerrr_relative_position_bucketXs*  z%T5Attention._relative_position_bucketc Cstj|tj|ddddf}tj|tj|ddddf}||}|j|d|j|jd}|j||d} | gdd} | S)z%Compute binned relative position biasrNT)rcrdre out_dtype)rrrWr) r arangerYrfrSrTrRpermute unsqueeze) rZ query_lengthZ key_lengthrrZcontext_positionZmemory_positionrbZrelative_position_bucketvaluesrrr compute_biasszT5Attention.compute_biasNc Cs||}||}||}|jdur$||jd|jd|j|j}|dur3|dur1||}n|}||||jd|jd||j|}| ||fS)NrWrg?) rMrNrOrRrmshaperrrQrP) rr!mask past_biasoptimized_attentionrMrNrOoutrrrr#s      &zT5Attention.forward)TrJrKNNN) r%r&r'r staticmethodrfrmr#r(rrrrrIGs   /rIc&eZdZfddZdddZZS)T5LayerSelfAttentionc s6tt||||||||_t||||d|_dSrD)rr rI SelfAttentionrrG) rr<rVr=rQrRrrrrrrr s zT5LayerSelfAttention.__init__NcCs6||}|j|||||d\}}||7}||fS)N)rorprq)rGrw)rr!rorprqZnormed_hidden_statesoutputrrrr#s zT5LayerSelfAttention.forwardrsr$rrrrrv rvcru)T5Blockc sTttj|_|jt||||||| | |jt|||||| | dSr?) rr r r ModuleListlayerappendrvrC) rr<rVr=r>rHrQrRrrrrrrr s   zT5Block.__init__NcCs.|jd||||\}}|jd|}||fS)Nrr)r|)rr!rorprqrrrr#szT5Block.forwardrsr$rrrrrzryrzcs&eZdZfddZdddZZS)T5Stackc sRttj f ddt|D|_td|_dS)Ncs2g|]}t p|dkd qS)r)rRrrr)rz).0i rrr>r=rHrVr<rQrrelative_attentionrr s2z$T5Stack.__init__..rE) rr r r r{rangeblockrfinal_layer_norm) r num_layersr<rVr=r>rHrQrrrrrrrr s &zT5Stack.__init__NTc Csd}|dur6d||j|jddd|jdf|jdd|jd|jd}||tjtd}d}t |j |dudd}d} t |j D]\} } | ||| |\}} | |kr^| }qI||}|duro|ro||}||fS)Ng?rrWrz-infT)ro small_input)rXrreshapernexpand masked_fillr boolr^rr enumeratercloner) rr!attention_maskintermediate_outputfinal_layer_norm_intermediaterro intermediaterqrprlrrrr#s H   zT5Stack.forward)NNTNr$rrrrr~s  r~cs4eZdZfddZddZddZddZZS) T5c svt|d|_|d}t|j|||d|d|d|d|ddk||| |_||_|j|d |||d |_dS) Nrd_modelZd_ffZ dense_act_fnZ is_gated_actrQ model_typeumt5 vocab_sizerL)rr rr~encoderrrUshared)r config_dictrrrr<rrrr s  8z T5.__init__cCs|jSr?r)rrrrget_input_embeddingsszT5.get_input_embeddingscCs ||_dSr?r)r embeddingsrrrset_input_embeddingss zT5.set_input_embeddingscOsP|j||dtjd}|jtjtjtjfvrt|}|j|g|Ri|S)Nrrg) rgetr float32rfloat16bfloat16 nan_to_numr)r input_idsargskwargsr!rrrr#s z T5.forward)r%r&r'r rrr#r(rrrrrs   r)r r_comfy.ldm.modules.attentionr comfy.opsrr Modulerr+r0r9r2r@rCrIrvrzr~rrrrrs   `