U Od;@sddlZddlmZddlZddlmZddlmZmZm Z m Z m Z ddl Z ddl Z ddlmZmZGdddejZGdddeZGd d d ejZGd d d eZGd ddeZGdddeZGdddeZe de jZddZGdddeZedkreZeedddS)N) checkpoint) T5TokenizerT5EncoderModel CLIPTokenizer CLIPTextModel CLIPModel)default count_paramscs$eZdZfddZddZZS)AbstractEncodercstdSN)super__init__self __class__r/group/30042/chongmou/ft_local/Diffusion/iccv23/ft_local/hug_coadapter/T2I-Adapter/ldm/modules/encoders/modules.pyr szAbstractEncoder.__init__cOstdSr )NotImplementedError)rargskwargsrrrencodeszAbstractEncoder.encode)__name__ __module__ __qualname__r r __classcell__rrrrr s r c@seZdZddZdS)IdentityEncodercCs|Sr r)rxrrrrszIdentityEncoder.encodeN)rrrrrrrrrsrcs(eZdZdfdd Zd ddZZS) ClassEmbedderclasscs"t||_t|||_dSr )r r keynn Embedding embedding)r embed_dim n_classesr!rrrr s zClassEmbedder.__init__NcCs0|dkr|j}||dddf}||}|Sr )r!r$)rbatchr!crrrforward!s  zClassEmbedder.forward)rr )N)rrrr r)rrrrrrsrcs:eZdZdZdfdd Zdd Zd d Zd d ZZS)FrozenT5Embedderz(Uses the T5 transformer encoder for textgoogle/t5-v1_1-largecudaMTcs>tt||_t||_||_||_|r:| dSr ) r r rfrom_pretrained tokenizerr transformerdevice max_lengthfreeze)rversionr1r2r3rrrr ,s   zFrozenT5Embedder.__init__cCs$|j|_|D] }d|_qdSNFr0eval parameters requires_gradrparamrrrr35s  zFrozenT5Embedder.freezec Cs@|j|d|jddddd}|d|j}|j|d}|j}|S)NTFr2pt truncationr2 return_lengthreturn_overflowing_tokenspaddingreturn_tensors input_ids)rC)r/r2tor1r0last_hidden_statertextZbatch_encodingtokensoutputszrrrr);s zFrozenT5Embedder.forwardcCs||Sr rrrGrrrrDszFrozenT5Embedder.encode)r+r,r-T rrr__doc__r r3r)rrrrrrr**s   r*cs:eZdZdZdfdd Zd d Zd d Zd dZZS)FrozenCLIPEmbedderz=Uses the CLIP transformer encoder for text (from huggingface)openai/clip-vit-large-patch14r,r-TlastcsFtt||_t|j|_||_||_ |r<| ||_ dSr ) r r rr.r/r text_modelr0r1r2r3layer)rr4r1r2r3rRrrrr Js  zFrozenCLIPEmbedder.__init__cCs$|j|_|D] }d|_qdSr5r6r:rrrr3Us  zFrozenCLIPEmbedder.freezec Csj|j|d|jddddd}|d|j}|j||jdkd}|jd kr`|jd }|j|}n|j}|S) NTFr2r<r=rCrPrCoutput_hidden_states penultimate) r/r2rDr1r0rR hidden_statesfinal_layer_normrErFrrrr)Zs  zFrozenCLIPEmbedder.forwardcCs||Sr rrKrrrrgszFrozenCLIPEmbedder.encode)rOr,r-TrPrLrrrrrNHs  rNcs\eZdZdZddgZdfd d Zd d Zd dZddZde j dddZ ddZ Z S)FrozenOpenCLIPEmbedderz8 Uses the OpenCLIP transformer encoder for text rPrUViT-H-14laion2b_s32b_b79kr,r-Tc st||jksttj|td|d\}}}|`||_ ||_||_ |rV| ||_ |j dkrnd|_ n|j dkrd|_ ntdS)Ncpu)r1 pretrainedrPrrU)r r LAYERSAssertionError open_clipZcreate_model_and_transformstorchr1Zvisualmodelr2r3rR layer_idxr) rarchr4r1r2r3rRrc_rrrr ts   zFrozenOpenCLIPEmbedder.__init__cCs$|j|_|D] }d|_qdSr5)rcr7r8r9r:rrrr3s  zFrozenOpenCLIPEmbedder.freezecCs t|}|||j}|Sr )ratokenizeencode_with_transformerrDr1)rrGrHrJrrrr)s zFrozenOpenCLIPEmbedder.forwardcCsV|j|}||jj}|ddd}|j||jjd}|ddd}|j|}|S)Nr^r attn_mask)rcZtoken_embeddingpositional_embeddingpermutetext_transformer_forwardrkZln_final)rrGrrrrrhs   z.FrozenOpenCLIPEmbedder.encode_with_transformerN)rcCsft|jjjD]R\}}|t|jjj|jkr2qb|jjjrTtj sTt |||}q|||d}q|S)Nrj) enumeratercr0Z resblockslenrdZgrad_checkpointingrbjit is_scriptingr)rrrkirrrrrnsz/FrozenOpenCLIPEmbedder.text_transformer_forwardcCs||Sr rrKrrrrszFrozenOpenCLIPEmbedder.encode)rZr[r,r-TrP)N)rrrrMr_r r3r)rhrbTensorrnrrrrrrrYks  rYcs.eZdZd fdd ZddZd d ZZS) FrozenCLIPT5EncoderrOgoogle/t5-v1_1-xlr,r-c srtt|||d|_t|||d|_t|jjjdt |jddd|jjjdt |jddddS)N)r2z has gư>z.2fz M parameters, z comes with z M params.) r r rN clip_encoderr* t5_encoderprintrrr )rZ clip_versionZ t5_versionr1Zclip_max_lengthZ t5_max_lengthrrrr s zFrozenCLIPT5Encoder.__init__cCs||Sr rrKrrrrszFrozenCLIPT5Encoder.encodecCs |j|}|j|}||gSr )rxrry)rrGZclip_zZt5_zrrrr)s  zFrozenCLIPT5Encoder.forward)rOrwr,r-r-)rrrr rr)rrrrrrvs rvzT \\\(| \\\)| \\\[| \\]| \\\\| \\| \(| \[| :([+-]?[.\d]+)\)| \)| ]| [^\\()\[\]:]+| : c sgg}g}d}d}fdd}t|D]}|d}|d}|drd|ddd gq*|d kr||tq*|d kr|tq*|dk rt|dkr||t|q*|d krt|dkr|||q*|d krt|dkr|||q*|d gq*|D]}|||q|D]}|||q2tdkr\dd ggd} | dtkr̈| d| ddkr| d| dd7<| dn| d7} q`S)a Parses a string with attention tokens and returns a list of pairs: text and its associated weight. Accepted tokens are: (abc) - increases attention to abc by a multiplier of 1.1 (abc:3.12) - increases attention to abc by a multiplier of 3.12 [abc] - decreases attention to abc by a multiplier of 1.1 \( - literal character '(' \[ - literal character '[' \) - literal character ')' \] - literal character ']' \ - literal character '' anything else - just text >>> parse_prompt_attention('normal text') [['normal text', 1.0]] >>> parse_prompt_attention('an (important) word') [['an ', 1.0], ['important', 1.1], [' word', 1.0]] >>> parse_prompt_attention('(unbalanced') [['unbalanced', 1.1]] >>> parse_prompt_attention('\(literal\]') [['(literal]', 1.0]] >>> parse_prompt_attention('(unnecessary)(parens)') [['unnecessaryparens', 1.1]] >>> parse_prompt_attention('a (((house:1.3)) [on] a (hill:0.5), sun, (((sky))).') [['a ', 1.0], ['house', 1.5730000000000004], [' ', 1.1], ['on', 1.0], [' a ', 1.1], ['hill', 0.55], [', sun, ', 1.1], ['sky', 1.4641000000000006], ['.', 1.1]] g?g]tE?cs,t|tD]}|d|9<qdS)Nr^)rangerp)start_position multiplierpresrrmultiply_rangesz.parse_prompt_attention..multiply_rangerr^\N?([)]) re_attentionfinditergroup startswithappendrppopfloat) rGZround_bracketsZsquare_bracketsZround_bracket_multiplierZsquare_bracket_multiplierrmweightposrsrrrparse_prompt_attentionsF$       rcs^eZdZdfdd ZddZd d Zd d Zd dZddZddZ ddZ ddZ Z S)WebUIFrozenCLIPEmebedderrOr,TrUcshtt|t||_t|j|_||_ ||_ |r@| dd|j Dd|_d|_dS)NcSsg|]\}}|dkr|qS)z,r).0kvrrr -sz5WebUIFrozenCLIPEmebedder.__init__..r)r rr rr.r/rrQr0r1rRr3 get_vocabitems comma_tokencomma_padding_backtrack)rr4r1r3rRrrrr $s z!WebUIFrozenCLIPEmebedder.__init__cCs$|j|_|D] }d|_qdSr5r6r:rrrr30s  zWebUIFrozenCLIPEmebedder.freezecCs|j|dddd}|S)NF)r>add_special_tokensrC)r/)rtexts tokenizedrrrrg5sz!WebUIFrozenCLIPEmebedder.tokenizecCs@|j||jdkd}|jdkr6|jd}|j|}n|j}|S)NrPrSrUrV)r0rRrWrXrE)rrHrIrJrrrencode_with_transformers9s   z1WebUIFrozenCLIPEmebedder.encode_with_transformerscCst|}|dd|D}g}g}d}t||D]\}\}} d} | t|kr2|| } | |jkrlt|}n|jdkr,tt|dddkr,|dkr,t|||jkr,|d7}||d} ||d} |d|}t|}tt |dd|}||j j g|| 7}|d|dg|| }| | | | | d7} qDq2t|}t t|ddd}|t|}||j j g|}|dg|}|||fS)NcSsg|] \}}|qSrr)rrGrfrrrrHsz:WebUIFrozenCLIPEmebedder.tokenize_line..rr^Kr) rrgziprprrmaxintmathceilr/ eos_token_idr)rlineparsedr remade_tokens multipliers last_commarHrGrrstokenZ reloc_tokensZ reloc_multslengthrem token_countZprompt_target_length tokens_to_addrrr tokenize_lineDsZ          z&WebUIFrozenCLIPEmebedder.tokenize_linec Cstg}d}i}g}|D]T}||kr.||\}}n&||\}}} t| |}||f||<||||q|||fS)Nr)rrr) rrremade_batch_tokensrcachebatch_multipliersrrrZcurrent_token_countrrr process_textps    z%WebUIFrozenCLIPEmebedder.process_textcsfdd|D}dd|D}t|j}|}dd|D}t|j}|}|||jd|j9}|}|||9}|S)Ncs,g|]$}jjg|ddjjgqS)Nr)r/ bos_token_idrrrrrrrsz;WebUIFrozenCLIPEmebedder.process_tokens..cSs$g|]}dg|dddgqS)rNrrrrrrrscSs"g|]}|dgdt|qS)rr)rprrrrrs)r^) rbasarrayrDr1rmeanreshapeshapeexpand)rrrrHrJZ batch_multipliers_of_same_lengthZ original_meanZnew_meanrrrprocess_tokenss  z'WebUIFrozenCLIPEmebedder.process_tokensc Cs||\}}}d}d}ttt|dkrdd|D}dd|D}g} g} tt|D]f} t|| dkr| || dd| || ddqZ| |jjgd| dgdqZ|| | } |dkr| nt j || fdd}|}|}|d 7}q|S) NrcSsg|]}|ddqSrNrrrrrrsz4WebUIFrozenCLIPEmebedder.forward..cSsg|]}|ddqSrrrrrrrsrrrV)axisr^) rrmaprpr{rr/rrrbcat) rrGrrrrJrsZ rem_tokensZrem_multipliersrHrjz1rrrr)s(  z WebUIFrozenCLIPEmebedder.forwardcCs||Sr rrKrrrrszWebUIFrozenCLIPEmebedder.encode)rOr,TrU) rrrr r3rgrrrrr)rrrrrrr#s  ,r__main__T)verbose) rbtorch.nnr"rtorch.utils.checkpointr transformersrrrrrrareldm.utilrr Moduler rrr*rNrYrvcompileXrrrrrcrrrrs.  #>V