a ^ftC@sxddlZddlZddlZddlmZddlmZddlmZddl m Z ddl m Z m Z ddlmZGddde ZdS) N)autocast)registry) BaseModel)StoppingCriteriaStoppingCriteriaList)StoppingCriteriaSubcseZdZdZdddddddd d dd dddd d gddffdd ZddZddZd,ddZddZddZ ddZ d-d d!Z d"d#Z e d$d%d$d&d$d$d$dd'gf d(d)Ze d.d*d+ZZS)/ MiniGPTBasez1 Base class for MiniGPT-4 and MiniGPT-v2 eva_clip_grFfp16T i q_projv_projg?c sht|j|| | ||||d\|_|_|||||||\|_|_||_| |_ | |_ | |_ g|_ dS)N)llama_model_path low_resourcelow_res_devicelora_rlora_target_modules lora_alpha lora_dropout) super__init__init_llm llama_modelllama_tokenizerinit_vision_encodervisual_encoder ln_vision max_txt_lenmax_context_lenend_symprompt_template prompt_list)selfZ vit_modelimg_sizedrop_path_rateuse_grad_checkpointZ vit_precisionZ freeze_vitrr!r"r$r#r device_8bitrrrr __class__s   zMiniGPTBase.vit_to_cpucs|dj|d}t|t|dks0Jdfddt|D}fdd|D}ddt|dd |D|d g}tj|dd }|S) Nr z3Unmatched numbers of image placeholders and images.cs,g|]$\}}j|d|dkdjqS)ptrreturn_tensorsadd_special_tokens)rr0 input_ids).0isegdevicer&r-r. Hs  z/MiniGPTBase.get_context_emb..csg|]}|qSr-) embed_tokens)r;Zseg_tr2r-r.r@McSsg|]}|D]}|q qSr-r-)r;pairembr-r-r.r@OrBdim)r?splitlen enumerateziptorchcat)r&promptimg_listZ prompt_segsZ seg_tokensZseg_embsZ mixed_embsr-r>r.get_context_embDs   &zMiniGPTBase.get_context_embNc Cs|dust|dkr||fS|dur`d|j_|j|dddd|j}||j}|j}||fSg}t|t r||gt|}t t ||D]\} \} } | j d} |dur| d| j d} | d|| | } | d } g}t | ddD]d\} }|j|ddd |j}||j}|tj|| ddd| | | d | fgd d qtj|d d }|j| dddd |j}||j}tj||gd d }||qd d|D}|tj|jj|jd}t||jkrt|n|j}|t||d}tjt||gtj|jd}t |D]X\}}|||jkrH||n|j}|ddd|f||d|f<d ||d|f<q(||fSdS)Nrrightr6longestF)r8paddingr9rEr4r7r5rFcSsg|]}|jdqSr5shaper;rDr-r-r.r@|rBz+MiniGPTBase.prompt_wrap..)r?dtyper?)rIr padding_sider0r?rAr:attention_mask isinstancestrrJrKrWreshaperHappendrLrMtensor pad_token_idmaxr"expandclonezerosint)r& img_embedsatts_imgpromptslengths prompt_tokensZ prompt_embedsZ atts_promptZ emb_listsidxZeach_img_embedZ each_promptpnZp_segsZinterleave_embr=Zp_tokensZp_embedZ wrapped_embZemb_lensZpad_emb max_lengthZ wrapped_embsZ wrapped_attsr<rDlengthr-r-r. prompt_wrapSsh     :    zMiniGPTBase.prompt_wrapc Csg}g}g}t|dD]~}||} || |t||d| ||||| dg|t||d| ||||| dgqt|}t|}|||fS)z Concatenate the batched input embedding and batched output embedding together. Both the input and the output embedding should be right padded. rN)rangesizesumr`rLrMstack) r&Z input_embsZ input_attsZ output_embsZ output_atts input_lensZcat_embsZcat_attsr<Z input_lenr-r-r.concat_emb_input_outputs.    z#MiniGPTBase.concat_emb_input_outputcsg}g}t|}t|D]}||||}}fdd|ddD}fdd|D}g} g} tt|D]N} | || j| || j| || j| t|| jdqn| |dj| |djtj| dd} tj| dd} || || qttd d|Dj } tj || g| j j d j j} tj || g| j j d d}t|D]T}||jd}||d d| f| |d|f<||d d| f||d|f<qn| j jktj}| ||fS) zVconcatenate conversation and make sure the model is only trained to regress the answercs,g|]$}jjj|dddjqSr6Fr7)r bos_tokenr0r?r;qr2r-r.r@sz5MiniGPTBase.tokenize_conversation..r5Ncs*g|]"}j|jdddjqSrx)rr#r0r?r;ar2r-r.r@s rErFcSsg|]}|jdqSrUrV)r;targetr-r-r.r@rBrYr)rIrrr`r:rL ones_likerMminrcr!onesrZr?rrbrWr0rg)r&conv_qconv_aZto_regress_token_ids_listZ targets_list batch_size batch_idx questionsanswersZcur_idZ cur_targetr<max_lenZto_regress_token_idstargetscur_lenZto_regress_token_attnr-r2r.tokenize_conversationsR          $z!MiniGPTBase.tokenize_conversationcsd|vr|d\}}nd}}d|vr|d|d}}|ddfdd|D}fdd|D}fd d|D}||d d|D\}}||\}} } nd |vr|d } njrtj} nd} td rjrfd d| D} d|vrJ|j\} } }| t |dd| |}||| |d\}}n||| \}}dj _ fdd|dD}j |dddj ddj}|j}|j} ||j jkd} |}|||| | fS)Nimagerr connect_symrcsg|]}|qSr-rHrzrr-r.r@rBz3MiniGPTBase.preparing_embedding..csg|]}|qSr-rr|rr-r.r@rBcsg|]}fdd|DqS)csg|]}j|qSr-r$format)r;itemr2r-r.r@rBz>MiniGPTBase.preparing_embedding...r-)r;itemsr2r-r.r@rBcSsg|] }|dqS)rr-rzr-r-r.r@rBinstruction_input chat_templatecsg|]}j|qSr-r)r;Zinstructr2r-r.r@rBrprErQcsg|]}|jqSr-)r#)r;tr2r-r.r@rBanswerr6rRTF)r8rS truncationror9r~) encode_imgrqrr%randomchoicehasattrrrWr_rIrr[r!r0r?r:r\ masked_fillrbrA)r&samplesrhZimg_attsrr cond_embeds cond_attsZregress_token_ids regress_atts part_targets instructionbszrnhstextZregress_tokensregress_embedsr-)rr&r.preparing_embeddingsT       zMiniGPTBase.preparing_embeddingmeancCsB||\}}}}}|||||\}} } t|ddddf|jj} || } |ddddf} tj| |gdd}tj| | gdd} tj|j d|j dgtj d |j  d}t|D]0\}}|||| |d| |t|df<q|$|j|| d||d}Wdn1s*0Y|j}d|iS) Nr5rFr)rZr~T) inputs_embedsr\ return_dictlabels reductionloss)rrwrLrr bos_token_idrArMrrWlongr0r?fill_rJrImaybe_autocastrr)r&rrrrrrrrr\rvbosZ bos_embedsZbos_attsrr<routputsrr-r-r.forwards8 " * &zMiniGPTBase.forwardcCs4t|jjdr"|jjjj|}n|jj|}|S)Nmodel)rr base_modelrrA)r& token_idsZembedsr-r-r.rA6szMiniGPTBase.embed_tokensr5g?c  sttfdd| Ddg} |j\} }dd| D}fddt||D}t|}tdd|D}|djd}|dj }|dj}t j |||g||d }t j ||gt j |d }t |D]:\}}|jd }|d||| d f<d ||| d f<qʈ0jj|||||| | |||d }Wd n1sF0Yg}|D]f}|ddkrv|d d }jj|d d}|dd}|dd}|dd}||qX|S)z4 function for generate test use cs g|]}t|gjqSr-)rLrar0r?)r;r<r2r-r.r@QrBz(MiniGPTBase.generate..)stopscSsg|]}|dgqS)Nr-)r;Z image_embr-r-r.r@TrBcsg|]\}}||qSr-)rP)r;rrOr2r-r.r@VrBcSsg|]}|jdqSrUrVrXr-r-r.r@YrBrrrYr5N) rr\max_new_tokens num_beamslength_penalty temperature do_sample min_lengthtop_prepetition_penaltyT)skip_special_tokenszzr z[/INST]rE)rrrr0r?rKrIrcrWrZrLrfrgrJrrgeneraterdecoderHreplacestripr`)r&imagestextsrrrrrrrrstop_words_idsstopping_criteriarhriZ image_listsZ batch_embsrrZemb_dimrZr?Zembs attn_maskr<rDZemb_lenrrZ output_tokenZ output_textsr-r2r.r=sP     &  zMiniGPTBase.generatec Csg}|D]>}|||d}|j|ddddd}||tjqtj|dd}|durt|jdD]} d || || df<qltj |dd} | S) N)rrrnone)rrrEr5rFri') rr_r`rLcuda empty_cacherMrrrWargsorttolist) r&rrrZnum_candZ all_lossesrZchoice_samplesrr<Zoutput_class_ranksr-r-r. multi_selects  zMiniGPTBase.multi_select)N)r)N)__name__ __module__ __qualname____doc__rr3rPrqrwrrrrArLno_gradrr __classcell__r-r-r+r.rsP+ 6-> % Kr)loggingrrLZtorch.cuda.amprtorch.nnnnminigpt4.common.registryrminigpt4.models.base_modelr transformersrr"minigpt4.conversation.conversationrrr-r-r-r.s