o HQg@sddlZddlZddlmmZddlmZddlm Z ddl m Z ddl m Z ddlmZmZmZddlZddlmZdZGd d d ejjZdS) N) trunc_normal_) MaskDecoder) PromptEncoder)TwoWayTransformer)get_1d_sine_peMLPselect_closest_cond_frames)FlopCountAnalysisgcs&eZdZ                              d4d ed ed edededededeffdd ZeddZddZddZ d5ddZ ddZ de j fd d!Z d"d#Z  $ $ %d6d&d'Zd(d)Z $ $ %d7d*d+Zd,d-Z  $ $ %d8d.d/Zd0d1Zd2d3ZZS)9SAM2Base?FTNuse_multimask_token_for_obj_ptrpred_obj_scorespred_obj_scores_mlpfixed_no_obj_ptrsoft_no_obj_ptruse_mlp_for_obj_ptr_projno_obj_embed_spatialcompile_image_encoderc$$srt||_| |_| rdnd|_||_||_|r%tjj ddddd|_ ||_ |r.|s.J||_ ||_ ||_||_|jj|_||_|j|_t|jdr\t|jjdr\|jjjjd|_||_tjt|dd|j|_t|jdd tjtdd|j|_tjtdd|j|_t|jdd t|jdd | |_ ||_!||_"| |_#||_$||_%| |_&||_'||_(||_)||_*||_+||_,||_-||_.|"|_/||_0||_1||_2||_3|j2r|j0sJ|jsJ|j0r|jrtjtd|j|_4t|j4dd | |_5d|_6|!rtjtd|j|_6t|j6dd |7| |_8|#r7t9d tj:|jj;d d d d|j_;dSdS)Nr) kernel_sizestrideout_projweightrg{Gz?)stdzFImage encoder compilation is enabled. First forward pass will be slow.z max-autotuneTF)mode fullgraphdynamic)<super__init__ image_encoderuse_high_res_features_in_samnum_feature_levelsuse_obj_ptrs_in_encodermax_obj_ptrs_in_encodertorchnnConv2dmask_downsampleadd_tpos_enc_to_obj_ptrsproj_tpos_enc_in_obj_ptrsuse_signed_tpos_enc_to_obj_ptrs"only_obj_ptrs_in_the_past_for_evalmemory_attentionZneckd_model hidden_dimmemory_encodermem_dimhasattrrrshape num_maskmem Parameterzerosmaskmem_tpos_encr no_mem_embedno_mem_pos_encdirectly_add_no_mem_embedsigmoid_scale_for_mem_encsigmoid_bias_for_mem_enc"binarize_mask_from_pts_for_mem_encnon_overlap_masks_for_mem_encmemory_temporal_stride_for_eval$use_mask_input_as_output_without_sammultimask_output_in_sammultimask_min_pt_nummultimask_max_pt_nummultimask_output_for_trackingriou_prediction_use_sigmoid image_sizebackbone_stridesam_mask_decoder_extra_argsrrrr no_obj_ptrrr_build_sam_headsmax_cond_frames_in_attnprintcompileforward)$selfr&r3r6r:rLrMrArBrCrFrQr@r'rGrHrIrJrrKrErDr)r*r/r0r1r2rrrrrrrNr __class__K/mnt/petrelfs/dingshuangrui/SAM2-Video-Predictor/sam2/modeling/sam2_base.pyr%s J    zSAM2Base.__init__cCst|jS)N)next parametersdevicerUrXrXrYr\szSAM2Base.devicecOstd)NzPlease use the corresponding methods in SAM2VideoPredictor for inference or SAM2Train for training/fine-tuningSee notebooks/video_predictor_example.ipynb for an inference example.)NotImplementedError)rUargskwargsrXrXrYrTszSAM2Base.forwardc Cs|j|_|j|j|_t|j|j|jf|j|jfdd|_td dtd|jddd|jdd|j |j |j |j |j d |jp z7SAM2Base._prepare_backbone_features..cS g|] }|ddddqSrbrrrpermuterrXrXrYrrcSrrrrrXrXrYrr)copyrr()rUrZ feature_mapsZvision_pos_embeds feat_sizesZ vision_featsrXrXrY_prepare_backbone_featuress z#SAM2Base._prepare_backbone_featuresr皙?c 8 s|dd} |j} |d\}}|dj}|jdkr+|dddd| | ||}|Sd}r1dnd}|sgg}}t|ddksEJ|d}t||j\}}dd| D}|j r`dn|j }t ||j }t|| djd}| dks~| dkrg}nRg}td| dD]:}|d |d d | |f}|d |d d | |f} | | kr|dkr|d|t||dkrnqd|vr|d| g}!td|jD]4}"|"|j}#|#t| krq|d ||#d }$|$d ur|||#d }$||"|$f|!||#qt||| djtjdg}%t||!D]r\\}"}&}'|&d ur5q(|"dkr_| dkr_|%|&d d | |'fd|&dd | |'fj|dd}(n |&dj|dd}(||(dddd|&dd|})|)dddd})|)|j|j|"d})||)q(|jrt ||j }|j s|jrfdd|D}*n|}*fdd|*D}+t||| djtjdg},td|D]G}-|- t| krn;|d ||- |||- d }$|$d ur$| ||- }.|,|$d d |.fd|+|-|$dd |.ffqt|+dkrt|+\}/}0tj|0dd}1|j rk|d}2|j!rH| n|j"}3tj#|/|d}4t$|4|2|3d}4|%|4}4|4&d'd| |j"}4n |1(t|/| |j"}4|j"| kr|1)d| | |j"|j"}1|1dddddd}1|4j*| |j"dd}4||1||4|1jd}n1d}n.|j+r|d|j,}5|5ddd| | ||}5|5S|j,'d| |j"g}|j-'d| |j"g}tj.|dd}6tj.|dd}7|j/|||6|7||%|,d}5|5ddd| | ||}5|5S)zAFuse the current frame's visual feature map with previous memory.rrrrbcond_frame_outputscSsg|]}d|fqS)rrX)routrXrXrYr%szASAM2Base._prepare_memory_conditioned_features..rnon_cond_frame_outputsr.rN maskmem_featuresT) non_blockingmaskmem_pos_enccs,i|]\}}r |krn|kr||qSrXrXrtr frame_idxtrack_in_reverserXrY eszASAM2Base._prepare_memory_conditioned_features..cs$g|]\}}t||dfqS)r)absr)rrXrYrlsrrrr)currcurr_posmemoryZ memory_posnum_obj_ptr_tokensZobject_frame_scoresZobject_ptr_scores)0rwr5r\r:rviewrrrQvaluestrainingrEminr*intr9rangeiteminsertappendgetr+rtobfloat16ziprr=r)r2itemsstackr/r0r7tensorrrprexpand new_zerosreshaperepeat_interleaver@r>r?catr3)8rUris_init_cond_framecurrent_vision_featscurrent_vision_pos_embedsr output_dict num_framesrmem_pick_indexstart_frame_idxiou_threrCHWr\pix_featrZ tpos_sign_mulZ to_cat_memoryZto_cat_memory_pos_embedZ cond_outputsZselected_cond_outputsZunselected_cond_outputsZt_pos_and_prevsrr*Z num_objectZ valid_indicesi object_scoreZiouZ prev_idxsZt_posidxrZobject_frame_scoreprevprev_idxZfeatsZ maskmem_encZptr_cond_outputsZ pos_and_ptrsZobject_ptr_scoreZt_diffZmem_idxZpos_listZ ptrs_listrZ t_diff_maxZtpos_dimZobj_posZpix_feat_with_memrZmemory_pos_embedrXrrY$_prepare_memory_conditioned_featuress          $      $        z-SAM2Base._prepare_memory_conditioned_featurescCs|dd}|j}|d\}} |dddd|||| } |jr*|js*||}|jo.|} | r;|js;|dk} nt |} |j dkrJ| |j } |j dkrT| |j } |j | | dd} | d }| d }|jd ur|dk}|d|d |jd j|j7}||fS) zBEncode the current image and its prediction into a memory feature.rrrbrrrT)Zskip_mask_sigmoidZvision_featuresrN).NN)rwr5rrrDr"_apply_non_overlapping_constraintsrCrr+rrArBr6rrr9)rUrrpred_masks_high_resris_mask_from_ptsrrrrrZbinarizeZ mask_for_memZ maskmem_outrrrrXrXrY_encode_new_memorys<              zSAM2Base._encode_new_memoryc Cs||d}t|dkrddt|dd|ddD}nd}|durE|jrE|dddd}|jd|jg|dR}||||}n<|j|||dd|dd|dd|| | | | |d }| durq|durm|dusoJ| }|||}|j |||||d }||||fS) N)rrrcSs:g|]\}}|dddj|d|dg|RqS)rrbr)rrrw)rrsrXrXrYrs(z(SAM2Base._track_step..rrbr) rrrrrrrrrrr)rrrr~r}) rrrFrrr5rr_use_multimaskr)rUrrrrrrrrrrprev_sam_mask_logitsrrr current_outr~r sam_outputsr}rXrXrY _track_stepsJ       zSAM2Base._track_stepc CsV|r!|jdkr!|}|j|||||dud\} } | |d<| |d<dSd|d<d|d<dS)Nr)rrrrrrr)r:r) rUrrrrun_mem_encoderrrrZhigh_res_masks_for_mem_encrrrXrXrY_encode_memory_in_output-s    z!SAM2Base._encode_memory_in_outputcCs|||||||||| | | | ||\}}}}|\}}}}}}}}| dkrD||d<|dd|d<|dddf|d<||d<||d<n||d<||d<|dddf|d<||d<||d<|jse||d<|S) Nr pred_masksrrrrrr)rmaxr)rUrrrrrrrrrrrrrrrrrrrrrrrrrrrXrXrY track_stepFsP  zSAM2Base.track_stepcCsN|durdn|dd}|jo$|p|jo$|j|ko"|jk}|S}|S)z0Whether to use multimask output in the SAM head.Nrrqr)rwrGrJrHrI)rUrrZnum_ptsr}rXrXrYrszSAM2Base._use_multimaskcCsn|d}|dkr |S|j}tj|ddd}tj||ddddddf}||k}t||tj|dd}|S) z Apply non-overlapping constraints to the object scores in pred_masks. Here we keep only the highest scoring object at each spatial location in pred_masks. rrT)rkeepdimrrNr)r)rwr\r+rrrclamp)rUr batch_sizer\Z max_obj_indsZbatch_obj_indskeeprXrXrYrs z+SAM2Base._apply_non_overlapping_constraints) r r r rrFFrFFFrrFFFrFFr TFFFFFFFFFNF)NNNF)Frrr)rrr)FTNrrr)__name__ __module__ __qualname__boolr%propertyr\rTrPrrr+Tensorrrrrrrrrr __classcell__rXrXrVrYr s 9;?BCEH. 5 +5 :@ C$ M r )r+torch.distributedZtorch.nn.functionalr, functionalrZ torch.nn.initrZsam2.modeling.sam.mask_decoderrZ sam2.modeling.sam.prompt_encoderrZsam2.modeling.sam.transformerrZsam2.modeling.sam2_utilsrrrpdbZ fvcore.nnr rModuler rXrXrXrYs