o 8Qg1@sddlZddlZddlZddlmZddlmZmZddlZddl Z ddl Z ddl m mZddlmZmZmZddlmZerGddlmZde jd efd d Zejd d de jfddZejd d dde jfddZd7ddZde jde jfddZ eGdddZ!ddddd d!ed"eed#e jd$ed%ed&e"dee!fd'd(Z#d)ee!d*e$d+e$fd,d-Z%d.d/d0d1ee&dd d!ed#e jd$ed2e$d3e$d4e"fd5d6Z'dS)8N) dataclass) TYPE_CHECKINGList) HOP_LENGTH SAMPLE_RATETOKENS_PER_SECOND) Tokenizer)Whisperx filter_widthc Cs|d}|jd|kr |S|j}dkr|ddddf}|dkr'|ddks+Jdd}tj||d|dddfdd}|jr_z dd lm}|||}Wnttj fy^t d Ynw|durt| d|d dd |df}|dkr||d }|S) zMApply a median filter of width `filter_width` along the last dimension of `x`Nrrz&`filter_width` should be an odd numberreflect)mode)median_filter_cudaz}Failed to launch Triton kernels, likely due to missing CUDA toolkit; falling back to a slower median kernel implementation....rr)shapendimFpadis_cuda triton_opsr RuntimeError subprocessCalledProcessErrorwarningswarnunfoldsort)r r pad_widthrresultrr"C/mnt/sfs-common/jkyang/EgoGPT/egogpt/model/speech_encoder/timing.py median_filters0  "r$T)nopythontracecCs|jdd}|jdd}d|dddf<d|dddf<g}|dks(|dkrj||d|df|||fdkrD|d8}|d8}n|||fdkrQ|d8}n|||fdkr^|d8}ntd|dks(|dks(t|}|dddddfjS)Nrrr zUnexpected trace[i, j]r)rappend ValueErrornparrayT)r&ijr!r"r"r# backtrace9s$    r.)r%parallelc Cs$|j\}}tj|d|dftjdtj}tj|d|dftjd }d|d<td|dD]]}td|dD]S}||d|df}||d|f}|||df} ||krc|| krc|d} } n||krq|| krq|d} } n| d} } ||d|df| |||f<| |||f<q9q0t|S)Nrdtyperrr )rr)onesfloat32infranger.) r NMcostr&r-r,c0c1c2ctr"r"r#dtw_cpuRs$ "    r>c Cs8ddlm}|j\}}||ksJd|tj|d|dftjdd||||||}|j }t ||d|dtj}d|d<| }t j|t jd}|d ||||d|d|d|||d |j d|d||d |d||d ddd|df}t|S) Nr) dtw_kernelz$M should be smaller than BLOCK_SIZE=r)valuer rr0)r) BLOCK_SIZE)rr@rrrr)r4flattenreshaper+ contiguoustorchr2cuda zeros_likeint32strider.cpunumpy)r rBr@r7r6x_skewr8r&r"r"r#dtw_cudals2  8 4 rOreturnc CsH|jrzt|WSttjfytdYnwt| S)NzsFailed to launch Triton kernels, likely due to missing CUDA toolkit; falling back to a slower DTW implementation...) rrOrrrrrr>doublerLrM)r r"r"r#dtws rRc@s:eZdZUeed<eeed<eed<eed<eed<dS) WordTimingwordtokensstartend probabilityN)__name__ __module__ __qualname__str__annotations__rintfloatr"r"r"r#rSs   rSg?) medfilt_widthqk_scalemodelr tokenizer text_tokensmel num_framesrarbc st|dkrgStg|j|j||j|j}dg|jj fddt |j j D}ddl m} tK| 8||d|dd} | t|jdd|jf} | jdd} | tt||fWdn1sywYWdn1swY|D]} | qtfdd|jjD}|ddddd|d f}||jdd}tj|d d d d \}}|||}t||}|jdd}|t|jd}t| \}}|||jg\}}t|dkrgStt dd|ddDd}tjt!|ddd"t#}||t$}||dd}||dd}fddt%|dd|ddD}ddt%|||||DS)Nrcs(g|]\}}|j|ffdd qS)cs||ddS)Nrr) __setitem__)_insoutsindexQKsr"r#z+find_alignment...) cross_attnregister_forward_hook).0r,blockrmr"r# s z"find_alignment..r) disable_sdpar)dimcsg|] \}}||qSr"r")rs_l_hrmr"r#rusr TF)rwkeepdimunbiased)axiscSsg|]}t|qSr")lenrsr=r"r"r#rurp)rr)constant_valuescs"g|] \}}t||qSr")r)mean)rsr,r-)text_token_probsr"r#ruscSs&g|]\}}}}}t|||||qSr")rS)rsrTrUrVrWrXr"r"r#rus )&r~rGtensor sot_sequence no_timestampseottodevicedims n_text_layer enumeratedecoderblocksrcrvno_grad unsqueezesoftmaxr)arangetolistremovestackalignment_headsindicesr+std_meanr$rrRsplit_to_word_tokensrcumsumdiffastypeboolrzip)rcrdrerfrgrarbrUhooksrvlogitssampled_logits token_probshookweightsstdrmatrix text_indices time_indiceswords word_tokensword_boundariesjumps jump_times start_times end_timesword_probabilitiesr")rnrr#find_alignmentsp            $   r alignment prependedappendedcCs t|d}t|d}|dkrD||}||}|jdr:|j|vr:|j|j|_|j|j|_d|_g|_n|}|d8}|dksd}d}|t|kr||}||}|jdsv|j|vrv|j|j|_|j|j|_d|_g|_n|}|d7}|t|ksNdSdS)Nr rr )r~rT startswithstriprUendswith)rrrr,r-previous followingr"r"r#merge_punctuationss4   ru "'“¿([{-u"'.。,,!!??::”)]}、)prepend_punctuationsappend_punctuationssegmentsrrlast_speech_timestampc st|dkrdSfdd|D} ttj| } t|| ||fi|} tdd| D} | | } t| dkr@t | nd} t dt | } | d}t| dkrd}t d t| D]3}| |j | |j|kr| |j|vr|| |j|| |_ q\| |d j|vr| |j || |_q\t| |||dd tt}d}t|| D]7\}} d}g}|t| kr|t| kr| |}|jr|t|jt||jdt||j d|jd |t|j7}|d 7}|t| kr|t| kst|dkr|dd || d krp|dd |dd|ks.t|d krp|d d |dd|dkrpt|d kra|d d |d d|krat|d d d|d d |}||dd <|d d<td|dd ||dd<|d|dd kr|dd|ddkrtdt |dd | |d|dd<n|dd|d<|d |ddkr|d d|dd krt|dd| |d |dd <n|dd |d <|d }||d<qdS)Nrcs"g|] }fdd|dDqS)csg|] }|jkr|qSr")r)rstokenrdr"r#ru'sz2add_word_timestamps...rUr")rssegmentrr"r#ru&sz'add_word_timestamps..cSsg|]}|j|jqSr")rWrVrr"r"r#ru-sggffffff?r u .。!!??rseek)rTrVrWrXrWrVg?rr)r~list itertoolschain from_iterablerr)r*nonzeromedianminr_r5rWrVrTrrrrr'dictroundrXrUmax)rrcrdrfrgrrrkwargstext_tokens_per_segmentrerword_durationsmedian_duration max_durationsentence_end_marksr, time_offset word_indexr saved_tokensrtimingboundaryr"rr#add_word_timestampss      "" r)r?)(rrr dataclassesrtypingrrnumbarMr)rGtorch.nn.functionalnn functionalraudiorrrrdr rcr Tensorr^r$jitndarrayr.r>rOrRrSr_rr\rrrr"r"r"r#sz    &  !  R)