a h@s~ddlZddlZddlZddlZddlZddlZddlmZddl m Z ddl m Z ddl mZmZddlZejddejdeeZGd d d eZGd d d eZGd dde ZGdddeZGdddeZGddde ZGdddeZGdddeZGdddeZGdddeZ GdddeZ!Gdd d eZ"Gd!d"d"eZ#Gd#d$d$eZ$d*d&d'Z%d(d)Z&dS)+N)deepcopy)Dataset)ByteLevelBPETokenizer) T5TokenizerRobertaTokenizerz6%(asctime)s - %(levelname)s - %(name)s - %(message)sz%m/%d/%Y %H:%M:%S)formatdatefmtlevelc@s^eZdZdZdddZeddZddZd d Zd d Z d dZ ddZ ddZ ddZ dS) MyTokenizerz+ Wrapper for ByteLevelBPETokenizer NcKs t||fi||_|dSN)r tokenizerupdate_id2token)selfvocabZmergeskwargsr1/Users/akhilshekkari/Desktop/github-test/utils.py__init__szMyTokenizer.__init__cCs*tj|d}tj|d}t||}|S)Nz vocab.jsonz merges.txt)ospathjoinr )rZvocabpZmergespZmytokenrrrfrom_pretraineds zMyTokenizer.from_pretrainedcs"|jfddD|_dS)Ncsi|]}||qSrr).0tokenrrr $z/MyTokenizer.update_id2token..)r get_vocabid2tokenrrrrr "s zMyTokenizer.update_id2tokencCs&|D]}|j|q|dSr )valuesr add_special_tokensr )rdicr rrrr!&s zMyTokenizer.add_special_tokenscs|jfdd|DS)Ncsg|] }|qSrr)rirrr -rz5MyTokenizer.convert_ids_to_tokens..)r)ridsrrrconvert_ids_to_tokens+sz!MyTokenizer.convert_ids_to_tokenscKs||}d|SN )r&r)rr%rtokensrrrdecode/s zMyTokenizer.decodecKs"|jdddd}|j|jS)Nasciiignore)errors)encoder*r r%)rtextrrrrr.3szMyTokenizer.encodecCs |jSr )r rrrrrr7szMyTokenizer.get_vocabcCst|jSr )lenr rrrrr__len__:szMyTokenizer.__len__)NN)__name__ __module__ __qualname____doc__r staticmethodrr r!r&r*r.rr1rrrrr s  r c@seZdZddZdS)RefineFeaturescCs||_||_||_dSr ) example_id source_ids target_ids)rr8r9r:rrrr?szRefineFeatures.__init__Nr2r3r4rrrrrr7>sr7c@sJeZdZdddZddZeddZdd Zd d Zd d Z ddZ dS) RefineDatasetcs|_|_td|ddt|D}tt|D]}d||vr:|||d<q:|dkrl|d|}td|||j fdd|D|_ dS)NReading examples from {}cSsg|]}t|qSr)jsonloadsrlinerrrr$Irz*RefineDataset.__init__..idrTokenize examples: csg|]}|fqSrrrexampleargsr rrr$Qr) r rHloggerinforopenranger0maptokenizefeats)rr poolrH file_path samplenumexamplesr#rrGrrEs  zRefineDataset.__init__c Cs|\}}}|dd}|dd}dd|D}dd|D}d|}d|}d|dd}d|dd}|d}||||}||jg||||7}||||} ||| ||\}} t|d || S) Nold newcSsg|]}|ddqSNstriprArrrr$Wrz*RefineDataset.tokenize..cSsg|]}|ddqSrWrYrArrrr$XrcommentrC)splitrreplace encode_removemsg_id pad_assertr7 ritemrFr rHZoldlinesnewlinesr\srcidstgtidsrrrrNSs   zRefineDataset.tokenizecCsD|d}dd|D}d|}d|}|dd}||fS)NrUcSsg|]}|ddqSrWrYrArrrr$grz3RefineDataset.process_pred_gold..r(z )r]rr^predgoldrrrprocess_pred_goldds    zRefineDataset.process_pred_goldcCs|d|jd}|jg||jg}|jt|}||jg|7}|d|jd}|jg||jg}|jt|}||jg|7}t||jksJdt||jksJd||fS)NNot equal length.max_source_lengthbos_ideos_idr0pad_idmax_target_lengthrr9r:rHr pad_lenrrrramszRefineDataset.pad_assertcCsZ|j||jdd}t|tkr*|ddSt|tkrB|ddSt|tkrR|StdSNT) max_length truncationr=rXr.rotyperrr NotImplementedErrorrr r/rHrrrr_zs     zRefineDataset.encode_removecCs t|jSr r0rOrrrrr1szRefineDataset.__len__cCs |j|Sr rOrr#rrr __getitem__szRefineDataset.__getitem__N)r=) r2r3r4rrNr6rkrar_r1rrrrrr<Ds    r<c@s eZdZddZeddZdS)SimpleRefineDatasetc Cs|\}}}|dd}|dd}dd|D}dd|D}d|}d|}|d}||||}||jg||||7}||||} ||| ||\}} t|d || S) NrTrUrVcSsg|]}|ddqSrWrYrArrrr$rz0SimpleRefineDataset.tokenize..cSsg|]}|ddqSrWrYrArrrr$rr(r\rC)r]rr_r`rar7rbrrrrNs   zSimpleRefineDataset.tokenizecCs8|d}dd|D}d|}d|}||fS)NrUcSsg|]}|ddqSrWrYrArrrr$rz9SimpleRefineDataset.process_pred_gold..r()r]rrhrrrrks   z%SimpleRefineDataset.process_pred_goldNr2r3r4rNr6rkrrrrrsrc@s eZdZddZeddZdS)Seq2SeqDatasetc Csx|\}}}|d|d}}d|}d|}||||}||||}|||||\}}t|d||S)NrTrVr(rC)rr]r_rar7) rrcrFr rHinputsoutputsrerfrrrrNs zSeq2SeqDataset.tokenizecCs$d|}d|}||fSr')rr]rhrrrrksz Seq2SeqDataset.process_pred_goldNrrrrrrs rc@sveZdZdddZddZddZdd Zd d Zd d ZddZ ddZ ddZ ddZ ddZ ddZddZdS) TextDatasetr=c sd|_|_|_ttr"d}n$ttr2d}nttrBd}nd}|d|d}tj |r~t d |t|}nVt d |t||}t d |||jfd d |D}t||t d ||||jfdd |D|_dd |jD|_dS)Nrmytokrgrbunk.jsonl.expsLoading examples from {}r>rDcsg|]}|fqSrrrErGrrr$rz(TextDataset.__init__..Convert examples to features...csg|]}|fqSrrrErGrrr$rcSsg|]}|D]}|q qSrr)rrOfeatrrrr$r)cntr rH isinstancer rrr^rrexistsrIrJrtorchloadread_review_examplesrMrNsaveset_start_end_idsconvert_examples_to_featuresZfeatssrO rr rPrHrQrRtokenizer_typesaveprSrrGrrs6         zTextDataset.__init__cCs t|jSr r}rrrrr1szTextDataset.__len__cCs |j|Sr r~rrrrrszTextDataset.__getitem__cCs&t|j|ksJ|jd||_dSr r})rdata_lenrrr reset_lenszTextDataset.reset_lencCs|D]z}|j}d}t|d}t|D]\}}|dkr&|}q@q&tt|dddD]}||}|dkrT|}qrqT||_||_qdS)NrrXr=)labelsr0 enumeraterLstart_idend_id)rrSrFrrrr#labelrrrrs zTextDataset.set_start_end_idsc Cs|\}}}|||j||_|jd}ddd|jD}|dt|d}dd|D}dd|D}ttt|}t|t |} dt|} } | |j d kr| d dkr| d t|| 8} | d 7} q| d 8} | d t|| 8} q|| | }|j | | } t|t tt||j d ks4Jd t|t| krpt d |dt| }| dt|} ||_| |_ |||j||_|S) Nr(css|]}t|VqdSr )str)rrCrrr rz'TextDataset.tokenize..cSs g|]}dd|dDqS)cSs g|]}t|dkrt|qSr)r0int)rvrrrr$rz3TextDataset.tokenize...r(r]rArrrr$sz(TextDataset.tokenize..cSsg|] }t|qSrr0rArrrr$rrrlrXz(Too long inputs in TextDataset.tokenize.z)Not equal length in TextDataset.tokenize.)r_input special_dictrr]rlistrMr0sumrorrIrJlinesmsg) rrcrFr rHZe0idrrlensZcurlenleftrightrrrrrNs:     * zTextDataset.tokenizecCs|\}}}t|jdkr\g}tdD]2}tdkrF|||q$|||q$|Stdkrt||gS||gS)Nrg?) r0rrLrandomappendgenmsg_exampledaemsg_exampleencoder_exampledecoder_example)rrcrF_Zexsrrrrs     z(TextDataset.convert_examples_to_featurescCs|\}}}|j}|j}|jg|j}gg}} tt||D]\} \} } | |jkrh||j| d| dkr||j| | | | | dgt | | |j kr<||j | dqcrXrBr)rrrorchoicesrLr0rZ mask_raterrrrrrrrrrrarr)rrcrFr rHrrrr9r: SPECIAL_IDZ mask_idxsid_dictr#rBrrrrrHs0   $        zTextDataset.decoder_examplecCs|\}}}|j}|j}dg|j}gg}} |j|j|jd} tt||D]X\} \} } | |jkrn| |j| dkr| | | | | | |j krL| |j qL| |j | |j t||jksJdt|d||| ||\}} t|j||| ddS)Nrrrrgenmsgr)rrrorrrrrrrrrr`rr0rarr)rrcrFr rHrrrr9r:rr#rBrrrrrds&         "zTextDataset.genmsg_examplecCsf|\}}}dg|j}gg}}t|j}ddtt|D} t| dkrfttt|} d| | <gg}}d} d} | t| kr>| } | t| kr| | s||| | d7} q| t| krƐq>||j d| d||j d| d| t| kr&| | r&||| | d7} q| d kr8| d7} | } qx| ||||\}}t |j |||d d S) NrcSsg|]}tdkqS)g?)r)rrrrrr$~rz.TextDataset.daemsg_example..rTrXrrrdaemsgr) rocprrLr0rrchoicerrrarr)rrcrFr rHrr9r:Zmsg_idsmasksrr#rjrrrrys8          zTextDataset.daemsg_examplecCs|d|jd}|jg||jg}|jt|}||jg|7}|d|jd}||jg}|jt|}||jg|7}t||jksJdt||jksJd||fS)NrlrXrmrnrtrrrras zTextDataset.pad_assertcCsZ|j||jdd}t|tkr*|ddSt|tkrB|ddSt|tkrR|StdSrvryr|rrrr_s     zTextDataset.encode_removeN)r=)r2r3r4rr1rrrrNrrrrrrar_rrrrrs "( rc@seZdZdddZddZdS)CommentGenDatasetr=c s|_ttrd}n$ttr&d}nttr6d}nd}|d|d}tj|rrt d |t |}nVt d |t||}t d |||jfd d |D}t ||t d ||||jfd d |D|_dd |jD|_dS)Nrrgrrrrrr>rDcsg|]}|fqSrrrErGrrr$rz.CommentGenDataset.__init__..rcsg|]}|fqSrrrErGrrr$rcSsg|]}|dur|qSr r)rrrrrr$rr rr rrr^rrrrIrJrrrrrMrNrrrrOrrrGrrs2         zCommentGenDataset.__init__cCs&|\}}}t|jdkrdS||S)Nr)r0rr)rrcrFr rHrrrrs z.CommentGenDataset.convert_examples_to_featuresN)r=r2r3r4rrrrrrrs rc@seZdZdddZddZdS)CommentClsDatasetr=c s|_ttrd}n$ttr&d}nttr6d}nd}|d|d}tj|rrt d |t |}nVt d |t||}t d |||jfd d |D}t ||t d ||||jfd d |D|_dS)Nrrgrrrrrr>rDcsg|]}|fqSrrrErGrrr$rz.CommentClsDataset.__init__..rcsg|]}|fqSrrrErGrrr$rrrrrGrrs0         zCommentClsDataset.__init__cCs&|\}}}||}t|j|j|jSr )r ClsFeaturesr8r9y)rrcrFr rHZ tmpfeaturerrrrs  z.CommentClsDataset.convert_examples_to_featuresN)r=rrrrrrs rc@seZdZdddZddZdS)SimpleClsDatasetr=c s|_ttrd}n$ttr&d}nttr6d}nd}|d|d}tj|rtt d |t ||_nZt d |t||}t d |||jfd d |D|_t |j|dS) Nrrgrrrz .simpexpsrr>rDcsg|]}|fqSrrrErGrrr$ rz-SimpleClsDataset.__init__..)r rr rrr^rrrrIrJrrrrOrrMrrrrrGrrs&     zSimpleClsDataset.__init__cCs:|\}}}|jd|_t|j}|jd||_tt|jD]J}|j|dkrhd|j||j|<q@|j|dkr@d|j||j|<q@d|j|_|||j|}t||jd}|dkr|dd} || | }|d|jd} |j g| |j g} |jt| } | |j g| 7} |j } |j } t| | | S)NrrXz+ rz- r(rl)rr]Z input_linesr0rrLrr_rorprqrrrrr)rrcrFr rHZlabels_lr# input_idsZexceed_lZhalfexlr9rur8rrrrr s,   z-SimpleClsDataset.convert_examples_to_featuresN)r=rrrrrrs rc@seZdZdddZddZdS)SimpleGenDatasetr=c s_ttrd}n$ttr&d}nttr6d}nd}|d|d}tj|rtt d |t |_nlt d |t|}tt|D]} | || d <qt d |fd d |D_t j|dS) Nrrgrrrz .simpgenexpsrr>rrDcsg|]}|fqSr)r)rr"rHrr rrr$>rz-SimpleGenDataset.__init__..)r rr rrr^rrrrIrJrrrrO read_jsonlrLr0r) rr rPrHrQrRrrdatar#rrrr's&    zSimpleGenDataset.__init__cs6|\}}}|d|d}}|ddd}dd|D}dddd fd d fd d|D}d d|D}d} t||D]@\} } | dkr| d| 7} q| dkr| d| 7} q| d| 7} q||| |} g} | |j|||d|}| ||| | ||\} } dgt| }t|d| || ddS)NpatchrrUrXcSs g|]}t|dkr|qSr)r0rZrArrrr$ErzASimpleGenDataset.convert_examples_to_features..rrl)-+r(cs|vr|SdSdS)Nrlr)s)map_dicrrfGsz8SimpleGenDataset.convert_examples_to_features..fcsg|]}|dqSrrrA)rrrr$LrcSsg|]}|ddqSrWrYrArrrr$Mrrgr[zzrrrr) r]rr_rr`rrar0r)rrcr"r rHdiffr difflinesrinputstrrrBr9r:rr)rrrrAs.     z-SimpleGenDataset.convert_examples_to_featuresN)r=rrrrrr&s rc@seZdZdZdddZdS) InputFeaturesz.A single training/test features for a example.NcCs||_||_||_||_dSr )r8r9r:url)rr8r9r:rrrrrcszInputFeatures.__init__)N)r2r3r4r5rrrrrr`src@seZdZddZdS)rcCs.||_||_||_||_|dvs$J||_dS)N)rrBrr)r8r9 source_labelsr:rz)rr8r9rr:rzrrrrks  zReviewFeatures.__init__Nr;rrrrrjsrc@seZdZddZdS)rcCs||_||_||_dSr )r8r9r)rr8r9rrrrrtszClsFeatures.__init__Nr;rrrrrssrc@s0eZdZdZddZddZddZdd Zd S) ReviewExamplezA single training/test example.cCsb||_||_||_||_||_||_||_g|_g|_g|_ g|_ d|_ d|_ | |dS)NFrg)roldfrrcmtidmax_lenr prevlines afterlinesrravailralign_and_clean postprocess)rrrrrrrrrrrr|szReviewExample.__init__c Cs|js dSdd|jD}t|}|ttt|7}dt|}}||jkr|ddkrv|t||d8}|d7}qB|d8}|t||d8}qB|||}|j|||_|j|||_|j}|j}t t|t|}d}||jkr||kr|t|krZ|t|d| d} | |jkr2q|j d|d||j dd| }|t|kr|t|| d} | |jkrq|j |||j d| }|d7}q||jksJdt|jt|jksJd d |j|_ggg|_|_|_dS) NcSsg|] }|qSrr)rZ source_strrrrr$rz-ReviewExample.postprocess..rrlrXr=rzToo long inputs.rmr)rrr0rrMrrrrmaxr]insertrrr) rrZinputlrrrrZprev_after_lenr#ZnewlrrrrsN        zReviewExample.postprocesscCshd}t|}d}||kr.|||vr.|d7}q|d}|dkrT|||vrT|d8}q6|||d}|S)z7 Remove start and end empty chars. z rrXr)rrBrepZtotallenr#rrrrremove_space_cleans  z ReviewExample.remove_space_cleanc sjd}jd}|d}|dd}dd|D}d}t||}|rf|\}}}} d_n d_dSt|dt|}}||} |d|_|| d_ |D]v} | d rވj | ddj dq| d r j | ddj dqj | j d qfd djD_fd dj D_ fddj D_ j_ddjD_ddj D_ tj tj ksJdttddtj j D} | gkrd_dS| \_ _ tj _ tj _ dS)NrUrrXcSsg|]}|dkr|qS)z\ No newline at end of filerrArrrr$rz1ReviewExample.align_and_clean..z @@ -(\d+),(\d+) \+(\d+),(\d+) @@TFrrrlcsg|]}|qSrrrArrrr$rcsg|]}|qSrrrArrrr$rcsg|]}|qSrrrArrrr$rcSsg|]}t|dkr|qSrrrArrrr$rcSsg|]}t|dkr|qSrrrArrrr$rzNot equal length in align.cSs$g|]\}}t|dkr||fqSrr)rrBrrrrr$s )rr]rrematchgroupsrrrr startswithrrrrrr0rr) rZ oldflinesr first_lineregexZmatchres startlineZrangelenstartposendposZendlinerBZtopackrrrrsZ           zReviewExample.align_and_cleanN)r2r3r4r5rrrrrrrrrys ,rr=c Csg}d}t|}|D]}zt|}WntdYqYn0d}d|vrZd|d<d|vrzt|ddkrzd|d<t||d|dd|vr|dnd d |vr|d nd ||dd } | jr|| |d7}||krqq|d7}||krqqWd n1s0Y|S) zRead examples from filename.rError during reading json data.rrrXrrrgr)rrrrrrrN) rKr?r@rZprintr0rrr) filenameZdata_numr rSrrrBjsZmaxlrFrrrrs@    &rcCspg}t|P}|D]:}zt|}WntdYqYn0||qWdn1sb0Y|S)Nr)rKr?r@rZrr)rrrrBr rrrr-s  *r)r=N)'rr?rrrloggingcopyrrtorch.utils.datarZ tokenizersr transformersrrZnltk basicConfigINFO getLoggerr2rIobjectr r7r<rrrrrrrrrrrrrrrrrs>    +G{%!0:   '