a }Gc @sRddlmZddlZddlZddlmZddlmZddlm Z GdddZ dS))RunN) print_message) Provenance)get_metadata_onlyc@sNeZdZdddZddZddZdd Zdd d Zd d Ze dddZ dS)ExamplesNcCs0|p |p t|_||_||_|p(|||_dSN)r_Examples__provenancenwaypath _load_filedata)selfr r r provenancer-/home/gupo/~/ColBERT/colbert/data/examples.py__init__ szExamples.__init__cCs|jSr)rr rrrrszExamples.provenancecCs|Sr)rrrrrtoDictszExamples.toDictcCsl|jr|jdn|j}g}t|6}|D] }t|d|}||q(Wdn1s^0Y|S)N)r openujsonloadsappend)r r r examplesflineexamplerrrr s *zExamples._load_filecsJ|s|r@|t|vs J||ffddtdtj|DStjS)a NOTE: For distributed sampling, this isn't equivalent to perfectly uniform sampling. In particular, each subset is perfectly represented in every batch! However, since we never repeat passes over the data, we never repeat any particular triple, and the split across nodes is random (since the underlying file is pre-shuffled), there's no concern here. csg|]}j|qSr)r ).0idxrrr ,z#Examples.tolist..r)rangelenr list)r ranknranksrrrtolist"s zExamples.tolistcCs"d|ddddvs&Jdtdt|jdd|t|d T}|jD]}t||| d qZ|j }td t|jd |j Wdn1s0Yt|d d B}i}t |d<| |d<tj |dd}| |Wdn1s0Y|S)Njson/.zTODO: Support .json[l] too.z #> Writing g.AzM examples to w z#> Saved examples with z lines to z.metametadatar)indent)stripsplitrr"r rrrdumpwritenamerrdumps)r new_pathrr output_pathdrrrrsave0s&   :  *z Examples.savecCsjt|tur|||dSt|tr.|||dSt||urN|dusJJ||SdsfJdt|ddS)N)r r )r r Fz obj has type z$ which is not compatible with cast())typestr isinstancer#)clsobjr rrrcastFs     z Examples.cast)NNNN)NN)N) __name__ __module__ __qualname__rrrr r&r9 classmethodr?rrrrr s  r) colbert.infra.runrosrcolbert.utils.utilsrcolbert.infra.provenancerutility.utils.save_metadatarrrrrrs