o FgL'@sddlZddlmZddlZddlmZddlmZm Z ddl m Z m Z ddl mZmZmZddlmZmZddlmZdd lmZGd d d ejZdS) N)Image)DEFAULT_IMAGE_TOKENIMAGE_TOKEN_INDEX)SeparatorStyleconv_templates)KeywordsStoppingCriteria process_imagetokenizer_image_token)get_model_name_from_pathload_pretrained_model)TextIteratorStreamer)ThreadcsdeZdZfddZddZeddZeddd Zdd d Z d dZ dddZ dddZ Z S)DescribeAnythingModelc  sxt||_||_||_||_||_||_||_t |ddfi|\} } } } | | j _ | |_ | |_ | |_t||_dS)N)super__init__ model_path conv_mode prompt_mode temperaturetop_p num_beamsmax_new_tokensr configimage_processor tokenizermodel context_lenr model_name) selfrrrrrrrkwargsrrrr __class__D/home/l/lo/longlian/describe-anything/dam/describe_anything_model.pyr s zDescribeAnythingModel.__init__cCsRt|vrtdt|j}||jd|||jdd|}||fS)Nzno tag found in input.rr)r ValueErrorrrcopyZappend_messageroles get_prompt)rqsconvpromptr#r#r$r(!sz DescribeAnythingModel.get_promptcCsJt|}|jdd\}}|jddd\}}||}||}||||fS)Nr)axisr)npargwhereminmax)mask_npZ mask_coordsy0x0y1x1hwr#r#r$ mask_to_box,s  z!DescribeAnythingModel.mask_to_box0cCs|dkr t|d}||fS|dkrX||\}}} } t|} | jdd|jks5Jd| jd|j|||| ||| f} | ||| ||| f} t| }ny|dkr||\}}} } t|} | jdd|jksJd| jd|j| jdd\}}|t|| dt|d| |t|| dt|d| |f} | t|| dt|d| |t|| dt|d| |f} t| }n|d krx||\}}} } t|} | jdd|jksJd| jd|j| jdd\}}|| d|| d}}t| |t| |} } t || dt || d}}|t|| dt|d| |t|| dt|d| |f} | t|| dt|d| |t|| dt|d| |f} t| }nY|d kr||\}}} } t|} | jdd|jksJd| jd|j|||| ||| f} | ||| ||| f} | | d } t| }nt d |t| d}||fS) Nfull)r1cropz(image shape mismatches with mask shape: z, Z context_croprZ focal_crop crop_mask).NzUnsupported crop_mode: ) dictr8r-asarrayshaper fromarrayr0r/intr%)clspil_imgr1 crop_modeZ min_box_wZ min_box_hinfor3r2r7r6img_npZcropped_mask_npZcropped_img_npZcropped_pil_imgimg_himg_wxcycr#r#r$ crop_image7sP  , ,DD   ,"DD   .   z DescribeAnythingModel.crop_imageFc Cs`||\}}t|ttfs t|ttfrJd|g}|g}n|}|}|j|||||d} | S)NzGimage_pil and mask_pil must be both list or tuple or not list or tuple. streaming)r( isinstancelisttupleget_description_from_prompt) r image_pilmask_pilqueryrNr+r* image_pils mask_pils descriptionr#r#r$get_descriptionnsz%DescribeAnythingModel.get_descriptionc st|dktjtjjdfddd\}}|djjjt j d}|dt d}t|jjd}|djjjt j d}t j ||ddddd ffdd }durtjjdfd dd\}} |djjjt j d}| d} t | d} t| jjd} | djjjt j d} t j || ddddd ffdd }nd}|durt j ||fdd S|S) NrcsjdSN)r1rErLrD)rErSr1rr#r$~z8DescribeAnythingModel.get_image_tensor..)Zpil_preprocess_fn)dtyper1r.)dimcsj|dSrZr[r\) crop_mode2r1rr#r$r]r^)r-r?astypeuint8r rrtodevicetorchfloat16rrAcat) rrSrTrErbZ images_tensorZ image_infoZ masks_tensorZimages_tensor2Z image_info2Zmask_np2Z mask_pil2Z masks_tensor2r#)rErbrSr1rr$get_image_tensor{s$&$$&z&DescribeAnythingModel.get_image_tensorcCs4|r |j||||ddS|j||||dd}t|S)NTrMF)$get_description_from_prompt_iteratornext)rrVrWr+r*rNoutputr#r#r$rRsz1DescribeAnythingModel.get_description_from_promptc #sjd\dksJdt|t|ks(Jdt|dt|dfddt||D}t|jtd d d }|j t j krM|j n|j }|g} t| j|} |rdtjd d d nd} t||jd krpd ndjjjjd | g| d } |rtjj| d} | d}| D]}||7}||vr|d||}n|Vq| dStjjdi| }Wdn1swYjj|d dd }|}||r|dt| }|}|VdS)N+r:zzCurrent prompt only supports first crop as full (non-cropped). If you need other specifications, please update the prompt.z8image_pils and mask_pils must have the same length. Got z and .cs"g|] \}}j||dqS))rErb)rj).0rSrTrErbrr#r$ s"zNDescribeAnythingModel.get_description_from_prompt_iterator..pt)return_tensorsrT)Z skip_promptskip_special_tokensF) input_idsimages do_samplerrrr use_cachestopping_criteriastreamer)targetr )rur#) rsplitlenzipr rr unsqueezecudaZ sep_stylerTWOsepZsep2rr r>rrrrrrgeneratestartfindjoinrginference_mode batch_decodestripendswith)rrVrWr+r*rNZ image_tensorsrvZstop_strkeywordsrzr{generation_kwargsthreadgenerated_textZnew_textZ output_idsoutputsr#rqr$rksT.    z:DescribeAnythingModel.get_description_from_prompt_iterator)r9r9)F)__name__ __module__ __qualname__rr( staticmethodr8 classmethodrLrYrjrRrk __classcell__r#r#r!r$r s    6 r)rgtorch.nnnnnumpyr-PILrZmodel.constantsrrZmodel.conversationrrZmodel.mm_utilsrr r rr r transformersr threadingrModulerr#r#r#r$s