X;g\ddlZddlZddlZddlZddlmZmZmZddlZddl Z ddlZddl m Z ddlZddl Z ddlZddl Z ddlZddlmZddlZddlmZej,dgZedddddd d d ZddlZddlZddlZddlZddlZdd lmZmZmZdd lm Z ddl!Z!ddl"m#Z#ddl$Z%ddl"m&Z'ddl(m)Z)d,dZ*dZ+ejXd-dZ-dZ.d.dZ/d.dZ0de1deejdejfffdZ4 d/dejjdejfdejfdee1de6dejjf dZ7d Z8d0d!Z9d"Z:dd#ddd$dddgd d%dddd&fdee1ej ffd'Z;d(Zd1dee1ej ffd+Z?y)2N)Image ImageDraw ImageFont) AzureOpenAI)pyplot) PaddleOCRenFiTslow)lang use_angle_clsuse_gpushow_logmax_batch_size use_dilationdet_db_score_mode rec_batch_num)TupleListUnion) box_convert) ToPILImage) BoxAnnotatorcT|s"tjjrdnd}|dk(rsddlm}m}|j d}|dk(r#|j |dtj}n|j |dtjj|}ny|dk(rtdd lm }m }|j d d }|dk(r#|j |tjd }n1|j |tjd j|}j|dS)Ncudacpublip2r)Blip2ProcessorBlip2ForConditionalGenerationSalesforce/blip2-opt-2.7b) device_map torch_dtype florence2) AutoProcessorAutoModelForCausalLMzmicrosoft/Florence-2-baseT)trust_remote_code)r!r%)model processor) torchr is_available transformersrrfrom_pretrainedfloat32float16tor#r$) model_namemodel_name_or_pathdevicerrr'r&r#r$s >/home/yadonglu/sandbox/huggingface/OmniParser-v2/util/utils.pyget_caption_model_processorr3/s&  ::224%WN"223NO U?1AA 4U]]B E2AA 4U]]B "V*  { "D!112Mae1f U?(889KY^YfYfz~8E(889KY^YfYfz~8CCDJKEXXf%I >>c"ddlm}||}|S)Nr)YOLO) ultralyticsr6) model_pathr6r&s r2get_yolo_modelr9Gs  E Lr4ct}|r||d}n|}g}t|D]\} } t| d|jdzt| d|jdz} } t| d|jdzt| d|jdz}} || || | ddf}t j |d}|j |||d|d}}|sd|jjvrd }nd }g}|j}tdt||D] } tj}|| | |z}tj}|jjd k(r9|||gt|zd d j|tj }n(|||gt|zd j|}|j#|d|dddd }|j%|d}|Dcgc]}|j'}}|j)| |S#YxYwcc}w)Nr)@r>r&r'florencez zThe image showsrptF)imagestextreturn_tensors do_resize)r1dtyperArBrC)r1 input_ids pixel_values)rGrHmax_new_tokens num_beams do_sampleT)skip_special_tokens)r enumerateintshapecv2resizeappendconfig name_or_pathr1rangelentimetyper.r(r-generate batch_decodestripextend)filtered_boxes starting_idx image_sourcecaption_model_processorprompt batch_sizeto_pil non_ocr_boxescroped_pil_imageicoordxminxmaxyminymax cropped_imager&r'generated_textsr1startbatcht1inputs generated_idsgenerated_textgens r2get_parsed_content_iconrvNs\F&|}5 & m,5 U1Xl&8&8&;;a$DU1Xl&8&8&;;a$D(dDIq)@AMJJ}h?M  # #F=$9 : /w79PQ\9]9E  22 2 F&FO \\F 1c*+Z 8/  1Z<0 YY[ <<   &e6(3u:2EVZfkloow}FKFSFSoTFe6(3u:2EVZ[^^fl^mF 1DRXYgRhxzFGSXY #// SW/X1?@##))+@@~./ ;  2AsB3H; I;Ic t}|r|t|d}n|}g}t|D]\}}t|d|jdzt|d|jdz} } t|d|jdzt|d|jdz} } || | | | ddf} |j || |d|d}}|j }ddd g}|jj|d d }d }g}tdt||D]}||||z}|Dcgc]}|j|d}}ggggd}|gt|z}t|D]x\}}|j|||d}|dj |d|dj |d|dj |d|dj |dzt|dDcgc]}|jdc}}t|dD]\}}tj|jjtj d||jdz tj"z|gd|d|<tjtj$d||jdz tj"|d|gd|d|<|j'Dcic]*\}}|tj(|j+|,}}}ddd d} |j,di|d|jj.i| }!|!dd|djddf}!|j1|!d d }"|"D#cgc]!}#|#j3dj3#}"}#|j5|"|Scc}wcc}wcc}}wcc}#w)Nrr;r<r=r&r'userz-<|image_1|> describe the icon in one sentence)rolecontentFT)tokenizeadd_generation_promptr@)rC)rGattention_maskrH image_sizesrGr~rHr)rE)dim{Gz?)rJ temperaturerL eos_token_id)rMclean_up_tokenization_spaces )rrWrNrOrPrSr1 tokenizerapply_chat_templaterVimage_processor_convert_images_texts_to_inputsmaxr(cat pad_token_idoneslongzerositems concatenater.rZrr[r\r])$r^ocr_bboxr`rardrerfrgrhrirjrkrlrmr&r'r1messagesrbrcrnrAx image_inputsrrtextstxtinputmax_lenvk inputs_catgeneration_args generate_idsresponseress$ r2get_parsed_content_icon_phi3vrsp \F&s8}~6 & m,75q,"4"4Q"778#eAh|GYGYZ[G\>\:]dq,"4"4Q"778#eAh|GYGYZ[G\>\:]d$T$YT 1%<= } 56 7 /w79PQ\9]9E \\F,\]^H  4 4Xei 4 jFJO 1c*+Z 8)!!AjL1SYZa 11!D1IZ Z B[]^3v;&& ?FAs==l1oscg=hE ;  & &u['9 : # $ + +E2B,C D > " ) )%*? @ = ! ( (})= >  ? 6++>?aqwwqz?@f[12 RDAq%*YY 0C0C0P0PSXS]S]^_ahklkrkrstkuau~C~H~HTI1IKL0MST&UF;  "*/))U[[GaggVWjDX`e`j`j5kmsuEnFGHnI5JPQ+RF# $Q ' RFL\\^TTQa**1-0088T T!  &u~~u uATATAaAauetu #Az+'>'D'DQ'G'H$HI )),Dot)u7?@CIIdO))+@@x(7): 7[@UAs4O8O /O &O&c |t|tsJd d fd fd |j}g}|r|j|t |D]\} d}t |D]-\}}||k7s  |kDs  |kDs+d}n|sF|r3t   fdt |Dri|j {|j tj|S)Nc0|d|dz |d|dz zSNr<rr=r;rboxs r2box_areaz remove_overlap..box_area%AQCFSVO44r4ct|d|d}t|d|d}t|d|d}t|d|d}td||z td||z zSNrr;r<r=rminbox1box2x1y1x2y2s r2intersection_areaz)remove_overlap..intersection_areaq a$q' " a$q' " a$q' " a$q' "1b2gQR00r4c||}||z|z dz}|dkDr"|dkDr||z }||z }nd\}}t||z ||SNgư>r)rrrrr intersectionunionratio1ratio2rrs r2IoUzremove_overlap..IoU{(t4 $/,>E D>A (4.1"4!HTN2F!HTN2F!NFF<%'88r4c6||}||z }|dkDS)Ngffffff?rrrrrrrs r2 is_insidez!remove_overlap..is_inside'(t4 .}r4TFc3TK|]\}}|kDxr | !ywNr).0rbox3rr iou_thresholdrs r2 z!remove_overlap..s6|]d]^`d3tT?]:X9TSWCX?XX|s%() isinstancertolistr]rNanyrSr(tensor) boxesrrr^rg is_valid_boxjrrrrrrs ` @@@@@r2remove_overlaprs  z(D99519 LLNENh'U#,4  ' GAtAv#dD/M9htnxX\~>]$    |hqrzh{||"))$/%%d+, << ''r4cn|t|tsJddfd}fd}g}|r|j|t|D]\}}|d}d} t|D]2\} } | d} || k7s||| |kDs|| kDs0d} n| sP|rd} d }|D]>}| r|d}|||r ||d d zz }|j |2|||rd} n@| r|r|j d |dd|d |j d |dddd |j ||S#YxYw)z ocr_bbox format: [{'type': 'text', 'bbox':[x,y], 'interactivity':False, 'content':str }, ...] boxes format: [{'type': 'icon', 'bbox':[x,y], 'interactivity':True, 'content':None }, ...] Nc0|d|dz |d|dz zSrrrs r2rz$remove_overlap_new..box_arearr4ct|d|d}t|d|d}t|d|d}t|d|d}td||z td||z zSrrrs r2rz-remove_overlap_new..intersection_arearr4c||}||z|z dz}|dkDr"|dkDr||z }||z }nd\}}t||z ||Srrrs r2rzremove_overlap_new..IoUrr4c6||}||z }|dkDS)Ng?rrs r2rz%remove_overlap_new..is_insiderr4bboxTFrz iconrYr interactivityrz)rrr]rNremoverS)rrrrrr^rg box1_elemrrr box2_elemr box_added ocr_labels box3_elemrrrs @@r2remove_overlap_newrs  z(D99519Nh'!%(&, 9  %e, LAyV$DAv#dD/M9htnxX\~>]$    !  !)%I$(0$T40) *i .BS.H H . 5 5i @'tT2(,I!$#%$!!&--vyQWGXko}G/IJ&--vyQWGXko}A/DE%%d+M&,N ) (s 6D00D4 image_pathreturnc Dtjtjdgdtjtjgdgdg}t j |jd}tj|}||d\}}||fS)Ni i5)max_size)g ףp= ?gv/?gCl?)gZd;O?gy&1?g?RGB) TCompose RandomResizeToTensor Normalizeropenconvertnpasarray)r transformr`imageimage_transformed_s r2 load_imager9s NNC54 0 JJL KK-/D E I::j)11%8L JJ| $E$\48q # ##r4r}r`rlogitsphrases text_scalec |j\}} } |tj| || |gz}t|ddj } t|ddj } t j | } t|jdDcgc]}|}}t||||}|j}|j|| || |f}t|| Dcic] \}}|| }}}||fScc}wcc}}w) aH This function annotates an image with bounding boxes and labels. Parameters: image_source (np.ndarray): The source image to be annotated. boxes (torch.Tensor): A tensor containing bounding box coordinates. in cxcywh format, pixel scale logits (torch.Tensor): A tensor containing confidence scores for each bounding box. phrases (List[str]): A list of labels for each bounding box. text_scale (float): The scale of the text to be displayed. 0.8 for mobile/web, 0.3 for desktop # 0.4 for mind2web Returns: np.ndarray: The annotated image. cxcywhxyxyrin_fmtout_fmtxywh)rr)r text_paddingtext_thickness thickness)scene detectionslabels image_size) rPr(Tensorrnumpysv DetectionsrVrcopyannotatezip)r`rrrrrrrhwrrrrphraser box_annotatorannotated_framerlabel_coordinatess r2rrGs  GAq! ELL!Q1. .E U8V D J J LD U8V D J J LDD)J(-ekk!n(= >fk >F > J\ao{DEM"'')O#,,?zbhvwxyuz,{O9>#%jj2&' ?   G%W-wx/@'(BS76E &' !!""s  BB'c&|r|j||||}n|j|||}|djj}|djj}t t |D cgc] } t | } } ||| fScc} w)r)sourceconfimgsziou)r&r'r)r)r$rrr'rVrWstr) r&rrr( scale_imgrresultrr'rgrs r2 predict_yolor-{s        1IOO E !9??  D$SZ01!s1v1G1 $ 2s5Bc|\}}}}t||zt||zt||zt||zg}|d|dz |d|dz z}|SrrO) rrrrrrrint_boxareas r2 int_box_arear2sfNBB2a4y#bd)SAYBqD :G AJ # WQZ(? @D Kr4rg?g?r>c  t|tr$tj|j d}|j \}}|s||f}t ||||| d\}}}|tj||||gj|jz }tj|}tt|Dcgc] }t|}}|r@tj|tj||||gz }|j!}n t#dd}t%|| Dcgc]\}}t'|||dkDsd|d|d }}}|j!Dcgc]}t'|||dkDsd |d dd }}t)|| | }t+|d }t-dt/|Dd}tj|Dcgc]}|d c}}t#dt||t1j0}| r|d}d|j2j4vrt7||||}nt9||||| |}t/| Dcgc] \}}d|d|} }}t| } g}!t/|D]\}}|d |j;d|d<!t/|D](\}}|!j=dt|| zd|*| |!z}"n%t/| Dcgc] \}}d|d|} }}| }"t#dt1j0|z t?|dd}tt|Dcgc]}|}}|rtAd&||||d|\}#}$ntA||||||\}#}$tjB|#}%tEjF}&|%jI|&d !tKjL|&jOjQd"}'|re|$jSD()cic]$\}(})|(|)d|z |)d#|z |)d$|z |)d%|z g&}$}(})||#jTd#k(r||#jTdk(sJ|'|$|fScc}wcc}}wcc}wcc}wcc}}wcc}}wcc}wcc})}(w)'zProcess either an image path or Image object Args: image_source: Either a file path (str) or PIL Image object ... rg?)r&rrr(r+rzno ocr bbox!!!NrrBFrrT)rrrc|dduS)Nrzr)rs r2z%get_som_labeled_img..sq|t?Sr4)keyc32K|]\}}|d |yw)rzNr)rrgrs r2rz&get_som_labeled_img..sbvq#3y>Kabs rrzlen(filtered_boxes):r&phi3_v)rbrcz Text Box ID z: rzz Icon Box ID ztime to get parsed content:rrr)r`rrr)r`rrrrrPNG)formatasciir;r<r=r)+rr*rrrr r-r(r r.r1rrrVrWrrprintrr2rsortednextrNrXrT model_typerrvpoprSrr fromarrayioBytesIOsavebase64 b64encodegetvaluedecoderrP)*r`r& BOX_TRESHOLDoutput_coord_in_ratiorrrdraw_bbox_configraocr_textuse_local_semanticsrrbr+r(rcrrrrrrgrr ocr_bbox_elem xyxy_elemr^filtered_boxes_elemr_time1 caption_modelparsed_content_icon icon_startparsed_content_icon_lsparsed_content_mergedrrpil_imgbuffered encoded_imagerrs* r2get_som_labeled_imgrZs7,$zz,/77>   DAq A(uLXdlq~GWZ[D&' %,,1a|,// < Q- Q2Q77Q= R )Rc|dd|dd|dd|ddz |dd|ddz f\}}}}t|t|t|t|f\}}}}||||fSNrr;r<r/rryrrs r2get_xywhr_sq!eAhqk58A;q!+DeAhqkTYZ[T\]^T_F__JAq!QQQQQ/JAq!Q aA:r4c|dd|dd|dd|ddf\}}}}t|t|t|t|f\}}}}||||fSr\r/)rrr^xpyps r2get_xyxyrcsl8A;a U1Xa[%(1+ELAq"bq63q63r7CG3LAq"b aR<r4c|d|d|d|dz |d|dz f\}}}}t|t|t|t|f\}}}}||||fSrr/r]s r2 get_xywh_yoloresnq58U1Xa%8%(U1X:MMJAq!QQQQQ/JAq!Q aA:r4c8t|trtj|}|jdk(r|j d}t j|}|j\}}|rd|d} n|d} tj|dd} | D cgc]} | dd| kDs| d} } | D cgc]} | dd| kDs| dd} } n>|i}tj|fi|} | D cgc]} | d } } | D cgc]} | d } } |rtj|tj}g}| D]J} t!| \}}}}|j#||||ftj$|||f||z||zfd d Lt'j(tj|tj*n;|d k(r| D cgc] } t!| }} n|d k(r| D cgc] } t-| }} | f|fScc} wcc} wcc} wcc} wcc} wcc} w) NRGBArg?rF)clsrr;)rrr<rr)rr*rrmoderrarrayr  paddle_ocrocrreaderreadtextrQcvtColor COLOR_RGB2BGRr_rS rectanglepltimshow COLOR_BGR2RGBrc)r` display_imgoutput_bb_formatgoal_filtering easyocr_args use_paddleocrimage_nprrrr,itemrhrB opencv_imgbbrr^abs r2 check_ocr_boxrs,$zz,/ F"#++E2 xx %H   DAq   N)*:;Ne4Q7%+KTtAwqzN/JaKK'-Mtan1LQ MM  L:\:%+,Ta,,$*+DQ++\\(C,=,=>  JD!$JAq!Q IIq!Ql # MM*q!fqsAaCj+q I J 3<< C,=,=>? v %-23T(4.3B3  '-23T(4.3B3 ":~ %%-LM -+43s0G>%G>2H H. H H H$H)rN)NNr)r}r<r=)gffffff?)TrNNF)@osrBrErXPILrrrjsonrequestsopenairsysrQr r matplotlibrrseasyocr paddleocrrReaderrnrlastr(typingrrrtorchvision.opsrretorchvision.transformsr supervisionr  transformsrutil.box_annotatorrr3r9inference_modervrrrr*rkr rndarrayfloatrr$r-r2rZr_rcrerrr4r2rs ++   $        %%' -"+?0,,`2h2(jNb $3 $55<<)?#@ $:;.2::.ell.ELL.[_`c[d.rw.?Azz.>"* . FJX\tyEIVYhi|@Z^ikAEUX`dpu}ANPOAeC,<&=OAd   $&c5;;&6 7$&r4