a &fo @sddlZddlZddlZddlZddlmZmZmZddlZddl Z ddlZddl m Z ddlZddl Z ddlZddl Z ddlZddlmZddlZedgZddlZddlZddlZddlZddlZddlmZmZddlmZddlZddlmZddl Z!ddlm"Z#d:d d Z$d d Z%d;ddZ&ddZ'dd/d0Z6d?d2d3Z7d@d5d6Z8dAd8d9Z9dS)BN)Image ImageDraw ImageFont) AzureOpenAI)pyploten)TupleList) box_convert) ToPILImageSalesforce/blip2-opt-2.7bc Cs|stjrdnd}|dkrNddlm}m}|d}|jddtjd}n,|dkrddlm}m}|d}|dkr|jddtjd}n|jddtjd}n|d krdd lm }m }|jd d d }|dkr|jdtjd d}n|jdtjd d |}n~|dkr|dkrzddlm }m }d}|j||d dd}|j|d d }| ||dS)Ncudacpur r)Blip2ProcessorBlip2ForConditionalGeneration) device_map torch_dtypezblip2-opt-2.7b-uiz0/home/yadonglu/sandbox/data/orca/blipv2_ui_mergeflorence) AutoProcessorAutoModelForCausalLMzmicrosoft/Florence-2-baseT)trust_remote_codez?/home/yadonglu/sandbox/data/orca/florence-2-base-ft-fft_ep1_rai)rrzM/home/yadonglu/sandbox/data/orca/florence-2-base-ft-fft_ep1_rai_win_ep5_fixedZphi3v_ui)rrz$microsoft/Phi-3-vision-128k-instructz)/home/yadonglu/sandbox/data/orca/phi3v_uiauto)rrrZphi3v)model processor) torchr is_available transformersrrZfrom_pretrainedfloat16Zfloat32rrto) Z model_namedevicerrrrrrZmodel_idr */home/yadonglu/sandbox/OmniParser/utils.pyget_caption_model_processor$sF     r"cCsddlm}||}|S)Nr)YOLO) ultralyticsr#) model_pathr#rr r r!get_yolo_modelMs r&c Cst}|r|t|d}n|}g}t|D]\}} t| d|jdt| d|jd} } t| d|jdt| d|jd} } || | | | ddf}|||q,|d|d}}|sd|jjvrd}nd }d }g}|j}t dt||D]}||||}|jj d krL|||gt|d d j |t j d}n |||gt|d d j |d}d|jjvr|j|d|ddddd}n |jfi|dddddd}|j|dd}dd|D}||q|S)Nrrrrz zThe image shows r ptimagestextreturn_tensors)rdtyper input_ids pixel_valuesF)r2r3max_new_tokens num_beams do_sampledT) max_lengthr6Zno_repeat_ngram_sizeZearly_stoppingZnum_return_sequences)skip_special_tokenscSsg|] }|qSr strip).0genr r r! yz+get_parsed_content_icon..)r len enumerateintshapeappendconfigZ name_or_pathrrangetyperrrgenerate batch_decodeextend)filtered_boxesocr_bbox image_sourcecaption_model_processorpromptto_pil non_ocr_boxescroped_pil_imageicoordxminxmaxyminymax cropped_imagerr batch_sizegenerated_textsrbatchinputsZ generated_idsZgenerated_textr r r!get_parsed_content_iconTs<.. &  r`c st}|r|t|d}n|}g}t|D]\}}t|d|jdt|d|jd} } t|d|jdt|d|jd} } || | | | ddf} ||| q,|d|d}|jddd g}jj|d d d }d }g}t dt||D]}||||}fdd|D}ggggd}|gt|}t|D]f\}}j |||dd}|d|d|d|d|d|d|d|dqTt dd|dD}t|dD]~\}}t j jjt jd||jdt jd|gdd|d|<t j t jd||jdt jd|d|gdd|d|<qއfdd|D}ddd d}|jfi|djji|}|dd|djddf}j|d d d }d!d|D}||q|S)"Nrr'r(r)rruserz-<|image_1|> describe the icon in one sentenceZrolecontentFT)tokenizeZadd_generation_promptr9csg|]}j|ddqS)r+r/)Zimage_processorr>x)rr r!r@rAz1get_parsed_content_icon_phi3v..)r2attention_maskr3 image_sizesr+rer2rhr3ricSsg|]}|jdqSr')rErfr r r!r@rA)r0)dimcs"i|]\}}|t|qSr )r concatenaterr>kvr1r r! rAz1get_parsed_content_icon_phi3v..{Gz?)r5 temperaturer7 eos_token_id)r;Zclean_up_tokenization_spacescSsg|]}|dqS) r<)r>resr r r!r@rA)r rBrCrDrErFr tokenizerZapply_chat_templaterHZ_convert_images_texts_to_inputsmaxrcatZ pad_token_idoneslongzerositemsrJrtrKrL)rMrNrOrPrRrSrTrUrVrWrXrYrZr[rmessagesrQr\r]r-Z image_inputsr_textstxtinputmax_lenroZ inputs_catZgeneration_argsZ generate_idsresponser )rrr!get_parsed_content_icon_phi3vsT.. :> rcs|dust|tsJddddfdd|}g}|rN||t|D]\}d}t|D]6\}}||krj|krj|krjd}qqj|rV|rtfd d t|Ds|qV|qVt|S) NcSs |d|d|d|dS)Nr(rr)r'r )boxr r r!box_areasz remove_overlap..box_areacSsdt|d|d}t|d|d}t|d|d}t|d|d}td||td||SNrr'r(r))rxmin)box1box2x1y1Zx2y2r r r!intersection_areas z)remove_overlap..intersection_areacsl||}|||d}|dkrT|dkrT||}||}nd\}}t||||S)Ngư>r)rr)rx)rr intersectionunionZratio1Zratio2)rrr r!IoUs  zremove_overlap..IoUTFc3s |]\}}|kVqdS)Nr )r>rnZbox3)rr iou_thresholdr r! rAz!remove_overlap..) isinstancer tolistrLrCanyrFrtensor)boxesrrNrMrUZ is_valid_boxjrr )rrrrrr!remove_overlaps(  &  r) image_pathreturnc Cs`ttjdgddttgdgdg}t|d}t |}||d\}}||fS)N i5)max_size)g ףp= ?gv/?gCl?)gZd;O?gy&1?g?RGB) TZComposeZ RandomResizeZToTensorZ Normalizeropenconvertnpasarray)r transformrOimageZimage_transformed_r r r! load_images rr9r(r))rOrlogitsphrases text_scalercCs|j\}} } |t| || |g}t|ddd} t|ddd} tj| d} ddt|jdD}dd lm }|||||d }| }|j || || |fd }d d t || D}||fS)aH This function annotates an image with bounding boxes and labels. Parameters: image_source (np.ndarray): The source image to be annotated. boxes (torch.Tensor): A tensor containing bounding box coordinates. in cxcywh format, pixel scale logits (torch.Tensor): A tensor containing confidence scores for each bounding box. phrases (List[str]): A list of labels for each bounding box. text_scale (float): The scale of the text to be displayed. 0.8 for mobile/web, 0.3 for desktop # 0.4 for mind2web Returns: np.ndarray: The annotated image. cxcywhxyxyrZin_fmtZout_fmtxywh)rcSsg|] }|qSr r )r>phraser r r!r@rAzannotate..r) BoxAnnotator)r text_paddingtext_thickness thickness)Zscene detectionslabelsZ image_sizecSsi|]\}}||qSr r )r>rror r r!rp rAzannotate..) rErTensorr numpysvZ DetectionsrHZutil.box_annotatorrcopyannotatezip)rOrrrrrrrhwrrrrrrZ box_annotatorannotated_framelabel_coordinatesr r r!rs   rc Cs|d|d}}|j}|||dd|}t|fi|}Wdn1sX0Y|j||j|||jdddgdd} | d | d | d } } } | | | fS) 9 Use huggingface model to replace the original model rrr+r,N) box_thresholdtext_thresholdZ target_sizesrrZscoresr)rrrZno_gradZ&post_process_grounded_object_detectionr2size) rrZcaptionrrrrr_outputsresultsrrrr r r!predicts  ,rcCsF|j||d}|djj}|djj}ddtt|D}|||fS)r)sourceconfrcSsg|] }t|qSr strr>rUr r r!r@/rAz predict_yolo..)rrrrrHrB)rrrresultrrrr r r! predict_yolo#s  rrrFg?Tg?c !sPd} d}t|d}|j\t|||d\}}}|tg|j }t |}ddt t |D}|j\}|rt|tg}|}n tdd}t|| |d }| rZ|d }d |jjvrt||||}nt||||| d }d dt| D} t | }g}t|D](\}}|dt||d|q&| |}nddt| D} | }t|ddd}ddt t |D}|rtf||||d|\}}nt||||||d\}}t|}t}|j|ddt !|"#d} |rFfdd|$D}|jdkrB|jdksFJ| ||fS)z( ocr_bbox: list of xyxy format bbox zclickable buttons on the screenrrr)rrrcSsg|] }t|qSr rrr r r!r@CrAz'get_som_labeled_img..zno ocr bbox!!!N)rrrNrZphi3_v)rQcSs g|]\}}d|d|qSz Text Box ID : r r>rUrr r r!r@VrAz Icon Box ID rcSs g|]\}}d|d|qSrr rr r r!r@]rArrrcSsg|]}|qSr r rr r r!r@brA)rOrrr)rOrrrrrPNG)formatasciics>i|]6\}}||d|d|d|dgqS)rr'r(r)r rmrrr r!rpprAz'get_som_labeled_img..r'r)%rrrrrrrrrrrrrHrBrErrprintrrGZ model_typerr`rCrFrr rZ fromarrayioBytesIOsavebase64 b64encodegetvaluedecoder})!Zimg_pathrZ BOX_TRESHOLDZoutput_coord_in_ratiorNrrZdraw_bbox_configrPZocr_textZuse_local_semanticsrrQZ TEXT_PROMPTZ TEXT_TRESHOLDrOrrrrrMZ caption_modelZparsed_content_iconZ icon_startZparsed_content_icon_lsrUrZparsed_content_mergedrrZpil_imgbuffered encoded_imager rr!get_som_labeled_img4sR     "  $rcCs||dd|dd|dd|dd|dd|ddf\}}}}t|t|t|t|f\}}}}||||fSNrr'r(rDrrgyrrr r r!get_xywhvsL$rcCsd|dd|dd|dd|ddf\}}}}t|t|t|t|f\}}}}||||fSrr)rrgrZxpZypr r r!get_xyxy{s4$rcCsd|d|d|d|d|d|df\}}}}t|t|t|t|f\}}}}||||fSrrrr r r! get_xywh_yolos4$rr4cCsfd}d}||krbz&tjjjt|d|d}|jdjjWStd||d7}t dYq0qdS) zc API call, check https://platform.openai.com/docs/guides/vision for the latest api usage. r)rrrrr~rs max_tokensretry call gptvr'r*) clientchat completionscreate deploymentchoicesmessagercrtimesleep)bodyr max_num_trial num_trialrr r r!run_apis  rc Cs*|rXzBt|d$}t|d}Wdn1s:0YWn|}Yn0|r|ddd|idd|dg}n d|dg}d }d }d }||krzFtjjjt d dd dgdd|dgd|d} | j d j j } WqWqt d||d7}d} tdYq0q||kr"d}| |fS)Nrbr image_urlurldata:image/jpeg;base64,rIrr.rIr.r)rTsystemzpYou are an AI assistant that is good at making plans and analyzing screens, and helping people find information.rbrarrrrr'rr*F)rrrreadrrrrrrrrrcrrr) message_textrrZimg_filerrcrrcall_api_successr ans_1st_passr r r!call_gpt4v_newsN 6        rrcCs|dur i}tj|fi|}d}|rd}d|d|dt|d}ddd d |d g} td t|\} } z4| d d d} t| }td| d}WntdYn0dd|D} dd|D} |rPt |}t |t j }g}| D]H}t |\}}}}|||||ft |||f||||fddqt|n2|dkrjdd| D}n|dkrdd| D}| |f|fS)NFa*Example 1: Based on task and ocr results, ```In summary, the task related bboxes are: [([[3060, 111], [3135, 111], [3135, 141], [3060, 141]], 'Share', 0.949013667261589), ([[3068, 197], [3135, 197], [3135, 227], [3068, 227]], 'Link _', 0.3567054243152049), ([[3006, 321], [3178, 321], [3178, 354], [3006, 354]], 'Manage Access', 0.8800734456437066)] ``` Example 2: Based on task and ocr results, ```In summary, the task related bboxes are: [([[3060, 111], [3135, 111], [3135, 141], [3060, 141]], 'Search Google or type a URL', 0.949013667261589)] ```aaBased on the task and ocr results which contains text+bounding box in a dictionary, please filter it so that it only contains the task related bboxes. Requirement: 1. first give a brief analysis. 2. provide an answer in the format: ```In summary, the task related bboxes are: ..```, you must put it inside ``` ```. Do not include any info after ```. z The task is: z, the ocr results are: .rzaYou are an AI assistant that helps people find the correct way to operate computer or smartphone.rbraz+[Perform OCR filtering by goal] ongoing ...z(In summary, the task related bboxes are:rz```z=[Perform OCR filtering by goal] success!!! Filtered buttons: Tz3[Perform OCR filtering by goal] failed or unused!!!cSsg|] }|dqS)rr r>itemr r r!r@rAz!check_ocr_box..cSsg|] }|dqSrjr rr r r!r@rA)rrr(rcSsg|] }t|qSr )rrr r r!r@rArcSsg|] }t|qSr )rrr r r!r@rA)readerZreadtextrrZ call_gpt4vsplitr=ast literal_evalcv2ZimreadZcvtColorZ COLOR_RGB2BGRrrFZ rectanglepltZimshow)rZ display_imgZoutput_bb_formatZgoal_filteringZ easyocr_argsrZis_goal_filteredZ ocr_filter_fsrrQpredrrVr.Z opencv_imgbbr rgrabr r r! check_ocr_boxsD    $   rClick IDc CsZ|}|s@ddddgddd|dddd |id gdg}n8ddddgd|dddd |id d|dgdg}|d d d d} d} d} d} | | krz.tjjjt|d dd} | jdjj}WqWqtj y}z,t d| | d7} d}t dWYd}~qd}~00q| | kr$d} |r4t d|zt d|t j}|r|d}|dddd}t|}n$|dddd}t|}||vr||}|t|}|d|dd|d|ddg|d <Wn0t d!t d"|d#ddgd$dd%}Yn0d}|rJtd&|d'd(\}}t d)||| |d|gfS)*z This func first 1. call gptv(yolo_labled_img, text bbox+task) -> ans_1st_cal 2. call gpt4(ans_1st_cal, label_coordinates) -> final ans rr.zTYou are an AI assistant that is great at interpreting screenshot and predict action.rrbrarrrrrrgffffff?r)r~rsZtop_prr)rTirrr'rNFzAnswer by GPTV: z ```(.*?)```z.In summary, the next action I will perform is:r\r( click_pointz!gptv action regex extract fail!!!z ans_1st_pass:ZCLICKNone)Z action_typervalueZ is_completedzzSummarize what action you decide to perform in the current step, in one sentence, and do not include any icon box number: )rstep_pred_summary)rrrrrrrrcrequestsZRequestExceptionrrrresearchDOTALLgroupr=r replacer rrr)rZyolo_labled_imgrZsummarize_historyverbosehistoryZid_keyrr~payloadrrrrrematchrrZicon_idZbboxrrr r r! get_pred_gptv sr""   "      0  r))r N)N)N)r9r(r))r4)Nr)TrNN)TTNr):osrrrPILrrrjsonrZopenairsysrrr matplotlibrrZeasyocrReaderr r rtypingrr Ztorchvision.opsr rZtorchvision.transformsr Z supervisionrZ transformsrr"r&r`rrrarrayrrndarrayfloatrrrrrrrrrrr)r r r r!s^       ) ,4 , "B  3 =