Spaces:

vvmnnnkv
/

owlv2-visual-prompt

Running on Zero

App Files Files Community

vvmnnnkv commited on 15 days ago

Commit

02a40a0

1 Parent(s): 2f285dd

initial

Browse files

Files changed (12) hide show

.gitattributes +1 -0
README.md +29 -8
app.py +296 -0
requirements.txt +4 -0
test-data/prompt1.jpg +3 -0
test-data/prompt2.jpg +3 -0
test-data/prompt3.jpg +3 -0
test-data/prompt4.jpg +3 -0
test-data/target1.jpg +3 -0
test-data/target2.jpg +3 -0
test-data/target3.jpg +3 -0
test-data/target4.jpg +3 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+*.jpg filter=lfs diff=lfs merge=lfs -text

README.md CHANGED Viewed

@@ -1,13 +1,34 @@
 ---
-title: Owlv2 Visual Prompt
-emoji: 🌖
-colorFrom: yellow
-colorTo: blue
 sdk: gradio
-sdk_version: 5.38.2
 app_file: app.py
-pinned: false
-short_description: OWLv2 zero-shot detection with visual prompt
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: OWLv2 Visual Prompt
+short_description: OWLv2 zero-shot detection with visual prompt
+emoji: 👀
 sdk: gradio
+sdk_version: 4.44.1
 app_file: app.py
 ---
+# OWLv2: Zero-shot detection with visual prompt 👀
+This demo showcases the OWLv2 model's ability to perform zero-shot object detection using visual and text prompts.
+You can either provide a text prompt or an image as a visual prompt to detect objects in the target image.
+For visual prompting, following sample code is used, taken from the HF documentation:
+```python
+    processor = AutoProcessor.from_pretrained("google/owlv2-base-patch16-ensemble")
+    model = Owlv2ForObjectDetection.from_pretrained("google/owlv2-base-patch16-ensemble")
+    target_image = Image.open(...)
+    prompt_image = Image.open(...)
+    inputs = processor(images=target_image, query_images=prompt_image, return_tensors="pt")
+    # forward pass
+    with torch.no_grad():
+        outputs = model.image_guided_detection(**inputs)
+    target_sizes = torch.Tensor([image.size[::-1]])
+    results = processor.post_process_image_guided_detection(outputs=outputs, threshold=0.9, nms_threshold=0.3, target_sizes=target_sizes)
+```
+For some reason, visual prompt works much worse than text, perhaps it's HF implementation issue.

app.py ADDED Viewed

	@@ -0,0 +1,296 @@

+import sys
+# Mock audio modules to avoid installing them
+sys.modules["audioop"] = type("audioop", (), {"__file__": ""})()
+sys.modules["pyaudioop"] = type("pyaudioop", (), {"__file__": ""})()
+import torch
+import gradio as gr
+import supervision as sv
+import spaces
+from transformers import AutoProcessor, Owlv2ForObjectDetection
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+@spaces.GPU
+def init_model(model_id):
+    processor = AutoProcessor.from_pretrained(model_id)
+    model = Owlv2ForObjectDetection.from_pretrained(model_id)
+    model.eval()
+    model.to(DEVICE)
+    return processor, model
+@spaces.GPU
+def inference(prompts, target_image, model_id, conf_thresh, iou_thresh, prompt_type):
+    processor, model = init_model(model_id)
+    result = None
+    class_names = {}
+    if prompt_type == "Text":
+        inputs = processor(
+            images=target_image,
+            text=prompts["texts"],
+            return_tensors="pt"
+        ).to(DEVICE)
+        with torch.no_grad():
+            outputs = model(**inputs)
+        target_sizes = torch.tensor([target_image.size[::-1]])
+        result = processor.post_process_grounded_object_detection(
+            outputs=outputs,
+            target_sizes=target_sizes,
+            threshold=conf_thresh
+        )[0]
+        class_names = {k: v for k, v in enumerate(prompts["texts"])}
+    elif prompt_type == "Visual":
+        inputs = processor(
+            images=target_image,
+            query_images=prompts["images"],
+            return_tensors="pt"
+        ).to(DEVICE)
+        with torch.no_grad():
+            outputs = model.image_guided_detection(**inputs)
+        # Post-process results
+        target_sizes = torch.tensor([target_image.size[::-1]])
+        result = processor.post_process_image_guided_detection(
+            outputs=outputs,
+            target_sizes=target_sizes,
+            threshold=conf_thresh,
+            nms_threshold=iou_thresh
+        )[0]
+        # prepare for supervision: add 0 label for all boxes
+        result['labels'] = torch.zeros(len(result['boxes']), dtype=torch.int64)
+        class_names = {0: "object"}
+    detections = sv.Detections.from_transformers(result, class_names)
+    resolution_wh = target_image.size
+    thickness = sv.calculate_optimal_line_thickness(resolution_wh=resolution_wh)
+    text_scale = sv.calculate_optimal_text_scale(resolution_wh=resolution_wh)
+    labels = [
+        f"{class_name} {confidence:.2f}"
+        for class_name, confidence
+        in zip(detections['class_name'], detections.confidence)
+    ]
+    annotated_image = target_image.copy()
+    annotated_image = sv.BoxAnnotator(color_lookup=sv.ColorLookup.INDEX, thickness=thickness).annotate(
+        scene=annotated_image, detections=detections)
+    annotated_image = sv.LabelAnnotator(color_lookup=sv.ColorLookup.INDEX, text_scale=text_scale, smart_position=True).annotate(
+        scene=annotated_image, detections=detections, labels=labels)
+    return annotated_image
+def app():
+    with gr.Blocks():
+        with gr.Row():
+            with gr.Column():
+                with gr.Row():
+                    target_image = gr.Image(type="pil", label="Target Image", visible=True, interactive=True)
+                detect_button = gr.Button(value="Detect Objects")
+                prompt_type = gr.State(value='Visual')  # Default prompt type
+                with gr.Tab("Visual") as visual_tab:
+                    with gr.Row():
+                        prompt_image = gr.Image(type="pil", label="Prompt Image", visible=True, interactive=True)
+                with gr.Tab("Text") as text_tab:
+                    texts = gr.Textbox(label="Input Texts", value='', placeholder='person,bus', visible=True, interactive=True)
+                visual_tab.select(
+                    fn=lambda: ("Visual", gr.update(visible=True)),
+                    inputs=None,
+                    outputs=[prompt_type, prompt_image]
+                )
+                text_tab.select(
+                    fn=lambda: ("Text", gr.update(value=None, visible=False)),
+                    inputs=None,
+                    outputs=[prompt_type, prompt_image]
+                )
+                model_id = gr.Dropdown(
+                    label="Model",
+                    choices=[
+                        "google/owlv2-base-patch16-ensemble",
+                        "google/owlv2-large-patch14"
+                    ],
+                    value="google/owlv2-base-patch16-ensemble",
+                )
+                conf_thresh = gr.Slider(
+                    label="Confidence Threshold",
+                    minimum=0.0,
+                    maximum=1.0,
+                    step=0.05,
+                    value=0.25,
+                )
+                iou_thresh = gr.Slider(
+                    label="IoU Threshold",
+                    minimum=0.0,
+                    maximum=1.0,
+                    step=0.05,
+                    value=0.70,
+                )
+            with gr.Column():
+                output_image = gr.Image(type="numpy", label="Annotated Image", visible=True)
+        def run_inference(prompt_image, target_image, texts, model_id, conf_thresh, iou_thresh, prompt_type):
+            # add text/built-in prompts
+            if prompt_type == "Text":
+                texts = [text.strip() for text in texts.split(',')]
+                prompts = {
+                    "texts": texts
+                }
+            # add visual prompt
+            elif prompt_type == "Visual":
+                prompts = {
+                    "images": prompt_image,
+                }
+            return inference(prompts, target_image, model_id, conf_thresh, iou_thresh, prompt_type)
+        detect_button.click(
+            fn=run_inference,
+            inputs=[prompt_image, target_image, texts, model_id, conf_thresh, iou_thresh, prompt_type],
+            outputs=[output_image],
+        )
+        ###################### Examples ##########################
+        image_examples_list = [[
+                "test-data/target1.jpg",
+                "test-data/prompt1.jpg",
+                "google/owlv2-base-patch16-ensemble",
+                0.9,
+                0.3,
+            ],
+            [
+                "test-data/target2.jpg",
+                "test-data/prompt2.jpg",
+                "google/owlv2-base-patch16-ensemble",
+                0.9,
+                0.3,
+            ],
+            [
+                "test-data/target3.jpg",
+                "test-data/prompt3.jpg",
+                "google/owlv2-base-patch16-ensemble",
+                0.9,
+                0.3,
+            ],
+            [
+                "test-data/target4.jpg",
+                "test-data/prompt4.jpg",
+                "google/owlv2-base-patch16-ensemble",
+                0.9,
+                0.3,
+            ]
+            ]
+        text_examples = gr.Examples(
+            examples=[[
+                "test-data/target1.jpg",
+                "logo",
+                "google/owlv2-base-patch16-ensemble",
+                0.3],
+                [
+                "test-data/target2.jpg",
+                "cat,remote",
+                "google/owlv2-base-patch16-ensemble",
+                0.3],
+                [
+                "test-data/target3.jpg",
+                "frog,spider,lizard",
+                "google/owlv2-base-patch16-ensemble",
+                0.3],
+                [
+                "test-data/target4.jpg",
+                "cat",
+                "google/owlv2-base-patch16-ensemble",
+                0.3]
+            ],
+            inputs=[target_image, texts, model_id, conf_thresh],
+            visible=False, cache_examples=False, label="Text Prompt Examples")
+        image_examples = gr.Examples(
+            examples=image_examples_list,
+            inputs=[target_image, prompt_image, model_id, conf_thresh, iou_thresh],
+            visible=True, cache_examples=False, label="Box Visual Prompt Examples")
+        # Examples update
+        def update_text_examples():
+            return gr.Dataset(visible=True), gr.Dataset(visible=False), gr.update(visible=False)
+        def update_visual_examples():
+            return gr.Dataset(visible=False), gr.Dataset(visible=True), gr.update(visible=True)
+        text_tab.select(
+            fn=update_text_examples,
+            inputs=None,
+            outputs=[text_examples.dataset, image_examples.dataset, iou_thresh]
+        )
+        visual_tab.select(
+            fn=update_visual_examples,
+            inputs=None,
+            outputs=[text_examples.dataset, image_examples.dataset, iou_thresh]
+        )
+        return target_image, prompt_image, model_id, conf_thresh, iou_thresh, image_examples_list
+gradio_app = gr.Blocks()
+with gradio_app:
+    gr.HTML(
+        """
+    <h1 style='text-align: center'>OWLv2: Zero-shot detection with visual prompt 👀</h1>
+    """)
+    gr.Markdown("""
+    This demo showcases the OWLv2 model's ability to perform zero-shot object detection using visual and text prompts.
+    You can either provide a text prompt or an image as a visual prompt to detect objects in the target image.
+    For visual prompting, following sample code is used, taken from the HF documentation:
+    ```python
+       processor = AutoProcessor.from_pretrained("google/owlv2-base-patch16-ensemble")
+       model = Owlv2ForObjectDetection.from_pretrained("google/owlv2-base-patch16-ensemble")
+       target_image = Image.open(...)
+       prompt_image = Image.open(...)
+       inputs = processor(images=target_image, query_images=prompt_image, return_tensors="pt")
+       # forward pass
+       with torch.no_grad():
+           outputs = model.image_guided_detection(**inputs)
+       target_sizes = torch.Tensor([image.size[::-1]])
+       results = processor.post_process_image_guided_detection(outputs=outputs, threshold=0.9, nms_threshold=0.3, target_sizes=target_sizes)
+    ```
+    For some reason, visual prompt works much worse than text, perhaps it's HF implementation issue.
+    """)
+    with gr.Row():
+        with gr.Column():
+            # Create a list of all UI components
+            ui_components = app()
+            # Unpack the components
+            target_image, prompt_image, model_id, conf_thresh, iou_thresh, image_examples_list = ui_components
+    gradio_app.load(
+        fn=lambda: image_examples_list[1],
+        outputs=[target_image, prompt_image, model_id, conf_thresh, iou_thresh]
+    )
+if __name__ == '__main__':
+    gradio_app.launch(allowed_paths=["figures"])