Spaces:
Running
on
Zero
Running
on
Zero
interesting findings
Browse files- README.md +10 -18
- app.py +193 -70
- test-data/prompt5.jpg +3 -0
- test-data/prompt6.jpg +3 -0
- test-data/target5.jpg +3 -0
- test-data/target6.jpg +3 -0
README.md
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
---
|
2 |
-
title: OWLv2 Visual
|
3 |
short_description: OWLv2 zero-shot detection with visual prompt
|
4 |
emoji: π
|
5 |
sdk: gradio
|
@@ -9,28 +9,20 @@ models:
|
|
9 |
- google/owlv2-large-patch14-ensemble
|
10 |
---
|
11 |
|
12 |
-
# OWLv2: Zero-
|
13 |
|
14 |
-
This demo showcases the OWLv2 model's ability to perform zero-shot object detection using
|
15 |
|
16 |
-
|
17 |
|
18 |
-
|
19 |
-
```python
|
20 |
-
processor = AutoProcessor.from_pretrained("google/owlv2-base-patch16-ensemble")
|
21 |
-
model = Owlv2ForObjectDetection.from_pretrained("google/owlv2-base-patch16-ensemble")
|
22 |
|
23 |
-
|
24 |
-
prompt_image = Image.open(...)
|
25 |
-
inputs = processor(images=target_image, query_images=prompt_image, return_tensors="pt")
|
26 |
|
27 |
-
|
28 |
-
with torch.no_grad():
|
29 |
-
outputs = model.image_guided_detection(**inputs)
|
30 |
|
31 |
-
|
32 |
|
33 |
-
|
34 |
-
```
|
35 |
|
36 |
-
|
|
|
1 |
---
|
2 |
+
title: OWLv2 Visual Prompting
|
3 |
short_description: OWLv2 zero-shot detection with visual prompt
|
4 |
emoji: π
|
5 |
sdk: gradio
|
|
|
9 |
- google/owlv2-large-patch14-ensemble
|
10 |
---
|
11 |
|
12 |
+
# OWLv2: Zero-Shot Object Detection with Visual Prompting
|
13 |
|
14 |
+
This demo showcases the OWLv2 model's ability to perform zero-shot object detection using both text and visual prompts. More importantly, it compares different approaches for selecting a query embedding from a visual prompt. The method used in Hugging Face's `transformers` library often underperforms because of how the visual prompt embedding is selected.
|
15 |
|
16 |
+
## The Problem with the Default Method
|
17 |
|
18 |
+
The standard implementation in `transformers` (using `model.image_guided_detection`) selects an embedding from the prompt image by maximizing its box's IoU with the full prompt image area and its distance from the average of other embeddings (`embed_image_query`).
|
|
|
|
|
|
|
19 |
|
20 |
+
However, this selection heuristic does not account for padding and often selects the largest box, which may also span the padded background. This leads to selecting an irrelevant embedding and, consequently, poor detection performance in the target image.
|
|
|
|
|
21 |
|
22 |
+
## An Alternative Approach: Objectness Γ IoU
|
|
|
|
|
23 |
|
24 |
+
This demo implements and compares an alternative method for selecting the query embedding. This method works by maximizing a combination of the objectness score (predicted by the model) and the box's IoU score with the non-padded area of the prompt image. The selected embedding, therefore, tends to represent the most distinct and largest object on the prompt image while excluding any padded areas.
|
25 |
|
26 |
+
## Results
|
|
|
27 |
|
28 |
+
This space compares the results from both methods. The examples clearly demonstrate that this alternative embedding selection approach provides significantly more accurate and reliable results, often performing on par with text-based prompting.
|
app.py
CHANGED
@@ -8,7 +8,10 @@ import torch
|
|
8 |
import gradio as gr
|
9 |
import supervision as sv
|
10 |
import spaces
|
11 |
-
from
|
|
|
|
|
|
|
12 |
|
13 |
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
|
14 |
|
@@ -18,14 +21,23 @@ def init_model(model_id):
|
|
18 |
model = Owlv2ForObjectDetection.from_pretrained(model_id)
|
19 |
model.eval()
|
20 |
model.to(DEVICE)
|
21 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
22 |
|
23 |
@spaces.GPU
|
24 |
def inference(prompts, target_image, model_id, conf_thresh, iou_thresh, prompt_type):
|
25 |
-
processor, model = init_model(model_id)
|
26 |
|
27 |
-
|
28 |
-
|
|
|
29 |
|
30 |
if prompt_type == "Text":
|
31 |
inputs = processor(
|
@@ -36,40 +48,128 @@ def inference(prompts, target_image, model_id, conf_thresh, iou_thresh, prompt_t
|
|
36 |
|
37 |
with torch.no_grad():
|
38 |
outputs = model(**inputs)
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
)[0]
|
46 |
class_names = {k: v for k, v in enumerate(prompts["texts"])}
|
|
|
|
|
47 |
|
48 |
elif prompt_type == "Visual":
|
|
|
49 |
inputs = processor(
|
50 |
images=target_image,
|
51 |
-
query_images=
|
52 |
return_tensors="pt"
|
53 |
).to(DEVICE)
|
54 |
with torch.no_grad():
|
55 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
56 |
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
69 |
|
|
|
70 |
detections = sv.Detections.from_transformers(result, class_names)
|
71 |
|
72 |
-
resolution_wh =
|
73 |
thickness = sv.calculate_optimal_line_thickness(resolution_wh=resolution_wh)
|
74 |
text_scale = sv.calculate_optimal_text_scale(resolution_wh=resolution_wh)
|
75 |
|
@@ -79,7 +179,7 @@ def inference(prompts, target_image, model_id, conf_thresh, iou_thresh, prompt_t
|
|
79 |
in zip(detections['class_name'], detections.confidence)
|
80 |
]
|
81 |
|
82 |
-
annotated_image =
|
83 |
annotated_image = sv.BoxAnnotator(color_lookup=sv.ColorLookup.INDEX, thickness=thickness).annotate(
|
84 |
scene=annotated_image, detections=detections)
|
85 |
annotated_image = sv.LabelAnnotator(color_lookup=sv.ColorLookup.INDEX, text_scale=text_scale, smart_position=True).annotate(
|
@@ -87,36 +187,28 @@ def inference(prompts, target_image, model_id, conf_thresh, iou_thresh, prompt_t
|
|
87 |
|
88 |
return annotated_image
|
89 |
|
|
|
|
|
|
|
|
|
|
|
|
|
90 |
|
91 |
def app():
|
92 |
with gr.Blocks():
|
93 |
with gr.Row():
|
94 |
with gr.Column():
|
95 |
-
|
96 |
-
target_image = gr.Image(type="pil", label="Target Image", visible=True, interactive=True)
|
97 |
|
98 |
detect_button = gr.Button(value="Detect Objects")
|
99 |
prompt_type = gr.Textbox(value='Visual', visible=False) # Default prompt type
|
100 |
|
101 |
with gr.Tab("Visual") as visual_tab:
|
102 |
-
|
103 |
-
prompt_image = gr.Image(type="pil", label="Prompt Image", visible=True, interactive=True)
|
104 |
|
105 |
with gr.Tab("Text") as text_tab:
|
106 |
texts = gr.Textbox(label="Input Texts", value='', placeholder='person,bus', visible=True, interactive=True)
|
107 |
|
108 |
-
visual_tab.select(
|
109 |
-
fn=lambda: ("Visual", gr.update(visible=True)),
|
110 |
-
inputs=None,
|
111 |
-
outputs=[prompt_type, prompt_image]
|
112 |
-
)
|
113 |
-
|
114 |
-
text_tab.select(
|
115 |
-
fn=lambda: ("Text", gr.update(value=None, visible=False)),
|
116 |
-
inputs=None,
|
117 |
-
outputs=[prompt_type, prompt_image]
|
118 |
-
)
|
119 |
-
|
120 |
model_id = gr.Dropdown(
|
121 |
label="Model",
|
122 |
choices=[
|
@@ -133,7 +225,7 @@ def app():
|
|
133 |
value=0.25,
|
134 |
)
|
135 |
iou_thresh = gr.Slider(
|
136 |
-
label="
|
137 |
minimum=0.0,
|
138 |
maximum=1.0,
|
139 |
step=0.05,
|
@@ -141,8 +233,32 @@ def app():
|
|
141 |
)
|
142 |
|
143 |
with gr.Column():
|
144 |
-
|
145 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
146 |
|
147 |
def run_inference(prompt_image, target_image, texts, model_id, conf_thresh, iou_thresh, prompt_type):
|
148 |
# add text/built-in prompts
|
@@ -162,7 +278,7 @@ def app():
|
|
162 |
detect_button.click(
|
163 |
fn=run_inference,
|
164 |
inputs=[prompt_image, target_image, texts, model_id, conf_thresh, iou_thresh, prompt_type],
|
165 |
-
outputs=[
|
166 |
)
|
167 |
|
168 |
###################### Examples ##########################
|
@@ -193,6 +309,20 @@ def app():
|
|
193 |
"google/owlv2-base-patch16-ensemble",
|
194 |
0.9,
|
195 |
0.3,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
196 |
]
|
197 |
]
|
198 |
|
@@ -216,7 +346,20 @@ def app():
|
|
216 |
"test-data/target4.jpg",
|
217 |
"cat",
|
218 |
"google/owlv2-base-patch16-ensemble",
|
219 |
-
0.3
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
220 |
],
|
221 |
inputs=[target_image, texts, model_id, conf_thresh],
|
222 |
visible=False, cache_examples=False, label="Text Prompt Examples")
|
@@ -255,28 +398,8 @@ with gradio_app:
|
|
255 |
""")
|
256 |
gr.Markdown("""
|
257 |
This demo showcases the OWLv2 model's ability to perform zero-shot object detection using visual and text prompts.
|
258 |
-
|
259 |
You can either provide a text prompt or an image as a visual prompt to detect objects in the target image.
|
260 |
-
|
261 |
-
For visual prompting, following sample code is used, taken from the HF documentation:
|
262 |
-
```python
|
263 |
-
processor = AutoProcessor.from_pretrained("google/owlv2-base-patch16-ensemble")
|
264 |
-
model = Owlv2ForObjectDetection.from_pretrained("google/owlv2-base-patch16-ensemble")
|
265 |
-
|
266 |
-
target_image = Image.open(...)
|
267 |
-
prompt_image = Image.open(...)
|
268 |
-
inputs = processor(images=target_image, query_images=prompt_image, return_tensors="pt")
|
269 |
-
|
270 |
-
# forward pass
|
271 |
-
with torch.no_grad():
|
272 |
-
outputs = model.image_guided_detection(**inputs)
|
273 |
-
|
274 |
-
target_sizes = torch.Tensor([image.size[::-1]])
|
275 |
-
|
276 |
-
results = processor.post_process_image_guided_detection(outputs=outputs, threshold=0.9, nms_threshold=0.3, target_sizes=target_sizes)
|
277 |
-
```
|
278 |
-
|
279 |
-
For some reason, visual prompt works much worse than text, perhaps it's HF implementation issue.
|
280 |
""")
|
281 |
|
282 |
with gr.Row():
|
|
|
8 |
import gradio as gr
|
9 |
import supervision as sv
|
10 |
import spaces
|
11 |
+
from PIL import Image
|
12 |
+
from transformers import AutoProcessor, Owlv2ForObjectDetection, Owlv2Processor
|
13 |
+
from transformers.models.owlv2.modeling_owlv2 import Owlv2ImageGuidedObjectDetectionOutput, center_to_corners_format, box_iou
|
14 |
+
#from transformers.models.owlv2.image_processing_owlv2
|
15 |
|
16 |
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
|
17 |
|
|
|
21 |
model = Owlv2ForObjectDetection.from_pretrained(model_id)
|
22 |
model.eval()
|
23 |
model.to(DEVICE)
|
24 |
+
image_size = tuple(processor.image_processor.size.values())
|
25 |
+
image_mean = torch.tensor(
|
26 |
+
processor.image_processor.image_mean, device=DEVICE
|
27 |
+
).view(1, 3, 1, 1)
|
28 |
+
image_std = torch.tensor(
|
29 |
+
processor.image_processor.image_std, device=DEVICE
|
30 |
+
).view(1, 3, 1, 1)
|
31 |
+
|
32 |
+
return processor, model, image_size, image_mean, image_std
|
33 |
|
34 |
@spaces.GPU
|
35 |
def inference(prompts, target_image, model_id, conf_thresh, iou_thresh, prompt_type):
|
36 |
+
processor, model, image_size, image_mean, image_std = init_model(model_id)
|
37 |
|
38 |
+
annotated_image_my = None
|
39 |
+
annotated_image_hf = None
|
40 |
+
annotated_prompt_image = None
|
41 |
|
42 |
if prompt_type == "Text":
|
43 |
inputs = processor(
|
|
|
48 |
|
49 |
with torch.no_grad():
|
50 |
outputs = model(**inputs)
|
51 |
+
target_sizes = torch.tensor([target_image.size[::-1]])
|
52 |
+
result = processor.post_process_grounded_object_detection(
|
53 |
+
outputs=outputs,
|
54 |
+
target_sizes=target_sizes,
|
55 |
+
threshold=conf_thresh
|
56 |
+
)[0]
|
|
|
57 |
class_names = {k: v for k, v in enumerate(prompts["texts"])}
|
58 |
+
# annotate the target image
|
59 |
+
annotated_image_hf = annotate_image(result, class_names, target_image)
|
60 |
|
61 |
elif prompt_type == "Visual":
|
62 |
+
prompt_image = prompts["images"]
|
63 |
inputs = processor(
|
64 |
images=target_image,
|
65 |
+
query_images=prompt_image,
|
66 |
return_tensors="pt"
|
67 |
).to(DEVICE)
|
68 |
with torch.no_grad():
|
69 |
+
query_feature_map = model.image_embedder(pixel_values=inputs.query_pixel_values)[0]
|
70 |
+
|
71 |
+
feature_map = model.image_embedder(pixel_values=inputs.pixel_values)[0]
|
72 |
+
batch_size, num_patches_height, num_patches_width, hidden_dim = feature_map.shape
|
73 |
+
image_feats = torch.reshape(feature_map, (batch_size, num_patches_height * num_patches_width, hidden_dim))
|
74 |
+
|
75 |
+
batch_size, num_patches_height, num_patches_width, hidden_dim = query_feature_map.shape
|
76 |
+
query_image_feats = torch.reshape(query_feature_map, (batch_size, num_patches_height * num_patches_width, hidden_dim))
|
77 |
|
78 |
+
# Select using hf method
|
79 |
+
query_embeds2, box_indices, pred_boxes = model.embed_image_query(
|
80 |
+
query_image_features=query_image_feats,
|
81 |
+
query_feature_map=query_feature_map
|
82 |
+
)
|
83 |
+
|
84 |
+
# Select top object from prompt image * iou
|
85 |
+
objectnesses = torch.sigmoid(model.objectness_predictor(query_image_feats))
|
86 |
+
_, source_class_embeddings = model.class_predictor(query_image_feats)
|
87 |
+
|
88 |
+
# identify the box that covers only the prompt image area excluding padding
|
89 |
+
pw, ph = prompt_image.size
|
90 |
+
max_side = max(pw, ph)
|
91 |
+
each_query_box = torch.tensor([[0, 0, pw/max_side, ph/max_side]], device=DEVICE)
|
92 |
+
|
93 |
+
pred_boxes_as_corners = center_to_corners_format(pred_boxes)
|
94 |
+
each_query_pred_boxes = pred_boxes_as_corners[0]
|
95 |
+
ious, _ = box_iou(each_query_box, each_query_pred_boxes)
|
96 |
+
comb_score = objectnesses * ious
|
97 |
+
top_obj_idx = torch.argmax(comb_score, dim=-1)
|
98 |
+
query_embeds = source_class_embeddings[0][top_obj_idx]
|
99 |
+
|
100 |
+
# Predict object boxes
|
101 |
+
target_pred_boxes = model.box_predictor(image_feats, feature_map)
|
102 |
+
|
103 |
+
# Predict for prompt: my method
|
104 |
+
(pred_logits, class_embeds) = model.class_predictor(image_feats=image_feats, query_embeds=query_embeds)
|
105 |
+
outputs = Owlv2ImageGuidedObjectDetectionOutput(
|
106 |
+
logits=pred_logits,
|
107 |
+
target_pred_boxes=target_pred_boxes,
|
108 |
+
)
|
109 |
+
# Post-process results
|
110 |
+
target_sizes = torch.tensor([target_image.size[::-1]])
|
111 |
+
result = processor.post_process_image_guided_detection(
|
112 |
+
outputs=outputs,
|
113 |
+
target_sizes=target_sizes,
|
114 |
+
threshold=conf_thresh,
|
115 |
+
nms_threshold=iou_thresh
|
116 |
+
)[0]
|
117 |
+
# prepare for supervision: add 0 label for all boxes
|
118 |
+
result['labels'] = torch.zeros(len(result['boxes']), dtype=torch.int64)
|
119 |
+
class_names = {0: "object"}
|
120 |
+
# annotate the target image
|
121 |
+
annotated_image_my = annotate_image(result, class_names, pad_to_square(target_image))
|
122 |
+
|
123 |
+
# Predict for prompt: hf method
|
124 |
+
(pred_logits, class_embeds) = model.class_predictor(image_feats=image_feats, query_embeds=query_embeds2)
|
125 |
+
# Predict object boxes
|
126 |
+
outputs = Owlv2ImageGuidedObjectDetectionOutput(
|
127 |
+
logits=pred_logits,
|
128 |
+
target_pred_boxes=target_pred_boxes,
|
129 |
+
)
|
130 |
+
# Post-process results
|
131 |
+
target_sizes = torch.tensor([target_image.size[::-1]])
|
132 |
+
result = processor.post_process_image_guided_detection(
|
133 |
+
outputs=outputs,
|
134 |
+
target_sizes=target_sizes,
|
135 |
+
threshold=conf_thresh,
|
136 |
+
nms_threshold=iou_thresh
|
137 |
+
)[0]
|
138 |
+
# prepare for supervision: add 0 label for all boxes
|
139 |
+
result['labels'] = torch.zeros(len(result['boxes']), dtype=torch.int64)
|
140 |
+
class_names = {0: "object"}
|
141 |
+
# annotate the target image
|
142 |
+
annotated_image_hf = annotate_image(result, class_names, pad_to_square(target_image))
|
143 |
+
|
144 |
+
# Render selected prompt embedding
|
145 |
+
query_pred_boxes = pred_boxes[0, [top_obj_idx, box_indices[0]]].unsqueeze(0)
|
146 |
+
query_logits = torch.reshape(objectnesses[0, [top_obj_idx, box_indices[0]]], (1, 2, 1))
|
147 |
+
query_outputs = Owlv2ImageGuidedObjectDetectionOutput(
|
148 |
+
logits=query_logits,
|
149 |
+
target_pred_boxes=query_pred_boxes,
|
150 |
+
)
|
151 |
+
query_result = processor.post_process_image_guided_detection(
|
152 |
+
outputs=query_outputs,
|
153 |
+
target_sizes=torch.tensor([prompt_image.size[::-1]]),
|
154 |
+
threshold=0.0,
|
155 |
+
nms_threshold=1.0
|
156 |
+
)[0]
|
157 |
+
query_result['labels'] = torch.Tensor([0, 1])
|
158 |
+
|
159 |
+
# Annotate the prompt image
|
160 |
+
query_class_names = {0: "my", 1: "hf"}
|
161 |
+
|
162 |
+
# annotate the prompt image
|
163 |
+
annotated_prompt_image = annotate_image(query_result, query_class_names, pad_to_square(prompt_image))
|
164 |
+
|
165 |
+
|
166 |
+
return annotated_image_my, annotated_image_hf, annotated_prompt_image
|
167 |
+
|
168 |
|
169 |
+
def annotate_image(result, class_names, image):
|
170 |
detections = sv.Detections.from_transformers(result, class_names)
|
171 |
|
172 |
+
resolution_wh = image.size
|
173 |
thickness = sv.calculate_optimal_line_thickness(resolution_wh=resolution_wh)
|
174 |
text_scale = sv.calculate_optimal_text_scale(resolution_wh=resolution_wh)
|
175 |
|
|
|
179 |
in zip(detections['class_name'], detections.confidence)
|
180 |
]
|
181 |
|
182 |
+
annotated_image = image.copy()
|
183 |
annotated_image = sv.BoxAnnotator(color_lookup=sv.ColorLookup.INDEX, thickness=thickness).annotate(
|
184 |
scene=annotated_image, detections=detections)
|
185 |
annotated_image = sv.LabelAnnotator(color_lookup=sv.ColorLookup.INDEX, text_scale=text_scale, smart_position=True).annotate(
|
|
|
187 |
|
188 |
return annotated_image
|
189 |
|
190 |
+
def pad_to_square(image, background_color=(128, 128, 128)):
|
191 |
+
width, height = image.size
|
192 |
+
max_side = max(width, height)
|
193 |
+
result = Image.new(image.mode, (max_side, max_side), background_color)
|
194 |
+
result.paste(image, (0, 0))
|
195 |
+
return result
|
196 |
|
197 |
def app():
|
198 |
with gr.Blocks():
|
199 |
with gr.Row():
|
200 |
with gr.Column():
|
201 |
+
target_image = gr.Image(type="pil", label="Target Image", visible=True, interactive=True)
|
|
|
202 |
|
203 |
detect_button = gr.Button(value="Detect Objects")
|
204 |
prompt_type = gr.Textbox(value='Visual', visible=False) # Default prompt type
|
205 |
|
206 |
with gr.Tab("Visual") as visual_tab:
|
207 |
+
prompt_image = gr.Image(type="pil", label="Prompt Image", visible=True, interactive=True)
|
|
|
208 |
|
209 |
with gr.Tab("Text") as text_tab:
|
210 |
texts = gr.Textbox(label="Input Texts", value='', placeholder='person,bus', visible=True, interactive=True)
|
211 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
212 |
model_id = gr.Dropdown(
|
213 |
label="Model",
|
214 |
choices=[
|
|
|
225 |
value=0.25,
|
226 |
)
|
227 |
iou_thresh = gr.Slider(
|
228 |
+
label="NSM Threshold",
|
229 |
minimum=0.0,
|
230 |
maximum=1.0,
|
231 |
step=0.05,
|
|
|
233 |
)
|
234 |
|
235 |
with gr.Column():
|
236 |
+
output_image_hf_gr = gr.Group()
|
237 |
+
with output_image_hf_gr:
|
238 |
+
gr.Markdown("### Annotated Image (HF default)")
|
239 |
+
output_image_hf = gr.Image(type="numpy", visible=True, show_label=False)
|
240 |
+
|
241 |
+
output_image_my_gr = gr.Group()
|
242 |
+
with output_image_my_gr:
|
243 |
+
gr.Markdown("### Annotated Image (Objectness Γ IoU variant)")
|
244 |
+
output_image_my = gr.Image(type="numpy", visible=True, show_label=False)
|
245 |
+
|
246 |
+
annotated_prompt_image_gr = gr.Group()
|
247 |
+
with annotated_prompt_image_gr:
|
248 |
+
gr.Markdown("### Prompt Image with Selected Embeddings and Objectness Score")
|
249 |
+
annotated_prompt_image = gr.Image(type="numpy", visible=True, show_label=False)
|
250 |
+
|
251 |
+
visual_tab.select(
|
252 |
+
fn=lambda: ("Visual", gr.update(visible=True), gr.update(visible=True), gr.update(visible=True)),
|
253 |
+
inputs=None,
|
254 |
+
outputs=[prompt_type, prompt_image, output_image_my_gr, annotated_prompt_image_gr]
|
255 |
+
)
|
256 |
+
|
257 |
+
text_tab.select(
|
258 |
+
fn=lambda: ("Text", gr.update(value=None, visible=False), gr.update(visible=False), gr.update(visible=False)),
|
259 |
+
inputs=None,
|
260 |
+
outputs=[prompt_type, prompt_image, output_image_my_gr, annotated_prompt_image_gr]
|
261 |
+
)
|
262 |
|
263 |
def run_inference(prompt_image, target_image, texts, model_id, conf_thresh, iou_thresh, prompt_type):
|
264 |
# add text/built-in prompts
|
|
|
278 |
detect_button.click(
|
279 |
fn=run_inference,
|
280 |
inputs=[prompt_image, target_image, texts, model_id, conf_thresh, iou_thresh, prompt_type],
|
281 |
+
outputs=[output_image_my, output_image_hf, annotated_prompt_image],
|
282 |
)
|
283 |
|
284 |
###################### Examples ##########################
|
|
|
309 |
"google/owlv2-base-patch16-ensemble",
|
310 |
0.9,
|
311 |
0.3,
|
312 |
+
],
|
313 |
+
[
|
314 |
+
"test-data/target5.jpg",
|
315 |
+
"test-data/prompt5.jpg",
|
316 |
+
"google/owlv2-base-patch16-ensemble",
|
317 |
+
0.9,
|
318 |
+
0.3,
|
319 |
+
],
|
320 |
+
[
|
321 |
+
"test-data/target6.jpg",
|
322 |
+
"test-data/prompt6.jpg",
|
323 |
+
"google/owlv2-base-patch16-ensemble",
|
324 |
+
0.9,
|
325 |
+
0.3,
|
326 |
]
|
327 |
]
|
328 |
|
|
|
346 |
"test-data/target4.jpg",
|
347 |
"cat",
|
348 |
"google/owlv2-base-patch16-ensemble",
|
349 |
+
0.3
|
350 |
+
],
|
351 |
+
[
|
352 |
+
"test-data/target5.jpg",
|
353 |
+
"lemon,straw",
|
354 |
+
"google/owlv2-base-patch16-ensemble",
|
355 |
+
0.3
|
356 |
+
],
|
357 |
+
[
|
358 |
+
"test-data/target6.jpg",
|
359 |
+
"beer logo",
|
360 |
+
"google/owlv2-base-patch16-ensemble",
|
361 |
+
0.3
|
362 |
+
]
|
363 |
],
|
364 |
inputs=[target_image, texts, model_id, conf_thresh],
|
365 |
visible=False, cache_examples=False, label="Text Prompt Examples")
|
|
|
398 |
""")
|
399 |
gr.Markdown("""
|
400 |
This demo showcases the OWLv2 model's ability to perform zero-shot object detection using visual and text prompts.
|
|
|
401 |
You can either provide a text prompt or an image as a visual prompt to detect objects in the target image.
|
402 |
+
Additionally, it compares different approaches for selecting a query embedding from a visual prompt. The method used in Hugging Face's `transformers` by default often underperforms because of how the visual prompt embedding is selected (see README.md for more details).
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
403 |
""")
|
404 |
|
405 |
with gr.Row():
|
test-data/prompt5.jpg
ADDED
![]() |
Git LFS Details
|
test-data/prompt6.jpg
ADDED
![]() |
Git LFS Details
|
test-data/target5.jpg
ADDED
![]() |
Git LFS Details
|
test-data/target6.jpg
ADDED
![]() |
Git LFS Details
|