|
[ |
|
{ |
|
"id": 1982, |
|
"image": "images/1982_testset_0000394.jpg", |
|
"conversations": [ |
|
{ |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"throw the soccer_ball\". Please predict a possible affordance area of the end effector?", |
|
"from": "human" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[161, 256, 183, 281]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 1091, |
|
"image": "images/1091_motorcycle_000923.jpg", |
|
"conversations": [ |
|
{ |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"sit_on the motorcycle\". Please predict a possible affordance area of the end effector?", |
|
"from": "human" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[456, 231, 510, 277]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 6498, |
|
"image": "images/6498_thu_cut_fruit_RGB_cut_fruit_cwx_2_22.jpg", |
|
"conversations": [ |
|
{ |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"cut_with the knife\". Please predict a possible affordance area of the end effector?", |
|
"from": "human" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[512, 196, 560, 252]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 24, |
|
"image": "images/24_6.png", |
|
"conversations": [ |
|
{ |
|
"from": "human", |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"take the basket\". Please predict a possible affordance area of the end effector?" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[817, 270, 1095, 436]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 2420, |
|
"image": "images/2420_ego4d_7aca7552-56d2-4ac0-be30-aed9c5dfc8ea_9000.jpg", |
|
"conversations": [ |
|
{ |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"lie_on the couch\". Please predict a possible affordance area of the end effector?", |
|
"from": "human" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[813, 442, 930, 558]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 1045, |
|
"image": "images/1045_bench_002635.jpg", |
|
"conversations": [ |
|
{ |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"sit_on the bench\". Please predict a possible affordance area of the end effector?", |
|
"from": "human" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[321, 169, 508, 224]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 6738, |
|
"image": "images/6738_cup_002554.jpg", |
|
"conversations": [ |
|
{ |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"pour the cup\". Please predict a possible affordance area of the end effector?", |
|
"from": "human" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[224, 293, 277, 356]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 5909, |
|
"image": "images/5909_bicycle_001556.jpg", |
|
"conversations": [ |
|
{ |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"sit_on the bicycle\". Please predict a possible affordance area of the end effector?", |
|
"from": "human" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[117, 113, 227, 192]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 7259, |
|
"image": "images/7259_golf_clubs_001094.jpg", |
|
"conversations": [ |
|
{ |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"swing the golf_clubs\". Please predict a possible affordance area of the end effector?", |
|
"from": "human" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[358, 117, 424, 229]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 6523, |
|
"image": "images/6523_soccer_ball_002817.jpg", |
|
"conversations": [ |
|
{ |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"kick the soccer_ball\". Please predict a possible affordance area of the end effector?", |
|
"from": "human" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[231, 117, 378, 275]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 3428, |
|
"image": "images/3428_testset_0001985.jpg", |
|
"conversations": [ |
|
{ |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"ride the bicycle\". Please predict a possible affordance area of the end effector?", |
|
"from": "human" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[68, 36, 100, 62]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 7249, |
|
"image": "images/7249_banana_000253.jpg", |
|
"conversations": [ |
|
{ |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"peel the banana\". Please predict a possible affordance area of the end effector?", |
|
"from": "human" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[216, 119, 387, 396]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 6568, |
|
"image": "images/6568_testset_0000092.jpg", |
|
"conversations": [ |
|
{ |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"kick the soccer_ball\". Please predict a possible affordance area of the end effector?", |
|
"from": "human" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[120, 189, 148, 217]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 5999, |
|
"image": "images/5999_ego4d_6b889d80-1c10-413c-9532-fbd629305a43_36300.jpg", |
|
"conversations": [ |
|
{ |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"type_on the laptop\". Please predict a possible affordance area of the end effector?", |
|
"from": "human" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[1507, 115, 1918, 315]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 2950, |
|
"image": "images/2950_bicycle_000661.jpg", |
|
"conversations": [ |
|
{ |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"push the bicycle\". Please predict a possible affordance area of the end effector?", |
|
"from": "human" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[483, 0, 533, 24]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 2050, |
|
"image": "images/2050_baseball_002670.jpg", |
|
"conversations": [ |
|
{ |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"throw the baseball\". Please predict a possible affordance area of the end effector?", |
|
"from": "human" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[307, 166, 364, 222]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 3501, |
|
"image": "images/3501_testset_0000898.jpg", |
|
"conversations": [ |
|
{ |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"ride the bicycle\". Please predict a possible affordance area of the end effector?", |
|
"from": "human" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[409, 261, 459, 319]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 5111, |
|
"image": "images/5111_youtube_Making a T-Bone Steak POV First Person Cooking-znuEJcLMSjw-1080p-1644527668544_003600.jpg", |
|
"conversations": [ |
|
{ |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"wash the bowl\". Please predict a possible affordance area of the end effector?", |
|
"from": "human" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[817, 585, 1186, 1005]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 4416, |
|
"image": "images/4416_testset_0000636.jpg", |
|
"conversations": [ |
|
{ |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"hold the skateboard\". Please predict a possible affordance area of the end effector?", |
|
"from": "human" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[325, 370, 466, 444]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 5277, |
|
"image": "images/5277_epic_P23_02_frame_0000071911.jpg", |
|
"conversations": [ |
|
{ |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"stir the bowl\". Please predict a possible affordance area of the end effector?", |
|
"from": "human" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[222, 96, 293, 163]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 5348, |
|
"image": "images/5348_ego4d_8af2ed7d-4435-40ce-b3cc-685f28bfcb2d_37500.jpg", |
|
"conversations": [ |
|
{ |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"take_photo the cell_phone\". Please predict a possible affordance area of the end effector?", |
|
"from": "human" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[1182, 355, 1372, 505]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 2722, |
|
"image": "images/2722_testset_0001597.jpg", |
|
"conversations": [ |
|
{ |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"lie_on the surfboard\". Please predict a possible affordance area of the end effector?", |
|
"from": "human" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[34, 222, 82, 342]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 4035, |
|
"image": "images/4035_testset_0001460.jpg", |
|
"conversations": [ |
|
{ |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"hold the suitcase\". Please predict a possible affordance area of the end effector?", |
|
"from": "human" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[504, 370, 576, 417]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 2999, |
|
"image": "images/2999_thu_cut_fruit_RGB_cut_fruit_tys_3_99.jpg", |
|
"conversations": [ |
|
{ |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"cut_with the knife\". Please predict a possible affordance area of the end effector?", |
|
"from": "human" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[355, 60, 408, 113]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 1217, |
|
"image": "images/1217_testset_0001818.jpg", |
|
"conversations": [ |
|
{ |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"sit_on the chair\". Please predict a possible affordance area of the end effector?", |
|
"from": "human" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[582, 487, 647, 537]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 6395, |
|
"image": "images/6395_knife_001124.jpg", |
|
"conversations": [ |
|
{ |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"cut_with the knife\". Please predict a possible affordance area of the end effector?", |
|
"from": "human" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[412, 62, 577, 189]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 4496, |
|
"image": "images/4496_frisbee_002270.jpg", |
|
"conversations": [ |
|
{ |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"hold the frisbee\". Please predict a possible affordance area of the end effector?", |
|
"from": "human" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[58, 73, 925, 931]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 3348, |
|
"image": "images/3348_motorcycle_002093.jpg", |
|
"conversations": [ |
|
{ |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"ride the motorcycle\". Please predict a possible affordance area of the end effector?", |
|
"from": "human" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[286, 134, 337, 184]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 5290, |
|
"image": "images/5290_ego4d_de8ea8a7-6bdb-4679-9b17-9dc6a79e45e7_4800.jpg", |
|
"conversations": [ |
|
{ |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"take_photo the cell_phone\". Please predict a possible affordance area of the end effector?", |
|
"from": "human" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[889, 424, 1018, 560]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 6667, |
|
"image": "images/6667_bicycle_002350.jpg", |
|
"conversations": [ |
|
{ |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"ride the bicycle\". Please predict a possible affordance area of the end effector?", |
|
"from": "human" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[389, 72, 455, 120]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 7248, |
|
"image": "images/7248_banana_000480.jpg", |
|
"conversations": [ |
|
{ |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"peel the banana\". Please predict a possible affordance area of the end effector?", |
|
"from": "human" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[289, 112, 359, 296]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 4655, |
|
"image": "images/4655_youtube_how to cook PASTA RISOTTATA UNDER 30 MINUTE PASTA MADE LIKE RISOTTO JONAH IN THE KITCHEN-QHjXmwMg3MY-1080p-1644528336886 (1)_012420.jpg", |
|
"conversations": [ |
|
{ |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"open the bottle\". Please predict a possible affordance area of the end effector?", |
|
"from": "human" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[760, 284, 885, 409]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 4731, |
|
"image": "images/4731_thu_read_book_RGB_read_book_cwx_3_86.jpg", |
|
"conversations": [ |
|
{ |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"open the book\". Please predict a possible affordance area of the end effector?", |
|
"from": "human" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[275, 255, 526, 348]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 3375, |
|
"image": "images/3375_testset_0001667.jpg", |
|
"conversations": [ |
|
{ |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"ride the motorbike\". Please predict a possible affordance area of the end effector?", |
|
"from": "human" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[493, 181, 548, 227]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 3059, |
|
"image": "images/3059_knife_000029.jpg", |
|
"conversations": [ |
|
{ |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"cut_with the knife\". Please predict a possible affordance area of the end effector?", |
|
"from": "human" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[407, 149, 456, 204]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 5885, |
|
"image": "images/5885_testset_0001139.jpg", |
|
"conversations": [ |
|
{ |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"sit_on the bicycle\". Please predict a possible affordance area of the end effector?", |
|
"from": "human" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[437, 235, 481, 275]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 1000, |
|
"image": "images/1000_bench_001222.jpg", |
|
"conversations": [ |
|
{ |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"sit_on the bench\". Please predict a possible affordance area of the end effector?", |
|
"from": "human" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[316, 286, 394, 330]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 1705, |
|
"image": "images/1705_skateboard_002490.jpg", |
|
"conversations": [ |
|
{ |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"carry the skateboard\". Please predict a possible affordance area of the end effector?", |
|
"from": "human" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[279, 191, 350, 248]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 7081, |
|
"image": "images/7081_thu_stir_RGB_stir_lhh_3_60.jpg", |
|
"conversations": [ |
|
{ |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"sip the cup\". Please predict a possible affordance area of the end effector?", |
|
"from": "human" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[289, 132, 378, 211]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 647, |
|
"image": "images/647_thu_stir_RGB_stir_lhh_3_60.jpg", |
|
"conversations": [ |
|
{ |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"drink_with the cup\". Please predict a possible affordance area of the end effector?", |
|
"from": "human" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[289, 132, 378, 211]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 5360, |
|
"image": "images/5360_ego4d_f5b87541-3879-444f-be65-2b17f1d95735_24450.jpg", |
|
"conversations": [ |
|
{ |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"take_photo the cell_phone\". Please predict a possible affordance area of the end effector?", |
|
"from": "human" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[1547, 200, 1701, 413]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 5, |
|
"image": "images/5_17.png", |
|
"conversations": [ |
|
{ |
|
"from": "human", |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"take the basket\". Please predict a possible affordance area of the end effector?" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[441, 960, 679, 1055]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 3682, |
|
"image": "images/3682_ego4d_9b8e8dc6-4069-4ef4-b8b2-edeb76894d53_17250.jpg", |
|
"conversations": [ |
|
{ |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"pour the bottle\". Please predict a possible affordance area of the end effector?", |
|
"from": "human" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[992, 0, 1157, 158]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 2483, |
|
"image": "images/2483_testset_0000557.jpg", |
|
"conversations": [ |
|
{ |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"lie_on the bench\". Please predict a possible affordance area of the end effector?", |
|
"from": "human" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[267, 276, 509, 322]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 5378, |
|
"image": "images/5378_camera_000542.jpg", |
|
"conversations": [ |
|
{ |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"take_photo the camera\". Please predict a possible affordance area of the end effector?", |
|
"from": "human" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[252, 284, 492, 425]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 4480, |
|
"image": "images/4480_axe_003531.jpg", |
|
"conversations": [ |
|
{ |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"hold the axe\". Please predict a possible affordance area of the end effector?", |
|
"from": "human" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[341, 136, 803, 508]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 432, |
|
"image": "images/432_testset_0000327.jpg", |
|
"conversations": [ |
|
{ |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"catch the soccer_ball\". Please predict a possible affordance area of the end effector?", |
|
"from": "human" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[164, 98, 198, 129]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 5440, |
|
"image": "images/5440_testset_0000972.jpg", |
|
"conversations": [ |
|
{ |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"pick_up the suitcase\". Please predict a possible affordance area of the end effector?", |
|
"from": "human" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[116, 68, 159, 94]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 0, |
|
"image": "images/0_8.png", |
|
"conversations": [ |
|
{ |
|
"from": "human", |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"take the basket\". Please predict a possible affordance area of the end effector?" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[1158, 344, 1382, 452]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 200, |
|
"image": "images/200_punching_bag_002699.jpg", |
|
"conversations": [ |
|
{ |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"boxing the punching_bag\". Please predict a possible affordance area of the end effector?", |
|
"from": "human" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[141, 173, 284, 425]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 6410, |
|
"image": "images/6410_knife_003240.jpg", |
|
"conversations": [ |
|
{ |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"cut_with the knife\". Please predict a possible affordance area of the end effector?", |
|
"from": "human" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[334, 0, 428, 116]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 5567, |
|
"image": "images/5567_banana_002174.jpg", |
|
"conversations": [ |
|
{ |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"peel the banana\". Please predict a possible affordance area of the end effector?", |
|
"from": "human" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[49, 67, 209, 245]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 4535, |
|
"image": "images/4535_testset_0002282.jpg", |
|
"conversations": [ |
|
{ |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"hold the frisbee\". Please predict a possible affordance area of the end effector?", |
|
"from": "human" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[267, 45, 366, 107]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 3242, |
|
"image": "images/3242_testset_0001182.jpg", |
|
"conversations": [ |
|
{ |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"kick the soccer_ball\". Please predict a possible affordance area of the end effector?", |
|
"from": "human" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[250, 171, 277, 200]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 6210, |
|
"image": "images/6210_bed_002956.jpg", |
|
"conversations": [ |
|
{ |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"lie_on the bed\". Please predict a possible affordance area of the end effector?", |
|
"from": "human" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[234, 207, 496, 252]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 4004, |
|
"image": "images/4004_ego4d_8a280483-bbfd-4f24-9654-bf6a008e406d_25050.jpg", |
|
"conversations": [ |
|
{ |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"hold the scissors\". Please predict a possible affordance area of the end effector?", |
|
"from": "human" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[1188, 262, 1457, 527]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 4217, |
|
"image": "images/4217_epic_P02_122_frame_0000010441.jpg", |
|
"conversations": [ |
|
{ |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"hold the bowl\". Please predict a possible affordance area of the end effector?", |
|
"from": "human" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[373, 79, 409, 119]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 2205, |
|
"image": "images/2205_testset_0001880.jpg", |
|
"conversations": [ |
|
{ |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"throw the frisbee\". Please predict a possible affordance area of the end effector?", |
|
"from": "human" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[152, 86, 230, 136]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 4052, |
|
"image": "images/4052_testset_0001986.jpg", |
|
"conversations": [ |
|
{ |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"hold the suitcase\". Please predict a possible affordance area of the end effector?", |
|
"from": "human" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[153, 24, 243, 76]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 6386, |
|
"image": "images/6386_youtube_Making Japanese Katsudon POV First Person Cooking-YYz5a6wS2oE-1080p-1644527980851_006060.jpg", |
|
"conversations": [ |
|
{ |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"cut_with the knife\". Please predict a possible affordance area of the end effector?", |
|
"from": "human" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[917, 481, 1075, 608]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 6127, |
|
"image": "images/6127_youtube_Good_012720.jpg", |
|
"conversations": [ |
|
{ |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"stick the knife\". Please predict a possible affordance area of the end effector?", |
|
"from": "human" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[418, 0, 541, 141]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 4507, |
|
"image": "images/4507_frisbee_003223.jpg", |
|
"conversations": [ |
|
{ |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"hold the frisbee\". Please predict a possible affordance area of the end effector?", |
|
"from": "human" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[33, 0, 839, 839]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 6794, |
|
"image": "images/6794_ego4d_6e0a6558-c212-4cab-b374-007671edb59c_14100.jpg", |
|
"conversations": [ |
|
{ |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"hold the cup\". Please predict a possible affordance area of the end effector?", |
|
"from": "human" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[1087, 441, 1206, 549]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 1420, |
|
"image": "images/1420_testset_0001955.jpg", |
|
"conversations": [ |
|
{ |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"sit_on the skateboard\". Please predict a possible affordance area of the end effector?", |
|
"from": "human" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[416, 145, 590, 240]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 5110, |
|
"image": "images/5110_bowl_001170.jpg", |
|
"conversations": [ |
|
{ |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"wash the bowl\". Please predict a possible affordance area of the end effector?", |
|
"from": "human" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[0, 171, 213, 228]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 3092, |
|
"image": "images/3092_thu_cut_fruit_RGB_cut_fruit_wyy_1_82.jpg", |
|
"conversations": [ |
|
{ |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"cut_with the knife\". Please predict a possible affordance area of the end effector?", |
|
"from": "human" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[293, 227, 369, 279]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 2760, |
|
"image": "images/2760_testset_0001353.jpg", |
|
"conversations": [ |
|
{ |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"lie_on the surfboard\". Please predict a possible affordance area of the end effector?", |
|
"from": "human" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[122, 38, 151, 127]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 3216, |
|
"image": "images/3216_testset_0002265.jpg", |
|
"conversations": [ |
|
{ |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"kick the soccer_ball\". Please predict a possible affordance area of the end effector?", |
|
"from": "human" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[172, 299, 384, 543]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 6724, |
|
"image": "images/6724_broccoli_003228.jpg", |
|
"conversations": [ |
|
{ |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"eat the broccoli\". Please predict a possible affordance area of the end effector?", |
|
"from": "human" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[97, 128, 241, 251]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 4638, |
|
"image": "images/4638_ego4d_11f51f72-c0bd-4631-aabb-6ecbe17d95c9_10050.jpg", |
|
"conversations": [ |
|
{ |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"hit the axe\". Please predict a possible affordance area of the end effector?", |
|
"from": "human" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[858, 154, 1091, 263]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 2269, |
|
"image": "images/2269_banana_000977.jpg", |
|
"conversations": [ |
|
{ |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"cut the banana\". Please predict a possible affordance area of the end effector?", |
|
"from": "human" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[162, 203, 307, 429]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 1867, |
|
"image": "images/1867_keyboard_000219.jpg", |
|
"conversations": [ |
|
{ |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"type_on the keyboard\". Please predict a possible affordance area of the end effector?", |
|
"from": "human" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[289, 114, 554, 219]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 382, |
|
"image": "images/382_ego4d_4ac59908-3f13-4085-9479-7348b13803ee_6450.jpg", |
|
"conversations": [ |
|
{ |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"write the pen\". Please predict a possible affordance area of the end effector?", |
|
"from": "human" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[1557, 221, 1918, 432]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 2939, |
|
"image": "images/2939_testset_0002111.jpg", |
|
"conversations": [ |
|
{ |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"push the bicycle\". Please predict a possible affordance area of the end effector?", |
|
"from": "human" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[213, 56, 239, 76]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 6040, |
|
"image": "images/6040_ego4d_20d62117-8b5f-44b3-b86f-a936e6cdd2c2_4950.jpg", |
|
"conversations": [ |
|
{ |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"type_on the laptop\". Please predict a possible affordance area of the end effector?", |
|
"from": "human" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[2291, 558, 2560, 771]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 3326, |
|
"image": "images/3326_motorcycle_002068.jpg", |
|
"conversations": [ |
|
{ |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"ride the motorcycle\". Please predict a possible affordance area of the end effector?", |
|
"from": "human" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[362, 291, 417, 339]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 790, |
|
"image": "images/790_testset_0002285.jpg", |
|
"conversations": [ |
|
{ |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"sit_on the couch\". Please predict a possible affordance area of the end effector?", |
|
"from": "human" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[344, 242, 444, 308]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 5435, |
|
"image": "images/5435_testset_0001206.jpg", |
|
"conversations": [ |
|
{ |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"pick_up the suitcase\". Please predict a possible affordance area of the end effector?", |
|
"from": "human" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[250, 24, 274, 46]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 3696, |
|
"image": "images/3696_wine_glass_002765.jpg", |
|
"conversations": [ |
|
{ |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"pour the wine_glass\". Please predict a possible affordance area of the end effector?", |
|
"from": "human" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[184, 273, 256, 506]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 3721, |
|
"image": "images/3721_baseball_bat_003712.jpg", |
|
"conversations": [ |
|
{ |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"hold the baseball_bat\". Please predict a possible affordance area of the end effector?", |
|
"from": "human" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[105, 192, 289, 258]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 1628, |
|
"image": "images/1628_testset_0001409.jpg", |
|
"conversations": [ |
|
{ |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"carry the surfboard\". Please predict a possible affordance area of the end effector?", |
|
"from": "human" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[132, 184, 183, 220]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 7126, |
|
"image": "images/7126_thu_use_chopstick_RGB_use_chopstick_cwx_3_54.jpg", |
|
"conversations": [ |
|
{ |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"wash the cup\". Please predict a possible affordance area of the end effector?", |
|
"from": "human" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[339, 151, 442, 252]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 2982, |
|
"image": "images/2982_youtube_First Person Cooking ITALIAN SAUSAGE & PEPPERS WITH POLENTA JONAH IN THE KITCHEN POV COOKING-2YPjWvmOb5o-1080p-1644528307790_002580.jpg", |
|
"conversations": [ |
|
{ |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"cut_with the knife\". Please predict a possible affordance area of the end effector?", |
|
"from": "human" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[900, 142, 1084, 276]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 4162, |
|
"image": "images/4162_testset_0001746.jpg", |
|
"conversations": [ |
|
{ |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"hold the surfboard\". Please predict a possible affordance area of the end effector?", |
|
"from": "human" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[72, 122, 190, 172]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 4174, |
|
"image": "images/4174_snowboard_001840.jpg", |
|
"conversations": [ |
|
{ |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"hold the snowboard\". Please predict a possible affordance area of the end effector?", |
|
"from": "human" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[679, 54, 805, 260]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 7266, |
|
"image": "images/7266_golf_clubs_000638.jpg", |
|
"conversations": [ |
|
{ |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"swing the golf_clubs\". Please predict a possible affordance area of the end effector?", |
|
"from": "human" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[26, 80, 165, 218]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 3592, |
|
"image": "images/3592_banana_002623.jpg", |
|
"conversations": [ |
|
{ |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"eat the banana\". Please predict a possible affordance area of the end effector?", |
|
"from": "human" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[49, 254, 169, 449]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 4754, |
|
"image": "images/4754_microwave_001049.jpg", |
|
"conversations": [ |
|
{ |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"open the microwave\". Please predict a possible affordance area of the end effector?", |
|
"from": "human" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[112, 108, 621, 171]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 3571, |
|
"image": "images/3571_apple_000735.jpg", |
|
"conversations": [ |
|
{ |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"eat the apple\". Please predict a possible affordance area of the end effector?", |
|
"from": "human" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[174, 76, 535, 307]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 2979, |
|
"image": "images/2979_knife_000695.jpg", |
|
"conversations": [ |
|
{ |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"cut_with the knife\". Please predict a possible affordance area of the end effector?", |
|
"from": "human" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[138, 174, 241, 224]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 5354, |
|
"image": "images/5354_ego4d_9a0a2d01-c4e0-4e30-8a69-642070d59ea9_6450.jpg", |
|
"conversations": [ |
|
{ |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"take_photo the cell_phone\". Please predict a possible affordance area of the end effector?", |
|
"from": "human" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[1537, 374, 1701, 548]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 1444, |
|
"image": "images/1444_bicycle_001404.jpg", |
|
"conversations": [ |
|
{ |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"sit_on the bicycle\". Please predict a possible affordance area of the end effector?", |
|
"from": "human" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[298, 80, 348, 126]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 5447, |
|
"image": "images/5447_testset_0001906.jpg", |
|
"conversations": [ |
|
{ |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"pick_up the suitcase\". Please predict a possible affordance area of the end effector?", |
|
"from": "human" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[285, 0, 333, 60]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 6518, |
|
"image": "images/6518_testset_0000299.jpg", |
|
"conversations": [ |
|
{ |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"kick the soccer_ball\". Please predict a possible affordance area of the end effector?", |
|
"from": "human" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[277, 240, 300, 263]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 789, |
|
"image": "images/789_testset_0000920.jpg", |
|
"conversations": [ |
|
{ |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"sit_on the couch\". Please predict a possible affordance area of the end effector?", |
|
"from": "human" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[214, 258, 456, 348]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 878, |
|
"image": "images/878_testset_0000949.jpg", |
|
"conversations": [ |
|
{ |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"sit_on the bench\". Please predict a possible affordance area of the end effector?", |
|
"from": "human" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[274, 201, 400, 250]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 4620, |
|
"image": "images/4620_tennis_racket_001468.jpg", |
|
"conversations": [ |
|
{ |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"hit the tennis_racket\". Please predict a possible affordance area of the end effector?", |
|
"from": "human" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[200, 1105, 273, 1407]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 6169, |
|
"image": "images/6169_thu_bounce_ball_RGB_bounce_ball_cwx_2_97.jpg", |
|
"conversations": [ |
|
{ |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"throw the basketball\". Please predict a possible affordance area of the end effector?", |
|
"from": "human" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[218, 173, 336, 288]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 483, |
|
"image": "images/483_testset_0000152.jpg", |
|
"conversations": [ |
|
{ |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"catch the soccer_ball\". Please predict a possible affordance area of the end effector?", |
|
"from": "human" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[222, 119, 249, 144]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 2095, |
|
"image": "images/2095_rugby_ball_002080.jpg", |
|
"conversations": [ |
|
{ |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"throw the rugby_ball\". Please predict a possible affordance area of the end effector?", |
|
"from": "human" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[81, 93, 207, 174]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 5299, |
|
"image": "images/5299_ego4d_5cae3bb3-1e1e-4b9f-82e8-87dbaad8f73d_9900.jpg", |
|
"conversations": [ |
|
{ |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"take_photo the cell_phone\". Please predict a possible affordance area of the end effector?", |
|
"from": "human" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[744, 386, 882, 666]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 5542, |
|
"image": "images/5542_epic_P33_102_frame_0000008911.jpg", |
|
"conversations": [ |
|
{ |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"peel the carrot\". Please predict a possible affordance area of the end effector?", |
|
"from": "human" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[26, 25, 82, 89]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 1766, |
|
"image": "images/1766_ego4d_70e81c43-3bf3-4a3c-81df-16f5108a6918_24150.jpg", |
|
"conversations": [ |
|
{ |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"type_on the laptop\". Please predict a possible affordance area of the end effector?", |
|
"from": "human" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[588, 645, 884, 764]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 6663, |
|
"image": "images/6663_testset_0001042.jpg", |
|
"conversations": [ |
|
{ |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"ride the bicycle\". Please predict a possible affordance area of the end effector?", |
|
"from": "human" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[136, 69, 162, 102]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 3264, |
|
"image": "images/3264_testset_0000353.jpg", |
|
"conversations": [ |
|
{ |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"kick the punching_bag\". Please predict a possible affordance area of the end effector?", |
|
"from": "human" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[249, 124, 345, 468]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 6274, |
|
"image": "images/6274_testset_0001830.jpg", |
|
"conversations": [ |
|
{ |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"push the bicycle\". Please predict a possible affordance area of the end effector?", |
|
"from": "human" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[252, 123, 278, 150]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 6209, |
|
"image": "images/6209_testset_0001089.jpg", |
|
"conversations": [ |
|
{ |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"lie_on the bed\". Please predict a possible affordance area of the end effector?", |
|
"from": "human" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[219, 306, 464, 413]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 2328, |
|
"image": "images/2328_testset_0000188.jpg", |
|
"conversations": [ |
|
{ |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"drag the suitcase\". Please predict a possible affordance area of the end effector?", |
|
"from": "human" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[186, 112, 216, 137]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 1535, |
|
"image": "images/1535_testset_0000898.jpg", |
|
"conversations": [ |
|
{ |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"sit_on the bicycle\". Please predict a possible affordance area of the end effector?", |
|
"from": "human" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[412, 274, 463, 322]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 2945, |
|
"image": "images/2945_testset_0002150.jpg", |
|
"conversations": [ |
|
{ |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"push the bicycle\". Please predict a possible affordance area of the end effector?", |
|
"from": "human" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[204, 120, 291, 210]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 6767, |
|
"image": "images/6767_escape_S05_short_cut_570.jpg", |
|
"conversations": [ |
|
{ |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"pour the cup\". Please predict a possible affordance area of the end effector?", |
|
"from": "human" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[938, 372, 1061, 494]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 7211, |
|
"image": "images/7211_testset_0001668.jpg", |
|
"conversations": [ |
|
{ |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"pick_up the skis\". Please predict a possible affordance area of the end effector?", |
|
"from": "human" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[344, 236, 408, 299]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 1393, |
|
"image": "images/1393_testset_0000436.jpg", |
|
"conversations": [ |
|
{ |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"sit_on the motorbike\". Please predict a possible affordance area of the end effector?", |
|
"from": "human" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[524, 223, 603, 283]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 77, |
|
"image": "images/77_testset_0000454.jpg", |
|
"conversations": [ |
|
{ |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"jump the surfboard\". Please predict a possible affordance area of the end effector?", |
|
"from": "human" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[244, 372, 320, 427]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 5328, |
|
"image": "images/5328_youtube_First Person Cooking POPEYES CHICKEN SANDWICH COPYCAT RECIPE JONAH IN THE KITCHEN POV COOKING-lNAPNvF0kGg-1080p-1644528296768_001740.jpg", |
|
"conversations": [ |
|
{ |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"take_photo the cell_phone\". Please predict a possible affordance area of the end effector?", |
|
"from": "human" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[295, 573, 437, 780]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 7178, |
|
"image": "images/7178_youtube_First Person Cooking Red Pesto Pasta 30 Minute Pasta JONAH IN THE KITCHEN-FeAspQ44YFk-1080p-1644528516587_005160.jpg", |
|
"conversations": [ |
|
{ |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"wash the knife\". Please predict a possible affordance area of the end effector?", |
|
"from": "human" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[691, 446, 1100, 872]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 587, |
|
"image": "images/587_testset_0000198.jpg", |
|
"conversations": [ |
|
{ |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"catch the frisbee\". Please predict a possible affordance area of the end effector?", |
|
"from": "human" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[0, 167, 14, 234]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 1545, |
|
"image": "images/1545_testset_0001366.jpg", |
|
"conversations": [ |
|
{ |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"sit_on the bicycle\". Please predict a possible affordance area of the end effector?", |
|
"from": "human" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[171, 254, 216, 299]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 2547, |
|
"image": "images/2547_testset_0000277.jpg", |
|
"conversations": [ |
|
{ |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"lie_on the bench\". Please predict a possible affordance area of the end effector?", |
|
"from": "human" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[94, 469, 227, 556]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 39, |
|
"image": "images/39_testset_0000686.jpg", |
|
"conversations": [ |
|
{ |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"jump the surfboard\". Please predict a possible affordance area of the end effector?", |
|
"from": "human" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[92, 59, 130, 90]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 6559, |
|
"image": "images/6559_testset_0000893.jpg", |
|
"conversations": [ |
|
{ |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"kick the soccer_ball\". Please predict a possible affordance area of the end effector?", |
|
"from": "human" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[164, 122, 208, 160]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 5156, |
|
"image": "images/5156_epic_P01_101_frame_0000035461.jpg", |
|
"conversations": [ |
|
{ |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"stir the bowl\". Please predict a possible affordance area of the end effector?", |
|
"from": "human" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[197, 54, 291, 160]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 3021, |
|
"image": "images/3021_knife_000942.jpg", |
|
"conversations": [ |
|
{ |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"cut_with the knife\". Please predict a possible affordance area of the end effector?", |
|
"from": "human" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[288, 236, 394, 286]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 1392, |
|
"image": "images/1392_testset_0001231.jpg", |
|
"conversations": [ |
|
{ |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"sit_on the motorbike\". Please predict a possible affordance area of the end effector?", |
|
"from": "human" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[300, 295, 383, 343]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 6450, |
|
"image": "images/6450_youtube_Really_001140.jpg", |
|
"conversations": [ |
|
{ |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"cut_with the knife\". Please predict a possible affordance area of the end effector?", |
|
"from": "human" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[833, 546, 979, 746]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 5683, |
|
"image": "images/5683_testset_0000166.jpg", |
|
"conversations": [ |
|
{ |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"catch the soccer_ball\". Please predict a possible affordance area of the end effector?", |
|
"from": "human" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[83, 288, 128, 330]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 3417, |
|
"image": "images/3417_bicycle_001546.jpg", |
|
"conversations": [ |
|
{ |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"ride the bicycle\". Please predict a possible affordance area of the end effector?", |
|
"from": "human" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[234, 80, 289, 126]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 284, |
|
"image": "images/284_ego4d_8f847cfc-ae36-461c-a96b-8fc58766fb2d_8700.jpg", |
|
"conversations": [ |
|
{ |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"write the pen\". Please predict a possible affordance area of the end effector?", |
|
"from": "human" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[717, 735, 1085, 909]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 4984, |
|
"image": "images/4984_ego4d_5c3be03c-07a5-4427-a2fe-449f16e8017b_3900.jpg", |
|
"conversations": [ |
|
{ |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"text_on the cell_phone\". Please predict a possible affordance area of the end effector?", |
|
"from": "human" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[327, 650, 522, 856]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 1514, |
|
"image": "images/1514_testset_0001042.jpg", |
|
"conversations": [ |
|
{ |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"sit_on the bicycle\". Please predict a possible affordance area of the end effector?", |
|
"from": "human" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[134, 89, 160, 113]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 7066, |
|
"image": "images/7066_cup_002410.jpg", |
|
"conversations": [ |
|
{ |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"sip the cup\". Please predict a possible affordance area of the end effector?", |
|
"from": "human" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[129, 130, 462, 234]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 474, |
|
"image": "images/474_testset_0000092.jpg", |
|
"conversations": [ |
|
{ |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"catch the soccer_ball\". Please predict a possible affordance area of the end effector?", |
|
"from": "human" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[120, 189, 148, 217]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 2393, |
|
"image": "images/2393_testset_0002092.jpg", |
|
"conversations": [ |
|
{ |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"lie_on the couch\". Please predict a possible affordance area of the end effector?", |
|
"from": "human" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[353, 275, 619, 324]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 1916, |
|
"image": "images/1916_thu_cut_fruit_RGB_cut_fruit_lzy_2_49.jpg", |
|
"conversations": [ |
|
{ |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"stick the knife\". Please predict a possible affordance area of the end effector?", |
|
"from": "human" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[387, 69, 435, 117]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 2417, |
|
"image": "images/2417_testset_0001465.jpg", |
|
"conversations": [ |
|
{ |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"lie_on the couch\". Please predict a possible affordance area of the end effector?", |
|
"from": "human" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[209, 321, 369, 366]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 12, |
|
"image": "images/12_20.png", |
|
"conversations": [ |
|
{ |
|
"from": "human", |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"take the basket\". Please predict a possible affordance area of the end effector?" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[544, 210, 1053, 378]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 2667, |
|
"image": "images/2667_testset_0001397.jpg", |
|
"conversations": [ |
|
{ |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"lie_on the bench\". Please predict a possible affordance area of the end effector?", |
|
"from": "human" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[188, 208, 356, 250]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 5140, |
|
"image": "images/5140_toothbrush_000575.jpg", |
|
"conversations": [ |
|
{ |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"wash the toothbrush\". Please predict a possible affordance area of the end effector?", |
|
"from": "human" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[130, 43, 319, 196]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 4777, |
|
"image": "images/4777_youtube_First Person Cooking CHICKEN QUESADILLAS JONAH IN THE KITCHEN POV COOKING-zma2SgJg6l0-1080p-1644528321483 (1)_002520.jpg", |
|
"conversations": [ |
|
{ |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"open the oven\". Please predict a possible affordance area of the end effector?", |
|
"from": "human" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[908, 824, 1261, 959]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 6997, |
|
"image": "images/6997_ego4d_61f0a3d8-01f9-4149-bd0c-c31ad5d319ed_12750.jpg", |
|
"conversations": [ |
|
{ |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"hold the axe\". Please predict a possible affordance area of the end effector?", |
|
"from": "human" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[1328, 819, 1618, 1022]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 2067, |
|
"image": "images/2067_basketball_003534.jpg", |
|
"conversations": [ |
|
{ |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"throw the basketball\". Please predict a possible affordance area of the end effector?", |
|
"from": "human" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[458, 244, 586, 373]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 1668, |
|
"image": "images/1668_testset_0001463.jpg", |
|
"conversations": [ |
|
{ |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"carry the snowboard\". Please predict a possible affordance area of the end effector?", |
|
"from": "human" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[132, 198, 165, 231]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 6716, |
|
"image": "images/6716_banana_002934.jpg", |
|
"conversations": [ |
|
{ |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"eat the banana\". Please predict a possible affordance area of the end effector?", |
|
"from": "human" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[83, 390, 213, 431]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 1431, |
|
"image": "images/1431_testset_0001432.jpg", |
|
"conversations": [ |
|
{ |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"sit_on the bicycle\". Please predict a possible affordance area of the end effector?", |
|
"from": "human" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[203, 175, 256, 223]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 4379, |
|
"image": "images/4379_wine_glass_003375.jpg", |
|
"conversations": [ |
|
{ |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"hold the wine_glass\". Please predict a possible affordance area of the end effector?", |
|
"from": "human" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[288, 152, 439, 295]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 2492, |
|
"image": "images/2492_testset_0001207.jpg", |
|
"conversations": [ |
|
{ |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"lie_on the bench\". Please predict a possible affordance area of the end effector?", |
|
"from": "human" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[185, 273, 356, 318]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 1511, |
|
"image": "images/1511_bicycle_002266.jpg", |
|
"conversations": [ |
|
{ |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"sit_on the bicycle\". Please predict a possible affordance area of the end effector?", |
|
"from": "human" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[145, 106, 209, 159]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 5865, |
|
"image": "images/5865_bicycle_001224.jpg", |
|
"conversations": [ |
|
{ |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"sit_on the bicycle\". Please predict a possible affordance area of the end effector?", |
|
"from": "human" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[325, 181, 426, 232]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 1780, |
|
"image": "images/1780_ego4d_6a85ad1a-746c-45ac-b372-b677cb6e0a3d_5850.jpg", |
|
"conversations": [ |
|
{ |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"type_on the laptop\". Please predict a possible affordance area of the end effector?", |
|
"from": "human" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[1989, 1315, 2237, 1564]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 3070, |
|
"image": "images/3070_youtube_Really_011880.jpg", |
|
"conversations": [ |
|
{ |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"cut_with the knife\". Please predict a possible affordance area of the end effector?", |
|
"from": "human" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[831, 587, 988, 785]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 3693, |
|
"image": "images/3693_wine_glass_002922.jpg", |
|
"conversations": [ |
|
{ |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"pour the wine_glass\". Please predict a possible affordance area of the end effector?", |
|
"from": "human" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[300, 331, 362, 425]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 3730, |
|
"image": "images/3730_baseball_bat_000452.jpg", |
|
"conversations": [ |
|
{ |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"hold the baseball_bat\". Please predict a possible affordance area of the end effector?", |
|
"from": "human" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[1084, 91, 1299, 280]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 165, |
|
"image": "images/165_testset_0000353.jpg", |
|
"conversations": [ |
|
{ |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"boxing the punching_bag\". Please predict a possible affordance area of the end effector?", |
|
"from": "human" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[249, 124, 345, 468]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 3170, |
|
"image": "images/3170_scissors_002226.jpg", |
|
"conversations": [ |
|
{ |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"cut_with the scissors\". Please predict a possible affordance area of the end effector?", |
|
"from": "human" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[254, 250, 330, 362]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 5718, |
|
"image": "images/5718_soccer_ball_002743.jpg", |
|
"conversations": [ |
|
{ |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"catch the soccer_ball\". Please predict a possible affordance area of the end effector?", |
|
"from": "human" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[151, 86, 222, 154]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 26, |
|
"image": "images/26_3.png", |
|
"conversations": [ |
|
{ |
|
"from": "human", |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"take the basket\". Please predict a possible affordance area of the end effector?" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[1182, 373, 1398, 522]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 4852, |
|
"image": "images/4852_bottle_001569.jpg", |
|
"conversations": [ |
|
{ |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"sip the bottle\". Please predict a possible affordance area of the end effector?", |
|
"from": "human" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[168, 83, 218, 125]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 4569, |
|
"image": "images/4569_testset_0001135.jpg", |
|
"conversations": [ |
|
{ |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"hold the frisbee\". Please predict a possible affordance area of the end effector?", |
|
"from": "human" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[162, 105, 186, 128]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 197, |
|
"image": "images/197_testset_0000909.jpg", |
|
"conversations": [ |
|
{ |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"boxing the punching_bag\". Please predict a possible affordance area of the end effector?", |
|
"from": "human" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[192, 183, 265, 490]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 3282, |
|
"image": "images/3282_punching_bag_000047.jpg", |
|
"conversations": [ |
|
{ |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"kick the punching_bag\". Please predict a possible affordance area of the end effector?", |
|
"from": "human" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[216, 257, 271, 383]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 477, |
|
"image": "images/477_testset_0000911.jpg", |
|
"conversations": [ |
|
{ |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"catch the soccer_ball\". Please predict a possible affordance area of the end effector?", |
|
"from": "human" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[158, 106, 193, 142]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 1397, |
|
"image": "images/1397_testset_0000611.jpg", |
|
"conversations": [ |
|
{ |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"sit_on the motorbike\". Please predict a possible affordance area of the end effector?", |
|
"from": "human" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[148, 199, 246, 263]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 2244, |
|
"image": "images/2244_carrot_000953.jpg", |
|
"conversations": [ |
|
{ |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"cut the carrot\". Please predict a possible affordance area of the end effector?", |
|
"from": "human" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[341, 99, 463, 263]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 684, |
|
"image": "images/684_bottle_002780.jpg", |
|
"conversations": [ |
|
{ |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"drink_with the bottle\". Please predict a possible affordance area of the end effector?", |
|
"from": "human" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[444, 270, 517, 339]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 2135, |
|
"image": "images/2135_testset_0000991.jpg", |
|
"conversations": [ |
|
{ |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"throw the frisbee\". Please predict a possible affordance area of the end effector?", |
|
"from": "human" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[494, 188, 670, 332]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 2680, |
|
"image": "images/2680_testset_0001407.jpg", |
|
"conversations": [ |
|
{ |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"lie_on the bench\". Please predict a possible affordance area of the end effector?", |
|
"from": "human" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[184, 513, 403, 572]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 4619, |
|
"image": "images/4619_tennis_racket_000964.jpg", |
|
"conversations": [ |
|
{ |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"hit the tennis_racket\". Please predict a possible affordance area of the end effector?", |
|
"from": "human" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[191, 1159, 265, 1420]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 1939, |
|
"image": "images/1939_ego4d_c2a3d4ac-8d13-4daa-a9a9-a867999e956f_29400.jpg", |
|
"conversations": [ |
|
{ |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"stick the knife\". Please predict a possible affordance area of the end effector?", |
|
"from": "human" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[884, 356, 989, 462]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 16, |
|
"image": "images/16_26.png", |
|
"conversations": [ |
|
{ |
|
"from": "human", |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"take the knife\". Please predict a possible affordance area of the end effector?" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[976, 517, 1215, 807]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 1378, |
|
"image": "images/1378_testset_0001495.jpg", |
|
"conversations": [ |
|
{ |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"sit_on the motorbike\". Please predict a possible affordance area of the end effector?", |
|
"from": "human" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[428, 256, 495, 307]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 4053, |
|
"image": "images/4053_testset_0001112.jpg", |
|
"conversations": [ |
|
{ |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"hold the suitcase\". Please predict a possible affordance area of the end effector?", |
|
"from": "human" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[94, 0, 139, 17]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 6238, |
|
"image": "images/6238_testset_0002145.jpg", |
|
"conversations": [ |
|
{ |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"lie_on the bed\". Please predict a possible affordance area of the end effector?", |
|
"from": "human" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[131, 164, 260, 212]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 4834, |
|
"image": "images/4834_cup_002062.jpg", |
|
"conversations": [ |
|
{ |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"sip the cup\". Please predict a possible affordance area of the end effector?", |
|
"from": "human" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[346, 120, 542, 221]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 21, |
|
"image": "images/21_4.png", |
|
"conversations": [ |
|
{ |
|
"from": "human", |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"take the knife\". Please predict a possible affordance area of the end effector?" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[1391, 905, 1728, 1076]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 3691, |
|
"image": "images/3691_thu_use_chopstick_RGB_use_chopstick_gs_2_176.jpg", |
|
"conversations": [ |
|
{ |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"pour the wine_glass\". Please predict a possible affordance area of the end effector?", |
|
"from": "human" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[273, 346, 327, 406]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 1160, |
|
"image": "images/1160_testset_0001004.jpg", |
|
"conversations": [ |
|
{ |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"sit_on the chair\". Please predict a possible affordance area of the end effector?", |
|
"from": "human" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[291, 393, 366, 463]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 160, |
|
"image": "images/160_punching_bag_000345.jpg", |
|
"conversations": [ |
|
{ |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"boxing the punching_bag\". Please predict a possible affordance area of the end effector?", |
|
"from": "human" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[90, 442, 276, 1093]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 3035, |
|
"image": "images/3035_thu_cut_fruit_RGB_cut_fruit_tys_1_79.jpg", |
|
"conversations": [ |
|
{ |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"cut_with the knife\". Please predict a possible affordance area of the end effector?", |
|
"from": "human" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[398, 68, 449, 126]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 979, |
|
"image": "images/979_testset_0001849.jpg", |
|
"conversations": [ |
|
{ |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"sit_on the bench\". Please predict a possible affordance area of the end effector?", |
|
"from": "human" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[365, 253, 491, 320]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 4495, |
|
"image": "images/4495_testset_0001819.jpg", |
|
"conversations": [ |
|
{ |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"hold the frisbee\". Please predict a possible affordance area of the end effector?", |
|
"from": "human" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[182, 78, 216, 107]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 6421, |
|
"image": "images/6421_ego4d_c2a3d4ac-8d13-4daa-a9a9-a867999e956f_15000.jpg", |
|
"conversations": [ |
|
{ |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"cut_with the knife\". Please predict a possible affordance area of the end effector?", |
|
"from": "human" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[927, 122, 1035, 226]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 22, |
|
"image": "images/22_5.png", |
|
"conversations": [ |
|
{ |
|
"from": "human", |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"take the basket\". Please predict a possible affordance area of the end effector?" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[683, 213, 916, 351]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 6463, |
|
"image": "images/6463_ego4d_002d2729-df71-438d-8396-5895b349e8fd_67200.jpg", |
|
"conversations": [ |
|
{ |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"cut_with the knife\". Please predict a possible affordance area of the end effector?", |
|
"from": "human" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[779, 482, 938, 705]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 2002, |
|
"image": "images/2002_testset_0000962.jpg", |
|
"conversations": [ |
|
{ |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"throw the soccer_ball\". Please predict a possible affordance area of the end effector?", |
|
"from": "human" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[290, 170, 323, 200]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 2335, |
|
"image": "images/2335_testset_0001233.jpg", |
|
"conversations": [ |
|
{ |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"drag the suitcase\". Please predict a possible affordance area of the end effector?", |
|
"from": "human" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[143, 128, 170, 150]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 4395, |
|
"image": "images/4395_wine_glass_002282.jpg", |
|
"conversations": [ |
|
{ |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"hold the wine_glass\". Please predict a possible affordance area of the end effector?", |
|
"from": "human" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[126, 236, 195, 299]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 3638, |
|
"image": "images/3638_ego4d_6be77236-230f-4d9b-98d5-1cbf3be34023_5700.jpg", |
|
"conversations": [ |
|
{ |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"pour the cup\". Please predict a possible affordance area of the end effector?", |
|
"from": "human" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[1524, 264, 1681, 446]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 1031, |
|
"image": "images/1031_testset_0000902.jpg", |
|
"conversations": [ |
|
{ |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"sit_on the bench\". Please predict a possible affordance area of the end effector?", |
|
"from": "human" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[280, 355, 468, 402]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 727, |
|
"image": "images/727_wine_glass_001599.jpg", |
|
"conversations": [ |
|
{ |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"drink_with the wine_glass\". Please predict a possible affordance area of the end effector?", |
|
"from": "human" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[130, 186, 223, 243]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 3089, |
|
"image": "images/3089_youtube_Good_006000.jpg", |
|
"conversations": [ |
|
{ |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"cut_with the knife\". Please predict a possible affordance area of the end effector?", |
|
"from": "human" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[606, 312, 794, 476]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 1344, |
|
"image": "images/1344_testset_0002270.jpg", |
|
"conversations": [ |
|
{ |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"sit_on the surfboard\". Please predict a possible affordance area of the end effector?", |
|
"from": "human" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[124, 93, 322, 156]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 7262, |
|
"image": "images/7262_golf_clubs_003141.jpg", |
|
"conversations": [ |
|
{ |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"swing the golf_clubs\". Please predict a possible affordance area of the end effector?", |
|
"from": "human" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[818, 213, 948, 425]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 6087, |
|
"image": "images/6087_knife_001875.jpg", |
|
"conversations": [ |
|
{ |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"stick the knife\". Please predict a possible affordance area of the end effector?", |
|
"from": "human" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[160, 284, 225, 339]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 3650, |
|
"image": "images/3650_cup_000508.jpg", |
|
"conversations": [ |
|
{ |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"pour the cup\". Please predict a possible affordance area of the end effector?", |
|
"from": "human" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[433, 250, 483, 313]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 3108, |
|
"image": "images/3108_youtube_Why_013980.jpg", |
|
"conversations": [ |
|
{ |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"cut_with the knife\". Please predict a possible affordance area of the end effector?", |
|
"from": "human" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[1294, 641, 1428, 779]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 3816, |
|
"image": "images/3816_bottle_001227.jpg", |
|
"conversations": [ |
|
{ |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"hold the bottle\". Please predict a possible affordance area of the end effector?", |
|
"from": "human" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[87, 181, 220, 317]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 6135, |
|
"image": "images/6135_youtube_POV 'From-Scratch' Meatballs & Spaghetti for My Wife First Person Cooking-RN4LfG5ZHE4-1080p-1644527646933_003840.jpg", |
|
"conversations": [ |
|
{ |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"stick the knife\". Please predict a possible affordance area of the end effector?", |
|
"from": "human" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[364, 171, 491, 295]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 6688, |
|
"image": "images/6688_bicycle_001334.jpg", |
|
"conversations": [ |
|
{ |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"ride the bicycle\". Please predict a possible affordance area of the end effector?", |
|
"from": "human" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[190, 163, 245, 210]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 6355, |
|
"image": "images/6355_knife_001749.jpg", |
|
"conversations": [ |
|
{ |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"cut_with the knife\". Please predict a possible affordance area of the end effector?", |
|
"from": "human" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[90, 181, 279, 224]" |
|
} |
|
] |
|
}, |
|
{ |
|
"id": 6391, |
|
"image": "images/6391_youtube_First Person Cooking CHICKEN QUESADILLAS JONAH IN THE KITCHEN POV COOKING-zma2SgJg6l0-1080p-1644528321483 (1)_007380.jpg", |
|
"conversations": [ |
|
{ |
|
"value": "<image>\nYou are a robot using the joint control. The task is \"cut_with the knife\". Please predict a possible affordance area of the end effector?", |
|
"from": "human" |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": "[869, 0, 1031, 218]" |
|
} |
|
] |
|
} |
|
] |