Omniscient / geo_bot.py
Andy Lee
chore: adapt to hf studio
6fda968
raw
history blame
10.2 kB
import base64
import json
import re
from io import BytesIO
from typing import Tuple, List, Optional, Dict, Any, Type
from PIL import Image
from langchain_core.messages import HumanMessage, BaseMessage
from langchain_core.language_models.chat_models import BaseChatModel
from langchain_openai import ChatOpenAI
from langchain_anthropic import ChatAnthropic
from langchain_google_genai import ChatGoogleGenerativeAI
from mapcrunch_controller import MapCrunchController
# The "Golden" Prompt (v6): Combines clear mechanics with robust strategic principles.
AGENT_PROMPT_TEMPLATE = """
**Mission:** You are an expert geo-location agent. Your goal is to find clues to determine your location within a limited number of steps.
**Current Status:**
- **Remaining Steps: {remaining_steps}**
- **Available Actions This Turn: {available_actions}**
---
**Core Principles of an Expert Player:**
1. **Navigate with Labels:** `MOVE_FORWARD` follows the green 'UP' arrow. `MOVE_BACKWARD` follows the red 'DOWN' arrow. These labels are your most reliable compass. If there are no arrows, you cannot move.
2. **Efficient Exploration (to avoid "Bulldozer" mode):**
- **Pan Before You Move:** At a new location or an intersection, it's often wise to use `PAN_LEFT` or `PAN_RIGHT` to quickly survey your surroundings before committing to a move.
- **Don't Get Stuck:** If you've moved forward 2-3 times down a path and found nothing but repetitive scenery (like an empty forest or highway), consider it a barren path. It's smarter to turn around (using `PAN`) and check another direction.
3. **Be Decisive:** If you find a truly definitive clue (like a full, readable address or a sign with a unique town name), `GUESS` immediately. Don't waste steps.
4. **Final Step Rule:** If `remaining_steps` is **exactly 1**, your action **MUST be `GUESS`**.
---
**Context & Task:**
Analyze your full journey history and current view, apply the Core Principles, and decide your next action in the required JSON format.
**Action History:**
{history_text}
**JSON Output Format:**
Your response MUST be a valid JSON object wrapped in ```json ... ```.
- For exploration: `{{"reasoning": "...", "action_details": {{"action": "ACTION_NAME"}} }}`
- For the final guess: `{{"reasoning": "...", "action_details": {{"action": "GUESS", "lat": <float>, "lon": <float>}} }}`
"""
BENCHMARK_PROMPT = """
Analyze the image and determine its geographic coordinates.
1. Describe visual clues.
2. Suggest potential regions.
3. State your most probable location.
4. Provide coordinates in the last line in this exact format: `Lat: XX.XXXX, Lon: XX.XXXX`
"""
class GeoBot:
def __init__(
self,
model: Type,
model_name: str,
use_selenium: bool = True,
headless: bool = False,
temperature: float = 0.0,
):
# Initialize model with temperature parameter
model_kwargs = {
"model": model_name,
"temperature": temperature,
}
self.model = model(**model_kwargs)
self.model_name = model_name
self.temperature = temperature
self.use_selenium = use_selenium
self.controller = MapCrunchController(headless=headless)
@staticmethod
def pil_to_base64(image: Image.Image) -> str:
buffered = BytesIO()
image.thumbnail((1024, 1024))
image.save(buffered, format="PNG")
return base64.b64encode(buffered.getvalue()).decode("utf-8")
def _create_message_with_history(
self, prompt: str, image_b64_list: List[str]
) -> List[HumanMessage]:
"""Creates a message for the LLM that includes text and a sequence of images."""
content = [{"type": "text", "text": prompt}]
# Add the JSON format instructions right after the main prompt text
content.append(
{
"type": "text",
"text": '\n**JSON Output Format:**\nYour response MUST be a valid JSON object wrapped in ```json ... ```.\n- For exploration: `{{"reasoning": "...", "action_details": {{"action": "ACTION_NAME"}} }}`\n- For the final guess: `{{"reasoning": "...", "action_details": {{"action": "GUESS", "lat": <float>, "lon": <float>}} }}`',
}
)
for b64_string in image_b64_list:
content.append(
{
"type": "image_url",
"image_url": {"url": f"data:image/png;base64,{b64_string}"},
}
)
return [HumanMessage(content=content)]
def _create_llm_message(self, prompt: str, image_b64: str) -> List[HumanMessage]:
"""Original method for single-image analysis (benchmark)."""
return [
HumanMessage(
content=[
{"type": "text", "text": prompt},
{
"type": "image_url",
"image_url": {"url": f"data:image/png;base64,{image_b64}"},
},
]
)
]
def _parse_agent_response(self, response: BaseMessage) -> Optional[Dict[str, Any]]:
"""
Robustly parses JSON from the LLM response, handling markdown code blocks.
"""
try:
assert isinstance(response.content, str), "Response content is not a string"
content = response.content.strip()
match = re.search(r"```json\s*(\{.*?\})\s*```", content, re.DOTALL)
if match:
json_str = match.group(1)
else:
json_str = content
return json.loads(json_str)
except (json.JSONDecodeError, AttributeError) as e:
print(f"Invalid JSON from LLM: {e}\nFull response was:\n{response.content}")
return None
def run_agent_loop(self, max_steps: int = 10) -> Optional[Tuple[float, float]]:
history: List[Dict[str, Any]] = []
for step in range(max_steps, 0, -1):
print(f"\n--- Step {max_steps - step + 1}/{max_steps} ---")
self.controller.setup_clean_environment()
self.controller.label_arrows_on_screen()
screenshot_bytes = self.controller.take_street_view_screenshot()
if not screenshot_bytes:
print("Failed to take screenshot. Ending agent loop.")
return None
current_screenshot_b64 = self.pil_to_base64(
image=Image.open(BytesIO(screenshot_bytes))
)
available_actions = self.controller.get_available_actions()
print(f"Available actions: {available_actions}")
history_text: str = ""
image_b64_for_prompt: List[str] = []
if not history:
history_text = "No history yet. This is the first step."
else:
for i, h in enumerate(history):
history_text += f"--- History Step {i + 1} ---\n"
history_text += f"Reasoning: {h.get('reasoning', 'N/A')}\n"
history_text += f"Action: {h.get('action_details', {}).get('action', 'N/A')}\n\n"
image_b64_for_prompt.append(h["screenshot_b64"])
image_b64_for_prompt.append(current_screenshot_b64)
prompt = AGENT_PROMPT_TEMPLATE.format(
remaining_steps=step,
history_text=history_text,
available_actions=json.dumps(available_actions),
)
message = self._create_message_with_history(prompt, image_b64_for_prompt)
response = self.model.invoke(message)
decision = self._parse_agent_response(response)
if not decision:
print(
"Response parsing failed. Using default recovery action: PAN_RIGHT."
)
decision = {
"reasoning": "Recovery due to parsing failure.",
"action_details": {"action": "PAN_RIGHT"},
}
decision["screenshot_b64"] = current_screenshot_b64
history.append(decision)
action_details = decision.get("action_details", {})
action = action_details.get("action")
print(f"AI Reasoning: {decision.get('reasoning', 'N/A')}")
print(f"AI Action: {action}")
if action == "GUESS":
lat, lon = action_details.get("lat"), action_details.get("lon")
if lat is not None and lon is not None:
return lat, lon
elif action == "MOVE_FORWARD":
self.controller.move("forward")
elif action == "MOVE_BACKWARD":
self.controller.move("backward")
elif action == "PAN_LEFT":
self.controller.pan_view("left")
elif action == "PAN_RIGHT":
self.controller.pan_view("right")
print("Max steps reached. Agent did not make a final guess.")
return None
def analyze_image(self, image: Image.Image) -> Optional[Tuple[float, float]]:
image_b64 = self.pil_to_base64(image)
message = self._create_llm_message(BENCHMARK_PROMPT, image_b64)
response = self.model.invoke(message)
print(f"\nLLM Response:\n{response.content}")
content = response.content.strip()
last_line = ""
for line in reversed(content.split("\n")):
if "lat" in line.lower() and "lon" in line.lower():
last_line = line
break
if not last_line:
return None
numbers = re.findall(r"[-+]?\d*\.\d+|\d+", last_line)
if len(numbers) < 2:
return None
lat, lon = float(numbers[0]), float(numbers[1])
return lat, lon
def take_screenshot(self) -> Optional[Image.Image]:
screenshot_bytes = self.controller.take_street_view_screenshot()
if screenshot_bytes:
return Image.open(BytesIO(screenshot_bytes))
return None
def close(self):
if self.controller:
self.controller.close()
def __enter__(self):
return self
def __exit__(self, exc_type, exc_val, exc_tb):
self.close()