import base64 import json import re from io import BytesIO from typing import Tuple, List, Optional, Dict, Any, Type from PIL import Image from langchain_core.messages import HumanMessage, BaseMessage from langchain_core.language_models.chat_models import BaseChatModel from langchain_openai import ChatOpenAI from langchain_anthropic import ChatAnthropic from langchain_google_genai import ChatGoogleGenerativeAI from mapcrunch_controller import MapCrunchController # The "Golden" Prompt (v6): Combines clear mechanics with robust strategic principles. AGENT_PROMPT_TEMPLATE = """ **Mission:** You are an expert geo-location agent. Your goal is to find clues to determine your location within a limited number of steps. **Current Status:** - **Remaining Steps: {remaining_steps}** - **Available Actions This Turn: {available_actions}** --- **Core Principles of an Expert Player:** 1. **Navigate with Labels:** `MOVE_FORWARD` follows the green 'UP' arrow. `MOVE_BACKWARD` follows the red 'DOWN' arrow. These labels are your most reliable compass. If there are no arrows, you cannot move. 2. **Efficient Exploration (to avoid "Bulldozer" mode):** - **Pan Before You Move:** At a new location or an intersection, it's often wise to use `PAN_LEFT` or `PAN_RIGHT` to quickly survey your surroundings before committing to a move. - **Don't Get Stuck:** If you've moved forward 2-3 times down a path and found nothing but repetitive scenery (like an empty forest or highway), consider it a barren path. It's smarter to turn around (using `PAN`) and check another direction. 3. **Be Decisive:** If you find a truly definitive clue (like a full, readable address or a sign with a unique town name), `GUESS` immediately. Don't waste steps. 4. **Final Step Rule:** If `remaining_steps` is **exactly 1**, your action **MUST be `GUESS`**. --- **Context & Task:** Analyze your full journey history and current view, apply the Core Principles, and decide your next action in the required JSON format. **Action History:** {history_text} **JSON Output Format:** Your response MUST be a valid JSON object wrapped in ```json ... ```. - For exploration: `{{"reasoning": "...", "action_details": {{"action": "ACTION_NAME"}} }}` - For the final guess: `{{"reasoning": "...", "action_details": {{"action": "GUESS", "lat": , "lon": }} }}` """ BENCHMARK_PROMPT = """ Analyze the image and determine its geographic coordinates. 1. Describe visual clues. 2. Suggest potential regions. 3. State your most probable location. 4. Provide coordinates in the last line in this exact format: `Lat: XX.XXXX, Lon: XX.XXXX` """ class GeoBot: def __init__( self, model: Type, model_name: str, use_selenium: bool = True, headless: bool = False, temperature: float = 0.0, ): # Initialize model with temperature parameter model_kwargs = { "model": model_name, "temperature": temperature, } self.model = model(**model_kwargs) self.model_name = model_name self.temperature = temperature self.use_selenium = use_selenium self.controller = MapCrunchController(headless=headless) @staticmethod def pil_to_base64(image: Image.Image) -> str: buffered = BytesIO() image.thumbnail((1024, 1024)) image.save(buffered, format="PNG") return base64.b64encode(buffered.getvalue()).decode("utf-8") def _create_message_with_history( self, prompt: str, image_b64_list: List[str] ) -> List[HumanMessage]: """Creates a message for the LLM that includes text and a sequence of images.""" content = [{"type": "text", "text": prompt}] # Add the JSON format instructions right after the main prompt text content.append( { "type": "text", "text": '\n**JSON Output Format:**\nYour response MUST be a valid JSON object wrapped in ```json ... ```.\n- For exploration: `{{"reasoning": "...", "action_details": {{"action": "ACTION_NAME"}} }}`\n- For the final guess: `{{"reasoning": "...", "action_details": {{"action": "GUESS", "lat": , "lon": }} }}`', } ) for b64_string in image_b64_list: content.append( { "type": "image_url", "image_url": {"url": f"data:image/png;base64,{b64_string}"}, } ) return [HumanMessage(content=content)] def _create_llm_message(self, prompt: str, image_b64: str) -> List[HumanMessage]: """Original method for single-image analysis (benchmark).""" return [ HumanMessage( content=[ {"type": "text", "text": prompt}, { "type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_b64}"}, }, ] ) ] def _parse_agent_response(self, response: BaseMessage) -> Optional[Dict[str, Any]]: """ Robustly parses JSON from the LLM response, handling markdown code blocks. """ try: assert isinstance(response.content, str), "Response content is not a string" content = response.content.strip() match = re.search(r"```json\s*(\{.*?\})\s*```", content, re.DOTALL) if match: json_str = match.group(1) else: json_str = content return json.loads(json_str) except (json.JSONDecodeError, AttributeError) as e: print(f"Invalid JSON from LLM: {e}\nFull response was:\n{response.content}") return None def run_agent_loop(self, max_steps: int = 10) -> Optional[Tuple[float, float]]: history: List[Dict[str, Any]] = [] for step in range(max_steps, 0, -1): print(f"\n--- Step {max_steps - step + 1}/{max_steps} ---") self.controller.setup_clean_environment() self.controller.label_arrows_on_screen() screenshot_bytes = self.controller.take_street_view_screenshot() if not screenshot_bytes: print("Failed to take screenshot. Ending agent loop.") return None current_screenshot_b64 = self.pil_to_base64( image=Image.open(BytesIO(screenshot_bytes)) ) available_actions = self.controller.get_available_actions() print(f"Available actions: {available_actions}") history_text: str = "" image_b64_for_prompt: List[str] = [] if not history: history_text = "No history yet. This is the first step." else: for i, h in enumerate(history): history_text += f"--- History Step {i + 1} ---\n" history_text += f"Reasoning: {h.get('reasoning', 'N/A')}\n" history_text += f"Action: {h.get('action_details', {}).get('action', 'N/A')}\n\n" image_b64_for_prompt.append(h["screenshot_b64"]) image_b64_for_prompt.append(current_screenshot_b64) prompt = AGENT_PROMPT_TEMPLATE.format( remaining_steps=step, history_text=history_text, available_actions=json.dumps(available_actions), ) message = self._create_message_with_history(prompt, image_b64_for_prompt) response = self.model.invoke(message) decision = self._parse_agent_response(response) if not decision: print( "Response parsing failed. Using default recovery action: PAN_RIGHT." ) decision = { "reasoning": "Recovery due to parsing failure.", "action_details": {"action": "PAN_RIGHT"}, } decision["screenshot_b64"] = current_screenshot_b64 history.append(decision) action_details = decision.get("action_details", {}) action = action_details.get("action") print(f"AI Reasoning: {decision.get('reasoning', 'N/A')}") print(f"AI Action: {action}") if action == "GUESS": lat, lon = action_details.get("lat"), action_details.get("lon") if lat is not None and lon is not None: return lat, lon elif action == "MOVE_FORWARD": self.controller.move("forward") elif action == "MOVE_BACKWARD": self.controller.move("backward") elif action == "PAN_LEFT": self.controller.pan_view("left") elif action == "PAN_RIGHT": self.controller.pan_view("right") print("Max steps reached. Agent did not make a final guess.") return None def analyze_image(self, image: Image.Image) -> Optional[Tuple[float, float]]: image_b64 = self.pil_to_base64(image) message = self._create_llm_message(BENCHMARK_PROMPT, image_b64) response = self.model.invoke(message) print(f"\nLLM Response:\n{response.content}") content = response.content.strip() last_line = "" for line in reversed(content.split("\n")): if "lat" in line.lower() and "lon" in line.lower(): last_line = line break if not last_line: return None numbers = re.findall(r"[-+]?\d*\.\d+|\d+", last_line) if len(numbers) < 2: return None lat, lon = float(numbers[0]), float(numbers[1]) return lat, lon def take_screenshot(self) -> Optional[Image.Image]: screenshot_bytes = self.controller.take_street_view_screenshot() if screenshot_bytes: return Image.open(BytesIO(screenshot_bytes)) return None def close(self): if self.controller: self.controller.close() def __enter__(self): return self def __exit__(self, exc_type, exc_val, exc_tb): self.close()