Spaces:

Chrisyichuan
/

Omniscient

Running

Omniscient / geo_bot.py

Andy Lee

chore: adapt to hf studio

6fda968 2 months ago

10.2 kB

	import base64
	import json
	import re
	from io import BytesIO
	from typing import Tuple, List, Optional, Dict, Any, Type

	from PIL import Image
	from langchain_core.messages import HumanMessage, BaseMessage
	from langchain_core.language_models.chat_models import BaseChatModel
	from langchain_openai import ChatOpenAI
	from langchain_anthropic import ChatAnthropic
	from langchain_google_genai import ChatGoogleGenerativeAI

	from mapcrunch_controller import MapCrunchController

	# The "Golden" Prompt (v6): Combines clear mechanics with robust strategic principles.

	AGENT_PROMPT_TEMPLATE = """
	Mission: You are an expert geo-location agent. Your goal is to find clues to determine your location within a limited number of steps.

	Current Status:
	- Remaining Steps: {remaining_steps}
	- Available Actions This Turn: {available_actions}

	---
	Core Principles of an Expert Player:

	1. Navigate with Labels: `MOVE_FORWARD` follows the green 'UP' arrow. `MOVE_BACKWARD` follows the red 'DOWN' arrow. These labels are your most reliable compass. If there are no arrows, you cannot move.

	2. Efficient Exploration (to avoid "Bulldozer" mode):
	- Pan Before You Move: At a new location or an intersection, it's often wise to use `PAN_LEFT` or `PAN_RIGHT` to quickly survey your surroundings before committing to a move.
	- Don't Get Stuck: If you've moved forward 2-3 times down a path and found nothing but repetitive scenery (like an empty forest or highway), consider it a barren path. It's smarter to turn around (using `PAN`) and check another direction.

	3. Be Decisive: If you find a truly definitive clue (like a full, readable address or a sign with a unique town name), `GUESS` immediately. Don't waste steps.

	4. Final Step Rule: If `remaining_steps` is exactly 1, your action MUST be `GUESS`.

	---
	Context & Task:
	Analyze your full journey history and current view, apply the Core Principles, and decide your next action in the required JSON format.

	Action History:
	{history_text}

	JSON Output Format:
	Your response MUST be a valid JSON object wrapped in ```json ... ```.
	- For exploration: `{{"reasoning": "...", "action_details": {{"action": "ACTION_NAME"}} }}`
	- For the final guess: `{{"reasoning": "...", "action_details": {{"action": "GUESS", "lat": <float>, "lon": <float>}} }}`
	"""

	BENCHMARK_PROMPT = """
	Analyze the image and determine its geographic coordinates.
	1. Describe visual clues.
	2. Suggest potential regions.
	3. State your most probable location.
	4. Provide coordinates in the last line in this exact format: `Lat: XX.XXXX, Lon: XX.XXXX`
	"""


	class GeoBot:
	def __init__(
	self,
	model: Type,
	model_name: str,
	use_selenium: bool = True,
	headless: bool = False,
	temperature: float = 0.0,
	):
	# Initialize model with temperature parameter
	model_kwargs = {
	"model": model_name,
	"temperature": temperature,
	}

	self.model = model(**model_kwargs)
	self.model_name = model_name
	self.temperature = temperature
	self.use_selenium = use_selenium
	self.controller = MapCrunchController(headless=headless)

	@staticmethod
	def pil_to_base64(image: Image.Image) -> str:
	buffered = BytesIO()
	image.thumbnail((1024, 1024))
	image.save(buffered, format="PNG")
	return base64.b64encode(buffered.getvalue()).decode("utf-8")

	def _create_message_with_history(
	self, prompt: str, image_b64_list: List[str]
	) -> List[HumanMessage]:
	"""Creates a message for the LLM that includes text and a sequence of images."""
	content = [{"type": "text", "text": prompt}]
	# Add the JSON format instructions right after the main prompt text
	content.append(
	{
	"type": "text",
	"text": '\nJSON Output Format:\nYour response MUST be a valid JSON object wrapped in ```json ... ```.\n- For exploration: `{{"reasoning": "...", "action_details": {{"action": "ACTION_NAME"}} }}`\n- For the final guess: `{{"reasoning": "...", "action_details": {{"action": "GUESS", "lat": <float>, "lon": <float>}} }}`',
	}
	)

	for b64_string in image_b64_list:
	content.append(
	{
	"type": "image_url",
	"image_url": {"url": f"data:image/png;base64,{b64_string}"},
	}
	)
	return [HumanMessage(content=content)]

	def _create_llm_message(self, prompt: str, image_b64: str) -> List[HumanMessage]:
	"""Original method for single-image analysis (benchmark)."""
	return [
	HumanMessage(
	content=[
	{"type": "text", "text": prompt},
	{
	"type": "image_url",
	"image_url": {"url": f"data:image/png;base64,{image_b64}"},
	},
	]
	)
	]

	def _parse_agent_response(self, response: BaseMessage) -> Optional[Dict[str, Any]]:
	"""
	Robustly parses JSON from the LLM response, handling markdown code blocks.
	"""
	try:
	assert isinstance(response.content, str), "Response content is not a string"
	content = response.content.strip()
	match = re.search(r"```json\s(\{.?\})\s*```", content, re.DOTALL)
	if match:
	json_str = match.group(1)
	else:
	json_str = content
	return json.loads(json_str)
	except (json.JSONDecodeError, AttributeError) as e:
	print(f"Invalid JSON from LLM: {e}\nFull response was:\n{response.content}")
	return None

	def run_agent_loop(self, max_steps: int = 10) -> Optional[Tuple[float, float]]:
	history: List[Dict[str, Any]] = []

	for step in range(max_steps, 0, -1):
	print(f"\n--- Step {max_steps - step + 1}/{max_steps} ---")

	self.controller.setup_clean_environment()

	self.controller.label_arrows_on_screen()

	screenshot_bytes = self.controller.take_street_view_screenshot()
	if not screenshot_bytes:
	print("Failed to take screenshot. Ending agent loop.")
	return None

	current_screenshot_b64 = self.pil_to_base64(
	image=Image.open(BytesIO(screenshot_bytes))
	)
	available_actions = self.controller.get_available_actions()
	print(f"Available actions: {available_actions}")

	history_text: str = ""
	image_b64_for_prompt: List[str] = []
	if not history:
	history_text = "No history yet. This is the first step."
	else:
	for i, h in enumerate(history):
	history_text += f"--- History Step {i + 1} ---\n"
	history_text += f"Reasoning: {h.get('reasoning', 'N/A')}\n"
	history_text += f"Action: {h.get('action_details', {}).get('action', 'N/A')}\n\n"
	image_b64_for_prompt.append(h["screenshot_b64"])

	image_b64_for_prompt.append(current_screenshot_b64)

	prompt = AGENT_PROMPT_TEMPLATE.format(
	remaining_steps=step,
	history_text=history_text,
	available_actions=json.dumps(available_actions),
	)

	message = self._create_message_with_history(prompt, image_b64_for_prompt)
	response = self.model.invoke(message)

	decision = self._parse_agent_response(response)

	if not decision:
	print(
	"Response parsing failed. Using default recovery action: PAN_RIGHT."
	)
	decision = {
	"reasoning": "Recovery due to parsing failure.",
	"action_details": {"action": "PAN_RIGHT"},
	}

	decision["screenshot_b64"] = current_screenshot_b64
	history.append(decision)

	action_details = decision.get("action_details", {})
	action = action_details.get("action")
	print(f"AI Reasoning: {decision.get('reasoning', 'N/A')}")
	print(f"AI Action: {action}")

	if action == "GUESS":
	lat, lon = action_details.get("lat"), action_details.get("lon")
	if lat is not None and lon is not None:
	return lat, lon
	elif action == "MOVE_FORWARD":
	self.controller.move("forward")
	elif action == "MOVE_BACKWARD":
	self.controller.move("backward")
	elif action == "PAN_LEFT":
	self.controller.pan_view("left")
	elif action == "PAN_RIGHT":
	self.controller.pan_view("right")

	print("Max steps reached. Agent did not make a final guess.")
	return None

	def analyze_image(self, image: Image.Image) -> Optional[Tuple[float, float]]:
	image_b64 = self.pil_to_base64(image)
	message = self._create_llm_message(BENCHMARK_PROMPT, image_b64)
	response = self.model.invoke(message)
	print(f"\nLLM Response:\n{response.content}")

	content = response.content.strip()
	last_line = ""
	for line in reversed(content.split("\n")):
	if "lat" in line.lower() and "lon" in line.lower():
	last_line = line
	break
	if not last_line:
	return None

	numbers = re.findall(r"[-+]?\d*\.\d+\|\d+", last_line)
	if len(numbers) < 2:
	return None

	lat, lon = float(numbers[0]), float(numbers[1])
	return lat, lon

	def take_screenshot(self) -> Optional[Image.Image]:
	screenshot_bytes = self.controller.take_street_view_screenshot()
	if screenshot_bytes:
	return Image.open(BytesIO(screenshot_bytes))
	return None

	def close(self):
	if self.controller:
	self.controller.close()

	def __enter__(self):
	return self

	def __exit__(self, exc_type, exc_val, exc_tb):
	self.close()