phanerozoic commited on
Commit
c13fff8
·
verified ·
1 Parent(s): 94fac5d

Update tools/visit_webpage.py

Browse files
Files changed (1) hide show
  1. tools/visit_webpage.py +12 -40
tools/visit_webpage.py CHANGED
@@ -1,57 +1,29 @@
1
  from typing import Any, Optional
2
  from smolagents.tools import Tool
3
- import re
4
  import markdownify
5
- import time
6
  from smolagents.utils import truncate_content
7
 
8
- # Import Selenium modules for JavaScript rendering
9
- from selenium import webdriver
10
- from selenium.webdriver.chrome.options import Options
11
- from selenium.common.exceptions import WebDriverException
12
-
13
  class VisitWebpageTool(Tool):
14
  name = "visit_webpage"
15
- description = (
16
- "Visits a webpage at the given URL and returns its content as a markdown string, "
17
- "using Selenium for JavaScript rendering if needed."
18
- )
19
- inputs = {'url': {'type': 'string', 'description': 'The URL of the webpage to visit.'}}
20
  output_type = "string"
21
 
22
  def forward(self, url: str) -> str:
23
- # Attempt to render the page using Selenium to capture JavaScript-loaded content
24
  try:
25
- chrome_options = Options()
26
- chrome_options.add_argument("--headless")
27
- chrome_options.add_argument("--disable-gpu")
28
- chrome_options.add_argument("--no-sandbox")
29
- # Initialize the Chrome webdriver; adjust executable_path if needed
30
- driver = webdriver.Chrome(options=chrome_options)
31
- driver.set_page_load_timeout(30)
32
- driver.get(url)
33
- # Wait a few seconds for dynamic content to load
34
- time.sleep(5)
35
- html = driver.page_source
36
- driver.quit()
37
- except WebDriverException as e:
38
- # Fallback: if Selenium fails, use requests
39
- try:
40
- import requests
41
- from requests.exceptions import RequestException
42
- response = requests.get(url, timeout=20)
43
- response.raise_for_status()
44
- html = response.text
45
- except Exception as ex:
46
- return f"Error fetching the webpage with requests: {str(ex)}"
47
  except Exception as e:
48
- return f"An unexpected error occurred during rendering: {str(e)}"
49
-
50
  try:
51
  # Convert the HTML content to Markdown
52
- markdown_content = markdownify.markdownify(html, heading_style="ATX").strip()
53
- # Clean up excessive newlines
54
- markdown_content = re.sub(r"\n{3,}", "\n\n", markdown_content)
55
  return truncate_content(markdown_content, 10000)
56
  except Exception as e:
57
  return f"Error processing content: {str(e)}"
 
1
  from typing import Any, Optional
2
  from smolagents.tools import Tool
3
+ import requests
4
  import markdownify
5
+ import re
6
  from smolagents.utils import truncate_content
7
 
 
 
 
 
 
8
  class VisitWebpageTool(Tool):
9
  name = "visit_webpage"
10
+ description = "Visits a webpage at the given url and reads its content as a markdown string. Use this to browse webpages."
11
+ inputs = {'url': {'type': 'string', 'description': 'The url of the webpage to visit.'}}
 
 
 
12
  output_type = "string"
13
 
14
  def forward(self, url: str) -> str:
 
15
  try:
16
+ # Send a GET request to the URL with a 20-second timeout
17
+ response = requests.get(url, timeout=20)
18
+ response.raise_for_status() # Raise an exception for bad status codes
19
+ html = response.text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
  except Exception as e:
21
+ return f"Error fetching the webpage: {str(e)}"
 
22
  try:
23
  # Convert the HTML content to Markdown
24
+ markdown_content = markdownify.markdownify(html).strip()
25
+ # Remove multiple line breaks
26
+ markdown_content = re.sub(r'\n{3,}', '\n\n', markdown_content)
27
  return truncate_content(markdown_content, 10000)
28
  except Exception as e:
29
  return f"Error processing content: {str(e)}"