phanerozoic commited on
Commit
d89e740
·
verified ·
1 Parent(s): 9b825c2

Update tools/visit_webpage.py

Browse files
Files changed (1) hide show
  1. tools/visit_webpage.py +41 -27
tools/visit_webpage.py CHANGED
@@ -1,46 +1,60 @@
1
  from typing import Any, Optional
2
  from smolagents.tools import Tool
3
- import requests
4
- import markdownify
5
- import smolagents
6
  import re
 
 
 
 
 
 
 
 
7
 
8
  class VisitWebpageTool(Tool):
9
  name = "visit_webpage"
10
- description = "Visits a webpage at the given url and reads its content as a markdown string. Use this to browse webpages."
11
- inputs = {'url': {'type': 'string', 'description': 'The url of the webpage to visit.'}}
 
 
 
12
  output_type = "string"
13
 
14
  def forward(self, url: str) -> str:
 
15
  try:
16
- import requests
17
- from markdownify import markdownify
18
- from requests.exceptions import RequestException
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
 
20
- from smolagents.utils import truncate_content
21
- except ImportError as e:
22
- raise ImportError(
23
- "You must install packages `markdownify` and `requests` to run this tool: for instance run `pip install markdownify requests`."
24
- ) from e
25
  try:
26
- # Send a GET request to the URL with a 20-second timeout
27
- response = requests.get(url, timeout=20)
28
- response.raise_for_status() # Raise an exception for bad status codes
29
-
30
  # Convert the HTML content to Markdown
31
- markdown_content = markdownify(response.text).strip()
32
-
33
- # Remove multiple line breaks
34
  markdown_content = re.sub(r"\n{3,}", "\n\n", markdown_content)
35
-
36
  return truncate_content(markdown_content, 10000)
37
-
38
- except requests.exceptions.Timeout:
39
- return "The request timed out. Please try again later or check the URL."
40
- except RequestException as e:
41
- return f"Error fetching the webpage: {str(e)}"
42
  except Exception as e:
43
- return f"An unexpected error occurred: {str(e)}"
44
 
45
  def __init__(self, *args, **kwargs):
46
  self.is_initialized = False
 
1
  from typing import Any, Optional
2
  from smolagents.tools import Tool
 
 
 
3
  import re
4
+ import markdownify
5
+ import time
6
+ from smolagents.utils import truncate_content
7
+
8
+ # Import Selenium modules for JavaScript rendering
9
+ from selenium import webdriver
10
+ from selenium.webdriver.chrome.options import Options
11
+ from selenium.common.exceptions import WebDriverException
12
 
13
  class VisitWebpageTool(Tool):
14
  name = "visit_webpage"
15
+ description = (
16
+ "Visits a webpage at the given URL and returns its content as a markdown string, "
17
+ "using Selenium for JavaScript rendering if needed."
18
+ )
19
+ inputs = {'url': {'type': 'string', 'description': 'The URL of the webpage to visit.'}}
20
  output_type = "string"
21
 
22
  def forward(self, url: str) -> str:
23
+ # Attempt to render the page using Selenium to capture JavaScript-loaded content
24
  try:
25
+ chrome_options = Options()
26
+ chrome_options.add_argument("--headless")
27
+ chrome_options.add_argument("--disable-gpu")
28
+ chrome_options.add_argument("--no-sandbox")
29
+ # Initialize the Chrome webdriver; adjust executable_path if needed
30
+ driver = webdriver.Chrome(options=chrome_options)
31
+ driver.set_page_load_timeout(30)
32
+ driver.get(url)
33
+ # Wait a few seconds for dynamic content to load
34
+ time.sleep(5)
35
+ html = driver.page_source
36
+ driver.quit()
37
+ except WebDriverException as e:
38
+ # Fallback: if Selenium fails, use requests
39
+ try:
40
+ import requests
41
+ from requests.exceptions import RequestException
42
+ response = requests.get(url, timeout=20)
43
+ response.raise_for_status()
44
+ html = response.text
45
+ except Exception as ex:
46
+ return f"Error fetching the webpage with requests: {str(ex)}"
47
+ except Exception as e:
48
+ return f"An unexpected error occurred during rendering: {str(e)}"
49
 
 
 
 
 
 
50
  try:
 
 
 
 
51
  # Convert the HTML content to Markdown
52
+ markdown_content = markdownify.markdownify(html, heading_style="ATX").strip()
53
+ # Clean up excessive newlines
 
54
  markdown_content = re.sub(r"\n{3,}", "\n\n", markdown_content)
 
55
  return truncate_content(markdown_content, 10000)
 
 
 
 
 
56
  except Exception as e:
57
+ return f"Error processing content: {str(e)}"
58
 
59
  def __init__(self, *args, **kwargs):
60
  self.is_initialized = False