Spaces:
Runtime error
Runtime error
Update tools/visit_webpage.py
Browse files- tools/visit_webpage.py +41 -27
tools/visit_webpage.py
CHANGED
@@ -1,46 +1,60 @@
|
|
1 |
from typing import Any, Optional
|
2 |
from smolagents.tools import Tool
|
3 |
-
import requests
|
4 |
-
import markdownify
|
5 |
-
import smolagents
|
6 |
import re
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
7 |
|
8 |
class VisitWebpageTool(Tool):
|
9 |
name = "visit_webpage"
|
10 |
-
description =
|
11 |
-
|
|
|
|
|
|
|
12 |
output_type = "string"
|
13 |
|
14 |
def forward(self, url: str) -> str:
|
|
|
15 |
try:
|
16 |
-
|
17 |
-
|
18 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
19 |
|
20 |
-
from smolagents.utils import truncate_content
|
21 |
-
except ImportError as e:
|
22 |
-
raise ImportError(
|
23 |
-
"You must install packages `markdownify` and `requests` to run this tool: for instance run `pip install markdownify requests`."
|
24 |
-
) from e
|
25 |
try:
|
26 |
-
# Send a GET request to the URL with a 20-second timeout
|
27 |
-
response = requests.get(url, timeout=20)
|
28 |
-
response.raise_for_status() # Raise an exception for bad status codes
|
29 |
-
|
30 |
# Convert the HTML content to Markdown
|
31 |
-
markdown_content = markdownify(
|
32 |
-
|
33 |
-
# Remove multiple line breaks
|
34 |
markdown_content = re.sub(r"\n{3,}", "\n\n", markdown_content)
|
35 |
-
|
36 |
return truncate_content(markdown_content, 10000)
|
37 |
-
|
38 |
-
except requests.exceptions.Timeout:
|
39 |
-
return "The request timed out. Please try again later or check the URL."
|
40 |
-
except RequestException as e:
|
41 |
-
return f"Error fetching the webpage: {str(e)}"
|
42 |
except Exception as e:
|
43 |
-
return f"
|
44 |
|
45 |
def __init__(self, *args, **kwargs):
|
46 |
self.is_initialized = False
|
|
|
1 |
from typing import Any, Optional
|
2 |
from smolagents.tools import Tool
|
|
|
|
|
|
|
3 |
import re
|
4 |
+
import markdownify
|
5 |
+
import time
|
6 |
+
from smolagents.utils import truncate_content
|
7 |
+
|
8 |
+
# Import Selenium modules for JavaScript rendering
|
9 |
+
from selenium import webdriver
|
10 |
+
from selenium.webdriver.chrome.options import Options
|
11 |
+
from selenium.common.exceptions import WebDriverException
|
12 |
|
13 |
class VisitWebpageTool(Tool):
|
14 |
name = "visit_webpage"
|
15 |
+
description = (
|
16 |
+
"Visits a webpage at the given URL and returns its content as a markdown string, "
|
17 |
+
"using Selenium for JavaScript rendering if needed."
|
18 |
+
)
|
19 |
+
inputs = {'url': {'type': 'string', 'description': 'The URL of the webpage to visit.'}}
|
20 |
output_type = "string"
|
21 |
|
22 |
def forward(self, url: str) -> str:
|
23 |
+
# Attempt to render the page using Selenium to capture JavaScript-loaded content
|
24 |
try:
|
25 |
+
chrome_options = Options()
|
26 |
+
chrome_options.add_argument("--headless")
|
27 |
+
chrome_options.add_argument("--disable-gpu")
|
28 |
+
chrome_options.add_argument("--no-sandbox")
|
29 |
+
# Initialize the Chrome webdriver; adjust executable_path if needed
|
30 |
+
driver = webdriver.Chrome(options=chrome_options)
|
31 |
+
driver.set_page_load_timeout(30)
|
32 |
+
driver.get(url)
|
33 |
+
# Wait a few seconds for dynamic content to load
|
34 |
+
time.sleep(5)
|
35 |
+
html = driver.page_source
|
36 |
+
driver.quit()
|
37 |
+
except WebDriverException as e:
|
38 |
+
# Fallback: if Selenium fails, use requests
|
39 |
+
try:
|
40 |
+
import requests
|
41 |
+
from requests.exceptions import RequestException
|
42 |
+
response = requests.get(url, timeout=20)
|
43 |
+
response.raise_for_status()
|
44 |
+
html = response.text
|
45 |
+
except Exception as ex:
|
46 |
+
return f"Error fetching the webpage with requests: {str(ex)}"
|
47 |
+
except Exception as e:
|
48 |
+
return f"An unexpected error occurred during rendering: {str(e)}"
|
49 |
|
|
|
|
|
|
|
|
|
|
|
50 |
try:
|
|
|
|
|
|
|
|
|
51 |
# Convert the HTML content to Markdown
|
52 |
+
markdown_content = markdownify.markdownify(html, heading_style="ATX").strip()
|
53 |
+
# Clean up excessive newlines
|
|
|
54 |
markdown_content = re.sub(r"\n{3,}", "\n\n", markdown_content)
|
|
|
55 |
return truncate_content(markdown_content, 10000)
|
|
|
|
|
|
|
|
|
|
|
56 |
except Exception as e:
|
57 |
+
return f"Error processing content: {str(e)}"
|
58 |
|
59 |
def __init__(self, *args, **kwargs):
|
60 |
self.is_initialized = False
|