Spaces:
Runtime error
Runtime error
Update tools/visit_webpage.py
Browse files- tools/visit_webpage.py +12 -40
tools/visit_webpage.py
CHANGED
@@ -1,57 +1,29 @@
|
|
1 |
from typing import Any, Optional
|
2 |
from smolagents.tools import Tool
|
3 |
-
import
|
4 |
import markdownify
|
5 |
-
import
|
6 |
from smolagents.utils import truncate_content
|
7 |
|
8 |
-
# Import Selenium modules for JavaScript rendering
|
9 |
-
from selenium import webdriver
|
10 |
-
from selenium.webdriver.chrome.options import Options
|
11 |
-
from selenium.common.exceptions import WebDriverException
|
12 |
-
|
13 |
class VisitWebpageTool(Tool):
|
14 |
name = "visit_webpage"
|
15 |
-
description =
|
16 |
-
|
17 |
-
"using Selenium for JavaScript rendering if needed."
|
18 |
-
)
|
19 |
-
inputs = {'url': {'type': 'string', 'description': 'The URL of the webpage to visit.'}}
|
20 |
output_type = "string"
|
21 |
|
22 |
def forward(self, url: str) -> str:
|
23 |
-
# Attempt to render the page using Selenium to capture JavaScript-loaded content
|
24 |
try:
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
# Initialize the Chrome webdriver; adjust executable_path if needed
|
30 |
-
driver = webdriver.Chrome(options=chrome_options)
|
31 |
-
driver.set_page_load_timeout(30)
|
32 |
-
driver.get(url)
|
33 |
-
# Wait a few seconds for dynamic content to load
|
34 |
-
time.sleep(5)
|
35 |
-
html = driver.page_source
|
36 |
-
driver.quit()
|
37 |
-
except WebDriverException as e:
|
38 |
-
# Fallback: if Selenium fails, use requests
|
39 |
-
try:
|
40 |
-
import requests
|
41 |
-
from requests.exceptions import RequestException
|
42 |
-
response = requests.get(url, timeout=20)
|
43 |
-
response.raise_for_status()
|
44 |
-
html = response.text
|
45 |
-
except Exception as ex:
|
46 |
-
return f"Error fetching the webpage with requests: {str(ex)}"
|
47 |
except Exception as e:
|
48 |
-
return f"
|
49 |
-
|
50 |
try:
|
51 |
# Convert the HTML content to Markdown
|
52 |
-
markdown_content = markdownify.markdownify(html
|
53 |
-
#
|
54 |
-
markdown_content = re.sub(r
|
55 |
return truncate_content(markdown_content, 10000)
|
56 |
except Exception as e:
|
57 |
return f"Error processing content: {str(e)}"
|
|
|
1 |
from typing import Any, Optional
|
2 |
from smolagents.tools import Tool
|
3 |
+
import requests
|
4 |
import markdownify
|
5 |
+
import re
|
6 |
from smolagents.utils import truncate_content
|
7 |
|
|
|
|
|
|
|
|
|
|
|
8 |
class VisitWebpageTool(Tool):
|
9 |
name = "visit_webpage"
|
10 |
+
description = "Visits a webpage at the given url and reads its content as a markdown string. Use this to browse webpages."
|
11 |
+
inputs = {'url': {'type': 'string', 'description': 'The url of the webpage to visit.'}}
|
|
|
|
|
|
|
12 |
output_type = "string"
|
13 |
|
14 |
def forward(self, url: str) -> str:
|
|
|
15 |
try:
|
16 |
+
# Send a GET request to the URL with a 20-second timeout
|
17 |
+
response = requests.get(url, timeout=20)
|
18 |
+
response.raise_for_status() # Raise an exception for bad status codes
|
19 |
+
html = response.text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
20 |
except Exception as e:
|
21 |
+
return f"Error fetching the webpage: {str(e)}"
|
|
|
22 |
try:
|
23 |
# Convert the HTML content to Markdown
|
24 |
+
markdown_content = markdownify.markdownify(html).strip()
|
25 |
+
# Remove multiple line breaks
|
26 |
+
markdown_content = re.sub(r'\n{3,}', '\n\n', markdown_content)
|
27 |
return truncate_content(markdown_content, 10000)
|
28 |
except Exception as e:
|
29 |
return f"Error processing content: {str(e)}"
|