-
Notifications
You must be signed in to change notification settings - Fork 5.9k
feat(scrape_tool): add support for JavaScript rendering using Playwright #4402
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,35 @@ | ||
| import pytest | ||
| from unittest.mock import patch, MagicMock | ||
| from crewai_tools.tools.scrape_website_tool.scrape_website_tool import ScrapeWebsiteTool | ||
|
|
||
| def test_scrape_website_tool_render_js_logic(): | ||
| """JavaScript rendering performance validation test with playwright""" | ||
| tool = ScrapeWebsiteTool(website_url="https://example.com", render_js=True) | ||
|
|
||
| with patch("playwright.sync_api.sync_playwright") as mock_playwright: | ||
| # Simulate Playwright structure | ||
| mock_context = mock_playwright.return_value.__enter__.return_value | ||
| mock_browser = mock_context.chromium.launch.return_value | ||
| mock_page = mock_browser.new_page.return_value | ||
| mock_page.content.return_value = "<html><body>JS Content</body></html>" | ||
|
|
||
| result = tool._run() | ||
|
|
||
| assert "JS Content" in result | ||
| mock_playwright.assert_called_once() | ||
|
|
||
| def test_scrape_website_tool_default_behavior(): | ||
| """Test that there is no change to the old behavior in the default state""" | ||
| tool = ScrapeWebsiteTool(website_url="https://example.com") | ||
|
|
||
| with patch("requests.get") as mock_get: | ||
| mock_response = MagicMock() | ||
| mock_response.text = "Normal Content" | ||
| mock_response.status_code = 200 | ||
| mock_get.return_value = mock_response | ||
|
|
||
| result = tool._run() | ||
|
|
||
| assert "Normal Content" in result | ||
| # We make sure that it doesn't go to the playwrite in normal mode | ||
| mock_get.assert_called_once() |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,6 +1,8 @@ | ||
| import os | ||
| from typing import Any | ||
| from urllib.parse import urljoin | ||
| import os | ||
|
|
||
| import httpx | ||
| import requests | ||
|
|
||
| from crewai.cli.config import Settings | ||
|
|
@@ -33,7 +35,11 @@ def __init__(self, api_key: str) -> None: | |
| if settings.org_uuid: | ||
| self.headers["X-Crewai-Organization-Id"] = settings.org_uuid | ||
|
|
||
| self.base_url = os.getenv("CREWAI_PLUS_URL") or str(settings.enterprise_base_url) or DEFAULT_CREWAI_ENTERPRISE_URL | ||
| self.base_url = ( | ||
| os.getenv("CREWAI_PLUS_URL") | ||
| or str(settings.enterprise_base_url) | ||
| or DEFAULT_CREWAI_ENTERPRISE_URL | ||
| ) | ||
|
|
||
| def _make_request( | ||
| self, method: str, endpoint: str, **kwargs: Any | ||
|
|
@@ -49,8 +55,10 @@ def login_to_tool_repository(self) -> requests.Response: | |
| def get_tool(self, handle: str) -> requests.Response: | ||
| return self._make_request("GET", f"{self.TOOLS_RESOURCE}/{handle}") | ||
|
|
||
| def get_agent(self, handle: str) -> requests.Response: | ||
| return self._make_request("GET", f"{self.AGENTS_RESOURCE}/{handle}") | ||
| async def get_agent(self, handle: str) -> httpx.Response: | ||
| url = urljoin(self.base_url, f"{self.AGENTS_RESOURCE}/{handle}") | ||
| async with httpx.AsyncClient() as client: | ||
| return await client.get(url, headers=self.headers) | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Async
|
||
|
|
||
| def publish_tool( | ||
| self, | ||
|
|
||


There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Playwright added as hard dependency instead of optional
Medium Severity
playwrightis added to the requireddependencieslist, making it mandatory for allcrewai-toolsusers. Playwright is a heavyweight package that also requires separate browser binary installation (playwright install). The implementation already handlesImportErrorat runtime (lines 87–88 of the scrape tool), clearly indicating it was designed to be optional. It belongs in[project.optional-dependencies]alongside similar optional packages likeseleniumandbrowserbase.