chore(api/core): apply ruff reformatting (#7624)

This commit is contained in:
Bowen Liang
2024-09-10 17:00:20 +08:00
committed by GitHub
parent 178730266d
commit 2cf1187b32
724 changed files with 21180 additions and 21123 deletions

View File

@@ -7,15 +7,8 @@ class FirecrawlProvider(BuiltinToolProviderController):
def _validate_credentials(self, credentials: dict) -> None:
try:
# Example validation using the ScrapeTool, only scraping title for minimize content
ScrapeTool().fork_tool_runtime(
runtime={"credentials": credentials}
).invoke(
user_id='',
tool_parameters={
"url": "https://google.com",
"onlyIncludeTags": 'title'
}
ScrapeTool().fork_tool_runtime(runtime={"credentials": credentials}).invoke(
user_id="", tool_parameters={"url": "https://google.com", "onlyIncludeTags": "title"}
)
except Exception as e:
raise ToolProviderCredentialValidationError(str(e))

View File

@@ -13,27 +13,24 @@ logger = logging.getLogger(__name__)
class FirecrawlApp:
def __init__(self, api_key: str | None = None, base_url: str | None = None):
self.api_key = api_key
self.base_url = base_url or 'https://api.firecrawl.dev'
self.base_url = base_url or "https://api.firecrawl.dev"
if not self.api_key:
raise ValueError("API key is required")
def _prepare_headers(self, idempotency_key: str | None = None):
headers = {
'Content-Type': 'application/json',
'Authorization': f'Bearer {self.api_key}'
}
headers = {"Content-Type": "application/json", "Authorization": f"Bearer {self.api_key}"}
if idempotency_key:
headers['Idempotency-Key'] = idempotency_key
headers["Idempotency-Key"] = idempotency_key
return headers
def _request(
self,
method: str,
url: str,
data: Mapping[str, Any] | None = None,
headers: Mapping[str, str] | None = None,
retries: int = 3,
backoff_factor: float = 0.3,
self,
method: str,
url: str,
data: Mapping[str, Any] | None = None,
headers: Mapping[str, str] | None = None,
retries: int = 3,
backoff_factor: float = 0.3,
) -> Mapping[str, Any] | None:
if not headers:
headers = self._prepare_headers()
@@ -44,54 +41,54 @@ class FirecrawlApp:
return response.json()
except requests.exceptions.RequestException as e:
if i < retries - 1:
time.sleep(backoff_factor * (2 ** i))
time.sleep(backoff_factor * (2**i))
else:
raise
return None
def scrape_url(self, url: str, **kwargs):
endpoint = f'{self.base_url}/v0/scrape'
data = {'url': url, **kwargs}
endpoint = f"{self.base_url}/v0/scrape"
data = {"url": url, **kwargs}
logger.debug(f"Sent request to {endpoint=} body={data}")
response = self._request('POST', endpoint, data)
response = self._request("POST", endpoint, data)
if response is None:
raise HTTPError("Failed to scrape URL after multiple retries")
return response
def search(self, query: str, **kwargs):
endpoint = f'{self.base_url}/v0/search'
data = {'query': query, **kwargs}
endpoint = f"{self.base_url}/v0/search"
data = {"query": query, **kwargs}
logger.debug(f"Sent request to {endpoint=} body={data}")
response = self._request('POST', endpoint, data)
response = self._request("POST", endpoint, data)
if response is None:
raise HTTPError("Failed to perform search after multiple retries")
return response
def crawl_url(
self, url: str, wait: bool = True, poll_interval: int = 5, idempotency_key: str | None = None, **kwargs
self, url: str, wait: bool = True, poll_interval: int = 5, idempotency_key: str | None = None, **kwargs
):
endpoint = f'{self.base_url}/v0/crawl'
endpoint = f"{self.base_url}/v0/crawl"
headers = self._prepare_headers(idempotency_key)
data = {'url': url, **kwargs}
data = {"url": url, **kwargs}
logger.debug(f"Sent request to {endpoint=} body={data}")
response = self._request('POST', endpoint, data, headers)
response = self._request("POST", endpoint, data, headers)
if response is None:
raise HTTPError("Failed to initiate crawl after multiple retries")
job_id: str = response['jobId']
job_id: str = response["jobId"]
if wait:
return self._monitor_job_status(job_id=job_id, poll_interval=poll_interval)
return response
def check_crawl_status(self, job_id: str):
endpoint = f'{self.base_url}/v0/crawl/status/{job_id}'
response = self._request('GET', endpoint)
endpoint = f"{self.base_url}/v0/crawl/status/{job_id}"
response = self._request("GET", endpoint)
if response is None:
raise HTTPError(f"Failed to check status for job {job_id} after multiple retries")
return response
def cancel_crawl_job(self, job_id: str):
endpoint = f'{self.base_url}/v0/crawl/cancel/{job_id}'
response = self._request('DELETE', endpoint)
endpoint = f"{self.base_url}/v0/crawl/cancel/{job_id}"
response = self._request("DELETE", endpoint)
if response is None:
raise HTTPError(f"Failed to cancel job {job_id} after multiple retries")
return response
@@ -99,9 +96,9 @@ class FirecrawlApp:
def _monitor_job_status(self, job_id: str, poll_interval: int):
while True:
status = self.check_crawl_status(job_id)
if status['status'] == 'completed':
if status["status"] == "completed":
return status
elif status['status'] == 'failed':
elif status["status"] == "failed":
raise HTTPError(f'Job {job_id} failed: {status["error"]}')
time.sleep(poll_interval)
@@ -109,7 +106,7 @@ class FirecrawlApp:
def get_array_params(tool_parameters: dict[str, Any], key):
param = tool_parameters.get(key)
if param:
return param.split(',')
return param.split(",")
def get_json_params(tool_parameters: dict[str, Any], key):

View File

@@ -11,38 +11,36 @@ class CrawlTool(BuiltinTool):
the crawlerOptions and pageOptions comes from doc here:
https://docs.firecrawl.dev/api-reference/endpoint/crawl
"""
app = FirecrawlApp(api_key=self.runtime.credentials['firecrawl_api_key'],
base_url=self.runtime.credentials['base_url'])
app = FirecrawlApp(
api_key=self.runtime.credentials["firecrawl_api_key"], base_url=self.runtime.credentials["base_url"]
)
crawlerOptions = {}
pageOptions = {}
wait_for_results = tool_parameters.get('wait_for_results', True)
wait_for_results = tool_parameters.get("wait_for_results", True)
crawlerOptions['excludes'] = get_array_params(tool_parameters, 'excludes')
crawlerOptions['includes'] = get_array_params(tool_parameters, 'includes')
crawlerOptions['returnOnlyUrls'] = tool_parameters.get('returnOnlyUrls', False)
crawlerOptions['maxDepth'] = tool_parameters.get('maxDepth')
crawlerOptions['mode'] = tool_parameters.get('mode')
crawlerOptions['ignoreSitemap'] = tool_parameters.get('ignoreSitemap', False)
crawlerOptions['limit'] = tool_parameters.get('limit', 5)
crawlerOptions['allowBackwardCrawling'] = tool_parameters.get('allowBackwardCrawling', False)
crawlerOptions['allowExternalContentLinks'] = tool_parameters.get('allowExternalContentLinks', False)
crawlerOptions["excludes"] = get_array_params(tool_parameters, "excludes")
crawlerOptions["includes"] = get_array_params(tool_parameters, "includes")
crawlerOptions["returnOnlyUrls"] = tool_parameters.get("returnOnlyUrls", False)
crawlerOptions["maxDepth"] = tool_parameters.get("maxDepth")
crawlerOptions["mode"] = tool_parameters.get("mode")
crawlerOptions["ignoreSitemap"] = tool_parameters.get("ignoreSitemap", False)
crawlerOptions["limit"] = tool_parameters.get("limit", 5)
crawlerOptions["allowBackwardCrawling"] = tool_parameters.get("allowBackwardCrawling", False)
crawlerOptions["allowExternalContentLinks"] = tool_parameters.get("allowExternalContentLinks", False)
pageOptions['headers'] = get_json_params(tool_parameters, 'headers')
pageOptions['includeHtml'] = tool_parameters.get('includeHtml', False)
pageOptions['includeRawHtml'] = tool_parameters.get('includeRawHtml', False)
pageOptions['onlyIncludeTags'] = get_array_params(tool_parameters, 'onlyIncludeTags')
pageOptions['removeTags'] = get_array_params(tool_parameters, 'removeTags')
pageOptions['onlyMainContent'] = tool_parameters.get('onlyMainContent', False)
pageOptions['replaceAllPathsWithAbsolutePaths'] = tool_parameters.get('replaceAllPathsWithAbsolutePaths', False)
pageOptions['screenshot'] = tool_parameters.get('screenshot', False)
pageOptions['waitFor'] = tool_parameters.get('waitFor', 0)
pageOptions["headers"] = get_json_params(tool_parameters, "headers")
pageOptions["includeHtml"] = tool_parameters.get("includeHtml", False)
pageOptions["includeRawHtml"] = tool_parameters.get("includeRawHtml", False)
pageOptions["onlyIncludeTags"] = get_array_params(tool_parameters, "onlyIncludeTags")
pageOptions["removeTags"] = get_array_params(tool_parameters, "removeTags")
pageOptions["onlyMainContent"] = tool_parameters.get("onlyMainContent", False)
pageOptions["replaceAllPathsWithAbsolutePaths"] = tool_parameters.get("replaceAllPathsWithAbsolutePaths", False)
pageOptions["screenshot"] = tool_parameters.get("screenshot", False)
pageOptions["waitFor"] = tool_parameters.get("waitFor", 0)
crawl_result = app.crawl_url(
url=tool_parameters['url'],
wait=wait_for_results,
crawlerOptions=crawlerOptions,
pageOptions=pageOptions
url=tool_parameters["url"], wait=wait_for_results, crawlerOptions=crawlerOptions, pageOptions=pageOptions
)
return self.create_json_message(crawl_result)

View File

@@ -7,14 +7,15 @@ from core.tools.tool.builtin_tool import BuiltinTool
class CrawlJobTool(BuiltinTool):
def _invoke(self, user_id: str, tool_parameters: dict[str, Any]) -> ToolInvokeMessage:
app = FirecrawlApp(api_key=self.runtime.credentials['firecrawl_api_key'],
base_url=self.runtime.credentials['base_url'])
operation = tool_parameters.get('operation', 'get')
if operation == 'get':
result = app.check_crawl_status(job_id=tool_parameters['job_id'])
elif operation == 'cancel':
result = app.cancel_crawl_job(job_id=tool_parameters['job_id'])
app = FirecrawlApp(
api_key=self.runtime.credentials["firecrawl_api_key"], base_url=self.runtime.credentials["base_url"]
)
operation = tool_parameters.get("operation", "get")
if operation == "get":
result = app.check_crawl_status(job_id=tool_parameters["job_id"])
elif operation == "cancel":
result = app.cancel_crawl_job(job_id=tool_parameters["job_id"])
else:
raise ValueError(f'Invalid operation: {operation}')
raise ValueError(f"Invalid operation: {operation}")
return self.create_json_message(result)

View File

@@ -6,34 +6,34 @@ from core.tools.tool.builtin_tool import BuiltinTool
class ScrapeTool(BuiltinTool):
def _invoke(self, user_id: str, tool_parameters: dict[str, Any]) -> ToolInvokeMessage:
"""
the pageOptions and extractorOptions comes from doc here:
https://docs.firecrawl.dev/api-reference/endpoint/scrape
"""
app = FirecrawlApp(api_key=self.runtime.credentials['firecrawl_api_key'],
base_url=self.runtime.credentials['base_url'])
app = FirecrawlApp(
api_key=self.runtime.credentials["firecrawl_api_key"], base_url=self.runtime.credentials["base_url"]
)
pageOptions = {}
extractorOptions = {}
pageOptions['headers'] = get_json_params(tool_parameters, 'headers')
pageOptions['includeHtml'] = tool_parameters.get('includeHtml', False)
pageOptions['includeRawHtml'] = tool_parameters.get('includeRawHtml', False)
pageOptions['onlyIncludeTags'] = get_array_params(tool_parameters, 'onlyIncludeTags')
pageOptions['removeTags'] = get_array_params(tool_parameters, 'removeTags')
pageOptions['onlyMainContent'] = tool_parameters.get('onlyMainContent', False)
pageOptions['replaceAllPathsWithAbsolutePaths'] = tool_parameters.get('replaceAllPathsWithAbsolutePaths', False)
pageOptions['screenshot'] = tool_parameters.get('screenshot', False)
pageOptions['waitFor'] = tool_parameters.get('waitFor', 0)
pageOptions["headers"] = get_json_params(tool_parameters, "headers")
pageOptions["includeHtml"] = tool_parameters.get("includeHtml", False)
pageOptions["includeRawHtml"] = tool_parameters.get("includeRawHtml", False)
pageOptions["onlyIncludeTags"] = get_array_params(tool_parameters, "onlyIncludeTags")
pageOptions["removeTags"] = get_array_params(tool_parameters, "removeTags")
pageOptions["onlyMainContent"] = tool_parameters.get("onlyMainContent", False)
pageOptions["replaceAllPathsWithAbsolutePaths"] = tool_parameters.get("replaceAllPathsWithAbsolutePaths", False)
pageOptions["screenshot"] = tool_parameters.get("screenshot", False)
pageOptions["waitFor"] = tool_parameters.get("waitFor", 0)
extractorOptions['mode'] = tool_parameters.get('mode', '')
extractorOptions['extractionPrompt'] = tool_parameters.get('extractionPrompt', '')
extractorOptions['extractionSchema'] = get_json_params(tool_parameters, 'extractionSchema')
extractorOptions["mode"] = tool_parameters.get("mode", "")
extractorOptions["extractionPrompt"] = tool_parameters.get("extractionPrompt", "")
extractorOptions["extractionSchema"] = get_json_params(tool_parameters, "extractionSchema")
crawl_result = app.scrape_url(url=tool_parameters['url'],
pageOptions=pageOptions,
extractorOptions=extractorOptions)
crawl_result = app.scrape_url(
url=tool_parameters["url"], pageOptions=pageOptions, extractorOptions=extractorOptions
)
return self.create_json_message(crawl_result)

View File

@@ -11,18 +11,17 @@ class SearchTool(BuiltinTool):
the pageOptions and searchOptions comes from doc here:
https://docs.firecrawl.dev/api-reference/endpoint/search
"""
app = FirecrawlApp(api_key=self.runtime.credentials['firecrawl_api_key'],
base_url=self.runtime.credentials['base_url'])
app = FirecrawlApp(
api_key=self.runtime.credentials["firecrawl_api_key"], base_url=self.runtime.credentials["base_url"]
)
pageOptions = {}
pageOptions['onlyMainContent'] = tool_parameters.get('onlyMainContent', False)
pageOptions['fetchPageContent'] = tool_parameters.get('fetchPageContent', True)
pageOptions['includeHtml'] = tool_parameters.get('includeHtml', False)
pageOptions['includeRawHtml'] = tool_parameters.get('includeRawHtml', False)
searchOptions = {'limit': tool_parameters.get('limit')}
pageOptions["onlyMainContent"] = tool_parameters.get("onlyMainContent", False)
pageOptions["fetchPageContent"] = tool_parameters.get("fetchPageContent", True)
pageOptions["includeHtml"] = tool_parameters.get("includeHtml", False)
pageOptions["includeRawHtml"] = tool_parameters.get("includeRawHtml", False)
searchOptions = {"limit": tool_parameters.get("limit")}
search_result = app.search(
query=tool_parameters['keyword'],
pageOptions=pageOptions,
searchOptions=searchOptions
query=tool_parameters["keyword"], pageOptions=pageOptions, searchOptions=searchOptions
)
return self.create_json_message(search_result)