mirror of
http://112.124.100.131/huang.ze/ebiz-dify-ai.git
synced 2025-12-22 17:26:54 +08:00
chore(api/core): apply ruff reformatting (#7624)
This commit is contained in:
@@ -7,15 +7,8 @@ class FirecrawlProvider(BuiltinToolProviderController):
|
||||
def _validate_credentials(self, credentials: dict) -> None:
|
||||
try:
|
||||
# Example validation using the ScrapeTool, only scraping title for minimize content
|
||||
ScrapeTool().fork_tool_runtime(
|
||||
runtime={"credentials": credentials}
|
||||
).invoke(
|
||||
user_id='',
|
||||
tool_parameters={
|
||||
"url": "https://google.com",
|
||||
"onlyIncludeTags": 'title'
|
||||
}
|
||||
ScrapeTool().fork_tool_runtime(runtime={"credentials": credentials}).invoke(
|
||||
user_id="", tool_parameters={"url": "https://google.com", "onlyIncludeTags": "title"}
|
||||
)
|
||||
except Exception as e:
|
||||
raise ToolProviderCredentialValidationError(str(e))
|
||||
|
||||
@@ -13,27 +13,24 @@ logger = logging.getLogger(__name__)
|
||||
class FirecrawlApp:
|
||||
def __init__(self, api_key: str | None = None, base_url: str | None = None):
|
||||
self.api_key = api_key
|
||||
self.base_url = base_url or 'https://api.firecrawl.dev'
|
||||
self.base_url = base_url or "https://api.firecrawl.dev"
|
||||
if not self.api_key:
|
||||
raise ValueError("API key is required")
|
||||
|
||||
def _prepare_headers(self, idempotency_key: str | None = None):
|
||||
headers = {
|
||||
'Content-Type': 'application/json',
|
||||
'Authorization': f'Bearer {self.api_key}'
|
||||
}
|
||||
headers = {"Content-Type": "application/json", "Authorization": f"Bearer {self.api_key}"}
|
||||
if idempotency_key:
|
||||
headers['Idempotency-Key'] = idempotency_key
|
||||
headers["Idempotency-Key"] = idempotency_key
|
||||
return headers
|
||||
|
||||
def _request(
|
||||
self,
|
||||
method: str,
|
||||
url: str,
|
||||
data: Mapping[str, Any] | None = None,
|
||||
headers: Mapping[str, str] | None = None,
|
||||
retries: int = 3,
|
||||
backoff_factor: float = 0.3,
|
||||
self,
|
||||
method: str,
|
||||
url: str,
|
||||
data: Mapping[str, Any] | None = None,
|
||||
headers: Mapping[str, str] | None = None,
|
||||
retries: int = 3,
|
||||
backoff_factor: float = 0.3,
|
||||
) -> Mapping[str, Any] | None:
|
||||
if not headers:
|
||||
headers = self._prepare_headers()
|
||||
@@ -44,54 +41,54 @@ class FirecrawlApp:
|
||||
return response.json()
|
||||
except requests.exceptions.RequestException as e:
|
||||
if i < retries - 1:
|
||||
time.sleep(backoff_factor * (2 ** i))
|
||||
time.sleep(backoff_factor * (2**i))
|
||||
else:
|
||||
raise
|
||||
return None
|
||||
|
||||
def scrape_url(self, url: str, **kwargs):
|
||||
endpoint = f'{self.base_url}/v0/scrape'
|
||||
data = {'url': url, **kwargs}
|
||||
endpoint = f"{self.base_url}/v0/scrape"
|
||||
data = {"url": url, **kwargs}
|
||||
logger.debug(f"Sent request to {endpoint=} body={data}")
|
||||
response = self._request('POST', endpoint, data)
|
||||
response = self._request("POST", endpoint, data)
|
||||
if response is None:
|
||||
raise HTTPError("Failed to scrape URL after multiple retries")
|
||||
return response
|
||||
|
||||
def search(self, query: str, **kwargs):
|
||||
endpoint = f'{self.base_url}/v0/search'
|
||||
data = {'query': query, **kwargs}
|
||||
endpoint = f"{self.base_url}/v0/search"
|
||||
data = {"query": query, **kwargs}
|
||||
logger.debug(f"Sent request to {endpoint=} body={data}")
|
||||
response = self._request('POST', endpoint, data)
|
||||
response = self._request("POST", endpoint, data)
|
||||
if response is None:
|
||||
raise HTTPError("Failed to perform search after multiple retries")
|
||||
return response
|
||||
|
||||
def crawl_url(
|
||||
self, url: str, wait: bool = True, poll_interval: int = 5, idempotency_key: str | None = None, **kwargs
|
||||
self, url: str, wait: bool = True, poll_interval: int = 5, idempotency_key: str | None = None, **kwargs
|
||||
):
|
||||
endpoint = f'{self.base_url}/v0/crawl'
|
||||
endpoint = f"{self.base_url}/v0/crawl"
|
||||
headers = self._prepare_headers(idempotency_key)
|
||||
data = {'url': url, **kwargs}
|
||||
data = {"url": url, **kwargs}
|
||||
logger.debug(f"Sent request to {endpoint=} body={data}")
|
||||
response = self._request('POST', endpoint, data, headers)
|
||||
response = self._request("POST", endpoint, data, headers)
|
||||
if response is None:
|
||||
raise HTTPError("Failed to initiate crawl after multiple retries")
|
||||
job_id: str = response['jobId']
|
||||
job_id: str = response["jobId"]
|
||||
if wait:
|
||||
return self._monitor_job_status(job_id=job_id, poll_interval=poll_interval)
|
||||
return response
|
||||
|
||||
def check_crawl_status(self, job_id: str):
|
||||
endpoint = f'{self.base_url}/v0/crawl/status/{job_id}'
|
||||
response = self._request('GET', endpoint)
|
||||
endpoint = f"{self.base_url}/v0/crawl/status/{job_id}"
|
||||
response = self._request("GET", endpoint)
|
||||
if response is None:
|
||||
raise HTTPError(f"Failed to check status for job {job_id} after multiple retries")
|
||||
return response
|
||||
|
||||
def cancel_crawl_job(self, job_id: str):
|
||||
endpoint = f'{self.base_url}/v0/crawl/cancel/{job_id}'
|
||||
response = self._request('DELETE', endpoint)
|
||||
endpoint = f"{self.base_url}/v0/crawl/cancel/{job_id}"
|
||||
response = self._request("DELETE", endpoint)
|
||||
if response is None:
|
||||
raise HTTPError(f"Failed to cancel job {job_id} after multiple retries")
|
||||
return response
|
||||
@@ -99,9 +96,9 @@ class FirecrawlApp:
|
||||
def _monitor_job_status(self, job_id: str, poll_interval: int):
|
||||
while True:
|
||||
status = self.check_crawl_status(job_id)
|
||||
if status['status'] == 'completed':
|
||||
if status["status"] == "completed":
|
||||
return status
|
||||
elif status['status'] == 'failed':
|
||||
elif status["status"] == "failed":
|
||||
raise HTTPError(f'Job {job_id} failed: {status["error"]}')
|
||||
time.sleep(poll_interval)
|
||||
|
||||
@@ -109,7 +106,7 @@ class FirecrawlApp:
|
||||
def get_array_params(tool_parameters: dict[str, Any], key):
|
||||
param = tool_parameters.get(key)
|
||||
if param:
|
||||
return param.split(',')
|
||||
return param.split(",")
|
||||
|
||||
|
||||
def get_json_params(tool_parameters: dict[str, Any], key):
|
||||
|
||||
@@ -11,38 +11,36 @@ class CrawlTool(BuiltinTool):
|
||||
the crawlerOptions and pageOptions comes from doc here:
|
||||
https://docs.firecrawl.dev/api-reference/endpoint/crawl
|
||||
"""
|
||||
app = FirecrawlApp(api_key=self.runtime.credentials['firecrawl_api_key'],
|
||||
base_url=self.runtime.credentials['base_url'])
|
||||
app = FirecrawlApp(
|
||||
api_key=self.runtime.credentials["firecrawl_api_key"], base_url=self.runtime.credentials["base_url"]
|
||||
)
|
||||
crawlerOptions = {}
|
||||
pageOptions = {}
|
||||
|
||||
wait_for_results = tool_parameters.get('wait_for_results', True)
|
||||
wait_for_results = tool_parameters.get("wait_for_results", True)
|
||||
|
||||
crawlerOptions['excludes'] = get_array_params(tool_parameters, 'excludes')
|
||||
crawlerOptions['includes'] = get_array_params(tool_parameters, 'includes')
|
||||
crawlerOptions['returnOnlyUrls'] = tool_parameters.get('returnOnlyUrls', False)
|
||||
crawlerOptions['maxDepth'] = tool_parameters.get('maxDepth')
|
||||
crawlerOptions['mode'] = tool_parameters.get('mode')
|
||||
crawlerOptions['ignoreSitemap'] = tool_parameters.get('ignoreSitemap', False)
|
||||
crawlerOptions['limit'] = tool_parameters.get('limit', 5)
|
||||
crawlerOptions['allowBackwardCrawling'] = tool_parameters.get('allowBackwardCrawling', False)
|
||||
crawlerOptions['allowExternalContentLinks'] = tool_parameters.get('allowExternalContentLinks', False)
|
||||
crawlerOptions["excludes"] = get_array_params(tool_parameters, "excludes")
|
||||
crawlerOptions["includes"] = get_array_params(tool_parameters, "includes")
|
||||
crawlerOptions["returnOnlyUrls"] = tool_parameters.get("returnOnlyUrls", False)
|
||||
crawlerOptions["maxDepth"] = tool_parameters.get("maxDepth")
|
||||
crawlerOptions["mode"] = tool_parameters.get("mode")
|
||||
crawlerOptions["ignoreSitemap"] = tool_parameters.get("ignoreSitemap", False)
|
||||
crawlerOptions["limit"] = tool_parameters.get("limit", 5)
|
||||
crawlerOptions["allowBackwardCrawling"] = tool_parameters.get("allowBackwardCrawling", False)
|
||||
crawlerOptions["allowExternalContentLinks"] = tool_parameters.get("allowExternalContentLinks", False)
|
||||
|
||||
pageOptions['headers'] = get_json_params(tool_parameters, 'headers')
|
||||
pageOptions['includeHtml'] = tool_parameters.get('includeHtml', False)
|
||||
pageOptions['includeRawHtml'] = tool_parameters.get('includeRawHtml', False)
|
||||
pageOptions['onlyIncludeTags'] = get_array_params(tool_parameters, 'onlyIncludeTags')
|
||||
pageOptions['removeTags'] = get_array_params(tool_parameters, 'removeTags')
|
||||
pageOptions['onlyMainContent'] = tool_parameters.get('onlyMainContent', False)
|
||||
pageOptions['replaceAllPathsWithAbsolutePaths'] = tool_parameters.get('replaceAllPathsWithAbsolutePaths', False)
|
||||
pageOptions['screenshot'] = tool_parameters.get('screenshot', False)
|
||||
pageOptions['waitFor'] = tool_parameters.get('waitFor', 0)
|
||||
pageOptions["headers"] = get_json_params(tool_parameters, "headers")
|
||||
pageOptions["includeHtml"] = tool_parameters.get("includeHtml", False)
|
||||
pageOptions["includeRawHtml"] = tool_parameters.get("includeRawHtml", False)
|
||||
pageOptions["onlyIncludeTags"] = get_array_params(tool_parameters, "onlyIncludeTags")
|
||||
pageOptions["removeTags"] = get_array_params(tool_parameters, "removeTags")
|
||||
pageOptions["onlyMainContent"] = tool_parameters.get("onlyMainContent", False)
|
||||
pageOptions["replaceAllPathsWithAbsolutePaths"] = tool_parameters.get("replaceAllPathsWithAbsolutePaths", False)
|
||||
pageOptions["screenshot"] = tool_parameters.get("screenshot", False)
|
||||
pageOptions["waitFor"] = tool_parameters.get("waitFor", 0)
|
||||
|
||||
crawl_result = app.crawl_url(
|
||||
url=tool_parameters['url'],
|
||||
wait=wait_for_results,
|
||||
crawlerOptions=crawlerOptions,
|
||||
pageOptions=pageOptions
|
||||
url=tool_parameters["url"], wait=wait_for_results, crawlerOptions=crawlerOptions, pageOptions=pageOptions
|
||||
)
|
||||
|
||||
return self.create_json_message(crawl_result)
|
||||
|
||||
@@ -7,14 +7,15 @@ from core.tools.tool.builtin_tool import BuiltinTool
|
||||
|
||||
class CrawlJobTool(BuiltinTool):
|
||||
def _invoke(self, user_id: str, tool_parameters: dict[str, Any]) -> ToolInvokeMessage:
|
||||
app = FirecrawlApp(api_key=self.runtime.credentials['firecrawl_api_key'],
|
||||
base_url=self.runtime.credentials['base_url'])
|
||||
operation = tool_parameters.get('operation', 'get')
|
||||
if operation == 'get':
|
||||
result = app.check_crawl_status(job_id=tool_parameters['job_id'])
|
||||
elif operation == 'cancel':
|
||||
result = app.cancel_crawl_job(job_id=tool_parameters['job_id'])
|
||||
app = FirecrawlApp(
|
||||
api_key=self.runtime.credentials["firecrawl_api_key"], base_url=self.runtime.credentials["base_url"]
|
||||
)
|
||||
operation = tool_parameters.get("operation", "get")
|
||||
if operation == "get":
|
||||
result = app.check_crawl_status(job_id=tool_parameters["job_id"])
|
||||
elif operation == "cancel":
|
||||
result = app.cancel_crawl_job(job_id=tool_parameters["job_id"])
|
||||
else:
|
||||
raise ValueError(f'Invalid operation: {operation}')
|
||||
raise ValueError(f"Invalid operation: {operation}")
|
||||
|
||||
return self.create_json_message(result)
|
||||
|
||||
@@ -6,34 +6,34 @@ from core.tools.tool.builtin_tool import BuiltinTool
|
||||
|
||||
|
||||
class ScrapeTool(BuiltinTool):
|
||||
|
||||
def _invoke(self, user_id: str, tool_parameters: dict[str, Any]) -> ToolInvokeMessage:
|
||||
"""
|
||||
the pageOptions and extractorOptions comes from doc here:
|
||||
https://docs.firecrawl.dev/api-reference/endpoint/scrape
|
||||
"""
|
||||
app = FirecrawlApp(api_key=self.runtime.credentials['firecrawl_api_key'],
|
||||
base_url=self.runtime.credentials['base_url'])
|
||||
app = FirecrawlApp(
|
||||
api_key=self.runtime.credentials["firecrawl_api_key"], base_url=self.runtime.credentials["base_url"]
|
||||
)
|
||||
|
||||
pageOptions = {}
|
||||
extractorOptions = {}
|
||||
|
||||
pageOptions['headers'] = get_json_params(tool_parameters, 'headers')
|
||||
pageOptions['includeHtml'] = tool_parameters.get('includeHtml', False)
|
||||
pageOptions['includeRawHtml'] = tool_parameters.get('includeRawHtml', False)
|
||||
pageOptions['onlyIncludeTags'] = get_array_params(tool_parameters, 'onlyIncludeTags')
|
||||
pageOptions['removeTags'] = get_array_params(tool_parameters, 'removeTags')
|
||||
pageOptions['onlyMainContent'] = tool_parameters.get('onlyMainContent', False)
|
||||
pageOptions['replaceAllPathsWithAbsolutePaths'] = tool_parameters.get('replaceAllPathsWithAbsolutePaths', False)
|
||||
pageOptions['screenshot'] = tool_parameters.get('screenshot', False)
|
||||
pageOptions['waitFor'] = tool_parameters.get('waitFor', 0)
|
||||
pageOptions["headers"] = get_json_params(tool_parameters, "headers")
|
||||
pageOptions["includeHtml"] = tool_parameters.get("includeHtml", False)
|
||||
pageOptions["includeRawHtml"] = tool_parameters.get("includeRawHtml", False)
|
||||
pageOptions["onlyIncludeTags"] = get_array_params(tool_parameters, "onlyIncludeTags")
|
||||
pageOptions["removeTags"] = get_array_params(tool_parameters, "removeTags")
|
||||
pageOptions["onlyMainContent"] = tool_parameters.get("onlyMainContent", False)
|
||||
pageOptions["replaceAllPathsWithAbsolutePaths"] = tool_parameters.get("replaceAllPathsWithAbsolutePaths", False)
|
||||
pageOptions["screenshot"] = tool_parameters.get("screenshot", False)
|
||||
pageOptions["waitFor"] = tool_parameters.get("waitFor", 0)
|
||||
|
||||
extractorOptions['mode'] = tool_parameters.get('mode', '')
|
||||
extractorOptions['extractionPrompt'] = tool_parameters.get('extractionPrompt', '')
|
||||
extractorOptions['extractionSchema'] = get_json_params(tool_parameters, 'extractionSchema')
|
||||
extractorOptions["mode"] = tool_parameters.get("mode", "")
|
||||
extractorOptions["extractionPrompt"] = tool_parameters.get("extractionPrompt", "")
|
||||
extractorOptions["extractionSchema"] = get_json_params(tool_parameters, "extractionSchema")
|
||||
|
||||
crawl_result = app.scrape_url(url=tool_parameters['url'],
|
||||
pageOptions=pageOptions,
|
||||
extractorOptions=extractorOptions)
|
||||
crawl_result = app.scrape_url(
|
||||
url=tool_parameters["url"], pageOptions=pageOptions, extractorOptions=extractorOptions
|
||||
)
|
||||
|
||||
return self.create_json_message(crawl_result)
|
||||
|
||||
@@ -11,18 +11,17 @@ class SearchTool(BuiltinTool):
|
||||
the pageOptions and searchOptions comes from doc here:
|
||||
https://docs.firecrawl.dev/api-reference/endpoint/search
|
||||
"""
|
||||
app = FirecrawlApp(api_key=self.runtime.credentials['firecrawl_api_key'],
|
||||
base_url=self.runtime.credentials['base_url'])
|
||||
app = FirecrawlApp(
|
||||
api_key=self.runtime.credentials["firecrawl_api_key"], base_url=self.runtime.credentials["base_url"]
|
||||
)
|
||||
pageOptions = {}
|
||||
pageOptions['onlyMainContent'] = tool_parameters.get('onlyMainContent', False)
|
||||
pageOptions['fetchPageContent'] = tool_parameters.get('fetchPageContent', True)
|
||||
pageOptions['includeHtml'] = tool_parameters.get('includeHtml', False)
|
||||
pageOptions['includeRawHtml'] = tool_parameters.get('includeRawHtml', False)
|
||||
searchOptions = {'limit': tool_parameters.get('limit')}
|
||||
pageOptions["onlyMainContent"] = tool_parameters.get("onlyMainContent", False)
|
||||
pageOptions["fetchPageContent"] = tool_parameters.get("fetchPageContent", True)
|
||||
pageOptions["includeHtml"] = tool_parameters.get("includeHtml", False)
|
||||
pageOptions["includeRawHtml"] = tool_parameters.get("includeRawHtml", False)
|
||||
searchOptions = {"limit": tool_parameters.get("limit")}
|
||||
search_result = app.search(
|
||||
query=tool_parameters['keyword'],
|
||||
pageOptions=pageOptions,
|
||||
searchOptions=searchOptions
|
||||
query=tool_parameters["keyword"], pageOptions=pageOptions, searchOptions=searchOptions
|
||||
)
|
||||
|
||||
return self.create_json_message(search_result)
|
||||
|
||||
Reference in New Issue
Block a user