mirror of
http://112.124.100.131/huang.ze/ebiz-dify-ai.git
synced 2025-12-24 10:13:01 +08:00
Feat/firecrawl data source (#5232)
Co-authored-by: Nicolas <nicolascamara29@gmail.com> Co-authored-by: chenhe <guchenhe@gmail.com> Co-authored-by: takatost <takatost@gmail.com>
This commit is contained in:
@@ -29,13 +29,13 @@ from .app import (
|
||||
)
|
||||
|
||||
# Import auth controllers
|
||||
from .auth import activate, data_source_oauth, login, oauth
|
||||
from .auth import activate, data_source_bearer_auth, data_source_oauth, login, oauth
|
||||
|
||||
# Import billing controllers
|
||||
from .billing import billing
|
||||
|
||||
# Import datasets controllers
|
||||
from .datasets import data_source, datasets, datasets_document, datasets_segments, file, hit_testing
|
||||
from .datasets import data_source, datasets, datasets_document, datasets_segments, file, hit_testing, website
|
||||
|
||||
# Import explore controllers
|
||||
from .explore import (
|
||||
|
||||
67
api/controllers/console/auth/data_source_bearer_auth.py
Normal file
67
api/controllers/console/auth/data_source_bearer_auth.py
Normal file
@@ -0,0 +1,67 @@
|
||||
from flask_login import current_user
|
||||
from flask_restful import Resource, reqparse
|
||||
from werkzeug.exceptions import Forbidden
|
||||
|
||||
from controllers.console import api
|
||||
from controllers.console.auth.error import ApiKeyAuthFailedError
|
||||
from libs.login import login_required
|
||||
from services.auth.api_key_auth_service import ApiKeyAuthService
|
||||
|
||||
from ..setup import setup_required
|
||||
from ..wraps import account_initialization_required
|
||||
|
||||
|
||||
class ApiKeyAuthDataSource(Resource):
|
||||
@setup_required
|
||||
@login_required
|
||||
@account_initialization_required
|
||||
def get(self):
|
||||
# The role of the current user in the table must be admin or owner
|
||||
if not current_user.is_admin_or_owner:
|
||||
raise Forbidden()
|
||||
data_source_api_key_bindings = ApiKeyAuthService.get_provider_auth_list(current_user.current_tenant_id)
|
||||
if data_source_api_key_bindings:
|
||||
return {
|
||||
'settings': [data_source_api_key_binding.to_dict() for data_source_api_key_binding in
|
||||
data_source_api_key_bindings]}
|
||||
return {'settings': []}
|
||||
|
||||
|
||||
class ApiKeyAuthDataSourceBinding(Resource):
|
||||
@setup_required
|
||||
@login_required
|
||||
@account_initialization_required
|
||||
def post(self):
|
||||
# The role of the current user in the table must be admin or owner
|
||||
if not current_user.is_admin_or_owner:
|
||||
raise Forbidden()
|
||||
parser = reqparse.RequestParser()
|
||||
parser.add_argument('category', type=str, required=True, nullable=False, location='json')
|
||||
parser.add_argument('provider', type=str, required=True, nullable=False, location='json')
|
||||
parser.add_argument('credentials', type=dict, required=True, nullable=False, location='json')
|
||||
args = parser.parse_args()
|
||||
ApiKeyAuthService.validate_api_key_auth_args(args)
|
||||
try:
|
||||
ApiKeyAuthService.create_provider_auth(current_user.current_tenant_id, args)
|
||||
except Exception as e:
|
||||
raise ApiKeyAuthFailedError(str(e))
|
||||
return {'result': 'success'}, 200
|
||||
|
||||
|
||||
class ApiKeyAuthDataSourceBindingDelete(Resource):
|
||||
@setup_required
|
||||
@login_required
|
||||
@account_initialization_required
|
||||
def delete(self, binding_id):
|
||||
# The role of the current user in the table must be admin or owner
|
||||
if not current_user.is_admin_or_owner:
|
||||
raise Forbidden()
|
||||
|
||||
ApiKeyAuthService.delete_provider_auth(current_user.current_tenant_id, binding_id)
|
||||
|
||||
return {'result': 'success'}, 200
|
||||
|
||||
|
||||
api.add_resource(ApiKeyAuthDataSource, '/api-key-auth/data-source')
|
||||
api.add_resource(ApiKeyAuthDataSourceBinding, '/api-key-auth/data-source/binding')
|
||||
api.add_resource(ApiKeyAuthDataSourceBindingDelete, '/api-key-auth/data-source/<uuid:binding_id>')
|
||||
7
api/controllers/console/auth/error.py
Normal file
7
api/controllers/console/auth/error.py
Normal file
@@ -0,0 +1,7 @@
|
||||
from libs.exception import BaseHTTPException
|
||||
|
||||
|
||||
class ApiKeyAuthFailedError(BaseHTTPException):
|
||||
error_code = 'auth_failed'
|
||||
description = "{message}"
|
||||
code = 500
|
||||
@@ -16,7 +16,7 @@ from extensions.ext_database import db
|
||||
from fields.data_source_fields import integrate_list_fields, integrate_notion_info_list_fields
|
||||
from libs.login import login_required
|
||||
from models.dataset import Document
|
||||
from models.source import DataSourceBinding
|
||||
from models.source import DataSourceOauthBinding
|
||||
from services.dataset_service import DatasetService, DocumentService
|
||||
from tasks.document_indexing_sync_task import document_indexing_sync_task
|
||||
|
||||
@@ -29,9 +29,9 @@ class DataSourceApi(Resource):
|
||||
@marshal_with(integrate_list_fields)
|
||||
def get(self):
|
||||
# get workspace data source integrates
|
||||
data_source_integrates = db.session.query(DataSourceBinding).filter(
|
||||
DataSourceBinding.tenant_id == current_user.current_tenant_id,
|
||||
DataSourceBinding.disabled == False
|
||||
data_source_integrates = db.session.query(DataSourceOauthBinding).filter(
|
||||
DataSourceOauthBinding.tenant_id == current_user.current_tenant_id,
|
||||
DataSourceOauthBinding.disabled == False
|
||||
).all()
|
||||
|
||||
base_url = request.url_root.rstrip('/')
|
||||
@@ -71,7 +71,7 @@ class DataSourceApi(Resource):
|
||||
def patch(self, binding_id, action):
|
||||
binding_id = str(binding_id)
|
||||
action = str(action)
|
||||
data_source_binding = DataSourceBinding.query.filter_by(
|
||||
data_source_binding = DataSourceOauthBinding.query.filter_by(
|
||||
id=binding_id
|
||||
).first()
|
||||
if data_source_binding is None:
|
||||
@@ -124,7 +124,7 @@ class DataSourceNotionListApi(Resource):
|
||||
data_source_info = json.loads(document.data_source_info)
|
||||
exist_page_ids.append(data_source_info['notion_page_id'])
|
||||
# get all authorized pages
|
||||
data_source_bindings = DataSourceBinding.query.filter_by(
|
||||
data_source_bindings = DataSourceOauthBinding.query.filter_by(
|
||||
tenant_id=current_user.current_tenant_id,
|
||||
provider='notion',
|
||||
disabled=False
|
||||
@@ -163,12 +163,12 @@ class DataSourceNotionApi(Resource):
|
||||
def get(self, workspace_id, page_id, page_type):
|
||||
workspace_id = str(workspace_id)
|
||||
page_id = str(page_id)
|
||||
data_source_binding = DataSourceBinding.query.filter(
|
||||
data_source_binding = DataSourceOauthBinding.query.filter(
|
||||
db.and_(
|
||||
DataSourceBinding.tenant_id == current_user.current_tenant_id,
|
||||
DataSourceBinding.provider == 'notion',
|
||||
DataSourceBinding.disabled == False,
|
||||
DataSourceBinding.source_info['workspace_id'] == f'"{workspace_id}"'
|
||||
DataSourceOauthBinding.tenant_id == current_user.current_tenant_id,
|
||||
DataSourceOauthBinding.provider == 'notion',
|
||||
DataSourceOauthBinding.disabled == False,
|
||||
DataSourceOauthBinding.source_info['workspace_id'] == f'"{workspace_id}"'
|
||||
)
|
||||
).first()
|
||||
if not data_source_binding:
|
||||
|
||||
@@ -315,6 +315,22 @@ class DatasetIndexingEstimateApi(Resource):
|
||||
document_model=args['doc_form']
|
||||
)
|
||||
extract_settings.append(extract_setting)
|
||||
elif args['info_list']['data_source_type'] == 'website_crawl':
|
||||
website_info_list = args['info_list']['website_info_list']
|
||||
for url in website_info_list['urls']:
|
||||
extract_setting = ExtractSetting(
|
||||
datasource_type="website_crawl",
|
||||
website_info={
|
||||
"provider": website_info_list['provider'],
|
||||
"job_id": website_info_list['job_id'],
|
||||
"url": url,
|
||||
"tenant_id": current_user.current_tenant_id,
|
||||
"mode": 'crawl',
|
||||
"only_main_content": website_info_list['only_main_content']
|
||||
},
|
||||
document_model=args['doc_form']
|
||||
)
|
||||
extract_settings.append(extract_setting)
|
||||
else:
|
||||
raise ValueError('Data source type not support')
|
||||
indexing_runner = IndexingRunner()
|
||||
@@ -519,6 +535,7 @@ class DatasetRetrievalSettingMockApi(Resource):
|
||||
raise ValueError(f"Unsupported vector db type {vector_type}.")
|
||||
|
||||
|
||||
|
||||
class DatasetErrorDocs(Resource):
|
||||
@setup_required
|
||||
@login_required
|
||||
|
||||
@@ -465,6 +465,20 @@ class DocumentBatchIndexingEstimateApi(DocumentResource):
|
||||
document_model=document.doc_form
|
||||
)
|
||||
extract_settings.append(extract_setting)
|
||||
elif document.data_source_type == 'website_crawl':
|
||||
extract_setting = ExtractSetting(
|
||||
datasource_type="website_crawl",
|
||||
website_info={
|
||||
"provider": data_source_info['provider'],
|
||||
"job_id": data_source_info['job_id'],
|
||||
"url": data_source_info['url'],
|
||||
"tenant_id": current_user.current_tenant_id,
|
||||
"mode": data_source_info['mode'],
|
||||
"only_main_content": data_source_info['only_main_content']
|
||||
},
|
||||
document_model=document.doc_form
|
||||
)
|
||||
extract_settings.append(extract_setting)
|
||||
|
||||
else:
|
||||
raise ValueError('Data source type not support')
|
||||
@@ -952,6 +966,33 @@ class DocumentRenameApi(DocumentResource):
|
||||
return document
|
||||
|
||||
|
||||
class WebsiteDocumentSyncApi(DocumentResource):
|
||||
@setup_required
|
||||
@login_required
|
||||
@account_initialization_required
|
||||
def get(self, dataset_id, document_id):
|
||||
"""sync website document."""
|
||||
dataset_id = str(dataset_id)
|
||||
dataset = DatasetService.get_dataset(dataset_id)
|
||||
if not dataset:
|
||||
raise NotFound('Dataset not found.')
|
||||
document_id = str(document_id)
|
||||
document = DocumentService.get_document(dataset.id, document_id)
|
||||
if not document:
|
||||
raise NotFound('Document not found.')
|
||||
if document.tenant_id != current_user.current_tenant_id:
|
||||
raise Forbidden('No permission.')
|
||||
if document.data_source_type != 'website_crawl':
|
||||
raise ValueError('Document is not a website document.')
|
||||
# 403 if document is archived
|
||||
if DocumentService.check_archived(document):
|
||||
raise ArchivedDocumentImmutableError()
|
||||
# sync document
|
||||
DocumentService.sync_website_document(dataset_id, document)
|
||||
|
||||
return {'result': 'success'}, 200
|
||||
|
||||
|
||||
api.add_resource(GetProcessRuleApi, '/datasets/process-rule')
|
||||
api.add_resource(DatasetDocumentListApi,
|
||||
'/datasets/<uuid:dataset_id>/documents')
|
||||
@@ -980,3 +1021,5 @@ api.add_resource(DocumentRecoverApi, '/datasets/<uuid:dataset_id>/documents/<uui
|
||||
api.add_resource(DocumentRetryApi, '/datasets/<uuid:dataset_id>/retry')
|
||||
api.add_resource(DocumentRenameApi,
|
||||
'/datasets/<uuid:dataset_id>/documents/<uuid:document_id>/rename')
|
||||
|
||||
api.add_resource(WebsiteDocumentSyncApi, '/datasets/<uuid:dataset_id>/documents/<uuid:document_id>/website-sync')
|
||||
|
||||
@@ -73,6 +73,12 @@ class InvalidMetadataError(BaseHTTPException):
|
||||
code = 400
|
||||
|
||||
|
||||
class WebsiteCrawlError(BaseHTTPException):
|
||||
error_code = 'crawl_failed'
|
||||
description = "{message}"
|
||||
code = 500
|
||||
|
||||
|
||||
class DatasetInUseError(BaseHTTPException):
|
||||
error_code = 'dataset_in_use'
|
||||
description = "The dataset is being used by some apps. Please remove the dataset from the apps before deleting it."
|
||||
|
||||
49
api/controllers/console/datasets/website.py
Normal file
49
api/controllers/console/datasets/website.py
Normal file
@@ -0,0 +1,49 @@
|
||||
from flask_restful import Resource, reqparse
|
||||
|
||||
from controllers.console import api
|
||||
from controllers.console.datasets.error import WebsiteCrawlError
|
||||
from controllers.console.setup import setup_required
|
||||
from controllers.console.wraps import account_initialization_required
|
||||
from libs.login import login_required
|
||||
from services.website_service import WebsiteService
|
||||
|
||||
|
||||
class WebsiteCrawlApi(Resource):
|
||||
|
||||
@setup_required
|
||||
@login_required
|
||||
@account_initialization_required
|
||||
def post(self):
|
||||
parser = reqparse.RequestParser()
|
||||
parser.add_argument('provider', type=str, choices=['firecrawl'],
|
||||
required=True, nullable=True, location='json')
|
||||
parser.add_argument('url', type=str, required=True, nullable=True, location='json')
|
||||
parser.add_argument('options', type=dict, required=True, nullable=True, location='json')
|
||||
args = parser.parse_args()
|
||||
WebsiteService.document_create_args_validate(args)
|
||||
# crawl url
|
||||
try:
|
||||
result = WebsiteService.crawl_url(args)
|
||||
except Exception as e:
|
||||
raise WebsiteCrawlError(str(e))
|
||||
return result, 200
|
||||
|
||||
|
||||
class WebsiteCrawlStatusApi(Resource):
|
||||
@setup_required
|
||||
@login_required
|
||||
@account_initialization_required
|
||||
def get(self, job_id: str):
|
||||
parser = reqparse.RequestParser()
|
||||
parser.add_argument('provider', type=str, choices=['firecrawl'], required=True, location='args')
|
||||
args = parser.parse_args()
|
||||
# get crawl status
|
||||
try:
|
||||
result = WebsiteService.get_crawl_status(job_id, args['provider'])
|
||||
except Exception as e:
|
||||
raise WebsiteCrawlError(str(e))
|
||||
return result, 200
|
||||
|
||||
|
||||
api.add_resource(WebsiteCrawlApi, '/website/crawl')
|
||||
api.add_resource(WebsiteCrawlStatusApi, '/website/crawl/status/<string:job_id>')
|
||||
Reference in New Issue
Block a user