Feat/dataset notion import (#392)

Co-authored-by: StyleZhang <jasonapring2015@outlook.com>
Co-authored-by: JzoNg <jzongcode@gmail.com>
This commit is contained in:
Jyong
2023-06-16 21:47:51 +08:00
committed by GitHub
parent f350948bde
commit 9253f72dea
96 changed files with 4479 additions and 367 deletions

View File

@@ -0,0 +1,303 @@
import datetime
import json
from cachetools import TTLCache
from flask import request, current_app
from flask_login import login_required, current_user
from flask_restful import Resource, marshal_with, fields, reqparse, marshal
from werkzeug.exceptions import NotFound
from controllers.console import api
from controllers.console.setup import setup_required
from controllers.console.wraps import account_initialization_required
from core.data_source.notion import NotionPageReader
from core.indexing_runner import IndexingRunner
from extensions.ext_database import db
from libs.helper import TimestampField
from libs.oauth_data_source import NotionOAuth
from models.dataset import Document
from models.source import DataSourceBinding
from services.dataset_service import DatasetService, DocumentService
from tasks.document_indexing_sync_task import document_indexing_sync_task
cache = TTLCache(maxsize=None, ttl=30)
FILE_SIZE_LIMIT = 15 * 1024 * 1024 # 15MB
ALLOWED_EXTENSIONS = ['txt', 'markdown', 'md', 'pdf', 'html', 'htm']
PREVIEW_WORDS_LIMIT = 3000
class DataSourceApi(Resource):
integrate_icon_fields = {
'type': fields.String,
'url': fields.String,
'emoji': fields.String
}
integrate_page_fields = {
'page_name': fields.String,
'page_id': fields.String,
'page_icon': fields.Nested(integrate_icon_fields, allow_null=True),
'parent_id': fields.String,
'type': fields.String
}
integrate_workspace_fields = {
'workspace_name': fields.String,
'workspace_id': fields.String,
'workspace_icon': fields.String,
'pages': fields.List(fields.Nested(integrate_page_fields)),
'total': fields.Integer
}
integrate_fields = {
'id': fields.String,
'provider': fields.String,
'created_at': TimestampField,
'is_bound': fields.Boolean,
'disabled': fields.Boolean,
'link': fields.String,
'source_info': fields.Nested(integrate_workspace_fields)
}
integrate_list_fields = {
'data': fields.List(fields.Nested(integrate_fields)),
}
@setup_required
@login_required
@account_initialization_required
@marshal_with(integrate_list_fields)
def get(self):
# get workspace data source integrates
data_source_integrates = db.session.query(DataSourceBinding).filter(
DataSourceBinding.tenant_id == current_user.current_tenant_id,
DataSourceBinding.disabled == False
).all()
base_url = request.url_root.rstrip('/')
data_source_oauth_base_path = "/console/api/oauth/data-source"
providers = ["notion"]
integrate_data = []
for provider in providers:
# existing_integrate = next((ai for ai in data_source_integrates if ai.provider == provider), None)
existing_integrates = filter(lambda item: item.provider == provider, data_source_integrates)
if existing_integrates:
for existing_integrate in list(existing_integrates):
integrate_data.append({
'id': existing_integrate.id,
'provider': provider,
'created_at': existing_integrate.created_at,
'is_bound': True,
'disabled': existing_integrate.disabled,
'source_info': existing_integrate.source_info,
'link': f'{base_url}{data_source_oauth_base_path}/{provider}'
})
else:
integrate_data.append({
'id': None,
'provider': provider,
'created_at': None,
'source_info': None,
'is_bound': False,
'disabled': None,
'link': f'{base_url}{data_source_oauth_base_path}/{provider}'
})
return {'data': integrate_data}, 200
@setup_required
@login_required
@account_initialization_required
def patch(self, binding_id, action):
binding_id = str(binding_id)
action = str(action)
data_source_binding = DataSourceBinding.query.filter_by(
id=binding_id
).first()
if data_source_binding is None:
raise NotFound('Data source binding not found.')
# enable binding
if action == 'enable':
if data_source_binding.disabled:
data_source_binding.disabled = False
data_source_binding.updated_at = datetime.datetime.utcnow()
db.session.add(data_source_binding)
db.session.commit()
else:
raise ValueError('Data source is not disabled.')
# disable binding
if action == 'disable':
if not data_source_binding.disabled:
data_source_binding.disabled = True
data_source_binding.updated_at = datetime.datetime.utcnow()
db.session.add(data_source_binding)
db.session.commit()
else:
raise ValueError('Data source is disabled.')
return {'result': 'success'}, 200
class DataSourceNotionListApi(Resource):
integrate_icon_fields = {
'type': fields.String,
'url': fields.String,
'emoji': fields.String
}
integrate_page_fields = {
'page_name': fields.String,
'page_id': fields.String,
'page_icon': fields.Nested(integrate_icon_fields, allow_null=True),
'is_bound': fields.Boolean,
'parent_id': fields.String,
'type': fields.String
}
integrate_workspace_fields = {
'workspace_name': fields.String,
'workspace_id': fields.String,
'workspace_icon': fields.String,
'pages': fields.List(fields.Nested(integrate_page_fields))
}
integrate_notion_info_list_fields = {
'notion_info': fields.List(fields.Nested(integrate_workspace_fields)),
}
@setup_required
@login_required
@account_initialization_required
@marshal_with(integrate_notion_info_list_fields)
def get(self):
dataset_id = request.args.get('dataset_id', default=None, type=str)
exist_page_ids = []
# import notion in the exist dataset
if dataset_id:
dataset = DatasetService.get_dataset(dataset_id)
if not dataset:
raise NotFound('Dataset not found.')
if dataset.data_source_type != 'notion_import':
raise ValueError('Dataset is not notion type.')
documents = Document.query.filter_by(
dataset_id=dataset_id,
tenant_id=current_user.current_tenant_id,
data_source_type='notion_import',
enabled=True
).all()
if documents:
for document in documents:
data_source_info = json.loads(document.data_source_info)
exist_page_ids.append(data_source_info['notion_page_id'])
# get all authorized pages
data_source_bindings = DataSourceBinding.query.filter_by(
tenant_id=current_user.current_tenant_id,
provider='notion',
disabled=False
).all()
if not data_source_bindings:
return {
'notion_info': []
}, 200
pre_import_info_list = []
for data_source_binding in data_source_bindings:
source_info = data_source_binding.source_info
pages = source_info['pages']
# Filter out already bound pages
for page in pages:
if page['page_id'] in exist_page_ids:
page['is_bound'] = True
else:
page['is_bound'] = False
pre_import_info = {
'workspace_name': source_info['workspace_name'],
'workspace_icon': source_info['workspace_icon'],
'workspace_id': source_info['workspace_id'],
'pages': pages,
}
pre_import_info_list.append(pre_import_info)
return {
'notion_info': pre_import_info_list
}, 200
class DataSourceNotionApi(Resource):
@setup_required
@login_required
@account_initialization_required
def get(self, workspace_id, page_id, page_type):
workspace_id = str(workspace_id)
page_id = str(page_id)
data_source_binding = DataSourceBinding.query.filter(
db.and_(
DataSourceBinding.tenant_id == current_user.current_tenant_id,
DataSourceBinding.provider == 'notion',
DataSourceBinding.disabled == False,
DataSourceBinding.source_info['workspace_id'] == f'"{workspace_id}"'
)
).first()
if not data_source_binding:
raise NotFound('Data source binding not found.')
reader = NotionPageReader(integration_token=data_source_binding.access_token)
if page_type == 'page':
page_content = reader.read_page(page_id)
elif page_type == 'database':
page_content = reader.query_database_data(page_id)
else:
page_content = ""
return {
'content': page_content
}, 200
@setup_required
@login_required
@account_initialization_required
def post(self):
parser = reqparse.RequestParser()
parser.add_argument('notion_info_list', type=list, required=True, nullable=True, location='json')
parser.add_argument('process_rule', type=dict, required=True, nullable=True, location='json')
args = parser.parse_args()
# validate args
DocumentService.estimate_args_validate(args)
indexing_runner = IndexingRunner()
response = indexing_runner.notion_indexing_estimate(args['notion_info_list'], args['process_rule'])
return response, 200
class DataSourceNotionDatasetSyncApi(Resource):
@setup_required
@login_required
@account_initialization_required
def get(self, dataset_id):
dataset_id_str = str(dataset_id)
dataset = DatasetService.get_dataset(dataset_id_str)
if dataset is None:
raise NotFound("Dataset not found.")
documents = DocumentService.get_document_by_dataset_id(dataset_id_str)
for document in documents:
document_indexing_sync_task.delay(dataset_id_str, document.id)
return 200
class DataSourceNotionDocumentSyncApi(Resource):
@setup_required
@login_required
@account_initialization_required
def get(self, dataset_id, document_id):
dataset_id_str = str(dataset_id)
document_id_str = str(document_id)
dataset = DatasetService.get_dataset(dataset_id_str)
if dataset is None:
raise NotFound("Dataset not found.")
document = DocumentService.get_document(dataset_id_str, document_id_str)
if document is None:
raise NotFound("Document not found.")
document_indexing_sync_task.delay(dataset_id_str, document_id_str)
return 200
api.add_resource(DataSourceApi, '/data-source/integrates', '/data-source/integrates/<uuid:binding_id>/<string:action>')
api.add_resource(DataSourceNotionListApi, '/notion/pre-import/pages')
api.add_resource(DataSourceNotionApi,
'/notion/workspaces/<uuid:workspace_id>/pages/<uuid:page_id>/<string:page_type>/preview',
'/datasets/notion-indexing-estimate')
api.add_resource(DataSourceNotionDatasetSyncApi, '/datasets/<uuid:dataset_id>/notion/sync')
api.add_resource(DataSourceNotionDocumentSyncApi, '/datasets/<uuid:dataset_id>/documents/<uuid:document_id>/notion/sync')

View File

@@ -12,8 +12,9 @@ from controllers.console.wraps import account_initialization_required
from core.indexing_runner import IndexingRunner
from libs.helper import TimestampField
from extensions.ext_database import db
from models.dataset import DocumentSegment, Document
from models.model import UploadFile
from services.dataset_service import DatasetService
from services.dataset_service import DatasetService, DocumentService
dataset_detail_fields = {
'id': fields.String,
@@ -217,17 +218,31 @@ class DatasetIndexingEstimateApi(Resource):
@login_required
@account_initialization_required
def post(self):
segment_rule = request.get_json()
file_detail = db.session.query(UploadFile).filter(
UploadFile.tenant_id == current_user.current_tenant_id,
UploadFile.id == segment_rule["file_id"]
).first()
parser = reqparse.RequestParser()
parser.add_argument('info_list', type=dict, required=True, nullable=True, location='json')
parser.add_argument('process_rule', type=dict, required=True, nullable=True, location='json')
args = parser.parse_args()
# validate args
DocumentService.estimate_args_validate(args)
if args['info_list']['data_source_type'] == 'upload_file':
file_ids = args['info_list']['file_info_list']['file_ids']
file_details = db.session.query(UploadFile).filter(
UploadFile.tenant_id == current_user.current_tenant_id,
UploadFile.id.in_(file_ids)
).all()
if file_detail is None:
raise NotFound("File not found.")
if file_details is None:
raise NotFound("File not found.")
indexing_runner = IndexingRunner()
response = indexing_runner.indexing_estimate(file_detail, segment_rule['process_rule'])
indexing_runner = IndexingRunner()
response = indexing_runner.file_indexing_estimate(file_details, args['process_rule'])
elif args['info_list']['data_source_type'] == 'notion_import':
indexing_runner = IndexingRunner()
response = indexing_runner.notion_indexing_estimate(args['info_list']['notion_info_list'],
args['process_rule'])
else:
raise ValueError('Data source type not support')
return response, 200
@@ -274,8 +289,54 @@ class DatasetRelatedAppListApi(Resource):
}, 200
class DatasetIndexingStatusApi(Resource):
document_status_fields = {
'id': fields.String,
'indexing_status': fields.String,
'processing_started_at': TimestampField,
'parsing_completed_at': TimestampField,
'cleaning_completed_at': TimestampField,
'splitting_completed_at': TimestampField,
'completed_at': TimestampField,
'paused_at': TimestampField,
'error': fields.String,
'stopped_at': TimestampField,
'completed_segments': fields.Integer,
'total_segments': fields.Integer,
}
document_status_fields_list = {
'data': fields.List(fields.Nested(document_status_fields))
}
@setup_required
@login_required
@account_initialization_required
def get(self, dataset_id):
dataset_id = str(dataset_id)
documents = db.session.query(Document).filter(
Document.dataset_id == dataset_id,
Document.tenant_id == current_user.current_tenant_id
).all()
documents_status = []
for document in documents:
completed_segments = DocumentSegment.query.filter(DocumentSegment.completed_at.isnot(None),
DocumentSegment.document_id == str(document.id),
DocumentSegment.status != 're_segment').count()
total_segments = DocumentSegment.query.filter(DocumentSegment.document_id == str(document.id),
DocumentSegment.status != 're_segment').count()
document.completed_segments = completed_segments
document.total_segments = total_segments
documents_status.append(marshal(document, self.document_status_fields))
data = {
'data': documents_status
}
return data
api.add_resource(DatasetListApi, '/datasets')
api.add_resource(DatasetApi, '/datasets/<uuid:dataset_id>')
api.add_resource(DatasetQueryApi, '/datasets/<uuid:dataset_id>/queries')
api.add_resource(DatasetIndexingEstimateApi, '/datasets/file-indexing-estimate')
api.add_resource(DatasetIndexingEstimateApi, '/datasets/indexing-estimate')
api.add_resource(DatasetRelatedAppListApi, '/datasets/<uuid:dataset_id>/related-apps')
api.add_resource(DatasetIndexingStatusApi, '/datasets/<uuid:dataset_id>/indexing-status')

View File

@@ -1,6 +1,7 @@
# -*- coding:utf-8 -*-
import random
from datetime import datetime
from typing import List
from flask import request
from flask_login import login_required, current_user
@@ -61,6 +62,29 @@ document_fields = {
'hit_count': fields.Integer,
}
document_with_segments_fields = {
'id': fields.String,
'position': fields.Integer,
'data_source_type': fields.String,
'data_source_info': fields.Raw(attribute='data_source_info_dict'),
'dataset_process_rule_id': fields.String,
'name': fields.String,
'created_from': fields.String,
'created_by': fields.String,
'created_at': TimestampField,
'tokens': fields.Integer,
'indexing_status': fields.String,
'error': fields.String,
'enabled': fields.Boolean,
'disabled_at': TimestampField,
'disabled_by': fields.String,
'archived': fields.Boolean,
'display_status': fields.String,
'word_count': fields.Integer,
'hit_count': fields.Integer,
'completed_segments': fields.Integer,
'total_segments': fields.Integer
}
class DocumentResource(Resource):
def get_document(self, dataset_id: str, document_id: str) -> Document:
@@ -83,6 +107,23 @@ class DocumentResource(Resource):
return document
def get_batch_documents(self, dataset_id: str, batch: str) -> List[Document]:
dataset = DatasetService.get_dataset(dataset_id)
if not dataset:
raise NotFound('Dataset not found.')
try:
DatasetService.check_dataset_permission(dataset, current_user)
except services.errors.account.NoPermissionError as e:
raise Forbidden(str(e))
documents = DocumentService.get_batch_documents(dataset_id, batch)
if not documents:
raise NotFound('Documents not found.')
return documents
class GetProcessRuleApi(Resource):
@setup_required
@@ -132,9 +173,9 @@ class DatasetDocumentListApi(Resource):
dataset_id = str(dataset_id)
page = request.args.get('page', default=1, type=int)
limit = request.args.get('limit', default=20, type=int)
search = request.args.get('search', default=None, type=str)
search = request.args.get('keyword', default=None, type=str)
sort = request.args.get('sort', default='-created_at', type=str)
fetch = request.args.get('fetch', default=False, type=bool)
dataset = DatasetService.get_dataset(dataset_id)
if not dataset:
raise NotFound('Dataset not found.')
@@ -173,9 +214,20 @@ class DatasetDocumentListApi(Resource):
paginated_documents = query.paginate(
page=page, per_page=limit, max_per_page=100, error_out=False)
documents = paginated_documents.items
if fetch:
for document in documents:
completed_segments = DocumentSegment.query.filter(DocumentSegment.completed_at.isnot(None),
DocumentSegment.document_id == str(document.id),
DocumentSegment.status != 're_segment').count()
total_segments = DocumentSegment.query.filter(DocumentSegment.document_id == str(document.id),
DocumentSegment.status != 're_segment').count()
document.completed_segments = completed_segments
document.total_segments = total_segments
data = marshal(documents, document_with_segments_fields)
else:
data = marshal(documents, document_fields)
response = {
'data': marshal(documents, document_fields),
'data': data,
'has_more': len(documents) == limit,
'limit': limit,
'total': paginated_documents.total,
@@ -184,10 +236,15 @@ class DatasetDocumentListApi(Resource):
return response
documents_and_batch_fields = {
'documents': fields.List(fields.Nested(document_fields)),
'batch': fields.String
}
@setup_required
@login_required
@account_initialization_required
@marshal_with(document_fields)
@marshal_with(documents_and_batch_fields)
def post(self, dataset_id):
dataset_id = str(dataset_id)
@@ -221,7 +278,7 @@ class DatasetDocumentListApi(Resource):
DocumentService.document_create_args_validate(args)
try:
document = DocumentService.save_document_with_dataset_id(dataset, args, current_user)
documents, batch = DocumentService.save_document_with_dataset_id(dataset, args, current_user)
except ProviderTokenNotInitError:
raise ProviderNotInitializeError()
except QuotaExceededError:
@@ -229,13 +286,17 @@ class DatasetDocumentListApi(Resource):
except ModelCurrentlyNotSupportError:
raise ProviderModelCurrentlyNotSupportError()
return document
return {
'documents': documents,
'batch': batch
}
class DatasetInitApi(Resource):
dataset_and_document_fields = {
'dataset': fields.Nested(dataset_fields),
'document': fields.Nested(document_fields)
'documents': fields.List(fields.Nested(document_fields)),
'batch': fields.String
}
@setup_required
@@ -258,7 +319,7 @@ class DatasetInitApi(Resource):
DocumentService.document_create_args_validate(args)
try:
dataset, document = DocumentService.save_document_without_dataset_id(
dataset, documents, batch = DocumentService.save_document_without_dataset_id(
tenant_id=current_user.current_tenant_id,
document_data=args,
account=current_user
@@ -272,7 +333,8 @@ class DatasetInitApi(Resource):
response = {
'dataset': dataset,
'document': document
'documents': documents,
'batch': batch
}
return response
@@ -317,11 +379,122 @@ class DocumentIndexingEstimateApi(DocumentResource):
raise NotFound('File not found.')
indexing_runner = IndexingRunner()
response = indexing_runner.indexing_estimate(file, data_process_rule_dict)
response = indexing_runner.file_indexing_estimate([file], data_process_rule_dict)
return response
class DocumentBatchIndexingEstimateApi(DocumentResource):
@setup_required
@login_required
@account_initialization_required
def get(self, dataset_id, batch):
dataset_id = str(dataset_id)
batch = str(batch)
dataset = DatasetService.get_dataset(dataset_id)
if dataset is None:
raise NotFound("Dataset not found.")
documents = self.get_batch_documents(dataset_id, batch)
response = {
"tokens": 0,
"total_price": 0,
"currency": "USD",
"total_segments": 0,
"preview": []
}
if not documents:
return response
data_process_rule = documents[0].dataset_process_rule
data_process_rule_dict = data_process_rule.to_dict()
info_list = []
for document in documents:
if document.indexing_status in ['completed', 'error']:
raise DocumentAlreadyFinishedError()
data_source_info = document.data_source_info_dict
# format document files info
if data_source_info and 'upload_file_id' in data_source_info:
file_id = data_source_info['upload_file_id']
info_list.append(file_id)
# format document notion info
elif data_source_info and 'notion_workspace_id' in data_source_info and 'notion_page_id' in data_source_info:
pages = []
page = {
'page_id': data_source_info['notion_page_id'],
'type': data_source_info['type']
}
pages.append(page)
notion_info = {
'workspace_id': data_source_info['notion_workspace_id'],
'pages': pages
}
info_list.append(notion_info)
if dataset.data_source_type == 'upload_file':
file_details = db.session.query(UploadFile).filter(
UploadFile.tenant_id == current_user.current_tenant_id,
UploadFile.id in info_list
).all()
if file_details is None:
raise NotFound("File not found.")
indexing_runner = IndexingRunner()
response = indexing_runner.file_indexing_estimate(file_details, data_process_rule_dict)
elif dataset.data_source_type:
indexing_runner = IndexingRunner()
response = indexing_runner.notion_indexing_estimate(info_list,
data_process_rule_dict)
else:
raise ValueError('Data source type not support')
return response
class DocumentBatchIndexingStatusApi(DocumentResource):
document_status_fields = {
'id': fields.String,
'indexing_status': fields.String,
'processing_started_at': TimestampField,
'parsing_completed_at': TimestampField,
'cleaning_completed_at': TimestampField,
'splitting_completed_at': TimestampField,
'completed_at': TimestampField,
'paused_at': TimestampField,
'error': fields.String,
'stopped_at': TimestampField,
'completed_segments': fields.Integer,
'total_segments': fields.Integer,
}
document_status_fields_list = {
'data': fields.List(fields.Nested(document_status_fields))
}
@setup_required
@login_required
@account_initialization_required
def get(self, dataset_id, batch):
dataset_id = str(dataset_id)
batch = str(batch)
documents = self.get_batch_documents(dataset_id, batch)
documents_status = []
for document in documents:
completed_segments = DocumentSegment.query.filter(DocumentSegment.completed_at.isnot(None),
DocumentSegment.document_id == str(document.id),
DocumentSegment.status != 're_segment').count()
total_segments = DocumentSegment.query.filter(DocumentSegment.document_id == str(document.id),
DocumentSegment.status != 're_segment').count()
document.completed_segments = completed_segments
document.total_segments = total_segments
documents_status.append(marshal(document, self.document_status_fields))
data = {
'data': documents_status
}
return data
class DocumentIndexingStatusApi(DocumentResource):
document_status_fields = {
'id': fields.String,
@@ -408,7 +581,7 @@ class DocumentDetailApi(DocumentResource):
'disabled_by': document.disabled_by,
'archived': document.archived,
'segment_count': document.segment_count,
'average_segment_length': document.average_segment_length,
'average_segment_length': document.average_segment_length,
'hit_count': document.hit_count,
'display_status': document.display_status
}
@@ -428,7 +601,7 @@ class DocumentDetailApi(DocumentResource):
'created_at': document.created_at.timestamp(),
'tokens': document.tokens,
'indexing_status': document.indexing_status,
'completed_at': int(document.completed_at.timestamp())if document.completed_at else None,
'completed_at': int(document.completed_at.timestamp()) if document.completed_at else None,
'updated_at': int(document.updated_at.timestamp()) if document.updated_at else None,
'indexing_latency': document.indexing_latency,
'error': document.error,
@@ -579,6 +752,8 @@ class DocumentStatusApi(DocumentResource):
return {'result': 'success'}, 200
elif action == "disable":
if not document.completed_at or document.indexing_status != 'completed':
raise InvalidActionError('Document is not completed.')
if not document.enabled:
raise InvalidActionError('Document already disabled.')
@@ -678,6 +853,10 @@ api.add_resource(DatasetInitApi,
'/datasets/init')
api.add_resource(DocumentIndexingEstimateApi,
'/datasets/<uuid:dataset_id>/documents/<uuid:document_id>/indexing-estimate')
api.add_resource(DocumentBatchIndexingEstimateApi,
'/datasets/<uuid:dataset_id>/batch/<string:batch>/indexing-estimate')
api.add_resource(DocumentBatchIndexingStatusApi,
'/datasets/<uuid:dataset_id>/batch/<string:batch>/indexing-status')
api.add_resource(DocumentIndexingStatusApi,
'/datasets/<uuid:dataset_id>/documents/<uuid:document_id>/indexing-status')
api.add_resource(DocumentDetailApi,