mirror of
http://112.124.100.131/huang.ze/ebiz-dify-ai.git
synced 2025-12-10 03:16:51 +08:00
Feat/support parent child chunk (#12092)
This commit is contained in:
@@ -6,12 +6,13 @@ import click
|
||||
from celery import shared_task # type: ignore
|
||||
from werkzeug.exceptions import NotFound
|
||||
|
||||
from core.rag.index_processor.constant.index_type import IndexType
|
||||
from core.rag.index_processor.index_processor_factory import IndexProcessorFactory
|
||||
from core.rag.models.document import Document
|
||||
from core.rag.models.document import ChildDocument, Document
|
||||
from extensions.ext_database import db
|
||||
from extensions.ext_redis import redis_client
|
||||
from models.dataset import DatasetAutoDisableLog, DocumentSegment
|
||||
from models.dataset import Document as DatasetDocument
|
||||
from models.dataset import DocumentSegment
|
||||
|
||||
|
||||
@shared_task(queue="dataset")
|
||||
@@ -53,7 +54,22 @@ def add_document_to_index_task(dataset_document_id: str):
|
||||
"dataset_id": segment.dataset_id,
|
||||
},
|
||||
)
|
||||
|
||||
if dataset_document.doc_form == IndexType.PARENT_CHILD_INDEX:
|
||||
child_chunks = segment.child_chunks
|
||||
if child_chunks:
|
||||
child_documents = []
|
||||
for child_chunk in child_chunks:
|
||||
child_document = ChildDocument(
|
||||
page_content=child_chunk.content,
|
||||
metadata={
|
||||
"doc_id": child_chunk.index_node_id,
|
||||
"doc_hash": child_chunk.index_node_hash,
|
||||
"document_id": segment.document_id,
|
||||
"dataset_id": segment.dataset_id,
|
||||
},
|
||||
)
|
||||
child_documents.append(child_document)
|
||||
document.children = child_documents
|
||||
documents.append(document)
|
||||
|
||||
dataset = dataset_document.dataset
|
||||
@@ -65,6 +81,12 @@ def add_document_to_index_task(dataset_document_id: str):
|
||||
index_processor = IndexProcessorFactory(index_type).init_index_processor()
|
||||
index_processor.load(dataset, documents)
|
||||
|
||||
# delete auto disable log
|
||||
db.session.query(DatasetAutoDisableLog).filter(
|
||||
DatasetAutoDisableLog.document_id == dataset_document.id
|
||||
).delete()
|
||||
db.session.commit()
|
||||
|
||||
end_at = time.perf_counter()
|
||||
logging.info(
|
||||
click.style(
|
||||
|
||||
75
api/tasks/batch_clean_document_task.py
Normal file
75
api/tasks/batch_clean_document_task.py
Normal file
@@ -0,0 +1,75 @@
|
||||
import logging
|
||||
import time
|
||||
|
||||
import click
|
||||
from celery import shared_task
|
||||
|
||||
from core.rag.index_processor.index_processor_factory import IndexProcessorFactory
|
||||
from core.tools.utils.web_reader_tool import get_image_upload_file_ids
|
||||
from extensions.ext_database import db
|
||||
from extensions.ext_storage import storage
|
||||
from models.dataset import Dataset, DocumentSegment
|
||||
from models.model import UploadFile
|
||||
|
||||
|
||||
@shared_task(queue="dataset")
|
||||
def batch_clean_document_task(document_ids: list[str], dataset_id: str, doc_form: str, file_ids: list[str]):
|
||||
"""
|
||||
Clean document when document deleted.
|
||||
:param document_ids: document ids
|
||||
:param dataset_id: dataset id
|
||||
:param doc_form: doc_form
|
||||
:param file_ids: file ids
|
||||
|
||||
Usage: clean_document_task.delay(document_id, dataset_id)
|
||||
"""
|
||||
logging.info(click.style("Start batch clean documents when documents deleted", fg="green"))
|
||||
start_at = time.perf_counter()
|
||||
|
||||
try:
|
||||
dataset = db.session.query(Dataset).filter(Dataset.id == dataset_id).first()
|
||||
|
||||
if not dataset:
|
||||
raise Exception("Document has no dataset")
|
||||
|
||||
segments = db.session.query(DocumentSegment).filter(DocumentSegment.document_id.in_(document_ids)).all()
|
||||
# check segment is exist
|
||||
if segments:
|
||||
index_node_ids = [segment.index_node_id for segment in segments]
|
||||
index_processor = IndexProcessorFactory(doc_form).init_index_processor()
|
||||
index_processor.clean(dataset, index_node_ids, with_keywords=True, delete_child_chunks=True)
|
||||
|
||||
for segment in segments:
|
||||
image_upload_file_ids = get_image_upload_file_ids(segment.content)
|
||||
for upload_file_id in image_upload_file_ids:
|
||||
image_file = db.session.query(UploadFile).filter(UploadFile.id == upload_file_id).first()
|
||||
try:
|
||||
storage.delete(image_file.key)
|
||||
except Exception:
|
||||
logging.exception(
|
||||
"Delete image_files failed when storage deleted, \
|
||||
image_upload_file_is: {}".format(upload_file_id)
|
||||
)
|
||||
db.session.delete(image_file)
|
||||
db.session.delete(segment)
|
||||
|
||||
db.session.commit()
|
||||
if file_ids:
|
||||
files = db.session.query(UploadFile).filter(UploadFile.id.in_(file_ids)).all()
|
||||
for file in files:
|
||||
try:
|
||||
storage.delete(file.key)
|
||||
except Exception:
|
||||
logging.exception("Delete file failed when document deleted, file_id: {}".format(file.id))
|
||||
db.session.delete(file)
|
||||
db.session.commit()
|
||||
|
||||
end_at = time.perf_counter()
|
||||
logging.info(
|
||||
click.style(
|
||||
"Cleaned documents when documents deleted latency: {}".format(end_at - start_at),
|
||||
fg="green",
|
||||
)
|
||||
)
|
||||
except Exception:
|
||||
logging.exception("Cleaned documents when documents deleted failed")
|
||||
@@ -7,13 +7,13 @@ import click
|
||||
from celery import shared_task # type: ignore
|
||||
from sqlalchemy import func
|
||||
|
||||
from core.indexing_runner import IndexingRunner
|
||||
from core.model_manager import ModelManager
|
||||
from core.model_runtime.entities.model_entities import ModelType
|
||||
from extensions.ext_database import db
|
||||
from extensions.ext_redis import redis_client
|
||||
from libs import helper
|
||||
from models.dataset import Dataset, Document, DocumentSegment
|
||||
from services.vector_service import VectorService
|
||||
|
||||
|
||||
@shared_task(queue="dataset")
|
||||
@@ -96,8 +96,7 @@ def batch_create_segment_to_index_task(
|
||||
dataset_document.word_count += word_count_change
|
||||
db.session.add(dataset_document)
|
||||
# add index to db
|
||||
indexing_runner = IndexingRunner()
|
||||
indexing_runner.batch_add_segments(document_segments, dataset)
|
||||
VectorService.create_segments_vector(None, document_segments, dataset, dataset_document.doc_form)
|
||||
db.session.commit()
|
||||
redis_client.setex(indexing_cache_key, 600, "completed")
|
||||
end_at = time.perf_counter()
|
||||
|
||||
@@ -62,7 +62,7 @@ def clean_dataset_task(
|
||||
if doc_form is None:
|
||||
raise ValueError("Index type must be specified.")
|
||||
index_processor = IndexProcessorFactory(doc_form).init_index_processor()
|
||||
index_processor.clean(dataset, None)
|
||||
index_processor.clean(dataset, None, with_keywords=True, delete_child_chunks=True)
|
||||
|
||||
for document in documents:
|
||||
db.session.delete(document)
|
||||
|
||||
@@ -38,7 +38,7 @@ def clean_document_task(document_id: str, dataset_id: str, doc_form: str, file_i
|
||||
if segments:
|
||||
index_node_ids = [segment.index_node_id for segment in segments]
|
||||
index_processor = IndexProcessorFactory(doc_form).init_index_processor()
|
||||
index_processor.clean(dataset, index_node_ids)
|
||||
index_processor.clean(dataset, index_node_ids, with_keywords=True, delete_child_chunks=True)
|
||||
|
||||
for segment in segments:
|
||||
image_upload_file_ids = get_image_upload_file_ids(segment.content)
|
||||
|
||||
@@ -37,7 +37,7 @@ def clean_notion_document_task(document_ids: list[str], dataset_id: str):
|
||||
segments = db.session.query(DocumentSegment).filter(DocumentSegment.document_id == document_id).all()
|
||||
index_node_ids = [segment.index_node_id for segment in segments]
|
||||
|
||||
index_processor.clean(dataset, index_node_ids)
|
||||
index_processor.clean(dataset, index_node_ids, with_keywords=True, delete_child_chunks=True)
|
||||
|
||||
for segment in segments:
|
||||
db.session.delete(segment)
|
||||
|
||||
@@ -4,8 +4,9 @@ import time
|
||||
import click
|
||||
from celery import shared_task # type: ignore
|
||||
|
||||
from core.rag.index_processor.constant.index_type import IndexType
|
||||
from core.rag.index_processor.index_processor_factory import IndexProcessorFactory
|
||||
from core.rag.models.document import Document
|
||||
from core.rag.models.document import ChildDocument, Document
|
||||
from extensions.ext_database import db
|
||||
from models.dataset import Dataset, DocumentSegment
|
||||
from models.dataset import Document as DatasetDocument
|
||||
@@ -105,7 +106,7 @@ def deal_dataset_vector_index_task(dataset_id: str, action: str):
|
||||
db.session.commit()
|
||||
|
||||
# clean index
|
||||
index_processor.clean(dataset, None, with_keywords=False)
|
||||
index_processor.clean(dataset, None, with_keywords=False, delete_child_chunks=False)
|
||||
|
||||
for dataset_document in dataset_documents:
|
||||
# update from vector index
|
||||
@@ -128,7 +129,22 @@ def deal_dataset_vector_index_task(dataset_id: str, action: str):
|
||||
"dataset_id": segment.dataset_id,
|
||||
},
|
||||
)
|
||||
|
||||
if dataset_document.doc_form == IndexType.PARENT_CHILD_INDEX:
|
||||
child_chunks = segment.child_chunks
|
||||
if child_chunks:
|
||||
child_documents = []
|
||||
for child_chunk in child_chunks:
|
||||
child_document = ChildDocument(
|
||||
page_content=child_chunk.content,
|
||||
metadata={
|
||||
"doc_id": child_chunk.index_node_id,
|
||||
"doc_hash": child_chunk.index_node_hash,
|
||||
"document_id": segment.document_id,
|
||||
"dataset_id": segment.dataset_id,
|
||||
},
|
||||
)
|
||||
child_documents.append(child_document)
|
||||
document.children = child_documents
|
||||
documents.append(document)
|
||||
# save vector index
|
||||
index_processor.load(dataset, documents, with_keywords=False)
|
||||
|
||||
@@ -6,48 +6,38 @@ from celery import shared_task # type: ignore
|
||||
|
||||
from core.rag.index_processor.index_processor_factory import IndexProcessorFactory
|
||||
from extensions.ext_database import db
|
||||
from extensions.ext_redis import redis_client
|
||||
from models.dataset import Dataset, Document
|
||||
|
||||
|
||||
@shared_task(queue="dataset")
|
||||
def delete_segment_from_index_task(segment_id: str, index_node_id: str, dataset_id: str, document_id: str):
|
||||
def delete_segment_from_index_task(index_node_ids: list, dataset_id: str, document_id: str):
|
||||
"""
|
||||
Async Remove segment from index
|
||||
:param segment_id:
|
||||
:param index_node_id:
|
||||
:param index_node_ids:
|
||||
:param dataset_id:
|
||||
:param document_id:
|
||||
|
||||
Usage: delete_segment_from_index_task.delay(segment_id)
|
||||
Usage: delete_segment_from_index_task.delay(segment_ids)
|
||||
"""
|
||||
logging.info(click.style("Start delete segment from index: {}".format(segment_id), fg="green"))
|
||||
logging.info(click.style("Start delete segment from index", fg="green"))
|
||||
start_at = time.perf_counter()
|
||||
indexing_cache_key = "segment_{}_delete_indexing".format(segment_id)
|
||||
try:
|
||||
dataset = db.session.query(Dataset).filter(Dataset.id == dataset_id).first()
|
||||
if not dataset:
|
||||
logging.info(click.style("Segment {} has no dataset, pass.".format(segment_id), fg="cyan"))
|
||||
return
|
||||
|
||||
dataset_document = db.session.query(Document).filter(Document.id == document_id).first()
|
||||
if not dataset_document:
|
||||
logging.info(click.style("Segment {} has no document, pass.".format(segment_id), fg="cyan"))
|
||||
return
|
||||
|
||||
if not dataset_document.enabled or dataset_document.archived or dataset_document.indexing_status != "completed":
|
||||
logging.info(click.style("Segment {} document status is invalid, pass.".format(segment_id), fg="cyan"))
|
||||
return
|
||||
|
||||
index_type = dataset_document.doc_form
|
||||
index_processor = IndexProcessorFactory(index_type).init_index_processor()
|
||||
index_processor.clean(dataset, [index_node_id])
|
||||
index_processor.clean(dataset, index_node_ids, with_keywords=True, delete_child_chunks=True)
|
||||
|
||||
end_at = time.perf_counter()
|
||||
logging.info(
|
||||
click.style("Segment deleted from index: {} latency: {}".format(segment_id, end_at - start_at), fg="green")
|
||||
)
|
||||
logging.info(click.style("Segment deleted from index latency: {}".format(end_at - start_at), fg="green"))
|
||||
except Exception:
|
||||
logging.exception("delete segment from index failed")
|
||||
finally:
|
||||
redis_client.delete(indexing_cache_key)
|
||||
|
||||
76
api/tasks/disable_segments_from_index_task.py
Normal file
76
api/tasks/disable_segments_from_index_task.py
Normal file
@@ -0,0 +1,76 @@
|
||||
import logging
|
||||
import time
|
||||
|
||||
import click
|
||||
from celery import shared_task
|
||||
|
||||
from core.rag.index_processor.index_processor_factory import IndexProcessorFactory
|
||||
from extensions.ext_database import db
|
||||
from extensions.ext_redis import redis_client
|
||||
from models.dataset import Dataset, DocumentSegment
|
||||
from models.dataset import Document as DatasetDocument
|
||||
|
||||
|
||||
@shared_task(queue="dataset")
|
||||
def disable_segments_from_index_task(segment_ids: list, dataset_id: str, document_id: str):
|
||||
"""
|
||||
Async disable segments from index
|
||||
:param segment_ids:
|
||||
|
||||
Usage: disable_segments_from_index_task.delay(segment_ids, dataset_id, document_id)
|
||||
"""
|
||||
start_at = time.perf_counter()
|
||||
|
||||
dataset = db.session.query(Dataset).filter(Dataset.id == dataset_id).first()
|
||||
if not dataset:
|
||||
logging.info(click.style("Dataset {} not found, pass.".format(dataset_id), fg="cyan"))
|
||||
return
|
||||
|
||||
dataset_document = db.session.query(DatasetDocument).filter(DatasetDocument.id == document_id).first()
|
||||
|
||||
if not dataset_document:
|
||||
logging.info(click.style("Document {} not found, pass.".format(document_id), fg="cyan"))
|
||||
return
|
||||
if not dataset_document.enabled or dataset_document.archived or dataset_document.indexing_status != "completed":
|
||||
logging.info(click.style("Document {} status is invalid, pass.".format(document_id), fg="cyan"))
|
||||
return
|
||||
# sync index processor
|
||||
index_processor = IndexProcessorFactory(dataset_document.doc_form).init_index_processor()
|
||||
|
||||
segments = (
|
||||
db.session.query(DocumentSegment)
|
||||
.filter(
|
||||
DocumentSegment.id.in_(segment_ids),
|
||||
DocumentSegment.dataset_id == dataset_id,
|
||||
DocumentSegment.document_id == document_id,
|
||||
)
|
||||
.all()
|
||||
)
|
||||
|
||||
if not segments:
|
||||
return
|
||||
|
||||
try:
|
||||
index_node_ids = [segment.index_node_id for segment in segments]
|
||||
index_processor.clean(dataset, index_node_ids, with_keywords=True, delete_child_chunks=False)
|
||||
|
||||
end_at = time.perf_counter()
|
||||
logging.info(click.style("Segments removed from index latency: {}".format(end_at - start_at), fg="green"))
|
||||
except Exception:
|
||||
# update segment error msg
|
||||
db.session.query(DocumentSegment).filter(
|
||||
DocumentSegment.id.in_(segment_ids),
|
||||
DocumentSegment.dataset_id == dataset_id,
|
||||
DocumentSegment.document_id == document_id,
|
||||
).update(
|
||||
{
|
||||
"disabled_at": None,
|
||||
"disabled_by": None,
|
||||
"enabled": True,
|
||||
}
|
||||
)
|
||||
db.session.commit()
|
||||
finally:
|
||||
for segment in segments:
|
||||
indexing_cache_key = "segment_{}_indexing".format(segment.id)
|
||||
redis_client.delete(indexing_cache_key)
|
||||
@@ -82,7 +82,7 @@ def document_indexing_sync_task(dataset_id: str, document_id: str):
|
||||
index_node_ids = [segment.index_node_id for segment in segments]
|
||||
|
||||
# delete from vector index
|
||||
index_processor.clean(dataset, index_node_ids)
|
||||
index_processor.clean(dataset, index_node_ids, with_keywords=True, delete_child_chunks=True)
|
||||
|
||||
for segment in segments:
|
||||
db.session.delete(segment)
|
||||
|
||||
@@ -47,7 +47,7 @@ def document_indexing_update_task(dataset_id: str, document_id: str):
|
||||
index_node_ids = [segment.index_node_id for segment in segments]
|
||||
|
||||
# delete from vector index
|
||||
index_processor.clean(dataset, index_node_ids)
|
||||
index_processor.clean(dataset, index_node_ids, with_keywords=True, delete_child_chunks=True)
|
||||
|
||||
for segment in segments:
|
||||
db.session.delete(segment)
|
||||
|
||||
@@ -51,7 +51,7 @@ def duplicate_document_indexing_task(dataset_id: str, document_ids: list):
|
||||
if document:
|
||||
document.indexing_status = "error"
|
||||
document.error = str(e)
|
||||
document.stopped_at = datetime.datetime.utcnow()
|
||||
document.stopped_at = datetime.datetime.now(datetime.timezone.utc).replace(tzinfo=None)
|
||||
db.session.add(document)
|
||||
db.session.commit()
|
||||
return
|
||||
@@ -73,14 +73,14 @@ def duplicate_document_indexing_task(dataset_id: str, document_ids: list):
|
||||
index_node_ids = [segment.index_node_id for segment in segments]
|
||||
|
||||
# delete from vector index
|
||||
index_processor.clean(dataset, index_node_ids)
|
||||
index_processor.clean(dataset, index_node_ids, with_keywords=True, delete_child_chunks=True)
|
||||
|
||||
for segment in segments:
|
||||
db.session.delete(segment)
|
||||
db.session.commit()
|
||||
|
||||
document.indexing_status = "parsing"
|
||||
document.processing_started_at = datetime.datetime.utcnow()
|
||||
document.processing_started_at = datetime.datetime.now(datetime.timezone.utc).replace(tzinfo=None)
|
||||
documents.append(document)
|
||||
db.session.add(document)
|
||||
db.session.commit()
|
||||
|
||||
@@ -6,8 +6,9 @@ import click
|
||||
from celery import shared_task # type: ignore
|
||||
from werkzeug.exceptions import NotFound
|
||||
|
||||
from core.rag.index_processor.constant.index_type import IndexType
|
||||
from core.rag.index_processor.index_processor_factory import IndexProcessorFactory
|
||||
from core.rag.models.document import Document
|
||||
from core.rag.models.document import ChildDocument, Document
|
||||
from extensions.ext_database import db
|
||||
from extensions.ext_redis import redis_client
|
||||
from models.dataset import DocumentSegment
|
||||
@@ -61,6 +62,22 @@ def enable_segment_to_index_task(segment_id: str):
|
||||
return
|
||||
|
||||
index_processor = IndexProcessorFactory(dataset_document.doc_form).init_index_processor()
|
||||
if dataset_document.doc_form == IndexType.PARENT_CHILD_INDEX:
|
||||
child_chunks = segment.child_chunks
|
||||
if child_chunks:
|
||||
child_documents = []
|
||||
for child_chunk in child_chunks:
|
||||
child_document = ChildDocument(
|
||||
page_content=child_chunk.content,
|
||||
metadata={
|
||||
"doc_id": child_chunk.index_node_id,
|
||||
"doc_hash": child_chunk.index_node_hash,
|
||||
"document_id": segment.document_id,
|
||||
"dataset_id": segment.dataset_id,
|
||||
},
|
||||
)
|
||||
child_documents.append(child_document)
|
||||
document.children = child_documents
|
||||
# save vector index
|
||||
index_processor.load(dataset, [document])
|
||||
|
||||
|
||||
108
api/tasks/enable_segments_to_index_task.py
Normal file
108
api/tasks/enable_segments_to_index_task.py
Normal file
@@ -0,0 +1,108 @@
|
||||
import datetime
|
||||
import logging
|
||||
import time
|
||||
|
||||
import click
|
||||
from celery import shared_task
|
||||
|
||||
from core.rag.index_processor.constant.index_type import IndexType
|
||||
from core.rag.index_processor.index_processor_factory import IndexProcessorFactory
|
||||
from core.rag.models.document import ChildDocument, Document
|
||||
from extensions.ext_database import db
|
||||
from extensions.ext_redis import redis_client
|
||||
from models.dataset import Dataset, DocumentSegment
|
||||
from models.dataset import Document as DatasetDocument
|
||||
|
||||
|
||||
@shared_task(queue="dataset")
|
||||
def enable_segments_to_index_task(segment_ids: list, dataset_id: str, document_id: str):
|
||||
"""
|
||||
Async enable segments to index
|
||||
:param segment_ids:
|
||||
|
||||
Usage: enable_segments_to_index_task.delay(segment_ids)
|
||||
"""
|
||||
start_at = time.perf_counter()
|
||||
dataset = db.session.query(Dataset).filter(Dataset.id == dataset_id).first()
|
||||
if not dataset:
|
||||
logging.info(click.style("Dataset {} not found, pass.".format(dataset_id), fg="cyan"))
|
||||
return
|
||||
|
||||
dataset_document = db.session.query(DatasetDocument).filter(DatasetDocument.id == document_id).first()
|
||||
|
||||
if not dataset_document:
|
||||
logging.info(click.style("Document {} not found, pass.".format(document_id), fg="cyan"))
|
||||
return
|
||||
if not dataset_document.enabled or dataset_document.archived or dataset_document.indexing_status != "completed":
|
||||
logging.info(click.style("Document {} status is invalid, pass.".format(document_id), fg="cyan"))
|
||||
return
|
||||
# sync index processor
|
||||
index_processor = IndexProcessorFactory(dataset_document.doc_form).init_index_processor()
|
||||
|
||||
segments = (
|
||||
db.session.query(DocumentSegment)
|
||||
.filter(
|
||||
DocumentSegment.id.in_(segment_ids),
|
||||
DocumentSegment.dataset_id == dataset_id,
|
||||
DocumentSegment.document_id == document_id,
|
||||
)
|
||||
.all()
|
||||
)
|
||||
if not segments:
|
||||
return
|
||||
|
||||
try:
|
||||
documents = []
|
||||
for segment in segments:
|
||||
document = Document(
|
||||
page_content=segment.content,
|
||||
metadata={
|
||||
"doc_id": segment.index_node_id,
|
||||
"doc_hash": segment.index_node_hash,
|
||||
"document_id": document_id,
|
||||
"dataset_id": dataset_id,
|
||||
},
|
||||
)
|
||||
|
||||
if dataset_document.doc_form == IndexType.PARENT_CHILD_INDEX:
|
||||
child_chunks = segment.child_chunks
|
||||
if child_chunks:
|
||||
child_documents = []
|
||||
for child_chunk in child_chunks:
|
||||
child_document = ChildDocument(
|
||||
page_content=child_chunk.content,
|
||||
metadata={
|
||||
"doc_id": child_chunk.index_node_id,
|
||||
"doc_hash": child_chunk.index_node_hash,
|
||||
"document_id": document_id,
|
||||
"dataset_id": dataset_id,
|
||||
},
|
||||
)
|
||||
child_documents.append(child_document)
|
||||
document.children = child_documents
|
||||
documents.append(document)
|
||||
# save vector index
|
||||
index_processor.load(dataset, documents)
|
||||
|
||||
end_at = time.perf_counter()
|
||||
logging.info(click.style("Segments enabled to index latency: {}".format(end_at - start_at), fg="green"))
|
||||
except Exception as e:
|
||||
logging.exception("enable segments to index failed")
|
||||
# update segment error msg
|
||||
db.session.query(DocumentSegment).filter(
|
||||
DocumentSegment.id.in_(segment_ids),
|
||||
DocumentSegment.dataset_id == dataset_id,
|
||||
DocumentSegment.document_id == document_id,
|
||||
).update(
|
||||
{
|
||||
"error": str(e),
|
||||
"status": "error",
|
||||
"disabled_at": datetime.datetime.now(datetime.timezone.utc).replace(tzinfo=None),
|
||||
"enabled": False,
|
||||
}
|
||||
)
|
||||
db.session.commit()
|
||||
finally:
|
||||
for segment in segments:
|
||||
indexing_cache_key = "segment_{}_indexing".format(segment.id)
|
||||
redis_client.delete(indexing_cache_key)
|
||||
@@ -43,7 +43,7 @@ def remove_document_from_index_task(document_id: str):
|
||||
index_node_ids = [segment.index_node_id for segment in segments]
|
||||
if index_node_ids:
|
||||
try:
|
||||
index_processor.clean(dataset, index_node_ids)
|
||||
index_processor.clean(dataset, index_node_ids, with_keywords=True, delete_child_chunks=False)
|
||||
except Exception:
|
||||
logging.exception(f"clean dataset {dataset.id} from index failed")
|
||||
|
||||
|
||||
@@ -48,7 +48,7 @@ def retry_document_indexing_task(dataset_id: str, document_ids: list[str]):
|
||||
if document:
|
||||
document.indexing_status = "error"
|
||||
document.error = str(e)
|
||||
document.stopped_at = datetime.datetime.utcnow()
|
||||
document.stopped_at = datetime.datetime.now(datetime.timezone.utc).replace(tzinfo=None)
|
||||
db.session.add(document)
|
||||
db.session.commit()
|
||||
redis_client.delete(retry_indexing_cache_key)
|
||||
@@ -69,14 +69,14 @@ def retry_document_indexing_task(dataset_id: str, document_ids: list[str]):
|
||||
if segments:
|
||||
index_node_ids = [segment.index_node_id for segment in segments]
|
||||
# delete from vector index
|
||||
index_processor.clean(dataset, index_node_ids)
|
||||
index_processor.clean(dataset, index_node_ids, with_keywords=True, delete_child_chunks=True)
|
||||
|
||||
for segment in segments:
|
||||
db.session.delete(segment)
|
||||
db.session.commit()
|
||||
for segment in segments:
|
||||
db.session.delete(segment)
|
||||
db.session.commit()
|
||||
|
||||
document.indexing_status = "parsing"
|
||||
document.processing_started_at = datetime.datetime.utcnow()
|
||||
document.processing_started_at = datetime.datetime.now(datetime.timezone.utc).replace(tzinfo=None)
|
||||
db.session.add(document)
|
||||
db.session.commit()
|
||||
|
||||
@@ -86,7 +86,7 @@ def retry_document_indexing_task(dataset_id: str, document_ids: list[str]):
|
||||
except Exception as ex:
|
||||
document.indexing_status = "error"
|
||||
document.error = str(ex)
|
||||
document.stopped_at = datetime.datetime.utcnow()
|
||||
document.stopped_at = datetime.datetime.now(datetime.timezone.utc).replace(tzinfo=None)
|
||||
db.session.add(document)
|
||||
db.session.commit()
|
||||
logging.info(click.style(str(ex), fg="yellow"))
|
||||
|
||||
@@ -46,7 +46,7 @@ def sync_website_document_indexing_task(dataset_id: str, document_id: str):
|
||||
if document:
|
||||
document.indexing_status = "error"
|
||||
document.error = str(e)
|
||||
document.stopped_at = datetime.datetime.utcnow()
|
||||
document.stopped_at = datetime.datetime.now(datetime.timezone.utc).replace(tzinfo=None)
|
||||
db.session.add(document)
|
||||
db.session.commit()
|
||||
redis_client.delete(sync_indexing_cache_key)
|
||||
@@ -65,14 +65,14 @@ def sync_website_document_indexing_task(dataset_id: str, document_id: str):
|
||||
if segments:
|
||||
index_node_ids = [segment.index_node_id for segment in segments]
|
||||
# delete from vector index
|
||||
index_processor.clean(dataset, index_node_ids)
|
||||
index_processor.clean(dataset, index_node_ids, with_keywords=True, delete_child_chunks=True)
|
||||
|
||||
for segment in segments:
|
||||
db.session.delete(segment)
|
||||
db.session.commit()
|
||||
for segment in segments:
|
||||
db.session.delete(segment)
|
||||
db.session.commit()
|
||||
|
||||
document.indexing_status = "parsing"
|
||||
document.processing_started_at = datetime.datetime.utcnow()
|
||||
document.processing_started_at = datetime.datetime.now(datetime.timezone.utc).replace(tzinfo=None)
|
||||
db.session.add(document)
|
||||
db.session.commit()
|
||||
|
||||
@@ -82,7 +82,7 @@ def sync_website_document_indexing_task(dataset_id: str, document_id: str):
|
||||
except Exception as ex:
|
||||
document.indexing_status = "error"
|
||||
document.error = str(ex)
|
||||
document.stopped_at = datetime.datetime.utcnow()
|
||||
document.stopped_at = datetime.datetime.now(datetime.timezone.utc).replace(tzinfo=None)
|
||||
db.session.add(document)
|
||||
db.session.commit()
|
||||
logging.info(click.style(str(ex), fg="yellow"))
|
||||
|
||||
Reference in New Issue
Block a user