Feature/mutil embedding model (#908)

Co-authored-by: JzoNg <jzongcode@gmail.com>
Co-authored-by: jyong <jyong@dify.ai>
Co-authored-by: StyleZhang <jasonapring2015@outlook.com>
This commit is contained in:
Jyong
2023-08-18 17:37:31 +08:00
committed by GitHub
parent 4420281d96
commit db7156dafd
54 changed files with 1704 additions and 278 deletions

View File

@@ -0,0 +1,95 @@
import datetime
import logging
import time
import uuid
from typing import Optional, List
import click
from celery import shared_task
from sqlalchemy import func
from werkzeug.exceptions import NotFound
from core.index.index import IndexBuilder
from core.indexing_runner import IndexingRunner
from core.model_providers.model_factory import ModelFactory
from extensions.ext_database import db
from extensions.ext_redis import redis_client
from libs import helper
from models.dataset import DocumentSegment, Dataset, Document
@shared_task(queue='dataset')
def batch_create_segment_to_index_task(job_id: str, content: List, dataset_id: str, document_id: str,
tenant_id: str, user_id: str):
"""
Async batch create segment to index
:param job_id:
:param content:
:param dataset_id:
:param document_id:
:param tenant_id:
:param user_id:
Usage: batch_create_segment_to_index_task.delay(segment_id)
"""
logging.info(click.style('Start batch create segment jobId: {}'.format(job_id), fg='green'))
start_at = time.perf_counter()
indexing_cache_key = 'segment_batch_import_{}'.format(job_id)
try:
dataset = db.session.query(Dataset).filter(Dataset.id == dataset_id).first()
if not dataset:
raise ValueError('Dataset not exist.')
dataset_document = db.session.query(Document).filter(Document.id == document_id).first()
if not dataset_document:
raise ValueError('Document not exist.')
if not dataset_document.enabled or dataset_document.archived or dataset_document.indexing_status != 'completed':
raise ValueError('Document is not available.')
document_segments = []
for segment in content:
content = segment['content']
doc_id = str(uuid.uuid4())
segment_hash = helper.generate_text_hash(content)
embedding_model = ModelFactory.get_embedding_model(
tenant_id=dataset.tenant_id,
model_provider_name=dataset.embedding_model_provider,
model_name=dataset.embedding_model
)
# calc embedding use tokens
tokens = embedding_model.get_num_tokens(content)
max_position = db.session.query(func.max(DocumentSegment.position)).filter(
DocumentSegment.document_id == dataset_document.id
).scalar()
segment_document = DocumentSegment(
tenant_id=tenant_id,
dataset_id=dataset_id,
document_id=document_id,
index_node_id=doc_id,
index_node_hash=segment_hash,
position=max_position + 1 if max_position else 1,
content=content,
word_count=len(content),
tokens=tokens,
created_by=user_id,
indexing_at=datetime.datetime.utcnow(),
status='completed',
completed_at=datetime.datetime.utcnow()
)
if dataset_document.doc_form == 'qa_model':
segment_document.answer = segment['answer']
db.session.add(segment_document)
document_segments.append(segment_document)
# add index to db
indexing_runner = IndexingRunner()
indexing_runner.batch_add_segments(document_segments, dataset)
db.session.commit()
redis_client.setex(indexing_cache_key, 600, 'completed')
end_at = time.perf_counter()
logging.info(click.style('Segment batch created job: {} latency: {}'.format(job_id, end_at - start_at), fg='green'))
except Exception as e:
logging.exception("Segments batch created index failed:{}".format(str(e)))
redis_client.setex(indexing_cache_key, 600, 'error')

View File

@@ -0,0 +1,58 @@
import logging
import time
import click
from celery import shared_task
from werkzeug.exceptions import NotFound
from core.index.index import IndexBuilder
from extensions.ext_database import db
from extensions.ext_redis import redis_client
from models.dataset import DocumentSegment, Dataset, Document
@shared_task(queue='dataset')
def delete_segment_from_index_task(segment_id: str, index_node_id: str, dataset_id: str, document_id: str):
"""
Async Remove segment from index
:param segment_id:
:param index_node_id:
:param dataset_id:
:param document_id:
Usage: delete_segment_from_index_task.delay(segment_id)
"""
logging.info(click.style('Start delete segment from index: {}'.format(segment_id), fg='green'))
start_at = time.perf_counter()
indexing_cache_key = 'segment_{}_delete_indexing'.format(segment_id)
try:
dataset = db.session.query(Dataset).filter(Dataset.id == dataset_id).first()
if not dataset:
logging.info(click.style('Segment {} has no dataset, pass.'.format(segment_id), fg='cyan'))
return
dataset_document = db.session.query(Document).filter(Document.id == document_id).first()
if not dataset_document:
logging.info(click.style('Segment {} has no document, pass.'.format(segment_id), fg='cyan'))
return
if not dataset_document.enabled or dataset_document.archived or dataset_document.indexing_status != 'completed':
logging.info(click.style('Segment {} document status is invalid, pass.'.format(segment_id), fg='cyan'))
return
vector_index = IndexBuilder.get_index(dataset, 'high_quality')
kw_index = IndexBuilder.get_index(dataset, 'economy')
# delete from vector index
if vector_index:
vector_index.delete_by_ids([index_node_id])
# delete from keyword index
kw_index.delete_by_ids([index_node_id])
end_at = time.perf_counter()
logging.info(click.style('Segment deleted from index: {} latency: {}'.format(segment_id, end_at - start_at), fg='green'))
except Exception:
logging.exception("delete segment from index failed")
finally:
redis_client.delete(indexing_cache_key)

View File

@@ -12,14 +12,14 @@ from models.dataset import DocumentSegment
@shared_task(queue='dataset')
def remove_segment_from_index_task(segment_id: str):
def disable_segment_from_index_task(segment_id: str):
"""
Async Remove segment from index
Async disable segment from index
:param segment_id:
Usage: remove_segment_from_index.delay(segment_id)
Usage: disable_segment_from_index_task.delay(segment_id)
"""
logging.info(click.style('Start remove segment from index: {}'.format(segment_id), fg='green'))
logging.info(click.style('Start disable segment from index: {}'.format(segment_id), fg='green'))
start_at = time.perf_counter()
segment = db.session.query(DocumentSegment).filter(DocumentSegment.id == segment_id).first()

View File

@@ -52,17 +52,6 @@ def update_segment_keyword_index_task(segment_id: str):
# delete from keyword index
kw_index.delete_by_ids([segment.index_node_id])
# add new index
document = Document(
page_content=segment.content,
metadata={
"doc_id": segment.index_node_id,
"doc_hash": segment.index_node_hash,
"document_id": segment.document_id,
"dataset_id": segment.dataset_id,
}
)
# save keyword index
index = IndexBuilder.get_index(dataset, 'economy')
if index: