mirror of
http://112.124.100.131/huang.ze/ebiz-dify-ai.git
synced 2025-12-09 02:46:52 +08:00
Feat/change split length method (#18097)
Co-authored-by: JzoNg <jzongcode@gmail.com>
This commit is contained in:
@@ -39,6 +39,12 @@ class EnhanceRecursiveCharacterTextSplitter(RecursiveCharacterTextSplitter):
|
||||
else:
|
||||
return [GPT2Tokenizer.get_num_tokens(text) for text in texts]
|
||||
|
||||
def _character_encoder(texts: list[str]) -> list[int]:
|
||||
if not texts:
|
||||
return []
|
||||
|
||||
return [len(text) for text in texts]
|
||||
|
||||
if issubclass(cls, TokenTextSplitter):
|
||||
extra_kwargs = {
|
||||
"model_name": embedding_model_instance.model if embedding_model_instance else "gpt2",
|
||||
@@ -47,7 +53,7 @@ class EnhanceRecursiveCharacterTextSplitter(RecursiveCharacterTextSplitter):
|
||||
}
|
||||
kwargs = {**kwargs, **extra_kwargs}
|
||||
|
||||
return cls(length_function=_token_encoder, **kwargs)
|
||||
return cls(length_function=_character_encoder, **kwargs)
|
||||
|
||||
|
||||
class FixedRecursiveCharacterTextSplitter(EnhanceRecursiveCharacterTextSplitter):
|
||||
@@ -103,7 +109,7 @@ class FixedRecursiveCharacterTextSplitter(EnhanceRecursiveCharacterTextSplitter)
|
||||
_good_splits_lengths = [] # cache the lengths of the splits
|
||||
_separator = "" if self._keep_separator else separator
|
||||
s_lens = self._length_function(splits)
|
||||
if _separator != "":
|
||||
if separator != "":
|
||||
for s, s_len in zip(splits, s_lens):
|
||||
if s_len < self._chunk_size:
|
||||
_good_splits.append(s)
|
||||
|
||||
@@ -553,7 +553,7 @@ class DocumentService:
|
||||
{"id": "remove_extra_spaces", "enabled": True},
|
||||
{"id": "remove_urls_emails", "enabled": False},
|
||||
],
|
||||
"segmentation": {"delimiter": "\n", "max_tokens": 500, "chunk_overlap": 50},
|
||||
"segmentation": {"delimiter": "\n", "max_tokens": 1024, "chunk_overlap": 50},
|
||||
},
|
||||
"limits": {
|
||||
"indexing_max_segmentation_tokens_length": dify_config.INDEXING_MAX_SEGMENTATION_TOKENS_LENGTH,
|
||||
|
||||
Reference in New Issue
Block a user