Sync INDEXING_MAX_SEGMENTATION_TOKENS_LENGTH between API and Web (#11230)

2025-12-11 03:46:52 +08:00 · 2024-12-02 16:29:25 +09:00
parent f8c966c39c
commit 1d8385f7ac
29 changed files with 51 additions and 40 deletions
--- a/api/.env.example
+++ b/api/.env.example
@@ -383,7 +383,7 @@ LOG_DATEFORMAT=%Y-%m-%d %H:%M:%S
 LOG_TZ=UTC

 # Indexing configuration
-INDEXING_MAX_SEGMENTATION_TOKENS_LENGTH=1000
+INDEXING_MAX_SEGMENTATION_TOKENS_LENGTH=4000

 # Workflow runtime configuration
 WORKFLOW_MAX_EXECUTION_STEPS=500
@@ -413,4 +413,4 @@ RESET_PASSWORD_TOKEN_EXPIRY_MINUTES=5

 CREATE_TIDB_SERVICE_JOB_ENABLED=false

-RETRIEVAL_TOP_N=0
+RETRIEVAL_TOP_N=0
--- a/api/configs/feature/init.py
+++ b/api/configs/feature/init.py
@@ -647,7 +647,7 @@ class IndexingConfig(BaseSettings):

    INDEXING_MAX_SEGMENTATION_TOKENS_LENGTH: PositiveInt = Field(
        description="Maximum token length for text segmentation during indexing",
-        default=1000,
+        default=4000,
    )


--- a/api/controllers/console/datasets/datasets_document.py
+++ b/api/controllers/console/datasets/datasets_document.py
@@ -106,6 +106,7 @@ class GetProcessRuleApi(Resource):
        # get default rules
        mode = DocumentService.DEFAULT_RULES["mode"]
        rules = DocumentService.DEFAULT_RULES["rules"]
+        limits = DocumentService.DEFAULT_RULES["limits"]
        if document_id:
            # get the latest process rule
            document = Document.query.get_or_404(document_id)
@@ -132,7 +133,7 @@ class GetProcessRuleApi(Resource):
                mode = dataset_process_rule.mode
                rules = dataset_process_rule.rules_dict

-        return {"mode": mode, "rules": rules}
+        return {"mode": mode, "rules": rules, "limits": limits}


 class DatasetDocumentListApi(Resource):
--- a/api/services/dataset_service.py
+++ b/api/services/dataset_service.py
@@ -406,6 +406,9 @@ class DocumentService:
            ],
            "segmentation": {"delimiter": "\n", "max_tokens": 500, "chunk_overlap": 50},
        },
+        "limits": {
+            "indexing_max_segmentation_tokens_length": dify_config.INDEXING_MAX_SEGMENTATION_TOKENS_LENGTH,
+        },
    }

    DOCUMENT_METADATA_SCHEMA = {