Feat/dify rag (#2528)

Co-authored-by: jyong <jyong@dify.ai>
2025-12-10 11:26:52 +08:00 · 2024-02-22 23:31:57 +08:00
parent 97fe817186
commit 6c4e6bf1d6
119 changed files with 3181 additions and 5892 deletions
--- a/api/core/tool/web_reader_tool.py
+++ b/api/core/tool/web_reader_tool.py
@@ -21,9 +21,9 @@ from pydantic import BaseModel, Field
 from regex import regex

 from core.chain.llm_chain import LLMChain
-from core.data_loader import file_extractor
-from core.data_loader.file_extractor import FileExtractor
 from core.entities.application_entities import ModelConfigEntity
+from core.rag.extractor import extract_processor
+from core.rag.extractor.extract_processor import ExtractProcessor

 FULL_TEMPLATE = """
 TITLE: {title}
@@ -146,7 +146,7 @@ def get_url(url: str) -> str:
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
    }
-    supported_content_types = file_extractor.SUPPORT_URL_CONTENT_TYPES + ["text/html"]
+    supported_content_types = extract_processor.SUPPORT_URL_CONTENT_TYPES + ["text/html"]

    head_response = requests.head(url, headers=headers, allow_redirects=True, timeout=(5, 10))

@@ -158,8 +158,8 @@ def get_url(url: str) -> str:
    if main_content_type not in supported_content_types:
        return "Unsupported content-type [{}] of URL.".format(main_content_type)

-    if main_content_type in file_extractor.SUPPORT_URL_CONTENT_TYPES:
-        return FileExtractor.load_from_url(url, return_text=True)
+    if main_content_type in extract_processor.SUPPORT_URL_CONTENT_TYPES:
+        return ExtractProcessor.load_from_url(url, return_text=True)

    response = requests.get(url, headers=headers, allow_redirects=True, timeout=(5, 30))
    a = extract_using_readabilipy(response.text)