fix unstructured setting (#12116)

2025-12-09 02:46:52 +08:00 · 2024-12-26 12:08:36 +08:00
parent 49feff082f
commit 811e4bd0cf
9 changed files with 17 additions and 15 deletions
--- a/api/core/rag/extractor/unstructured/unstructured_eml_extractor.py
+++ b/api/core/rag/extractor/unstructured/unstructured_eml_extractor.py
@@ -1,5 +1,6 @@
 import base64
 import logging
+from typing import Optional

 from bs4 import BeautifulSoup  # type: ignore

@@ -15,7 +16,7 @@ class UnstructuredEmailExtractor(BaseExtractor):
        file_path: Path to the file to load.
    """

-    def __init__(self, file_path: str, api_url: str, api_key: str):
+    def __init__(self, file_path: str, api_url: Optional[str] = None, api_key: str = ""):
        """Initialize with file path."""
        self._file_path = file_path
        self._api_url = api_url
--- a/api/core/rag/extractor/unstructured/unstructured_epub_extractor.py
+++ b/api/core/rag/extractor/unstructured/unstructured_epub_extractor.py
@@ -19,7 +19,7 @@ class UnstructuredEpubExtractor(BaseExtractor):
        self,
        file_path: str,
        api_url: Optional[str] = None,
-        api_key: Optional[str] = None,
+        api_key: str = "",
    ):
        """Initialize with file path."""
        self._file_path = file_path
@@ -30,9 +30,6 @@ class UnstructuredEpubExtractor(BaseExtractor):
        if self._api_url:
            from unstructured.partition.api import partition_via_api

-            if self._api_key is None:
-                raise ValueError("api_key is required")
-
            elements = partition_via_api(filename=self._file_path, api_url=self._api_url, api_key=self._api_key)
        else:
            from unstructured.partition.epub import partition_epub
--- a/api/core/rag/extractor/unstructured/unstructured_markdown_extractor.py
+++ b/api/core/rag/extractor/unstructured/unstructured_markdown_extractor.py
@@ -1,4 +1,5 @@
 import logging
+from typing import Optional

 from core.rag.extractor.extractor_base import BaseExtractor
 from core.rag.models.document import Document
@@ -24,7 +25,7 @@ class UnstructuredMarkdownExtractor(BaseExtractor):
            if the specified encoding fails.
    """

-    def __init__(self, file_path: str, api_url: str, api_key: str):
+    def __init__(self, file_path: str, api_url: Optional[str] = None, api_key: str = ""):
        """Initialize with file path."""
        self._file_path = file_path
        self._api_url = api_url
--- a/api/core/rag/extractor/unstructured/unstructured_msg_extractor.py
+++ b/api/core/rag/extractor/unstructured/unstructured_msg_extractor.py
@@ -1,4 +1,5 @@
 import logging
+from typing import Optional

 from core.rag.extractor.extractor_base import BaseExtractor
 from core.rag.models.document import Document
@@ -14,7 +15,7 @@ class UnstructuredMsgExtractor(BaseExtractor):
        file_path: Path to the file to load.
    """

-    def __init__(self, file_path: str, api_url: str, api_key: str):
+    def __init__(self, file_path: str, api_url: Optional[str] = None, api_key: str = ""):
        """Initialize with file path."""
        self._file_path = file_path
        self._api_url = api_url
--- a/api/core/rag/extractor/unstructured/unstructured_ppt_extractor.py
+++ b/api/core/rag/extractor/unstructured/unstructured_ppt_extractor.py
@@ -1,4 +1,5 @@
 import logging
+from typing import Optional

 from core.rag.extractor.extractor_base import BaseExtractor
 from core.rag.models.document import Document
@@ -14,7 +15,7 @@ class UnstructuredPPTExtractor(BaseExtractor):
        file_path: Path to the file to load.
    """

-    def __init__(self, file_path: str, api_url: str, api_key: str):
+    def __init__(self, file_path: str, api_url: Optional[str] = None, api_key: str = ""):
        """Initialize with file path."""
        self._file_path = file_path
        self._api_url = api_url
--- a/api/core/rag/extractor/unstructured/unstructured_pptx_extractor.py
+++ b/api/core/rag/extractor/unstructured/unstructured_pptx_extractor.py
@@ -1,4 +1,5 @@
 import logging
+from typing import Optional

 from core.rag.extractor.extractor_base import BaseExtractor
 from core.rag.models.document import Document
@@ -14,7 +15,7 @@ class UnstructuredPPTXExtractor(BaseExtractor):
        file_path: Path to the file to load.
    """

-    def __init__(self, file_path: str, api_url: str, api_key: str):
+    def __init__(self, file_path: str, api_url: Optional[str] = None, api_key: str = ""):
        """Initialize with file path."""
        self._file_path = file_path
        self._api_url = api_url
--- a/api/core/rag/extractor/unstructured/unstructured_xml_extractor.py
+++ b/api/core/rag/extractor/unstructured/unstructured_xml_extractor.py
@@ -1,4 +1,5 @@
 import logging
+from typing import Optional

 from core.rag.extractor.extractor_base import BaseExtractor
 from core.rag.models.document import Document
@@ -14,7 +15,7 @@ class UnstructuredXmlExtractor(BaseExtractor):
        file_path: Path to the file to load.
    """

-    def __init__(self, file_path: str, api_url: str, api_key: str):
+    def __init__(self, file_path: str, api_url: Optional[str] = None, api_key: str = ""):
        """Initialize with file path."""
        self._file_path = file_path
        self._api_url = api_url