install pandoc (#16825)

2025-12-09 02:46:52 +08:00 · 2025-03-26 22:34:10 +08:00
parent 91db2207b3
commit 30792a1e1a
4 changed files with 66 additions and 52 deletions
--- a/api/core/rag/extractor/unstructured/unstructured_epub_extractor.py
+++ b/api/core/rag/extractor/unstructured/unstructured_epub_extractor.py
@@ -1,6 +1,8 @@
 import logging
 from typing import Optional

+import pypandoc  # type: ignore
+
 from core.rag.extractor.extractor_base import BaseExtractor
 from core.rag.models.document import Document

@@ -34,6 +36,7 @@ class UnstructuredEpubExtractor(BaseExtractor):
        else:
            from unstructured.partition.epub import partition_epub

+            pypandoc.download_pandoc()
            elements = partition_epub(filename=self._file_path, xml_keep_tags=True)

        from unstructured.chunking.title import chunk_by_title
--- a/api/core/workflow/nodes/document_extractor/node.py
+++ b/api/core/workflow/nodes/document_extractor/node.py
@@ -9,6 +9,7 @@ from typing import Any, cast

 import docx
 import pandas as pd
+import pypandoc  # type: ignore
 import pypdfium2  # type: ignore
 import yaml  # type: ignore
 from docx.document import Document
@@ -369,7 +370,7 @@ def _extract_text_from_ppt(file_content: bytes) -> str:
    from unstructured.partition.ppt import partition_ppt

    try:
-        if dify_config.UNSTRUCTURED_API_URL and dify_config.UNSTRUCTURED_API_KEY:
+        if dify_config.UNSTRUCTURED_API_URL:
            with tempfile.NamedTemporaryFile(suffix=".ppt", delete=False) as temp_file:
                temp_file.write(file_content)
                temp_file.flush()
@@ -378,7 +379,7 @@ def _extract_text_from_ppt(file_content: bytes) -> str:
                        file=file,
                        metadata_filename=temp_file.name,
                        api_url=dify_config.UNSTRUCTURED_API_URL,
-                        api_key=dify_config.UNSTRUCTURED_API_KEY,
+                        api_key=dify_config.UNSTRUCTURED_API_KEY,  # type: ignore
                    )
                os.unlink(temp_file.name)
        else:
@@ -395,7 +396,7 @@ def _extract_text_from_pptx(file_content: bytes) -> str:
    from unstructured.partition.pptx import partition_pptx

    try:
-        if dify_config.UNSTRUCTURED_API_URL and dify_config.UNSTRUCTURED_API_KEY:
+        if dify_config.UNSTRUCTURED_API_URL:
            with tempfile.NamedTemporaryFile(suffix=".pptx", delete=False) as temp_file:
                temp_file.write(file_content)
                temp_file.flush()
@@ -404,7 +405,7 @@ def _extract_text_from_pptx(file_content: bytes) -> str:
                        file=file,
                        metadata_filename=temp_file.name,
                        api_url=dify_config.UNSTRUCTURED_API_URL,
-                        api_key=dify_config.UNSTRUCTURED_API_KEY,
+                        api_key=dify_config.UNSTRUCTURED_API_KEY,  # type: ignore
                    )
                os.unlink(temp_file.name)
        else:
@@ -416,11 +417,26 @@ def _extract_text_from_pptx(file_content: bytes) -> str:


 def _extract_text_from_epub(file_content: bytes) -> str:
+    from unstructured.partition.api import partition_via_api
    from unstructured.partition.epub import partition_epub

    try:
-        with io.BytesIO(file_content) as file:
-            elements = partition_epub(file=file)
+        if dify_config.UNSTRUCTURED_API_URL:
+            with tempfile.NamedTemporaryFile(suffix=".epub", delete=False) as temp_file:
+                temp_file.write(file_content)
+                temp_file.flush()
+                with open(temp_file.name, "rb") as file:
+                    elements = partition_via_api(
+                        file=file,
+                        metadata_filename=temp_file.name,
+                        api_url=dify_config.UNSTRUCTURED_API_URL,
+                        api_key=dify_config.UNSTRUCTURED_API_KEY,  # type: ignore
+                    )
+                os.unlink(temp_file.name)
+        else:
+            pypandoc.download_pandoc()
+            with io.BytesIO(file_content) as file:
+                elements = partition_epub(file=file)
        return "\n".join([str(element) for element in elements])
    except Exception as e:
        raise TextExtractionError(f"Failed to extract text from EPUB: {str(e)}") from e