install pandoc (#16825)

This commit is contained in:
Jyong
2025-03-26 22:34:10 +08:00
committed by GitHub
parent 91db2207b3
commit 30792a1e1a
4 changed files with 66 additions and 52 deletions

View File

@@ -1,6 +1,8 @@
import logging
from typing import Optional
import pypandoc # type: ignore
from core.rag.extractor.extractor_base import BaseExtractor
from core.rag.models.document import Document
@@ -34,6 +36,7 @@ class UnstructuredEpubExtractor(BaseExtractor):
else:
from unstructured.partition.epub import partition_epub
pypandoc.download_pandoc()
elements = partition_epub(filename=self._file_path, xml_keep_tags=True)
from unstructured.chunking.title import chunk_by_title

View File

@@ -9,6 +9,7 @@ from typing import Any, cast
import docx
import pandas as pd
import pypandoc # type: ignore
import pypdfium2 # type: ignore
import yaml # type: ignore
from docx.document import Document
@@ -369,7 +370,7 @@ def _extract_text_from_ppt(file_content: bytes) -> str:
from unstructured.partition.ppt import partition_ppt
try:
if dify_config.UNSTRUCTURED_API_URL and dify_config.UNSTRUCTURED_API_KEY:
if dify_config.UNSTRUCTURED_API_URL:
with tempfile.NamedTemporaryFile(suffix=".ppt", delete=False) as temp_file:
temp_file.write(file_content)
temp_file.flush()
@@ -378,7 +379,7 @@ def _extract_text_from_ppt(file_content: bytes) -> str:
file=file,
metadata_filename=temp_file.name,
api_url=dify_config.UNSTRUCTURED_API_URL,
api_key=dify_config.UNSTRUCTURED_API_KEY,
api_key=dify_config.UNSTRUCTURED_API_KEY, # type: ignore
)
os.unlink(temp_file.name)
else:
@@ -395,7 +396,7 @@ def _extract_text_from_pptx(file_content: bytes) -> str:
from unstructured.partition.pptx import partition_pptx
try:
if dify_config.UNSTRUCTURED_API_URL and dify_config.UNSTRUCTURED_API_KEY:
if dify_config.UNSTRUCTURED_API_URL:
with tempfile.NamedTemporaryFile(suffix=".pptx", delete=False) as temp_file:
temp_file.write(file_content)
temp_file.flush()
@@ -404,7 +405,7 @@ def _extract_text_from_pptx(file_content: bytes) -> str:
file=file,
metadata_filename=temp_file.name,
api_url=dify_config.UNSTRUCTURED_API_URL,
api_key=dify_config.UNSTRUCTURED_API_KEY,
api_key=dify_config.UNSTRUCTURED_API_KEY, # type: ignore
)
os.unlink(temp_file.name)
else:
@@ -416,11 +417,26 @@ def _extract_text_from_pptx(file_content: bytes) -> str:
def _extract_text_from_epub(file_content: bytes) -> str:
from unstructured.partition.api import partition_via_api
from unstructured.partition.epub import partition_epub
try:
with io.BytesIO(file_content) as file:
elements = partition_epub(file=file)
if dify_config.UNSTRUCTURED_API_URL:
with tempfile.NamedTemporaryFile(suffix=".epub", delete=False) as temp_file:
temp_file.write(file_content)
temp_file.flush()
with open(temp_file.name, "rb") as file:
elements = partition_via_api(
file=file,
metadata_filename=temp_file.name,
api_url=dify_config.UNSTRUCTURED_API_URL,
api_key=dify_config.UNSTRUCTURED_API_KEY, # type: ignore
)
os.unlink(temp_file.name)
else:
pypandoc.download_pandoc()
with io.BytesIO(file_content) as file:
elements = partition_epub(file=file)
return "\n".join([str(element) for element in elements])
except Exception as e:
raise TextExtractionError(f"Failed to extract text from EPUB: {str(e)}") from e