mirror of
http://112.124.100.131/huang.ze/ebiz-dify-ai.git
synced 2025-12-09 02:46:52 +08:00
install pandoc (#16825)
This commit is contained in:
@@ -1,6 +1,8 @@
|
||||
import logging
|
||||
from typing import Optional
|
||||
|
||||
import pypandoc # type: ignore
|
||||
|
||||
from core.rag.extractor.extractor_base import BaseExtractor
|
||||
from core.rag.models.document import Document
|
||||
|
||||
@@ -34,6 +36,7 @@ class UnstructuredEpubExtractor(BaseExtractor):
|
||||
else:
|
||||
from unstructured.partition.epub import partition_epub
|
||||
|
||||
pypandoc.download_pandoc()
|
||||
elements = partition_epub(filename=self._file_path, xml_keep_tags=True)
|
||||
|
||||
from unstructured.chunking.title import chunk_by_title
|
||||
|
||||
@@ -9,6 +9,7 @@ from typing import Any, cast
|
||||
|
||||
import docx
|
||||
import pandas as pd
|
||||
import pypandoc # type: ignore
|
||||
import pypdfium2 # type: ignore
|
||||
import yaml # type: ignore
|
||||
from docx.document import Document
|
||||
@@ -369,7 +370,7 @@ def _extract_text_from_ppt(file_content: bytes) -> str:
|
||||
from unstructured.partition.ppt import partition_ppt
|
||||
|
||||
try:
|
||||
if dify_config.UNSTRUCTURED_API_URL and dify_config.UNSTRUCTURED_API_KEY:
|
||||
if dify_config.UNSTRUCTURED_API_URL:
|
||||
with tempfile.NamedTemporaryFile(suffix=".ppt", delete=False) as temp_file:
|
||||
temp_file.write(file_content)
|
||||
temp_file.flush()
|
||||
@@ -378,7 +379,7 @@ def _extract_text_from_ppt(file_content: bytes) -> str:
|
||||
file=file,
|
||||
metadata_filename=temp_file.name,
|
||||
api_url=dify_config.UNSTRUCTURED_API_URL,
|
||||
api_key=dify_config.UNSTRUCTURED_API_KEY,
|
||||
api_key=dify_config.UNSTRUCTURED_API_KEY, # type: ignore
|
||||
)
|
||||
os.unlink(temp_file.name)
|
||||
else:
|
||||
@@ -395,7 +396,7 @@ def _extract_text_from_pptx(file_content: bytes) -> str:
|
||||
from unstructured.partition.pptx import partition_pptx
|
||||
|
||||
try:
|
||||
if dify_config.UNSTRUCTURED_API_URL and dify_config.UNSTRUCTURED_API_KEY:
|
||||
if dify_config.UNSTRUCTURED_API_URL:
|
||||
with tempfile.NamedTemporaryFile(suffix=".pptx", delete=False) as temp_file:
|
||||
temp_file.write(file_content)
|
||||
temp_file.flush()
|
||||
@@ -404,7 +405,7 @@ def _extract_text_from_pptx(file_content: bytes) -> str:
|
||||
file=file,
|
||||
metadata_filename=temp_file.name,
|
||||
api_url=dify_config.UNSTRUCTURED_API_URL,
|
||||
api_key=dify_config.UNSTRUCTURED_API_KEY,
|
||||
api_key=dify_config.UNSTRUCTURED_API_KEY, # type: ignore
|
||||
)
|
||||
os.unlink(temp_file.name)
|
||||
else:
|
||||
@@ -416,11 +417,26 @@ def _extract_text_from_pptx(file_content: bytes) -> str:
|
||||
|
||||
|
||||
def _extract_text_from_epub(file_content: bytes) -> str:
|
||||
from unstructured.partition.api import partition_via_api
|
||||
from unstructured.partition.epub import partition_epub
|
||||
|
||||
try:
|
||||
with io.BytesIO(file_content) as file:
|
||||
elements = partition_epub(file=file)
|
||||
if dify_config.UNSTRUCTURED_API_URL:
|
||||
with tempfile.NamedTemporaryFile(suffix=".epub", delete=False) as temp_file:
|
||||
temp_file.write(file_content)
|
||||
temp_file.flush()
|
||||
with open(temp_file.name, "rb") as file:
|
||||
elements = partition_via_api(
|
||||
file=file,
|
||||
metadata_filename=temp_file.name,
|
||||
api_url=dify_config.UNSTRUCTURED_API_URL,
|
||||
api_key=dify_config.UNSTRUCTURED_API_KEY, # type: ignore
|
||||
)
|
||||
os.unlink(temp_file.name)
|
||||
else:
|
||||
pypandoc.download_pandoc()
|
||||
with io.BytesIO(file_content) as file:
|
||||
elements = partition_epub(file=file)
|
||||
return "\n".join([str(element) for element in elements])
|
||||
except Exception as e:
|
||||
raise TextExtractionError(f"Failed to extract text from EPUB: {str(e)}") from e
|
||||
|
||||
Reference in New Issue
Block a user