mirror of
http://112.124.100.131/huang.ze/ebiz-dify-ai.git
synced 2025-12-10 03:16:51 +08:00
Feat/support parent child chunk (#12092)
This commit is contained in:
@@ -24,7 +24,6 @@ from core.rag.extractor.unstructured.unstructured_markdown_extractor import Unst
|
||||
from core.rag.extractor.unstructured.unstructured_msg_extractor import UnstructuredMsgExtractor
|
||||
from core.rag.extractor.unstructured.unstructured_ppt_extractor import UnstructuredPPTExtractor
|
||||
from core.rag.extractor.unstructured.unstructured_pptx_extractor import UnstructuredPPTXExtractor
|
||||
from core.rag.extractor.unstructured.unstructured_text_extractor import UnstructuredTextExtractor
|
||||
from core.rag.extractor.unstructured.unstructured_xml_extractor import UnstructuredXmlExtractor
|
||||
from core.rag.extractor.word_extractor import WordExtractor
|
||||
from core.rag.models.document import Document
|
||||
@@ -141,11 +140,7 @@ class ExtractProcessor:
|
||||
extractor = UnstructuredEpubExtractor(file_path, unstructured_api_url, unstructured_api_key)
|
||||
else:
|
||||
# txt
|
||||
extractor = (
|
||||
UnstructuredTextExtractor(file_path, unstructured_api_url)
|
||||
if is_automatic
|
||||
else TextExtractor(file_path, autodetect_encoding=True)
|
||||
)
|
||||
extractor = TextExtractor(file_path, autodetect_encoding=True)
|
||||
else:
|
||||
if file_extension in {".xlsx", ".xls"}:
|
||||
extractor = ExcelExtractor(file_path)
|
||||
|
||||
@@ -267,8 +267,10 @@ class WordExtractor(BaseExtractor):
|
||||
if isinstance(element.tag, str) and element.tag.endswith("p"): # paragraph
|
||||
para = paragraphs.pop(0)
|
||||
parsed_paragraph = parse_paragraph(para)
|
||||
if parsed_paragraph:
|
||||
if parsed_paragraph.strip():
|
||||
content.append(parsed_paragraph)
|
||||
else:
|
||||
content.append("\n")
|
||||
elif isinstance(element.tag, str) and element.tag.endswith("tbl"): # table
|
||||
table = tables.pop(0)
|
||||
content.append(self._table_to_markdown(table, image_map))
|
||||
|
||||
Reference in New Issue
Block a user