mirror of
http://112.124.100.131/huang.ze/ebiz-dify-ai.git
synced 2025-12-09 19:06:51 +08:00
fix document extractor node incorrectly processing doc and ppt files (#12902)
This commit is contained in:
@@ -8,7 +8,7 @@ from core.variables.variables import StringVariable
|
||||
from core.workflow.entities.node_entities import NodeRunResult
|
||||
from core.workflow.nodes.document_extractor import DocumentExtractorNode, DocumentExtractorNodeData
|
||||
from core.workflow.nodes.document_extractor.node import (
|
||||
_extract_text_from_doc,
|
||||
_extract_text_from_docx,
|
||||
_extract_text_from_pdf,
|
||||
_extract_text_from_plain_text,
|
||||
)
|
||||
@@ -120,7 +120,7 @@ def test_run_extract_text(
|
||||
monkeypatch.setattr("core.workflow.nodes.document_extractor.node._extract_text_from_pdf", mock_pdf_extract)
|
||||
elif mime_type.startswith("application/vnd.openxmlformats"):
|
||||
mock_docx_extract = Mock(return_value=expected_text[0])
|
||||
monkeypatch.setattr("core.workflow.nodes.document_extractor.node._extract_text_from_doc", mock_docx_extract)
|
||||
monkeypatch.setattr("core.workflow.nodes.document_extractor.node._extract_text_from_docx", mock_docx_extract)
|
||||
|
||||
result = document_extractor_node._run()
|
||||
|
||||
@@ -163,14 +163,14 @@ def test_extract_text_from_pdf(mock_pdf_document):
|
||||
|
||||
|
||||
@patch("docx.Document")
|
||||
def test_extract_text_from_doc(mock_document):
|
||||
def test_extract_text_from_docx(mock_document):
|
||||
mock_paragraph1 = Mock()
|
||||
mock_paragraph1.text = "Paragraph 1"
|
||||
mock_paragraph2 = Mock()
|
||||
mock_paragraph2.text = "Paragraph 2"
|
||||
mock_document.return_value.paragraphs = [mock_paragraph1, mock_paragraph2]
|
||||
|
||||
text = _extract_text_from_doc(b"PK\x03\x04")
|
||||
text = _extract_text_from_docx(b"PK\x03\x04")
|
||||
assert text == "Paragraph 1\nParagraph 2"
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user