fix document extractor node incorrectly processing doc and ppt files (#12902)

This commit is contained in:
AugNSo
2025-02-12 18:04:28 +08:00
committed by GitHub
parent 6529240da6
commit 2b86465d4c
4 changed files with 59 additions and 14 deletions

View File

@@ -8,7 +8,7 @@ from core.variables.variables import StringVariable
from core.workflow.entities.node_entities import NodeRunResult
from core.workflow.nodes.document_extractor import DocumentExtractorNode, DocumentExtractorNodeData
from core.workflow.nodes.document_extractor.node import (
_extract_text_from_doc,
_extract_text_from_docx,
_extract_text_from_pdf,
_extract_text_from_plain_text,
)
@@ -120,7 +120,7 @@ def test_run_extract_text(
monkeypatch.setattr("core.workflow.nodes.document_extractor.node._extract_text_from_pdf", mock_pdf_extract)
elif mime_type.startswith("application/vnd.openxmlformats"):
mock_docx_extract = Mock(return_value=expected_text[0])
monkeypatch.setattr("core.workflow.nodes.document_extractor.node._extract_text_from_doc", mock_docx_extract)
monkeypatch.setattr("core.workflow.nodes.document_extractor.node._extract_text_from_docx", mock_docx_extract)
result = document_extractor_node._run()
@@ -163,14 +163,14 @@ def test_extract_text_from_pdf(mock_pdf_document):
@patch("docx.Document")
def test_extract_text_from_doc(mock_document):
def test_extract_text_from_docx(mock_document):
mock_paragraph1 = Mock()
mock_paragraph1.text = "Paragraph 1"
mock_paragraph2 = Mock()
mock_paragraph2.text = "Paragraph 2"
mock_document.return_value.paragraphs = [mock_paragraph1, mock_paragraph2]
text = _extract_text_from_doc(b"PK\x03\x04")
text = _extract_text_from_docx(b"PK\x03\x04")
assert text == "Paragraph 1\nParagraph 2"