From ac635c70cd0a7de8cb959ed89be9cadf7420329f Mon Sep 17 00:00:00 2001
From: yihong <zouzou0208@gmail.com>
Date: Fri, 20 Dec 2024 17:19:46 +0800
Subject: [PATCH] fix: doc can not extract tables (#11879)

Signed-off-by: yihong0618 <zouzou0208@gmail.com>
Co-authored-by: akinobu-i <akinobu-i@users.noreply.github.com>
---
 .../workflow/nodes/document_extractor/node.py | 38 ++++++++++++++++++-
 1 file changed, 37 insertions(+), 1 deletion(-)

diff --git a/api/core/workflow/nodes/document_extractor/node.py b/api/core/workflow/nodes/document_extractor/node.py
index 95d0ea3aa..6d82dbe6d 100644
--- a/api/core/workflow/nodes/document_extractor/node.py
+++ b/api/core/workflow/nodes/document_extractor/node.py
@@ -1,6 +1,7 @@
 import csv
 import io
 import json
+import logging
 import os
 import tempfile
 
@@ -22,6 +23,8 @@ from models.workflow import WorkflowNodeExecutionStatus
 from .entities import DocumentExtractorNodeData
 from .exc import DocumentExtractorError, FileDownloadError, TextExtractionError, UnsupportedFileTypeError
 
+logger = logging.getLogger(__name__)
+
 
 class DocumentExtractorNode(BaseNode[DocumentExtractorNodeData]):
     """
@@ -177,10 +180,43 @@ def _extract_text_from_pdf(file_content: bytes) -> str:
 
 
 def _extract_text_from_doc(file_content: bytes) -> str:
+    """
+    Extract text from a DOC/DOCX file.
+    For now support only paragraph and table add more if needed
+    """
     try:
         doc_file = io.BytesIO(file_content)
         doc = docx.Document(doc_file)
-        return "\n".join([paragraph.text for paragraph in doc.paragraphs])
+        text = []
+        # Process paragraphs
+        for paragraph in doc.paragraphs:
+            if paragraph.text.strip():
+                text.append(paragraph.text)
+
+        # Process tables
+        for table in doc.tables:
+            # Table header
+            try:
+                # table maybe cause errors so ignore it.
+                if len(table.rows) > 0 and table.rows[0].cells is not None:
+                    # Check if any cell in the table has text
+                    has_content = False
+                    for row in table.rows:
+                        if any(cell.text.strip() for cell in row.cells):
+                            has_content = True
+                            break
+
+                    if has_content:
+                        markdown_table = "| " + " | ".join(cell.text for cell in table.rows[0].cells) + " |\n"
+                        markdown_table += "| " + " | ".join(["---"] * len(table.rows[0].cells)) + " |\n"
+                        for row in table.rows[1:]:
+                            markdown_table += "| " + " | ".join(cell.text for cell in row.cells) + " |\n"
+                        text.append(markdown_table)
+            except Exception as e:
+                logger.warning(f"Failed to extract table from DOC/DOCX: {e}")
+                continue
+
+        return "\n".join(text)
     except Exception as e:
         raise TextExtractionError(f"Failed to extract text from DOC/DOCX: {str(e)}") from e