add xls file suport (#3321)

2025-12-16 06:16:53 +08:00 · 2024-04-12 14:53:44 +08:00
parent 42936fc917
commit ad65c891e7
5 changed files with 57 additions and 9 deletions
--- a/api/core/rag/extractor/excel_extractor.py
+++ b/api/core/rag/extractor/excel_extractor.py
@@ -2,6 +2,7 @@
 from typing import Optional

 import pandas as pd
+import xlrd

 from core.rag.extractor.extractor_base import BaseExtractor
 from core.rag.models.document import Document
@@ -27,10 +28,37 @@ class ExcelExtractor(BaseExtractor):
        self._autodetect_encoding = autodetect_encoding

    def extract(self) -> list[Document]:
+        """ parse excel file"""
+        if self._file_path.endswith('.xls'):
+            return self._extract4xls()
+        elif self._file_path.endswith('.xlsx'):
+            return self._extract4xlsx()
+
+    def _extract4xls(self) -> list[Document]:
+        wb = xlrd.open_workbook(filename=self._file_path)
+        documents = []
+        # loop over all sheets
+        for sheet in wb.sheets():
+            for row_index, row in enumerate(sheet.get_rows(), start=1):
+                row_header = None
+                if self.is_blank_row(row):
+                    continue
+                if row_header is None:
+                    row_header = row
+                    continue
+                item_arr = []
+                for index, cell in enumerate(row):
+                    txt_value = str(cell.value)
+                    item_arr.append(f'{row_header[index].value}:{txt_value}')
+                item_str = "\n".join(item_arr)
+                document = Document(page_content=item_str, metadata={'source': self._file_path})
+                documents.append(document)
+        return documents
+
+    def _extract4xlsx(self) -> list[Document]:
        """Load from file path using Pandas."""
        data = []
-
-        # 使用 Pandas 读取 Excel 文件的每个工作表
+        # Read each worksheet of an Excel file using Pandas
        xls = pd.ExcelFile(self._file_path)
        for sheet_name in xls.sheet_names:
            df = pd.read_excel(xls, sheet_name=sheet_name)
@@ -43,5 +71,18 @@ class ExcelExtractor(BaseExtractor):
                item = ';'.join(f'{k}:{v}' for k, v in row.items() if pd.notna(v))
                document = Document(page_content=item, metadata={'source': self._file_path})
                data.append(document)
-
        return data
+
+    @staticmethod
+    def is_blank_row(row):
+        """
+
+        Determine whether the specified line is a blank line.
+        :param row: row object。
+        :return: Returns True if the row is blank, False otherwise.
+        """
+        # Iterates through the cells and returns False if a non-empty cell is found
+        for cell in row:
+            if cell.value is not None and cell.value != '':
+                return False
+        return True
--- a/api/core/rag/extractor/extract_processor.py
+++ b/api/core/rag/extractor/extract_processor.py
@@ -84,7 +84,7 @@ class ExtractProcessor:
                etl_type = current_app.config['ETL_TYPE']
                unstructured_api_url = current_app.config['UNSTRUCTURED_API_URL']
                if etl_type == 'Unstructured':
-                    if file_extension == '.xlsx':
+                    if file_extension == '.xlsx' or file_extension == '.xls':
                        extractor = ExcelExtractor(file_path)
                    elif file_extension == '.pdf':
                        extractor = PdfExtractor(file_path)
@@ -114,7 +114,7 @@ class ExtractProcessor:
                        extractor = UnstructuredTextExtractor(file_path, unstructured_api_url) if is_automatic \
                            else TextExtractor(file_path, autodetect_encoding=True)
                else:
-                    if file_extension == '.xlsx':
+                    if file_extension == '.xlsx' or file_extension == '.xls':
                        extractor = ExcelExtractor(file_path)
                    elif file_extension == '.pdf':
                        extractor = PdfExtractor(file_path)