Feat/add unstructured support (#1780)

Co-authored-by: jyong <jyong@dify.ai>
This commit is contained in:
Jyong
2023-12-18 23:24:06 +08:00
committed by GitHub
parent 2fd56cb01c
commit 5e34f938c1
15 changed files with 361 additions and 14 deletions

View File

@@ -0,0 +1,41 @@
import logging
import re
from typing import Optional, List, Tuple, cast
from langchain.document_loaders.base import BaseLoader
from langchain.document_loaders.helpers import detect_file_encodings
from langchain.schema import Document
logger = logging.getLogger(__name__)
class UnstructuredEmailLoader(BaseLoader):
"""Load msg files.
Args:
file_path: Path to the file to load.
"""
def __init__(
self,
file_path: str,
api_url: str,
):
"""Initialize with file path."""
self._file_path = file_path
self._api_url = api_url
def load(self) -> List[Document]:
from unstructured.partition.email import partition_email
elements = partition_email(filename=self._file_path, api_url=self._api_url)
from unstructured.chunking.title import chunk_by_title
chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=0)
documents = []
for chunk in chunks:
text = chunk.text.strip()
documents.append(Document(page_content=text))
return documents

View File

@@ -0,0 +1,48 @@
import logging
from typing import List
from langchain.document_loaders.base import BaseLoader
from langchain.schema import Document
logger = logging.getLogger(__name__)
class UnstructuredMarkdownLoader(BaseLoader):
"""Load md files.
Args:
file_path: Path to the file to load.
remove_hyperlinks: Whether to remove hyperlinks from the text.
remove_images: Whether to remove images from the text.
encoding: File encoding to use. If `None`, the file will be loaded
with the default system encoding.
autodetect_encoding: Whether to try to autodetect the file encoding
if the specified encoding fails.
"""
def __init__(
self,
file_path: str,
api_url: str,
):
"""Initialize with file path."""
self._file_path = file_path
self._api_url = api_url
def load(self) -> List[Document]:
from unstructured.partition.md import partition_md
elements = partition_md(filename=self._file_path, api_url=self._api_url)
from unstructured.chunking.title import chunk_by_title
chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=0)
documents = []
for chunk in chunks:
text = chunk.text.strip()
documents.append(Document(page_content=text))
return documents

View File

@@ -0,0 +1,40 @@
import logging
import re
from typing import Optional, List, Tuple, cast
from langchain.document_loaders.base import BaseLoader
from langchain.document_loaders.helpers import detect_file_encodings
from langchain.schema import Document
logger = logging.getLogger(__name__)
class UnstructuredMsgLoader(BaseLoader):
"""Load msg files.
Args:
file_path: Path to the file to load.
"""
def __init__(
self,
file_path: str,
api_url: str
):
"""Initialize with file path."""
self._file_path = file_path
self._api_url = api_url
def load(self) -> List[Document]:
from unstructured.partition.msg import partition_msg
elements = partition_msg(filename=self._file_path, api_url=self._api_url)
from unstructured.chunking.title import chunk_by_title
chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=0)
documents = []
for chunk in chunks:
text = chunk.text.strip()
documents.append(Document(page_content=text))
return documents

View File

@@ -0,0 +1,40 @@
import logging
import re
from typing import Optional, List, Tuple, cast
from langchain.document_loaders.base import BaseLoader
from langchain.document_loaders.helpers import detect_file_encodings
from langchain.schema import Document
logger = logging.getLogger(__name__)
class UnstructuredPPTLoader(BaseLoader):
"""Load msg files.
Args:
file_path: Path to the file to load.
"""
def __init__(
self,
file_path: str,
api_url: str
):
"""Initialize with file path."""
self._file_path = file_path
self._api_url = api_url
def load(self) -> List[Document]:
from unstructured.partition.ppt import partition_ppt
elements = partition_ppt(filename=self._file_path, api_url=self._api_url)
from unstructured.chunking.title import chunk_by_title
chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=0)
documents = []
for chunk in chunks:
text = chunk.text.strip()
documents.append(Document(page_content=text))
return documents

View File

@@ -0,0 +1,40 @@
import logging
import re
from typing import Optional, List, Tuple, cast
from langchain.document_loaders.base import BaseLoader
from langchain.document_loaders.helpers import detect_file_encodings
from langchain.schema import Document
logger = logging.getLogger(__name__)
class UnstructuredPPTXLoader(BaseLoader):
"""Load msg files.
Args:
file_path: Path to the file to load.
"""
def __init__(
self,
file_path: str,
api_url: str
):
"""Initialize with file path."""
self._file_path = file_path
self._api_url = api_url
def load(self) -> List[Document]:
from unstructured.partition.pptx import partition_pptx
elements = partition_pptx(filename=self._file_path, api_url=self._api_url)
from unstructured.chunking.title import chunk_by_title
chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=0)
documents = []
for chunk in chunks:
text = chunk.text.strip()
documents.append(Document(page_content=text))
return documents

View File

@@ -0,0 +1,40 @@
import logging
import re
from typing import Optional, List, Tuple, cast
from langchain.document_loaders.base import BaseLoader
from langchain.document_loaders.helpers import detect_file_encodings
from langchain.schema import Document
logger = logging.getLogger(__name__)
class UnstructuredTextLoader(BaseLoader):
"""Load msg files.
Args:
file_path: Path to the file to load.
"""
def __init__(
self,
file_path: str,
api_url: str
):
"""Initialize with file path."""
self._file_path = file_path
self._api_url = api_url
def load(self) -> List[Document]:
from unstructured.partition.text import partition_text
elements = partition_text(filename=self._file_path, api_url=self._api_url)
from unstructured.chunking.title import chunk_by_title
chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=0)
documents = []
for chunk in chunks:
text = chunk.text.strip()
documents.append(Document(page_content=text))
return documents

View File

@@ -0,0 +1,40 @@
import logging
import re
from typing import Optional, List, Tuple, cast
from langchain.document_loaders.base import BaseLoader
from langchain.document_loaders.helpers import detect_file_encodings
from langchain.schema import Document
logger = logging.getLogger(__name__)
class UnstructuredXmlLoader(BaseLoader):
"""Load msg files.
Args:
file_path: Path to the file to load.
"""
def __init__(
self,
file_path: str,
api_url: str
):
"""Initialize with file path."""
self._file_path = file_path
self._api_url = api_url
def load(self) -> List[Document]:
from unstructured.partition.xml import partition_xml
elements = partition_xml(filename=self._file_path, xml_keep_tags=True, api_url=self._api_url)
from unstructured.chunking.title import chunk_by_title
chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=0)
documents = []
for chunk in chunks:
text = chunk.text.strip()
documents.append(Document(page_content=text))
return documents