Add UNSTRUCTURED_API_KEY env support (#4369)

This commit is contained in:
majian
2024-05-20 13:14:17 +08:00
committed by GitHub
parent 3a51f2a778
commit b5204111da
4 changed files with 8 additions and 3 deletions

View File

@@ -96,6 +96,7 @@ class ExtractProcessor:
file_extension = input_file.suffix.lower()
etl_type = current_app.config['ETL_TYPE']
unstructured_api_url = current_app.config['UNSTRUCTURED_API_URL']
unstructured_api_key = current_app.config['UNSTRUCTURED_API_KEY']
if etl_type == 'Unstructured':
if file_extension == '.xlsx' or file_extension == '.xls':
extractor = ExcelExtractor(file_path)
@@ -115,7 +116,7 @@ class ExtractProcessor:
elif file_extension == '.eml':
extractor = UnstructuredEmailExtractor(file_path, unstructured_api_url)
elif file_extension == '.ppt':
extractor = UnstructuredPPTExtractor(file_path, unstructured_api_url)
extractor = UnstructuredPPTExtractor(file_path, unstructured_api_url, unstructured_api_key)
elif file_extension == '.pptx':
extractor = UnstructuredPPTXExtractor(file_path, unstructured_api_url)
elif file_extension == '.xml':

View File

@@ -17,16 +17,18 @@ class UnstructuredPPTExtractor(BaseExtractor):
def __init__(
self,
file_path: str,
api_url: str
api_url: str,
api_key: str
):
"""Initialize with file path."""
self._file_path = file_path
self._api_url = api_url
self._api_key = api_key
def extract(self) -> list[Document]:
from unstructured.partition.api import partition_via_api
elements = partition_via_api(filename=self._file_path, api_url=self._api_url)
elements = partition_via_api(filename=self._file_path, api_url=self._api_url, api_key=self._api_key)
text_by_page = {}
for element in elements:
page = element.metadata.page_number