mirror of
http://112.124.100.131/huang.ze/ebiz-dify-ai.git
synced 2025-12-09 19:06:51 +08:00
Optimize webscraper (#4392)
Co-authored-by: luowei <glpat-EjySCyNjWiLqAED-YmwM> Co-authored-by: crazywoola <427733928@qq.com> Co-authored-by: crazywoola <100913391+crazywoola@users.noreply.github.com>
This commit is contained in:
@@ -1,6 +1,8 @@
|
||||
import re
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
from typing import Union
|
||||
from urllib.parse import unquote
|
||||
|
||||
import requests
|
||||
from flask import current_app
|
||||
@@ -55,6 +57,17 @@ class ExtractProcessor:
|
||||
|
||||
with tempfile.TemporaryDirectory() as temp_dir:
|
||||
suffix = Path(url).suffix
|
||||
if not suffix and suffix != '.':
|
||||
# get content-type
|
||||
if response.headers.get('Content-Type'):
|
||||
suffix = '.' + response.headers.get('Content-Type').split('/')[-1]
|
||||
else:
|
||||
content_disposition = response.headers.get('Content-Disposition')
|
||||
filename_match = re.search(r'filename="([^"]+)"', content_disposition)
|
||||
if filename_match:
|
||||
filename = unquote(filename_match.group(1))
|
||||
suffix = '.' + re.search(r'\.(\w+)$', filename).group(1)
|
||||
|
||||
file_path = f"{temp_dir}/{next(tempfile._get_candidate_names())}{suffix}"
|
||||
with open(file_path, 'wb') as file:
|
||||
file.write(response.content)
|
||||
|
||||
Reference in New Issue
Block a user