Optimize webscraper (#4392)

Co-authored-by: luowei <glpat-EjySCyNjWiLqAED-YmwM>
Co-authored-by: crazywoola <427733928@qq.com>
Co-authored-by: crazywoola <100913391+crazywoola@users.noreply.github.com>
This commit is contained in:
Charlie.Wei
2024-05-15 15:23:16 +08:00
committed by GitHub
parent c0fe414e0a
commit 97b65f9b4b
4 changed files with 61 additions and 11 deletions

View File

@@ -1,6 +1,8 @@
import re
import tempfile
from pathlib import Path
from typing import Union
from urllib.parse import unquote
import requests
from flask import current_app
@@ -55,6 +57,17 @@ class ExtractProcessor:
with tempfile.TemporaryDirectory() as temp_dir:
suffix = Path(url).suffix
if not suffix and suffix != '.':
# get content-type
if response.headers.get('Content-Type'):
suffix = '.' + response.headers.get('Content-Type').split('/')[-1]
else:
content_disposition = response.headers.get('Content-Disposition')
filename_match = re.search(r'filename="([^"]+)"', content_disposition)
if filename_match:
filename = unquote(filename_match.group(1))
suffix = '.' + re.search(r'\.(\w+)$', filename).group(1)
file_path = f"{temp_dir}/{next(tempfile._get_candidate_names())}{suffix}"
with open(file_path, 'wb') as file:
file.write(response.content)