fix: non utf8 code decode close #10691 (#10698)

Signed-off-by: yihong0618 <zouzou0208@gmail.com>
This commit is contained in:
yihong
2024-11-14 17:29:49 +08:00
committed by GitHub
parent fbb9c1c249
commit 722964667f
2 changed files with 15 additions and 4 deletions

View File

@@ -140,6 +140,17 @@ def test_extract_text_from_plain_text():
assert text == "Hello, world!"
def tet_extract_text_from_plain_text_non_utf8():
import tempfile
non_utf8_content = b"Hello world\xa9." # \xA9 represents © in Latin-1
with tempfile.NamedTemporaryFile(delete=True) as temp_file:
temp_file.write(non_utf8_content)
temp_file.seek(0)
text = _extract_text_from_plain_text(temp_file.read())
assert text == "Hello, world."
@patch("pypdfium2.PdfDocument")
def test_extract_text_from_pdf(mock_pdf_document):
mock_page = Mock()