From cef74d0d051ded2e95e06f81c42b2809bf39d826 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Thu, 23 Apr 2026 14:46:35 +0300 Subject: [PATCH 1/4] gh-148821: Always reject known multi-byte encodings in pyexpat The XML parser (pyexpat) now raises ValueError for known unsupported multi-byte encodings such us "ISO-2022-JP", "utf8" (without hyphen) or "raw-unicode-escape" instead of failing later, when encounter non-ASCII data. --- Include/codecs.h | 6 +++ Include/internal/pycore_codecs.h | 2 +- Lib/codecs.py | 5 +- Lib/encodings/big5.py | 1 + Lib/encodings/big5hkscs.py | 1 + Lib/encodings/cp932.py | 1 + Lib/encodings/cp949.py | 1 + Lib/encodings/cp950.py | 1 + Lib/encodings/euc_jis_2004.py | 1 + Lib/encodings/euc_jisx0213.py | 1 + Lib/encodings/euc_jp.py | 1 + Lib/encodings/euc_kr.py | 1 + Lib/encodings/gb18030.py | 1 + Lib/encodings/gb2312.py | 1 + Lib/encodings/gbk.py | 1 + Lib/encodings/hz.py | 1 + Lib/encodings/idna.py | 1 + Lib/encodings/iso2022_jp.py | 1 + Lib/encodings/iso2022_jp_1.py | 1 + Lib/encodings/iso2022_jp_2.py | 1 + Lib/encodings/iso2022_jp_2004.py | 1 + Lib/encodings/iso2022_jp_3.py | 1 + Lib/encodings/iso2022_jp_ext.py | 1 + Lib/encodings/iso2022_kr.py | 1 + Lib/encodings/johab.py | 1 + Lib/encodings/punycode.py | 1 + Lib/encodings/raw_unicode_escape.py | 1 + Lib/encodings/shift_jis.py | 1 + Lib/encodings/shift_jis_2004.py | 1 + Lib/encodings/shift_jisx0213.py | 1 + Lib/encodings/unicode_escape.py | 1 + Lib/encodings/utf_16.py | 1 + Lib/encodings/utf_16_be.py | 1 + Lib/encodings/utf_16_le.py | 1 + Lib/encodings/utf_32.py | 1 + Lib/encodings/utf_32_be.py | 1 + Lib/encodings/utf_32_le.py | 1 + Lib/encodings/utf_7.py | 1 + Lib/encodings/utf_8.py | 1 + Lib/encodings/utf_8_sig.py | 1 + Lib/test/test_codecs.py | 3 ++ Lib/test/test_pyexpat.py | 47 ++++++++++++++++++- ...-04-23-14-46-30.gh-issue-148821.cR4kMa.rst | 4 ++ Modules/pyexpat.c | 26 ++++++++++ Tools/unicode/gencjkcodecs.py | 1 + 45 files changed, 128 insertions(+), 3 deletions(-) create mode 100644 Misc/NEWS.d/next/Library/2026-04-23-14-46-30.gh-issue-148821.cR4kMa.rst diff --git a/Include/codecs.h b/Include/codecs.h index 512a3c723eca18..d14f527dee75da 100644 --- a/Include/codecs.h +++ b/Include/codecs.h @@ -170,6 +170,12 @@ PyAPI_FUNC(PyObject *) PyCodec_NameReplaceErrors(PyObject *exc); PyAPI_DATA(const char *) Py_hexdigits; #endif +#ifndef Py_LIMITED_API +PyAPI_FUNC(PyObject*) _PyCodec_LookupTextEncoding( + const char *encoding, + const char *alternate_command); +#endif + #ifdef __cplusplus } #endif diff --git a/Include/internal/pycore_codecs.h b/Include/internal/pycore_codecs.h index 52dca1362592d6..bfa10eadf73573 100644 --- a/Include/internal/pycore_codecs.h +++ b/Include/internal/pycore_codecs.h @@ -45,7 +45,7 @@ extern int _PyCodec_UnregisterError(const char *name); in Python 3.5+? */ -extern PyObject* _PyCodec_LookupTextEncoding( +PyAPI_FUNC(PyObject*) _PyCodec_LookupTextEncoding( const char *encoding, const char *alternate_command); diff --git a/Lib/codecs.py b/Lib/codecs.py index e4a8010aba90a5..e99460a670a516 100644 --- a/Lib/codecs.py +++ b/Lib/codecs.py @@ -93,7 +93,8 @@ class CodecInfo(tuple): def __new__(cls, encode, decode, streamreader=None, streamwriter=None, incrementalencoder=None, incrementaldecoder=None, name=None, - *, _is_text_encoding=None): + *, _is_text_encoding=None, + _is_single_byte=None): self = tuple.__new__(cls, (encode, decode, streamreader, streamwriter)) self.name = name self.encode = encode @@ -104,6 +105,8 @@ def __new__(cls, encode, decode, streamreader=None, streamwriter=None, self.streamreader = streamreader if _is_text_encoding is not None: self._is_text_encoding = _is_text_encoding + if _is_single_byte is not None: + self._is_single_byte = _is_single_byte return self def __repr__(self): diff --git a/Lib/encodings/big5.py b/Lib/encodings/big5.py index 7adeb0e1605274..8bed14b35c5899 100644 --- a/Lib/encodings/big5.py +++ b/Lib/encodings/big5.py @@ -36,4 +36,5 @@ def getregentry(): incrementaldecoder=IncrementalDecoder, streamreader=StreamReader, streamwriter=StreamWriter, + _is_single_byte=False, ) diff --git a/Lib/encodings/big5hkscs.py b/Lib/encodings/big5hkscs.py index 350df37baaedaf..eeeb7865895190 100644 --- a/Lib/encodings/big5hkscs.py +++ b/Lib/encodings/big5hkscs.py @@ -36,4 +36,5 @@ def getregentry(): incrementaldecoder=IncrementalDecoder, streamreader=StreamReader, streamwriter=StreamWriter, + _is_single_byte=False, ) diff --git a/Lib/encodings/cp932.py b/Lib/encodings/cp932.py index e01f59b7190576..3671a4387f96b6 100644 --- a/Lib/encodings/cp932.py +++ b/Lib/encodings/cp932.py @@ -36,4 +36,5 @@ def getregentry(): incrementaldecoder=IncrementalDecoder, streamreader=StreamReader, streamwriter=StreamWriter, + _is_single_byte=False, ) diff --git a/Lib/encodings/cp949.py b/Lib/encodings/cp949.py index 627c87125e2aff..df998ba3bad75c 100644 --- a/Lib/encodings/cp949.py +++ b/Lib/encodings/cp949.py @@ -36,4 +36,5 @@ def getregentry(): incrementaldecoder=IncrementalDecoder, streamreader=StreamReader, streamwriter=StreamWriter, + _is_single_byte=False, ) diff --git a/Lib/encodings/cp950.py b/Lib/encodings/cp950.py index 39eec5ed0ddef9..12c7bbd8d226ad 100644 --- a/Lib/encodings/cp950.py +++ b/Lib/encodings/cp950.py @@ -36,4 +36,5 @@ def getregentry(): incrementaldecoder=IncrementalDecoder, streamreader=StreamReader, streamwriter=StreamWriter, + _is_single_byte=False, ) diff --git a/Lib/encodings/euc_jis_2004.py b/Lib/encodings/euc_jis_2004.py index 72b87aea68862f..68604db3c30b2d 100644 --- a/Lib/encodings/euc_jis_2004.py +++ b/Lib/encodings/euc_jis_2004.py @@ -36,4 +36,5 @@ def getregentry(): incrementaldecoder=IncrementalDecoder, streamreader=StreamReader, streamwriter=StreamWriter, + _is_single_byte=False, ) diff --git a/Lib/encodings/euc_jisx0213.py b/Lib/encodings/euc_jisx0213.py index cc47d04112a187..cd2808965a6edd 100644 --- a/Lib/encodings/euc_jisx0213.py +++ b/Lib/encodings/euc_jisx0213.py @@ -36,4 +36,5 @@ def getregentry(): incrementaldecoder=IncrementalDecoder, streamreader=StreamReader, streamwriter=StreamWriter, + _is_single_byte=False, ) diff --git a/Lib/encodings/euc_jp.py b/Lib/encodings/euc_jp.py index 7bcbe4147f2ad4..bcdd0582d71902 100644 --- a/Lib/encodings/euc_jp.py +++ b/Lib/encodings/euc_jp.py @@ -36,4 +36,5 @@ def getregentry(): incrementaldecoder=IncrementalDecoder, streamreader=StreamReader, streamwriter=StreamWriter, + _is_single_byte=False, ) diff --git a/Lib/encodings/euc_kr.py b/Lib/encodings/euc_kr.py index c1fb1260e879f0..8a81356d8f9980 100644 --- a/Lib/encodings/euc_kr.py +++ b/Lib/encodings/euc_kr.py @@ -36,4 +36,5 @@ def getregentry(): incrementaldecoder=IncrementalDecoder, streamreader=StreamReader, streamwriter=StreamWriter, + _is_single_byte=False, ) diff --git a/Lib/encodings/gb18030.py b/Lib/encodings/gb18030.py index 34fb6c366a7614..98df7d4cbeec3d 100644 --- a/Lib/encodings/gb18030.py +++ b/Lib/encodings/gb18030.py @@ -36,4 +36,5 @@ def getregentry(): incrementaldecoder=IncrementalDecoder, streamreader=StreamReader, streamwriter=StreamWriter, + _is_single_byte=False, ) diff --git a/Lib/encodings/gb2312.py b/Lib/encodings/gb2312.py index 3c3b837d618ecd..ba915a2500f21a 100644 --- a/Lib/encodings/gb2312.py +++ b/Lib/encodings/gb2312.py @@ -36,4 +36,5 @@ def getregentry(): incrementaldecoder=IncrementalDecoder, streamreader=StreamReader, streamwriter=StreamWriter, + _is_single_byte=False, ) diff --git a/Lib/encodings/gbk.py b/Lib/encodings/gbk.py index 1b45db89859cdf..d597c7bb77e93e 100644 --- a/Lib/encodings/gbk.py +++ b/Lib/encodings/gbk.py @@ -36,4 +36,5 @@ def getregentry(): incrementaldecoder=IncrementalDecoder, streamreader=StreamReader, streamwriter=StreamWriter, + _is_single_byte=False, ) diff --git a/Lib/encodings/hz.py b/Lib/encodings/hz.py index 383442a3c9ac9a..43ee36a9286426 100644 --- a/Lib/encodings/hz.py +++ b/Lib/encodings/hz.py @@ -36,4 +36,5 @@ def getregentry(): incrementaldecoder=IncrementalDecoder, streamreader=StreamReader, streamwriter=StreamWriter, + _is_single_byte=False, ) diff --git a/Lib/encodings/idna.py b/Lib/encodings/idna.py index d31ee07ab45b76..98bf9462e36fbf 100644 --- a/Lib/encodings/idna.py +++ b/Lib/encodings/idna.py @@ -385,4 +385,5 @@ def getregentry(): incrementaldecoder=IncrementalDecoder, streamwriter=StreamWriter, streamreader=StreamReader, + _is_single_byte=False, ) diff --git a/Lib/encodings/iso2022_jp.py b/Lib/encodings/iso2022_jp.py index ab0406069356e4..27129ce67aa884 100644 --- a/Lib/encodings/iso2022_jp.py +++ b/Lib/encodings/iso2022_jp.py @@ -36,4 +36,5 @@ def getregentry(): incrementaldecoder=IncrementalDecoder, streamreader=StreamReader, streamwriter=StreamWriter, + _is_single_byte=False, ) diff --git a/Lib/encodings/iso2022_jp_1.py b/Lib/encodings/iso2022_jp_1.py index 997044dc378749..0f41dd95cd4332 100644 --- a/Lib/encodings/iso2022_jp_1.py +++ b/Lib/encodings/iso2022_jp_1.py @@ -36,4 +36,5 @@ def getregentry(): incrementaldecoder=IncrementalDecoder, streamreader=StreamReader, streamwriter=StreamWriter, + _is_single_byte=False, ) diff --git a/Lib/encodings/iso2022_jp_2.py b/Lib/encodings/iso2022_jp_2.py index 9106bf762512fd..25f625819f5ea0 100644 --- a/Lib/encodings/iso2022_jp_2.py +++ b/Lib/encodings/iso2022_jp_2.py @@ -36,4 +36,5 @@ def getregentry(): incrementaldecoder=IncrementalDecoder, streamreader=StreamReader, streamwriter=StreamWriter, + _is_single_byte=False, ) diff --git a/Lib/encodings/iso2022_jp_2004.py b/Lib/encodings/iso2022_jp_2004.py index 40198bf098570b..1f0bd1b7874472 100644 --- a/Lib/encodings/iso2022_jp_2004.py +++ b/Lib/encodings/iso2022_jp_2004.py @@ -36,4 +36,5 @@ def getregentry(): incrementaldecoder=IncrementalDecoder, streamreader=StreamReader, streamwriter=StreamWriter, + _is_single_byte=False, ) diff --git a/Lib/encodings/iso2022_jp_3.py b/Lib/encodings/iso2022_jp_3.py index 346e08beccbbaf..2acdb3a2cd9be3 100644 --- a/Lib/encodings/iso2022_jp_3.py +++ b/Lib/encodings/iso2022_jp_3.py @@ -36,4 +36,5 @@ def getregentry(): incrementaldecoder=IncrementalDecoder, streamreader=StreamReader, streamwriter=StreamWriter, + _is_single_byte=False, ) diff --git a/Lib/encodings/iso2022_jp_ext.py b/Lib/encodings/iso2022_jp_ext.py index 752bab9813a094..a32a533e8bdf00 100644 --- a/Lib/encodings/iso2022_jp_ext.py +++ b/Lib/encodings/iso2022_jp_ext.py @@ -36,4 +36,5 @@ def getregentry(): incrementaldecoder=IncrementalDecoder, streamreader=StreamReader, streamwriter=StreamWriter, + _is_single_byte=False, ) diff --git a/Lib/encodings/iso2022_kr.py b/Lib/encodings/iso2022_kr.py index bf7018763eae38..51dd4ab560422a 100644 --- a/Lib/encodings/iso2022_kr.py +++ b/Lib/encodings/iso2022_kr.py @@ -36,4 +36,5 @@ def getregentry(): incrementaldecoder=IncrementalDecoder, streamreader=StreamReader, streamwriter=StreamWriter, + _is_single_byte=False, ) diff --git a/Lib/encodings/johab.py b/Lib/encodings/johab.py index 512aeeb732b522..e58c50a06c4b96 100644 --- a/Lib/encodings/johab.py +++ b/Lib/encodings/johab.py @@ -36,4 +36,5 @@ def getregentry(): incrementaldecoder=IncrementalDecoder, streamreader=StreamReader, streamwriter=StreamWriter, + _is_single_byte=False, ) diff --git a/Lib/encodings/punycode.py b/Lib/encodings/punycode.py index 268fccbd53974e..335acb87cb9b28 100644 --- a/Lib/encodings/punycode.py +++ b/Lib/encodings/punycode.py @@ -250,4 +250,5 @@ def getregentry(): incrementaldecoder=IncrementalDecoder, streamwriter=StreamWriter, streamreader=StreamReader, + _is_single_byte=False, ) diff --git a/Lib/encodings/raw_unicode_escape.py b/Lib/encodings/raw_unicode_escape.py index 46c8e070dd192e..5c5b41437a84b4 100644 --- a/Lib/encodings/raw_unicode_escape.py +++ b/Lib/encodings/raw_unicode_escape.py @@ -43,4 +43,5 @@ def getregentry(): incrementaldecoder=IncrementalDecoder, streamwriter=StreamWriter, streamreader=StreamReader, + _is_single_byte=False, ) diff --git a/Lib/encodings/shift_jis.py b/Lib/encodings/shift_jis.py index 83381172764dea..bf7fded09468c8 100644 --- a/Lib/encodings/shift_jis.py +++ b/Lib/encodings/shift_jis.py @@ -36,4 +36,5 @@ def getregentry(): incrementaldecoder=IncrementalDecoder, streamreader=StreamReader, streamwriter=StreamWriter, + _is_single_byte=False, ) diff --git a/Lib/encodings/shift_jis_2004.py b/Lib/encodings/shift_jis_2004.py index 161b1e86f9918a..ae40b684a010f2 100644 --- a/Lib/encodings/shift_jis_2004.py +++ b/Lib/encodings/shift_jis_2004.py @@ -36,4 +36,5 @@ def getregentry(): incrementaldecoder=IncrementalDecoder, streamreader=StreamReader, streamwriter=StreamWriter, + _is_single_byte=False, ) diff --git a/Lib/encodings/shift_jisx0213.py b/Lib/encodings/shift_jisx0213.py index cb653f53055e67..5af8565618b40e 100644 --- a/Lib/encodings/shift_jisx0213.py +++ b/Lib/encodings/shift_jisx0213.py @@ -36,4 +36,5 @@ def getregentry(): incrementaldecoder=IncrementalDecoder, streamreader=StreamReader, streamwriter=StreamWriter, + _is_single_byte=False, ) diff --git a/Lib/encodings/unicode_escape.py b/Lib/encodings/unicode_escape.py index 9b1ce99b339ae0..d896cefc9596be 100644 --- a/Lib/encodings/unicode_escape.py +++ b/Lib/encodings/unicode_escape.py @@ -43,4 +43,5 @@ def getregentry(): incrementaldecoder=IncrementalDecoder, streamwriter=StreamWriter, streamreader=StreamReader, + _is_single_byte=False, ) diff --git a/Lib/encodings/utf_16.py b/Lib/encodings/utf_16.py index d3b9980026666f..eac93bd17d07d1 100644 --- a/Lib/encodings/utf_16.py +++ b/Lib/encodings/utf_16.py @@ -152,4 +152,5 @@ def getregentry(): incrementaldecoder=IncrementalDecoder, streamreader=StreamReader, streamwriter=StreamWriter, + _is_single_byte=False, ) diff --git a/Lib/encodings/utf_16_be.py b/Lib/encodings/utf_16_be.py index 86b458eb9bcd96..d056cf9202a40f 100644 --- a/Lib/encodings/utf_16_be.py +++ b/Lib/encodings/utf_16_be.py @@ -39,4 +39,5 @@ def getregentry(): incrementaldecoder=IncrementalDecoder, streamreader=StreamReader, streamwriter=StreamWriter, + _is_single_byte=False, ) diff --git a/Lib/encodings/utf_16_le.py b/Lib/encodings/utf_16_le.py index ec454142eedf25..2e07f76cc3f742 100644 --- a/Lib/encodings/utf_16_le.py +++ b/Lib/encodings/utf_16_le.py @@ -39,4 +39,5 @@ def getregentry(): incrementaldecoder=IncrementalDecoder, streamreader=StreamReader, streamwriter=StreamWriter, + _is_single_byte=False, ) diff --git a/Lib/encodings/utf_32.py b/Lib/encodings/utf_32.py index 1924bedbb74c68..aebe145ec95e71 100644 --- a/Lib/encodings/utf_32.py +++ b/Lib/encodings/utf_32.py @@ -147,4 +147,5 @@ def getregentry(): incrementaldecoder=IncrementalDecoder, streamreader=StreamReader, streamwriter=StreamWriter, + _is_single_byte=False, ) diff --git a/Lib/encodings/utf_32_be.py b/Lib/encodings/utf_32_be.py index fe272b5fafec69..ee1b41a11aa35f 100644 --- a/Lib/encodings/utf_32_be.py +++ b/Lib/encodings/utf_32_be.py @@ -34,4 +34,5 @@ def getregentry(): incrementaldecoder=IncrementalDecoder, streamreader=StreamReader, streamwriter=StreamWriter, + _is_single_byte=False, ) diff --git a/Lib/encodings/utf_32_le.py b/Lib/encodings/utf_32_le.py index 9e48210928ee65..4ac786bb73349b 100644 --- a/Lib/encodings/utf_32_le.py +++ b/Lib/encodings/utf_32_le.py @@ -34,4 +34,5 @@ def getregentry(): incrementaldecoder=IncrementalDecoder, streamreader=StreamReader, streamwriter=StreamWriter, + _is_single_byte=False, ) diff --git a/Lib/encodings/utf_7.py b/Lib/encodings/utf_7.py index 8e0567f2087d65..3127867fb5bff9 100644 --- a/Lib/encodings/utf_7.py +++ b/Lib/encodings/utf_7.py @@ -35,4 +35,5 @@ def getregentry(): incrementaldecoder=IncrementalDecoder, streamreader=StreamReader, streamwriter=StreamWriter, + _is_single_byte=False, ) diff --git a/Lib/encodings/utf_8.py b/Lib/encodings/utf_8.py index 1bf6336571547b..3801615ce34001 100644 --- a/Lib/encodings/utf_8.py +++ b/Lib/encodings/utf_8.py @@ -39,4 +39,5 @@ def getregentry(): incrementaldecoder=IncrementalDecoder, streamreader=StreamReader, streamwriter=StreamWriter, + _is_single_byte=False, ) diff --git a/Lib/encodings/utf_8_sig.py b/Lib/encodings/utf_8_sig.py index 1bb479203f365d..b5e5c89f80b9eb 100644 --- a/Lib/encodings/utf_8_sig.py +++ b/Lib/encodings/utf_8_sig.py @@ -127,4 +127,5 @@ def getregentry(): incrementaldecoder=IncrementalDecoder, streamreader=StreamReader, streamwriter=StreamWriter, + _is_single_byte=False, ) diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py index 79c8a7ef886482..03dd61a76db154 100644 --- a/Lib/test/test_codecs.py +++ b/Lib/test/test_codecs.py @@ -1892,6 +1892,7 @@ def test_copy(self): self.assertIsNot(dup, orig) self.assertEqual(dup, orig) self.assertTrue(orig._is_text_encoding) + self.assertFalse(orig._is_single_byte) self.assertEqual(dup.encode, orig.encode) self.assertEqual(dup.name, orig.name) self.assertEqual(dup.incrementalencoder, orig.incrementalencoder) @@ -1912,6 +1913,7 @@ def test_deepcopy(self): self.assertIsNot(dup, orig) self.assertEqual(dup, orig) self.assertTrue(orig._is_text_encoding) + self.assertFalse(orig._is_single_byte) self.assertEqual(dup.encode, orig.encode) self.assertEqual(dup.name, orig.name) self.assertEqual(dup.incrementalencoder, orig.incrementalencoder) @@ -1940,6 +1942,7 @@ def test_pickle(self): unpickled_codec_info.incrementalencoder ) self.assertTrue(unpickled_codec_info._is_text_encoding) + self.assertFalse(unpickled_codec_info._is_single_byte) # Test a CodecInfo with _is_text_encoding equal to false. codec_info = codecs.lookup('base64') diff --git a/Lib/test/test_pyexpat.py b/Lib/test/test_pyexpat.py index aaa91aca36e3c4..0763bb19865167 100644 --- a/Lib/test/test_pyexpat.py +++ b/Lib/test/test_pyexpat.py @@ -227,7 +227,7 @@ def _verify_parse_output(self, operations): "Character data: '\xb5'", "End element: 'root'", ] - for operation, expected_operation in zip(operations, expected_operations): + for operation, expected_operation in zip(operations, expected_operations, strict=True): self.assertEqual(operation, expected_operation) def test_parse_bytes(self): @@ -276,6 +276,51 @@ def test_parse_again(self): self.assertEqual(expat.ErrorString(cm.exception.code), expat.errors.XML_ERROR_FINISHED) + @support.subTests('enc', ['UTF-8', 'utf-8', 'utf-16', 'koi8-u', + 'cp1125', 'cp1251', 'iso8859-5', + 'mac_cyrillic']) + def test_supportes_ecodings(self, enc): + out = self.Outputter() + parser = expat.ParserCreate() + self._hookup_callbacks(parser, out) + data = (f'\n' + '<корінь атрибут="значення">зміст').encode(enc) + parser.Parse(data, True) + self.assertEqual(out.out, [ + ('XML declaration', ('1.0', enc, -1)), + "Start element: 'корінь' {'атрибут': 'значення'}", + "Character data: 'зміст'", + "End element: 'корінь'", + ]) + + @support.subTests('enc', [ + 'UTF8', 'UTF-7', + "unicode-escape", "raw-unicode-escape", + "Big5-HKSCS", "Big5", + "cp932", "cp949", "cp950", + "EUC_JIS-2004", "EUC_JISX0213", "EUC-JP", "EUC-KR", + "GB18030", "GB2312", "GBK", + "HZ-GB-2312", + "ISO-2022-JP", "ISO-2022-JP-1", "ISO-2022-JP-2004", + "ISO-2022-JP-2", "ISO-2022-JP-3", "ISO-2022-JP-EXT", + "ISO-2022-KR", + "johab", + "Shift_JIS", "Shift_JIS-2004", "Shift_JISX0213", + ]) + def test_unsupportes_ecodings(self, enc): + parser = expat.ParserCreate() + data = (f'\n' + '').encode(enc) + with self.assertRaises(ValueError): + parser.Parse(data, True) + + def test_unknown_ecoding(self): + parser = expat.ParserCreate() + data = b'\n' + with self.assertRaises(LookupError): + parser.Parse(data, True) + + class NamespaceSeparatorTest(unittest.TestCase): def test_legal(self): # Tests that make sure we get errors when the namespace_separator value diff --git a/Misc/NEWS.d/next/Library/2026-04-23-14-46-30.gh-issue-148821.cR4kMa.rst b/Misc/NEWS.d/next/Library/2026-04-23-14-46-30.gh-issue-148821.cR4kMa.rst new file mode 100644 index 00000000000000..5dd95047178938 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2026-04-23-14-46-30.gh-issue-148821.cR4kMa.rst @@ -0,0 +1,4 @@ +The :mod:`XML parser ` now raises :exc:`ValueError` for known +unsupported multi-byte encodings such us "UTF8", "ISO-2022-JP" or +"raw-unicode-escape" instead of failing later, when encounter non-ASCII +data. diff --git a/Modules/pyexpat.c b/Modules/pyexpat.c index 0f0afe17513ef1..68c8ac0e4accef 100644 --- a/Modules/pyexpat.c +++ b/Modules/pyexpat.c @@ -4,6 +4,7 @@ #include "Python.h" #include "pycore_ceval.h" // _Py_EnterRecursiveCall() +#include "pycore_codecs.h" // _PyCodec_LookupTextEncoding() #include "pycore_import.h" // _PyImport_SetModule() #include "pycore_pyhash.h" // _Py_HashSecret #include "pycore_traceback.h" // _PyTraceback_Add() @@ -1465,6 +1466,31 @@ PyUnknownEncodingHandler(void *encodingHandlerData, if (PyErr_Occurred()) return XML_STATUS_ERROR; + PyObject *codec = _PyCodec_LookupTextEncoding(name, NULL); + if (codec == NULL) { + return XML_STATUS_ERROR; + } + // if (!PyTuple_CheckExact(codec)) { + // PyObject *attr; + // if (PyObject_GetOptionalAttrString(codec, "_is_single_byte", &attr) < 0) { + // Py_DECREF(codec); + // return XML_STATUS_ERROR; + // } + // if (attr != NULL) { + // int is_single_byte = PyObject_IsTrue(attr); + // Py_DECREF(attr); + // if (is_single_byte <= 0) { + // Py_DECREF(codec); + // if (is_single_byte == 0) { + // PyErr_SetString(PyExc_ValueError, + // "multi-byte encodings are not supported"); + // } + // return XML_STATUS_ERROR; + // } + // } + // } + Py_DECREF(codec); + u = PyUnicode_Decode((const char*) template_buffer, 256, name, "replace"); if (u == NULL) { Py_XDECREF(u); diff --git a/Tools/unicode/gencjkcodecs.py b/Tools/unicode/gencjkcodecs.py index 45866bf2f61062..eb04f67f2077eb 100644 --- a/Tools/unicode/gencjkcodecs.py +++ b/Tools/unicode/gencjkcodecs.py @@ -51,6 +51,7 @@ def getregentry(): incrementaldecoder=IncrementalDecoder, streamreader=StreamReader, streamwriter=StreamWriter, + _is_single_byte=False, ) """) From 2e2df1ea095bf9263b3aedb6332a5a2ef6c6ed3f Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Thu, 23 Apr 2026 15:47:18 +0300 Subject: [PATCH 2/4] Uncomment temporary commented out code. --- Modules/pyexpat.c | 38 +++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/Modules/pyexpat.c b/Modules/pyexpat.c index 68c8ac0e4accef..e95dcb611a33e2 100644 --- a/Modules/pyexpat.c +++ b/Modules/pyexpat.c @@ -1470,25 +1470,25 @@ PyUnknownEncodingHandler(void *encodingHandlerData, if (codec == NULL) { return XML_STATUS_ERROR; } - // if (!PyTuple_CheckExact(codec)) { - // PyObject *attr; - // if (PyObject_GetOptionalAttrString(codec, "_is_single_byte", &attr) < 0) { - // Py_DECREF(codec); - // return XML_STATUS_ERROR; - // } - // if (attr != NULL) { - // int is_single_byte = PyObject_IsTrue(attr); - // Py_DECREF(attr); - // if (is_single_byte <= 0) { - // Py_DECREF(codec); - // if (is_single_byte == 0) { - // PyErr_SetString(PyExc_ValueError, - // "multi-byte encodings are not supported"); - // } - // return XML_STATUS_ERROR; - // } - // } - // } + if (!PyTuple_CheckExact(codec)) { + PyObject *attr; + if (PyObject_GetOptionalAttrString(codec, "_is_single_byte", &attr) < 0) { + Py_DECREF(codec); + return XML_STATUS_ERROR; + } + if (attr != NULL) { + int is_single_byte = PyObject_IsTrue(attr); + Py_DECREF(attr); + if (is_single_byte <= 0) { + Py_DECREF(codec); + if (is_single_byte == 0) { + PyErr_SetString(PyExc_ValueError, + "multi-byte encodings are not supported"); + } + return XML_STATUS_ERROR; + } + } + } Py_DECREF(codec); u = PyUnicode_Decode((const char*) template_buffer, 256, name, "replace"); From 91ac15e21f2d81fc2803856f641d7ae5bbaba45a Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Sat, 25 Apr 2026 15:59:16 +0300 Subject: [PATCH 3/4] Fix the module reference. --- .../next/Library/2026-04-23-14-46-30.gh-issue-148821.cR4kMa.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Misc/NEWS.d/next/Library/2026-04-23-14-46-30.gh-issue-148821.cR4kMa.rst b/Misc/NEWS.d/next/Library/2026-04-23-14-46-30.gh-issue-148821.cR4kMa.rst index 5dd95047178938..119a465fcb200a 100644 --- a/Misc/NEWS.d/next/Library/2026-04-23-14-46-30.gh-issue-148821.cR4kMa.rst +++ b/Misc/NEWS.d/next/Library/2026-04-23-14-46-30.gh-issue-148821.cR4kMa.rst @@ -1,4 +1,4 @@ -The :mod:`XML parser ` now raises :exc:`ValueError` for known +The :mod:`XML parser ` now raises :exc:`ValueError` for known unsupported multi-byte encodings such us "UTF8", "ISO-2022-JP" or "raw-unicode-escape" instead of failing later, when encounter non-ASCII data. From 2177825c7729d03c92b9618b0f98f2aca0abb3b9 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Sat, 25 Apr 2026 16:11:37 +0300 Subject: [PATCH 4/4] Fix ElementTree tests. --- Lib/test/test_xml_etree.py | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/Lib/test/test_xml_etree.py b/Lib/test/test_xml_etree.py index 51af46f124cac6..730456e7582adc 100644 --- a/Lib/test/test_xml_etree.py +++ b/Lib/test/test_xml_etree.py @@ -1014,7 +1014,7 @@ def xml(encoding): def bxml(encoding): return xml(encoding).encode(encoding) supported_encodings = [ - 'ascii', 'utf-8', 'utf-8-sig', 'utf-16', 'utf-16be', 'utf-16le', + 'ascii', 'utf-8', 'utf-16', 'utf-16be', 'utf-16le', 'iso8859-1', 'iso8859-2', 'iso8859-3', 'iso8859-4', 'iso8859-5', 'iso8859-6', 'iso8859-7', 'iso8859-8', 'iso8859-9', 'iso8859-10', 'iso8859-13', 'iso8859-14', 'iso8859-15', 'iso8859-16', @@ -1025,32 +1025,34 @@ def bxml(encoding): 'cp1256', 'cp1257', 'cp1258', 'mac-cyrillic', 'mac-greek', 'mac-iceland', 'mac-latin2', 'mac-roman', 'mac-turkish', - 'iso2022-jp', 'iso2022-jp-1', 'iso2022-jp-2', 'iso2022-jp-2004', - 'iso2022-jp-3', 'iso2022-jp-ext', - 'koi8-r', 'koi8-t', 'koi8-u', 'kz1048', - 'hz', 'ptcp154', + 'koi8-r', 'koi8-t', 'koi8-u', 'kz1048', 'ptcp154', ] for encoding in supported_encodings: - self.assertEqual(ET.tostring(ET.XML(bxml(encoding))), b'') + with self.subTest(encoding=encoding): + self.assertEqual(ET.tostring(ET.XML(bxml(encoding))), b'') unsupported_ascii_compatible_encodings = [ 'big5', 'big5hkscs', 'cp932', 'cp949', 'cp950', 'euc-jp', 'euc-jis-2004', 'euc-jisx0213', 'euc-kr', 'gb2312', 'gbk', 'gb18030', - 'iso2022-kr', 'johab', + 'iso2022-jp', 'iso2022-jp-1', 'iso2022-jp-2', 'iso2022-jp-2004', + 'iso2022-jp-3', 'iso2022-jp-ext', + 'iso2022-kr', 'johab', 'hz', 'shift-jis', 'shift-jis-2004', 'shift-jisx0213', - 'utf-7', + 'utf-7', 'utf-8-sig', 'utf8', ] for encoding in unsupported_ascii_compatible_encodings: - self.assertRaises(ValueError, ET.XML, bxml(encoding)) + with self.subTest(encoding=encoding): + self.assertRaises(ValueError, ET.XML, bxml(encoding)) unsupported_ascii_incompatible_encodings = [ 'cp037', 'cp424', 'cp500', 'cp864', 'cp875', 'cp1026', 'cp1140', 'utf_32', 'utf_32_be', 'utf_32_le', ] for encoding in unsupported_ascii_incompatible_encodings: - self.assertRaises(ET.ParseError, ET.XML, bxml(encoding)) + with self.subTest(encoding=encoding): + self.assertRaises(ET.ParseError, ET.XML, bxml(encoding)) self.assertRaises(ValueError, ET.XML, xml('undefined').encode('ascii')) self.assertRaises(LookupError, ET.XML, xml('xxx').encode('ascii'))