From cef74d0d051ded2e95e06f81c42b2809bf39d826 Mon Sep 17 00:00:00 2001
From: Serhiy Storchaka <storchaka@gmail.com>
Date: Thu, 23 Apr 2026 14:46:35 +0300
Subject: [PATCH 1/4] gh-148821: Always reject known multi-byte encodings in
 pyexpat

The XML parser (pyexpat) now raises ValueError for known unsupported
multi-byte encodings such us "ISO-2022-JP", "utf8" (without hyphen) or
"raw-unicode-escape" instead of failing later, when encounter non-ASCII data.
---
 Include/codecs.h                              |  6 +++
 Include/internal/pycore_codecs.h              |  2 +-
 Lib/codecs.py                                 |  5 +-
 Lib/encodings/big5.py                         |  1 +
 Lib/encodings/big5hkscs.py                    |  1 +
 Lib/encodings/cp932.py                        |  1 +
 Lib/encodings/cp949.py                        |  1 +
 Lib/encodings/cp950.py                        |  1 +
 Lib/encodings/euc_jis_2004.py                 |  1 +
 Lib/encodings/euc_jisx0213.py                 |  1 +
 Lib/encodings/euc_jp.py                       |  1 +
 Lib/encodings/euc_kr.py                       |  1 +
 Lib/encodings/gb18030.py                      |  1 +
 Lib/encodings/gb2312.py                       |  1 +
 Lib/encodings/gbk.py                          |  1 +
 Lib/encodings/hz.py                           |  1 +
 Lib/encodings/idna.py                         |  1 +
 Lib/encodings/iso2022_jp.py                   |  1 +
 Lib/encodings/iso2022_jp_1.py                 |  1 +
 Lib/encodings/iso2022_jp_2.py                 |  1 +
 Lib/encodings/iso2022_jp_2004.py              |  1 +
 Lib/encodings/iso2022_jp_3.py                 |  1 +
 Lib/encodings/iso2022_jp_ext.py               |  1 +
 Lib/encodings/iso2022_kr.py                   |  1 +
 Lib/encodings/johab.py                        |  1 +
 Lib/encodings/punycode.py                     |  1 +
 Lib/encodings/raw_unicode_escape.py           |  1 +
 Lib/encodings/shift_jis.py                    |  1 +
 Lib/encodings/shift_jis_2004.py               |  1 +
 Lib/encodings/shift_jisx0213.py               |  1 +
 Lib/encodings/unicode_escape.py               |  1 +
 Lib/encodings/utf_16.py                       |  1 +
 Lib/encodings/utf_16_be.py                    |  1 +
 Lib/encodings/utf_16_le.py                    |  1 +
 Lib/encodings/utf_32.py                       |  1 +
 Lib/encodings/utf_32_be.py                    |  1 +
 Lib/encodings/utf_32_le.py                    |  1 +
 Lib/encodings/utf_7.py                        |  1 +
 Lib/encodings/utf_8.py                        |  1 +
 Lib/encodings/utf_8_sig.py                    |  1 +
 Lib/test/test_codecs.py                       |  3 ++
 Lib/test/test_pyexpat.py                      | 47 ++++++++++++++++++-
 ...-04-23-14-46-30.gh-issue-148821.cR4kMa.rst |  4 ++
 Modules/pyexpat.c                             | 26 ++++++++++
 Tools/unicode/gencjkcodecs.py                 |  1 +
 45 files changed, 128 insertions(+), 3 deletions(-)
 create mode 100644 Misc/NEWS.d/next/Library/2026-04-23-14-46-30.gh-issue-148821.cR4kMa.rst

diff --git a/Include/codecs.h b/Include/codecs.h
index 512a3c723eca18..d14f527dee75da 100644
--- a/Include/codecs.h
+++ b/Include/codecs.h
@@ -170,6 +170,12 @@ PyAPI_FUNC(PyObject *) PyCodec_NameReplaceErrors(PyObject *exc);
 PyAPI_DATA(const char *) Py_hexdigits;
 #endif
 
+#ifndef Py_LIMITED_API
+PyAPI_FUNC(PyObject*) _PyCodec_LookupTextEncoding(
+   const char *encoding,
+   const char *alternate_command);
+#endif
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/Include/internal/pycore_codecs.h b/Include/internal/pycore_codecs.h
index 52dca1362592d6..bfa10eadf73573 100644
--- a/Include/internal/pycore_codecs.h
+++ b/Include/internal/pycore_codecs.h
@@ -45,7 +45,7 @@ extern int _PyCodec_UnregisterError(const char *name);
    in Python 3.5+?
 
  */
-extern PyObject* _PyCodec_LookupTextEncoding(
+PyAPI_FUNC(PyObject*) _PyCodec_LookupTextEncoding(
    const char *encoding,
    const char *alternate_command);
 
diff --git a/Lib/codecs.py b/Lib/codecs.py
index e4a8010aba90a5..e99460a670a516 100644
--- a/Lib/codecs.py
+++ b/Lib/codecs.py
@@ -93,7 +93,8 @@ class CodecInfo(tuple):
 
     def __new__(cls, encode, decode, streamreader=None, streamwriter=None,
         incrementalencoder=None, incrementaldecoder=None, name=None,
-        *, _is_text_encoding=None):
+        *, _is_text_encoding=None,
+        _is_single_byte=None):
         self = tuple.__new__(cls, (encode, decode, streamreader, streamwriter))
         self.name = name
         self.encode = encode
@@ -104,6 +105,8 @@ def __new__(cls, encode, decode, streamreader=None, streamwriter=None,
         self.streamreader = streamreader
         if _is_text_encoding is not None:
             self._is_text_encoding = _is_text_encoding
+        if _is_single_byte is not None:
+            self._is_single_byte = _is_single_byte
         return self
 
     def __repr__(self):
diff --git a/Lib/encodings/big5.py b/Lib/encodings/big5.py
index 7adeb0e1605274..8bed14b35c5899 100644
--- a/Lib/encodings/big5.py
+++ b/Lib/encodings/big5.py
@@ -36,4 +36,5 @@ def getregentry():
         incrementaldecoder=IncrementalDecoder,
         streamreader=StreamReader,
         streamwriter=StreamWriter,
+        _is_single_byte=False,
     )
diff --git a/Lib/encodings/big5hkscs.py b/Lib/encodings/big5hkscs.py
index 350df37baaedaf..eeeb7865895190 100644
--- a/Lib/encodings/big5hkscs.py
+++ b/Lib/encodings/big5hkscs.py
@@ -36,4 +36,5 @@ def getregentry():
         incrementaldecoder=IncrementalDecoder,
         streamreader=StreamReader,
         streamwriter=StreamWriter,
+        _is_single_byte=False,
     )
diff --git a/Lib/encodings/cp932.py b/Lib/encodings/cp932.py
index e01f59b7190576..3671a4387f96b6 100644
--- a/Lib/encodings/cp932.py
+++ b/Lib/encodings/cp932.py
@@ -36,4 +36,5 @@ def getregentry():
         incrementaldecoder=IncrementalDecoder,
         streamreader=StreamReader,
         streamwriter=StreamWriter,
+        _is_single_byte=False,
     )
diff --git a/Lib/encodings/cp949.py b/Lib/encodings/cp949.py
index 627c87125e2aff..df998ba3bad75c 100644
--- a/Lib/encodings/cp949.py
+++ b/Lib/encodings/cp949.py
@@ -36,4 +36,5 @@ def getregentry():
         incrementaldecoder=IncrementalDecoder,
         streamreader=StreamReader,
         streamwriter=StreamWriter,
+        _is_single_byte=False,
     )
diff --git a/Lib/encodings/cp950.py b/Lib/encodings/cp950.py
index 39eec5ed0ddef9..12c7bbd8d226ad 100644
--- a/Lib/encodings/cp950.py
+++ b/Lib/encodings/cp950.py
@@ -36,4 +36,5 @@ def getregentry():
         incrementaldecoder=IncrementalDecoder,
         streamreader=StreamReader,
         streamwriter=StreamWriter,
+        _is_single_byte=False,
     )
diff --git a/Lib/encodings/euc_jis_2004.py b/Lib/encodings/euc_jis_2004.py
index 72b87aea68862f..68604db3c30b2d 100644
--- a/Lib/encodings/euc_jis_2004.py
+++ b/Lib/encodings/euc_jis_2004.py
@@ -36,4 +36,5 @@ def getregentry():
         incrementaldecoder=IncrementalDecoder,
         streamreader=StreamReader,
         streamwriter=StreamWriter,
+        _is_single_byte=False,
     )
diff --git a/Lib/encodings/euc_jisx0213.py b/Lib/encodings/euc_jisx0213.py
index cc47d04112a187..cd2808965a6edd 100644
--- a/Lib/encodings/euc_jisx0213.py
+++ b/Lib/encodings/euc_jisx0213.py
@@ -36,4 +36,5 @@ def getregentry():
         incrementaldecoder=IncrementalDecoder,
         streamreader=StreamReader,
         streamwriter=StreamWriter,
+        _is_single_byte=False,
     )
diff --git a/Lib/encodings/euc_jp.py b/Lib/encodings/euc_jp.py
index 7bcbe4147f2ad4..bcdd0582d71902 100644
--- a/Lib/encodings/euc_jp.py
+++ b/Lib/encodings/euc_jp.py
@@ -36,4 +36,5 @@ def getregentry():
         incrementaldecoder=IncrementalDecoder,
         streamreader=StreamReader,
         streamwriter=StreamWriter,
+        _is_single_byte=False,
     )
diff --git a/Lib/encodings/euc_kr.py b/Lib/encodings/euc_kr.py
index c1fb1260e879f0..8a81356d8f9980 100644
--- a/Lib/encodings/euc_kr.py
+++ b/Lib/encodings/euc_kr.py
@@ -36,4 +36,5 @@ def getregentry():
         incrementaldecoder=IncrementalDecoder,
         streamreader=StreamReader,
         streamwriter=StreamWriter,
+        _is_single_byte=False,
     )
diff --git a/Lib/encodings/gb18030.py b/Lib/encodings/gb18030.py
index 34fb6c366a7614..98df7d4cbeec3d 100644
--- a/Lib/encodings/gb18030.py
+++ b/Lib/encodings/gb18030.py
@@ -36,4 +36,5 @@ def getregentry():
         incrementaldecoder=IncrementalDecoder,
         streamreader=StreamReader,
         streamwriter=StreamWriter,
+        _is_single_byte=False,
     )
diff --git a/Lib/encodings/gb2312.py b/Lib/encodings/gb2312.py
index 3c3b837d618ecd..ba915a2500f21a 100644
--- a/Lib/encodings/gb2312.py
+++ b/Lib/encodings/gb2312.py
@@ -36,4 +36,5 @@ def getregentry():
         incrementaldecoder=IncrementalDecoder,
         streamreader=StreamReader,
         streamwriter=StreamWriter,
+        _is_single_byte=False,
     )
diff --git a/Lib/encodings/gbk.py b/Lib/encodings/gbk.py
index 1b45db89859cdf..d597c7bb77e93e 100644
--- a/Lib/encodings/gbk.py
+++ b/Lib/encodings/gbk.py
@@ -36,4 +36,5 @@ def getregentry():
         incrementaldecoder=IncrementalDecoder,
         streamreader=StreamReader,
         streamwriter=StreamWriter,
+        _is_single_byte=False,
     )
diff --git a/Lib/encodings/hz.py b/Lib/encodings/hz.py
index 383442a3c9ac9a..43ee36a9286426 100644
--- a/Lib/encodings/hz.py
+++ b/Lib/encodings/hz.py
@@ -36,4 +36,5 @@ def getregentry():
         incrementaldecoder=IncrementalDecoder,
         streamreader=StreamReader,
         streamwriter=StreamWriter,
+        _is_single_byte=False,
     )
diff --git a/Lib/encodings/idna.py b/Lib/encodings/idna.py
index d31ee07ab45b76..98bf9462e36fbf 100644
--- a/Lib/encodings/idna.py
+++ b/Lib/encodings/idna.py
@@ -385,4 +385,5 @@ def getregentry():
         incrementaldecoder=IncrementalDecoder,
         streamwriter=StreamWriter,
         streamreader=StreamReader,
+        _is_single_byte=False,
     )
diff --git a/Lib/encodings/iso2022_jp.py b/Lib/encodings/iso2022_jp.py
index ab0406069356e4..27129ce67aa884 100644
--- a/Lib/encodings/iso2022_jp.py
+++ b/Lib/encodings/iso2022_jp.py
@@ -36,4 +36,5 @@ def getregentry():
         incrementaldecoder=IncrementalDecoder,
         streamreader=StreamReader,
         streamwriter=StreamWriter,
+        _is_single_byte=False,
     )
diff --git a/Lib/encodings/iso2022_jp_1.py b/Lib/encodings/iso2022_jp_1.py
index 997044dc378749..0f41dd95cd4332 100644
--- a/Lib/encodings/iso2022_jp_1.py
+++ b/Lib/encodings/iso2022_jp_1.py
@@ -36,4 +36,5 @@ def getregentry():
         incrementaldecoder=IncrementalDecoder,
         streamreader=StreamReader,
         streamwriter=StreamWriter,
+        _is_single_byte=False,
     )
diff --git a/Lib/encodings/iso2022_jp_2.py b/Lib/encodings/iso2022_jp_2.py
index 9106bf762512fd..25f625819f5ea0 100644
--- a/Lib/encodings/iso2022_jp_2.py
+++ b/Lib/encodings/iso2022_jp_2.py
@@ -36,4 +36,5 @@ def getregentry():
         incrementaldecoder=IncrementalDecoder,
         streamreader=StreamReader,
         streamwriter=StreamWriter,
+        _is_single_byte=False,
     )
diff --git a/Lib/encodings/iso2022_jp_2004.py b/Lib/encodings/iso2022_jp_2004.py
index 40198bf098570b..1f0bd1b7874472 100644
--- a/Lib/encodings/iso2022_jp_2004.py
+++ b/Lib/encodings/iso2022_jp_2004.py
@@ -36,4 +36,5 @@ def getregentry():
         incrementaldecoder=IncrementalDecoder,
         streamreader=StreamReader,
         streamwriter=StreamWriter,
+        _is_single_byte=False,
     )
diff --git a/Lib/encodings/iso2022_jp_3.py b/Lib/encodings/iso2022_jp_3.py
index 346e08beccbbaf..2acdb3a2cd9be3 100644
--- a/Lib/encodings/iso2022_jp_3.py
+++ b/Lib/encodings/iso2022_jp_3.py
@@ -36,4 +36,5 @@ def getregentry():
         incrementaldecoder=IncrementalDecoder,
         streamreader=StreamReader,
         streamwriter=StreamWriter,
+        _is_single_byte=False,
     )
diff --git a/Lib/encodings/iso2022_jp_ext.py b/Lib/encodings/iso2022_jp_ext.py
index 752bab9813a094..a32a533e8bdf00 100644
--- a/Lib/encodings/iso2022_jp_ext.py
+++ b/Lib/encodings/iso2022_jp_ext.py
@@ -36,4 +36,5 @@ def getregentry():
         incrementaldecoder=IncrementalDecoder,
         streamreader=StreamReader,
         streamwriter=StreamWriter,
+        _is_single_byte=False,
     )
diff --git a/Lib/encodings/iso2022_kr.py b/Lib/encodings/iso2022_kr.py
index bf7018763eae38..51dd4ab560422a 100644
--- a/Lib/encodings/iso2022_kr.py
+++ b/Lib/encodings/iso2022_kr.py
@@ -36,4 +36,5 @@ def getregentry():
         incrementaldecoder=IncrementalDecoder,
         streamreader=StreamReader,
         streamwriter=StreamWriter,
+        _is_single_byte=False,
     )
diff --git a/Lib/encodings/johab.py b/Lib/encodings/johab.py
index 512aeeb732b522..e58c50a06c4b96 100644
--- a/Lib/encodings/johab.py
+++ b/Lib/encodings/johab.py
@@ -36,4 +36,5 @@ def getregentry():
         incrementaldecoder=IncrementalDecoder,
         streamreader=StreamReader,
         streamwriter=StreamWriter,
+        _is_single_byte=False,
     )
diff --git a/Lib/encodings/punycode.py b/Lib/encodings/punycode.py
index 268fccbd53974e..335acb87cb9b28 100644
--- a/Lib/encodings/punycode.py
+++ b/Lib/encodings/punycode.py
@@ -250,4 +250,5 @@ def getregentry():
         incrementaldecoder=IncrementalDecoder,
         streamwriter=StreamWriter,
         streamreader=StreamReader,
+        _is_single_byte=False,
     )
diff --git a/Lib/encodings/raw_unicode_escape.py b/Lib/encodings/raw_unicode_escape.py
index 46c8e070dd192e..5c5b41437a84b4 100644
--- a/Lib/encodings/raw_unicode_escape.py
+++ b/Lib/encodings/raw_unicode_escape.py
@@ -43,4 +43,5 @@ def getregentry():
         incrementaldecoder=IncrementalDecoder,
         streamwriter=StreamWriter,
         streamreader=StreamReader,
+        _is_single_byte=False,
     )
diff --git a/Lib/encodings/shift_jis.py b/Lib/encodings/shift_jis.py
index 83381172764dea..bf7fded09468c8 100644
--- a/Lib/encodings/shift_jis.py
+++ b/Lib/encodings/shift_jis.py
@@ -36,4 +36,5 @@ def getregentry():
         incrementaldecoder=IncrementalDecoder,
         streamreader=StreamReader,
         streamwriter=StreamWriter,
+        _is_single_byte=False,
     )
diff --git a/Lib/encodings/shift_jis_2004.py b/Lib/encodings/shift_jis_2004.py
index 161b1e86f9918a..ae40b684a010f2 100644
--- a/Lib/encodings/shift_jis_2004.py
+++ b/Lib/encodings/shift_jis_2004.py
@@ -36,4 +36,5 @@ def getregentry():
         incrementaldecoder=IncrementalDecoder,
         streamreader=StreamReader,
         streamwriter=StreamWriter,
+        _is_single_byte=False,
     )
diff --git a/Lib/encodings/shift_jisx0213.py b/Lib/encodings/shift_jisx0213.py
index cb653f53055e67..5af8565618b40e 100644
--- a/Lib/encodings/shift_jisx0213.py
+++ b/Lib/encodings/shift_jisx0213.py
@@ -36,4 +36,5 @@ def getregentry():
         incrementaldecoder=IncrementalDecoder,
         streamreader=StreamReader,
         streamwriter=StreamWriter,
+        _is_single_byte=False,
     )
diff --git a/Lib/encodings/unicode_escape.py b/Lib/encodings/unicode_escape.py
index 9b1ce99b339ae0..d896cefc9596be 100644
--- a/Lib/encodings/unicode_escape.py
+++ b/Lib/encodings/unicode_escape.py
@@ -43,4 +43,5 @@ def getregentry():
         incrementaldecoder=IncrementalDecoder,
         streamwriter=StreamWriter,
         streamreader=StreamReader,
+        _is_single_byte=False,
     )
diff --git a/Lib/encodings/utf_16.py b/Lib/encodings/utf_16.py
index d3b9980026666f..eac93bd17d07d1 100644
--- a/Lib/encodings/utf_16.py
+++ b/Lib/encodings/utf_16.py
@@ -152,4 +152,5 @@ def getregentry():
         incrementaldecoder=IncrementalDecoder,
         streamreader=StreamReader,
         streamwriter=StreamWriter,
+        _is_single_byte=False,
     )
diff --git a/Lib/encodings/utf_16_be.py b/Lib/encodings/utf_16_be.py
index 86b458eb9bcd96..d056cf9202a40f 100644
--- a/Lib/encodings/utf_16_be.py
+++ b/Lib/encodings/utf_16_be.py
@@ -39,4 +39,5 @@ def getregentry():
         incrementaldecoder=IncrementalDecoder,
         streamreader=StreamReader,
         streamwriter=StreamWriter,
+        _is_single_byte=False,
     )
diff --git a/Lib/encodings/utf_16_le.py b/Lib/encodings/utf_16_le.py
index ec454142eedf25..2e07f76cc3f742 100644
--- a/Lib/encodings/utf_16_le.py
+++ b/Lib/encodings/utf_16_le.py
@@ -39,4 +39,5 @@ def getregentry():
         incrementaldecoder=IncrementalDecoder,
         streamreader=StreamReader,
         streamwriter=StreamWriter,
+        _is_single_byte=False,
     )
diff --git a/Lib/encodings/utf_32.py b/Lib/encodings/utf_32.py
index 1924bedbb74c68..aebe145ec95e71 100644
--- a/Lib/encodings/utf_32.py
+++ b/Lib/encodings/utf_32.py
@@ -147,4 +147,5 @@ def getregentry():
         incrementaldecoder=IncrementalDecoder,
         streamreader=StreamReader,
         streamwriter=StreamWriter,
+        _is_single_byte=False,
     )
diff --git a/Lib/encodings/utf_32_be.py b/Lib/encodings/utf_32_be.py
index fe272b5fafec69..ee1b41a11aa35f 100644
--- a/Lib/encodings/utf_32_be.py
+++ b/Lib/encodings/utf_32_be.py
@@ -34,4 +34,5 @@ def getregentry():
         incrementaldecoder=IncrementalDecoder,
         streamreader=StreamReader,
         streamwriter=StreamWriter,
+        _is_single_byte=False,
     )
diff --git a/Lib/encodings/utf_32_le.py b/Lib/encodings/utf_32_le.py
index 9e48210928ee65..4ac786bb73349b 100644
--- a/Lib/encodings/utf_32_le.py
+++ b/Lib/encodings/utf_32_le.py
@@ -34,4 +34,5 @@ def getregentry():
         incrementaldecoder=IncrementalDecoder,
         streamreader=StreamReader,
         streamwriter=StreamWriter,
+        _is_single_byte=False,
     )
diff --git a/Lib/encodings/utf_7.py b/Lib/encodings/utf_7.py
index 8e0567f2087d65..3127867fb5bff9 100644
--- a/Lib/encodings/utf_7.py
+++ b/Lib/encodings/utf_7.py
@@ -35,4 +35,5 @@ def getregentry():
         incrementaldecoder=IncrementalDecoder,
         streamreader=StreamReader,
         streamwriter=StreamWriter,
+        _is_single_byte=False,
     )
diff --git a/Lib/encodings/utf_8.py b/Lib/encodings/utf_8.py
index 1bf6336571547b..3801615ce34001 100644
--- a/Lib/encodings/utf_8.py
+++ b/Lib/encodings/utf_8.py
@@ -39,4 +39,5 @@ def getregentry():
         incrementaldecoder=IncrementalDecoder,
         streamreader=StreamReader,
         streamwriter=StreamWriter,
+        _is_single_byte=False,
     )
diff --git a/Lib/encodings/utf_8_sig.py b/Lib/encodings/utf_8_sig.py
index 1bb479203f365d..b5e5c89f80b9eb 100644
--- a/Lib/encodings/utf_8_sig.py
+++ b/Lib/encodings/utf_8_sig.py
@@ -127,4 +127,5 @@ def getregentry():
         incrementaldecoder=IncrementalDecoder,
         streamreader=StreamReader,
         streamwriter=StreamWriter,
+        _is_single_byte=False,
     )
diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py
index 79c8a7ef886482..03dd61a76db154 100644
--- a/Lib/test/test_codecs.py
+++ b/Lib/test/test_codecs.py
@@ -1892,6 +1892,7 @@ def test_copy(self):
         self.assertIsNot(dup, orig)
         self.assertEqual(dup, orig)
         self.assertTrue(orig._is_text_encoding)
+        self.assertFalse(orig._is_single_byte)
         self.assertEqual(dup.encode, orig.encode)
         self.assertEqual(dup.name, orig.name)
         self.assertEqual(dup.incrementalencoder, orig.incrementalencoder)
@@ -1912,6 +1913,7 @@ def test_deepcopy(self):
         self.assertIsNot(dup, orig)
         self.assertEqual(dup, orig)
         self.assertTrue(orig._is_text_encoding)
+        self.assertFalse(orig._is_single_byte)
         self.assertEqual(dup.encode, orig.encode)
         self.assertEqual(dup.name, orig.name)
         self.assertEqual(dup.incrementalencoder, orig.incrementalencoder)
@@ -1940,6 +1942,7 @@ def test_pickle(self):
                      unpickled_codec_info.incrementalencoder
                 )
                 self.assertTrue(unpickled_codec_info._is_text_encoding)
+                self.assertFalse(unpickled_codec_info._is_single_byte)
 
         # Test a CodecInfo with _is_text_encoding equal to false.
         codec_info = codecs.lookup('base64')
diff --git a/Lib/test/test_pyexpat.py b/Lib/test/test_pyexpat.py
index aaa91aca36e3c4..0763bb19865167 100644
--- a/Lib/test/test_pyexpat.py
+++ b/Lib/test/test_pyexpat.py
@@ -227,7 +227,7 @@ def _verify_parse_output(self, operations):
             "Character data: '\xb5'",
             "End element: 'root'",
         ]
-        for operation, expected_operation in zip(operations, expected_operations):
+        for operation, expected_operation in zip(operations, expected_operations, strict=True):
             self.assertEqual(operation, expected_operation)
 
     def test_parse_bytes(self):
@@ -276,6 +276,51 @@ def test_parse_again(self):
         self.assertEqual(expat.ErrorString(cm.exception.code),
                           expat.errors.XML_ERROR_FINISHED)
 
+    @support.subTests('enc', ['UTF-8', 'utf-8', 'utf-16', 'koi8-u',
+                              'cp1125', 'cp1251', 'iso8859-5',
+                              'mac_cyrillic'])
+    def test_supportes_ecodings(self, enc):
+        out = self.Outputter()
+        parser = expat.ParserCreate()
+        self._hookup_callbacks(parser, out)
+        data = (f'<?xml version="1.0" encoding="{enc}"?>\n'
+                '<корінь атрибут="значення">зміст</корінь>').encode(enc)
+        parser.Parse(data, True)
+        self.assertEqual(out.out, [
+            ('XML declaration', ('1.0', enc, -1)),
+            "Start element: 'корінь' {'атрибут': 'значення'}",
+            "Character data: 'зміст'",
+            "End element: 'корінь'",
+        ])
+
+    @support.subTests('enc', [
+        'UTF8', 'UTF-7',
+        "unicode-escape", "raw-unicode-escape",
+        "Big5-HKSCS", "Big5",
+        "cp932", "cp949", "cp950",
+        "EUC_JIS-2004", "EUC_JISX0213", "EUC-JP", "EUC-KR",
+        "GB18030", "GB2312", "GBK",
+        "HZ-GB-2312",
+        "ISO-2022-JP", "ISO-2022-JP-1", "ISO-2022-JP-2004",
+        "ISO-2022-JP-2", "ISO-2022-JP-3", "ISO-2022-JP-EXT",
+        "ISO-2022-KR",
+        "johab",
+        "Shift_JIS", "Shift_JIS-2004", "Shift_JISX0213",
+    ])
+    def test_unsupportes_ecodings(self, enc):
+        parser = expat.ParserCreate()
+        data = (f'<?xml version="1.0" encoding="{enc}"?>\n'
+                '<root></root>').encode(enc)
+        with self.assertRaises(ValueError):
+            parser.Parse(data, True)
+
+    def test_unknown_ecoding(self):
+        parser = expat.ParserCreate()
+        data = b'<?xml version="1.0" encoding="xyz"?>\n<root></root>'
+        with self.assertRaises(LookupError):
+            parser.Parse(data, True)
+
+
 class NamespaceSeparatorTest(unittest.TestCase):
     def test_legal(self):
         # Tests that make sure we get errors when the namespace_separator value
diff --git a/Misc/NEWS.d/next/Library/2026-04-23-14-46-30.gh-issue-148821.cR4kMa.rst b/Misc/NEWS.d/next/Library/2026-04-23-14-46-30.gh-issue-148821.cR4kMa.rst
new file mode 100644
index 00000000000000..5dd95047178938
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2026-04-23-14-46-30.gh-issue-148821.cR4kMa.rst
@@ -0,0 +1,4 @@
+The :mod:`XML parser <pyexpat>` now raises :exc:`ValueError` for known
+unsupported multi-byte encodings such us "UTF8", "ISO-2022-JP" or
+"raw-unicode-escape" instead of failing later, when encounter non-ASCII
+data.
diff --git a/Modules/pyexpat.c b/Modules/pyexpat.c
index 0f0afe17513ef1..68c8ac0e4accef 100644
--- a/Modules/pyexpat.c
+++ b/Modules/pyexpat.c
@@ -4,6 +4,7 @@
 
 #include "Python.h"
 #include "pycore_ceval.h"         // _Py_EnterRecursiveCall()
+#include "pycore_codecs.h"        // _PyCodec_LookupTextEncoding()
 #include "pycore_import.h"        // _PyImport_SetModule()
 #include "pycore_pyhash.h"        // _Py_HashSecret
 #include "pycore_traceback.h"     // _PyTraceback_Add()
@@ -1465,6 +1466,31 @@ PyUnknownEncodingHandler(void *encodingHandlerData,
     if (PyErr_Occurred())
         return XML_STATUS_ERROR;
 
+    PyObject *codec = _PyCodec_LookupTextEncoding(name, NULL);
+    if (codec == NULL) {
+        return XML_STATUS_ERROR;
+    }
+    // if (!PyTuple_CheckExact(codec)) {
+    //     PyObject *attr;
+    //     if (PyObject_GetOptionalAttrString(codec, "_is_single_byte", &attr) < 0) {
+    //         Py_DECREF(codec);
+    //         return XML_STATUS_ERROR;
+    //     }
+    //     if (attr != NULL) {
+    //         int is_single_byte = PyObject_IsTrue(attr);
+    //         Py_DECREF(attr);
+    //         if (is_single_byte <= 0) {
+    //             Py_DECREF(codec);
+    //             if (is_single_byte == 0) {
+    //                 PyErr_SetString(PyExc_ValueError,
+    //                                 "multi-byte encodings are not supported");
+    //             }
+    //             return XML_STATUS_ERROR;
+    //         }
+    //     }
+    // }
+    Py_DECREF(codec);
+
     u = PyUnicode_Decode((const char*) template_buffer, 256, name, "replace");
     if (u == NULL) {
         Py_XDECREF(u);
diff --git a/Tools/unicode/gencjkcodecs.py b/Tools/unicode/gencjkcodecs.py
index 45866bf2f61062..eb04f67f2077eb 100644
--- a/Tools/unicode/gencjkcodecs.py
+++ b/Tools/unicode/gencjkcodecs.py
@@ -51,6 +51,7 @@ def getregentry():
         incrementaldecoder=IncrementalDecoder,
         streamreader=StreamReader,
         streamwriter=StreamWriter,
+        _is_single_byte=False,
     )
 """)
 

From 2e2df1ea095bf9263b3aedb6332a5a2ef6c6ed3f Mon Sep 17 00:00:00 2001
From: Serhiy Storchaka <storchaka@gmail.com>
Date: Thu, 23 Apr 2026 15:47:18 +0300
Subject: [PATCH 2/4] Uncomment temporary commented out code.

---
 Modules/pyexpat.c | 38 +++++++++++++++++++-------------------
 1 file changed, 19 insertions(+), 19 deletions(-)

diff --git a/Modules/pyexpat.c b/Modules/pyexpat.c
index 68c8ac0e4accef..e95dcb611a33e2 100644
--- a/Modules/pyexpat.c
+++ b/Modules/pyexpat.c
@@ -1470,25 +1470,25 @@ PyUnknownEncodingHandler(void *encodingHandlerData,
     if (codec == NULL) {
         return XML_STATUS_ERROR;
     }
-    // if (!PyTuple_CheckExact(codec)) {
-    //     PyObject *attr;
-    //     if (PyObject_GetOptionalAttrString(codec, "_is_single_byte", &attr) < 0) {
-    //         Py_DECREF(codec);
-    //         return XML_STATUS_ERROR;
-    //     }
-    //     if (attr != NULL) {
-    //         int is_single_byte = PyObject_IsTrue(attr);
-    //         Py_DECREF(attr);
-    //         if (is_single_byte <= 0) {
-    //             Py_DECREF(codec);
-    //             if (is_single_byte == 0) {
-    //                 PyErr_SetString(PyExc_ValueError,
-    //                                 "multi-byte encodings are not supported");
-    //             }
-    //             return XML_STATUS_ERROR;
-    //         }
-    //     }
-    // }
+    if (!PyTuple_CheckExact(codec)) {
+        PyObject *attr;
+        if (PyObject_GetOptionalAttrString(codec, "_is_single_byte", &attr) < 0) {
+            Py_DECREF(codec);
+            return XML_STATUS_ERROR;
+        }
+        if (attr != NULL) {
+            int is_single_byte = PyObject_IsTrue(attr);
+            Py_DECREF(attr);
+            if (is_single_byte <= 0) {
+                Py_DECREF(codec);
+                if (is_single_byte == 0) {
+                    PyErr_SetString(PyExc_ValueError,
+                                    "multi-byte encodings are not supported");
+                }
+                return XML_STATUS_ERROR;
+            }
+        }
+    }
     Py_DECREF(codec);
 
     u = PyUnicode_Decode((const char*) template_buffer, 256, name, "replace");

From 91ac15e21f2d81fc2803856f641d7ae5bbaba45a Mon Sep 17 00:00:00 2001
From: Serhiy Storchaka <storchaka@gmail.com>
Date: Sat, 25 Apr 2026 15:59:16 +0300
Subject: [PATCH 3/4] Fix the module reference.

---
 .../next/Library/2026-04-23-14-46-30.gh-issue-148821.cR4kMa.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Misc/NEWS.d/next/Library/2026-04-23-14-46-30.gh-issue-148821.cR4kMa.rst b/Misc/NEWS.d/next/Library/2026-04-23-14-46-30.gh-issue-148821.cR4kMa.rst
index 5dd95047178938..119a465fcb200a 100644
--- a/Misc/NEWS.d/next/Library/2026-04-23-14-46-30.gh-issue-148821.cR4kMa.rst
+++ b/Misc/NEWS.d/next/Library/2026-04-23-14-46-30.gh-issue-148821.cR4kMa.rst
@@ -1,4 +1,4 @@
-The :mod:`XML parser <pyexpat>` now raises :exc:`ValueError` for known
+The :mod:`XML parser <xml.parsers.expat>` now raises :exc:`ValueError` for known
 unsupported multi-byte encodings such us "UTF8", "ISO-2022-JP" or
 "raw-unicode-escape" instead of failing later, when encounter non-ASCII
 data.

From 2177825c7729d03c92b9618b0f98f2aca0abb3b9 Mon Sep 17 00:00:00 2001
From: Serhiy Storchaka <storchaka@gmail.com>
Date: Sat, 25 Apr 2026 16:11:37 +0300
Subject: [PATCH 4/4] Fix ElementTree tests.

---
 Lib/test/test_xml_etree.py | 22 ++++++++++++----------
 1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/Lib/test/test_xml_etree.py b/Lib/test/test_xml_etree.py
index 51af46f124cac6..730456e7582adc 100644
--- a/Lib/test/test_xml_etree.py
+++ b/Lib/test/test_xml_etree.py
@@ -1014,7 +1014,7 @@ def xml(encoding):
         def bxml(encoding):
             return xml(encoding).encode(encoding)
         supported_encodings = [
-            'ascii', 'utf-8', 'utf-8-sig', 'utf-16', 'utf-16be', 'utf-16le',
+            'ascii', 'utf-8', 'utf-16', 'utf-16be', 'utf-16le',
             'iso8859-1', 'iso8859-2', 'iso8859-3', 'iso8859-4', 'iso8859-5',
             'iso8859-6', 'iso8859-7', 'iso8859-8', 'iso8859-9', 'iso8859-10',
             'iso8859-13', 'iso8859-14', 'iso8859-15', 'iso8859-16',
@@ -1025,32 +1025,34 @@ def bxml(encoding):
             'cp1256', 'cp1257', 'cp1258',
             'mac-cyrillic', 'mac-greek', 'mac-iceland', 'mac-latin2',
             'mac-roman', 'mac-turkish',
-            'iso2022-jp', 'iso2022-jp-1', 'iso2022-jp-2', 'iso2022-jp-2004',
-            'iso2022-jp-3', 'iso2022-jp-ext',
-            'koi8-r', 'koi8-t', 'koi8-u', 'kz1048',
-            'hz', 'ptcp154',
+            'koi8-r', 'koi8-t', 'koi8-u', 'kz1048', 'ptcp154',
         ]
         for encoding in supported_encodings:
-            self.assertEqual(ET.tostring(ET.XML(bxml(encoding))), b'<xml />')
+            with self.subTest(encoding=encoding):
+                self.assertEqual(ET.tostring(ET.XML(bxml(encoding))), b'<xml />')
 
         unsupported_ascii_compatible_encodings = [
             'big5', 'big5hkscs',
             'cp932', 'cp949', 'cp950',
             'euc-jp', 'euc-jis-2004', 'euc-jisx0213', 'euc-kr',
             'gb2312', 'gbk', 'gb18030',
-            'iso2022-kr', 'johab',
+            'iso2022-jp', 'iso2022-jp-1', 'iso2022-jp-2', 'iso2022-jp-2004',
+            'iso2022-jp-3', 'iso2022-jp-ext',
+            'iso2022-kr', 'johab', 'hz',
             'shift-jis', 'shift-jis-2004', 'shift-jisx0213',
-            'utf-7',
+            'utf-7', 'utf-8-sig', 'utf8',
         ]
         for encoding in unsupported_ascii_compatible_encodings:
-            self.assertRaises(ValueError, ET.XML, bxml(encoding))
+            with self.subTest(encoding=encoding):
+                self.assertRaises(ValueError, ET.XML, bxml(encoding))
 
         unsupported_ascii_incompatible_encodings = [
             'cp037', 'cp424', 'cp500', 'cp864', 'cp875', 'cp1026', 'cp1140',
             'utf_32', 'utf_32_be', 'utf_32_le',
         ]
         for encoding in unsupported_ascii_incompatible_encodings:
-            self.assertRaises(ET.ParseError, ET.XML, bxml(encoding))
+            with self.subTest(encoding=encoding):
+                self.assertRaises(ET.ParseError, ET.XML, bxml(encoding))
 
         self.assertRaises(ValueError, ET.XML, xml('undefined').encode('ascii'))
         self.assertRaises(LookupError, ET.XML, xml('xxx').encode('ascii'))