From deb73e2bf2c10494b74738bfe63a495429253f34 Mon Sep 17 00:00:00 2001 From: tonghuaroot Date: Wed, 24 Jun 2026 15:20:42 +0800 Subject: [PATCH 1/3] gh-152052: Fix misleading json error for \uXXXX escape at end of input The C accelerator of json reported "Invalid \uXXXX escape" instead of "Unterminated string starting at" for a complete, valid \uXXXX escape whose last hex digit is the final character of the input, diverging from the pure-Python decoder. The bounds check used `>=` where `>` is correct: when end == len the four hex digits at indices len-4..len-1 are all in bounds, so the escape is valid and the string is merely unterminated. --- Lib/test/test_json/test_fail.py | 9 +++++++++ Lib/test/test_json/test_scanstring.py | 4 ++++ .../2026-06-24-12-00-00.gh-issue-152052.yBssDE.rst | 3 +++ Modules/_json.c | 2 +- 4 files changed, 17 insertions(+), 1 deletion(-) create mode 100644 Misc/NEWS.d/next/Library/2026-06-24-12-00-00.gh-issue-152052.yBssDE.rst diff --git a/Lib/test/test_json/test_fail.py b/Lib/test/test_json/test_fail.py index 79c44af2fbf0e1f..0cfc34c969d5a1a 100644 --- a/Lib/test/test_json/test_fail.py +++ b/Lib/test/test_json/test_fail.py @@ -144,6 +144,15 @@ def test_truncated_input(self): ('"', 'Unterminated string starting at', 0), ('"spam', 'Unterminated string starting at', 0), ] + # A \uXXXX escape whose final hex digit is the last character of + # the input forms a complete, valid escape, so the string is + # unterminated rather than containing an invalid escape. + test_cases += [ + (r'"\u0041', 'Unterminated string starting at', 0), + (r'"\ud834', 'Unterminated string starting at', 0), + (r'"\ud834\udd1e', 'Unterminated string starting at', 0), + (r'{"a": "\u0041', 'Unterminated string starting at', 6), + ] for data, msg, idx in test_cases: with self.assertRaises(self.JSONDecodeError) as cm: self.loads(data) diff --git a/Lib/test/test_json/test_scanstring.py b/Lib/test/test_json/test_scanstring.py index 9a6cdfe12d266c0..964798aa4ef584b 100644 --- a/Lib/test/test_json/test_scanstring.py +++ b/Lib/test/test_json/test_scanstring.py @@ -137,6 +137,10 @@ def test_bad_escapes(self): '"\\ud834\\u-123"', '"\\ud834\\u+123"', '"\\ud834\\u1_23"', + # A \uXXXX escape at the end of the input: too few hex digits + # before EOF, and the right number of non-hex characters. + '"\\u004', + '"\\uXYZW', ] for s in bad_escapes: with self.assertRaises(self.JSONDecodeError, msg=s): diff --git a/Misc/NEWS.d/next/Library/2026-06-24-12-00-00.gh-issue-152052.yBssDE.rst b/Misc/NEWS.d/next/Library/2026-06-24-12-00-00.gh-issue-152052.yBssDE.rst new file mode 100644 index 000000000000000..b56a945d5e5cb93 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2026-06-24-12-00-00.gh-issue-152052.yBssDE.rst @@ -0,0 +1,3 @@ +The C accelerator of :mod:`json` now agrees with the pure-Python decoder and +raises ``Unterminated string starting at`` instead of ``Invalid \uXXXX +escape`` for a valid ``\uXXXX`` escape at the very end of the input. diff --git a/Modules/_json.c b/Modules/_json.c index b057b56b2f9f8d9..3a724a3e72b185b 100644 --- a/Modules/_json.c +++ b/Modules/_json.c @@ -576,7 +576,7 @@ scanstring_unicode(PyObject *pystr, Py_ssize_t end, int strict, Py_ssize_t *next c = 0; next++; end = next + 4; - if (end >= len) { + if (end > len) { raise_errmsg("Invalid \\uXXXX escape", pystr, next - 1); goto bail; } From 8573a979225c25b8788584b0bf4d93875daa6fe4 Mon Sep 17 00:00:00 2001 From: tonghuaroot Date: Wed, 24 Jun 2026 15:27:26 +0800 Subject: [PATCH 2/3] gh-152052: trim test comments --- Lib/test/test_json/test_fail.py | 4 +--- Lib/test/test_json/test_scanstring.py | 3 +-- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/Lib/test/test_json/test_fail.py b/Lib/test/test_json/test_fail.py index 0cfc34c969d5a1a..7f99a6331d36251 100644 --- a/Lib/test/test_json/test_fail.py +++ b/Lib/test/test_json/test_fail.py @@ -144,9 +144,7 @@ def test_truncated_input(self): ('"', 'Unterminated string starting at', 0), ('"spam', 'Unterminated string starting at', 0), ] - # A \uXXXX escape whose final hex digit is the last character of - # the input forms a complete, valid escape, so the string is - # unterminated rather than containing an invalid escape. + # A complete \uXXXX escape at end of input leaves it unterminated. test_cases += [ (r'"\u0041', 'Unterminated string starting at', 0), (r'"\ud834', 'Unterminated string starting at', 0), diff --git a/Lib/test/test_json/test_scanstring.py b/Lib/test/test_json/test_scanstring.py index 964798aa4ef584b..a52b01c71130d8c 100644 --- a/Lib/test/test_json/test_scanstring.py +++ b/Lib/test/test_json/test_scanstring.py @@ -137,8 +137,7 @@ def test_bad_escapes(self): '"\\ud834\\u-123"', '"\\ud834\\u+123"', '"\\ud834\\u1_23"', - # A \uXXXX escape at the end of the input: too few hex digits - # before EOF, and the right number of non-hex characters. + # Truncated or non-hex \uXXXX escape at end of input. '"\\u004', '"\\uXYZW', ] From 288b1b5a22ebdba8f4a9a2c4c599c41601892fbd Mon Sep 17 00:00:00 2001 From: Stan Ulbrych Date: Fri, 26 Jun 2026 13:53:18 +0100 Subject: [PATCH 3/3] Shorten blurb --- .../Library/2026-06-24-12-00-00.gh-issue-152052.yBssDE.rst | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/Misc/NEWS.d/next/Library/2026-06-24-12-00-00.gh-issue-152052.yBssDE.rst b/Misc/NEWS.d/next/Library/2026-06-24-12-00-00.gh-issue-152052.yBssDE.rst index b56a945d5e5cb93..f9ffbbd342fc81a 100644 --- a/Misc/NEWS.d/next/Library/2026-06-24-12-00-00.gh-issue-152052.yBssDE.rst +++ b/Misc/NEWS.d/next/Library/2026-06-24-12-00-00.gh-issue-152052.yBssDE.rst @@ -1,3 +1,2 @@ -The C accelerator of :mod:`json` now agrees with the pure-Python decoder and -raises ``Unterminated string starting at`` instead of ``Invalid \uXXXX -escape`` for a valid ``\uXXXX`` escape at the very end of the input. +The :mod:`json` C accelerator now correctly reports an unterminated string for a +``\uXXXX`` escape at the end of the input.