From 908f438e198a753d40d1166b5f8725e650a9ed6e Mon Sep 17 00:00:00 2001 From: Donghee Na Date: Fri, 26 Jun 2026 11:34:13 +0900 Subject: [PATCH 1/3] gh-152235: Defer GC tracking of set and frozenset to end of construction (gh-152237) --- .../2026-06-26-05-49-13.gh-issue-152235.YU20T9.rst | 2 ++ Objects/setobject.c | 10 +++++++--- 2 files changed, 9 insertions(+), 3 deletions(-) create mode 100644 Misc/NEWS.d/next/Core_and_Builtins/2026-06-26-05-49-13.gh-issue-152235.YU20T9.rst diff --git a/Misc/NEWS.d/next/Core_and_Builtins/2026-06-26-05-49-13.gh-issue-152235.YU20T9.rst b/Misc/NEWS.d/next/Core_and_Builtins/2026-06-26-05-49-13.gh-issue-152235.YU20T9.rst new file mode 100644 index 00000000000000..8d386ad458dfff --- /dev/null +++ b/Misc/NEWS.d/next/Core_and_Builtins/2026-06-26-05-49-13.gh-issue-152235.YU20T9.rst @@ -0,0 +1,2 @@ +Defer GC tracking of a :class:`set` or :class:`frozenset` to the end of its +construction from iterable. Patch by Donghee Na. diff --git a/Objects/setobject.c b/Objects/setobject.c index d198794849f0d1..883658b8bfd340 100644 --- a/Objects/setobject.c +++ b/Objects/setobject.c @@ -1351,7 +1351,9 @@ make_new_set(PyTypeObject *type, PyObject *iterable) assert(PyType_Check(type)); PySetObject *so; - so = (PySetObject *)type->tp_alloc(type, 0); + // Allocate untracked: the fill below runs user code, and a half-built + // set must not be reachable from another thread via gc.get_objects(). + so = (PySetObject *)_PyType_AllocNoTrack(type, 0); if (so == NULL) return NULL; @@ -1370,6 +1372,8 @@ make_new_set(PyTypeObject *type, PyObject *iterable) } } + // Track only once fully built. + _PyObject_GC_TRACK(so); return (PyObject *)so; } @@ -2885,7 +2889,7 @@ PyTypeObject PySet_Type = { 0, /* tp_descr_set */ 0, /* tp_dictoffset */ set_init, /* tp_init */ - PyType_GenericAlloc, /* tp_alloc */ + _PyType_AllocNoTrack, /* tp_alloc */ set_new, /* tp_new */ PyObject_GC_Del, /* tp_free */ .tp_vectorcall = set_vectorcall, @@ -2977,7 +2981,7 @@ PyTypeObject PyFrozenSet_Type = { 0, /* tp_descr_set */ 0, /* tp_dictoffset */ 0, /* tp_init */ - PyType_GenericAlloc, /* tp_alloc */ + _PyType_AllocNoTrack, /* tp_alloc */ frozenset_new, /* tp_new */ PyObject_GC_Del, /* tp_free */ .tp_vectorcall = frozenset_vectorcall, From 794b42ff8a75614898c98c56ab87090e9804c369 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Fri, 26 Jun 2026 07:33:33 +0300 Subject: [PATCH 2/3] gh-95555: Support Unicode property escapes \p{...} in regular expressions (GH-151969) Add support for \p{property} and \P{property} escapes in Unicode (str) regular expressions, for the properties the engine can resolve without the unicodedata database. They are matched as CATEGORY opcodes or as fixed sets of character ranges. Supported in this change: many General_Category values (the groups L, N, Z, C and the values Lu, Lt, Lm, Nd, Nl, No, Zs, Zl, Zp, Cc, Cf, Cs, Co and Cn); the binary properties Alphabetic, Lowercase, Uppercase, Numeric, Printable, XID_Start, XID_Continue, Cased and Case_Ignorable; the POSIX compatibility classes; the code-point classes ASCII, Any, Assigned, Noncharacter_Code_Point, Join_Control, Pattern_Syntax and Pattern_White_Space; and Regional_Indicator, ASCII_Hex_Digit and Hex_Digit. Property and value names use loose matching (UAX #44 UAX44-LM3), so a property may be spelled \p{Lu}, \p{gc=Lu} or \p{name=yes}. Co-Authored-By: Claude Opus 4.8 --- Doc/library/re.rst | 47 ++- Doc/whatsnew/3.16.rst | 7 + Lib/re/_constants.py | 64 +++- Lib/re/_parser.py | 32 +- Lib/re/_properties.py | 279 ++++++++++++++++++ Lib/test/test_re.py | 177 +++++++++++ ...6-06-22-12-00-00.gh-issue-95555.Pr0p18.rst | 4 + Modules/_sre/sre.c | 194 ++++++++++++ Modules/_sre/sre_constants.h | 52 +++- Tools/unicode/makeunicodedata.py | 5 +- 10 files changed, 854 insertions(+), 7 deletions(-) create mode 100644 Lib/re/_properties.py create mode 100644 Misc/NEWS.d/next/Library/2026-06-22-12-00-00.gh-issue-95555.Pr0p18.rst diff --git a/Doc/library/re.rst b/Doc/library/re.rst index 7c8c589b3f5dfc..617dc96f479926 100644 --- a/Doc/library/re.rst +++ b/Doc/library/re.rst @@ -613,7 +613,7 @@ character ``'$'``. Matches ``[0-9]`` if the :py:const:`~re.ASCII` flag is used. - __ https://www.unicode.org/versions/Unicode15.0.0/ch04.pdf#G134153 + __ https://www.unicode.org/versions/Unicode17.0.0/core-spec/chapter-4/#G124142 For 8-bit (bytes) patterns: Matches any decimal digit in the ASCII character set; @@ -680,6 +680,51 @@ character ``'$'``. matches characters which are neither alphanumeric in the current locale nor the underscore. +.. index:: single: \p; in regular expressions + single: \P; in regular expressions + +``\p{property=value}``, ``\p{value}`` + Matches any character with the given Unicode property + (see `Unicode Technical Standard #18 + `_, requirement RL1.2 "Properties"). + Property and value names are matched loosely: + case, whitespace, ``'-'`` and ``'_'`` are ignored. + The following properties are supported: + + * The ``General_Category`` property (short name ``gc``), + spelled ``\p{Lu}``, ``\p{gc=Lu}`` or, for a one-letter group, ``\p{L}``. + The supported values are the groups ``L``, ``N``, ``Z`` and ``C`` and the + values ``Lu``, ``Lt``, ``Lm``, ``Nd``, ``Nl``, ``No``, ``Zs``, ``Zl``, + ``Zp``, ``Cc``, ``Cf``, ``Cs``, ``Co`` and ``Cn``. + * The binary properties ``XID_Start``, ``XID_Continue``, ``Alphabetic``, + ``Lowercase``, ``Uppercase``, ``Numeric``, ``Printable``, ``Cased`` and + ``Case_Ignorable``. A binary property may also be spelled + ``\p{name=yes}`` or ``\p{name=no}``. + * The POSIX compatibility classes ``alpha``, ``alnum``, ``blank``, + ``cntrl``, ``digit``, ``graph``, ``lower``, ``print``, ``space``, + ``upper``, ``word`` and ``xdigit``. + * The properties ``ASCII``, ``Any``, ``Assigned``, + ``Noncharacter_Code_Point``, ``Join_Control``, ``Regional_Indicator``, + ``ASCII_Hex_Digit``, ``Hex_Digit``, ``Pattern_Syntax`` and + ``Pattern_White_Space``. + + Where a supported property corresponds to a :mod:`unicodedata` accessor or + :class:`str` method, the set of characters it matches is exactly the one + they report. For consistency with these, ``space`` follows + :py:meth:`str.isspace` (like ``\s``) and ``xdigit`` matches only the ASCII + hexadecimal digits. + + This is only recognized in Unicode (str) patterns. + In bytes patterns it is an error. + + .. versionadded:: next + +``\P{...}`` + Matches any character which does *not* have the given Unicode property. + This is the opposite of ``\p``. + + .. versionadded:: next + .. index:: single: \z; in regular expressions single: \Z; in regular expressions diff --git a/Doc/whatsnew/3.16.rst b/Doc/whatsnew/3.16.rst index 32962a9520fa69..cb6d5434fb3ce7 100644 --- a/Doc/whatsnew/3.16.rst +++ b/Doc/whatsnew/3.16.rst @@ -192,6 +192,13 @@ re matches an ASCII lowercase consonant. (Contributed by Serhiy Storchaka in :gh:`152100`.) +* Regular expressions now support Unicode property escapes ``\p{...}`` and + ``\P{...}``, which match a character by a Unicode property -- for example + ``\p{Lu}`` (an uppercase letter), ``\p{Cased}`` or ``\p{ASCII}``. See + :ref:`the regular expression syntax ` for the supported + properties. + (Contributed by Serhiy Storchaka in :gh:`95555`.) + shlex ----- diff --git a/Lib/re/_constants.py b/Lib/re/_constants.py index d6f32302d37b2d..6e99dae5350151 100644 --- a/Lib/re/_constants.py +++ b/Lib/re/_constants.py @@ -13,7 +13,7 @@ # update when constants are added or removed -MAGIC = 20230612 +MAGIC = 20260622 from _sre import MAXREPEAT, MAXGROUPS # noqa: F401 @@ -150,6 +150,35 @@ def _makecodes(*names): 'CATEGORY_UNI_SPACE', 'CATEGORY_UNI_NOT_SPACE', 'CATEGORY_UNI_WORD', 'CATEGORY_UNI_NOT_WORD', 'CATEGORY_UNI_LINEBREAK', 'CATEGORY_UNI_NOT_LINEBREAK', + + # Unicode property categories. These are not affected by the ASCII, + # LOCALE or UNICODE flags. + 'CATEGORY_ALPHA', 'CATEGORY_NOT_ALPHA', + 'CATEGORY_LOWER', 'CATEGORY_NOT_LOWER', + 'CATEGORY_UPPER', 'CATEGORY_NOT_UPPER', + 'CATEGORY_NUMERIC', 'CATEGORY_NOT_NUMERIC', + 'CATEGORY_PRINTABLE', 'CATEGORY_NOT_PRINTABLE', + 'CATEGORY_ALNUM', 'CATEGORY_NOT_ALNUM', + 'CATEGORY_XID_START', 'CATEGORY_NOT_XID_START', + 'CATEGORY_XID_CONTINUE', 'CATEGORY_NOT_XID_CONTINUE', + 'CATEGORY_TITLE', 'CATEGORY_NOT_TITLE', + 'CATEGORY_CASED', 'CATEGORY_NOT_CASED', + 'CATEGORY_CASE_IGNORABLE', 'CATEGORY_NOT_CASE_IGNORABLE', + # Compound categories: Lu = uppercase letter, N = number. + 'CATEGORY_LU', 'CATEGORY_NOT_LU', + 'CATEGORY_N', 'CATEGORY_NOT_N', + 'CATEGORY_LM', 'CATEGORY_NOT_LM', + 'CATEGORY_NL', 'CATEGORY_NOT_NL', + 'CATEGORY_NO', 'CATEGORY_NOT_NO', + 'CATEGORY_CF', 'CATEGORY_NOT_CF', + 'CATEGORY_Z', 'CATEGORY_NOT_Z', + 'CATEGORY_ZS', 'CATEGORY_NOT_ZS', + 'CATEGORY_C', 'CATEGORY_NOT_C', + 'CATEGORY_CN', 'CATEGORY_NOT_CN', + 'CATEGORY_ASSIGNED', 'CATEGORY_NOT_ASSIGNED', + 'CATEGORY_BLANK', 'CATEGORY_NOT_BLANK', + 'CATEGORY_GRAPH', 'CATEGORY_NOT_GRAPH', + 'CATEGORY_PRINT', 'CATEGORY_NOT_PRINT', ) @@ -206,6 +235,39 @@ def _makecodes(*names): CATEGORY_NOT_LINEBREAK: CATEGORY_UNI_NOT_LINEBREAK } +# The Unicode property categories are the same regardless of the flags. +CH_PROPERTY = ( + CATEGORY_ALPHA, CATEGORY_NOT_ALPHA, + CATEGORY_LOWER, CATEGORY_NOT_LOWER, + CATEGORY_UPPER, CATEGORY_NOT_UPPER, + CATEGORY_NUMERIC, CATEGORY_NOT_NUMERIC, + CATEGORY_PRINTABLE, CATEGORY_NOT_PRINTABLE, + CATEGORY_ALNUM, CATEGORY_NOT_ALNUM, + CATEGORY_XID_START, CATEGORY_NOT_XID_START, + CATEGORY_XID_CONTINUE, CATEGORY_NOT_XID_CONTINUE, + CATEGORY_TITLE, CATEGORY_NOT_TITLE, + CATEGORY_CASED, CATEGORY_NOT_CASED, + CATEGORY_CASE_IGNORABLE, CATEGORY_NOT_CASE_IGNORABLE, + CATEGORY_LU, CATEGORY_NOT_LU, + CATEGORY_N, CATEGORY_NOT_N, + CATEGORY_LM, CATEGORY_NOT_LM, + CATEGORY_NL, CATEGORY_NOT_NL, + CATEGORY_NO, CATEGORY_NOT_NO, + CATEGORY_CF, CATEGORY_NOT_CF, + CATEGORY_Z, CATEGORY_NOT_Z, + CATEGORY_ZS, CATEGORY_NOT_ZS, + CATEGORY_C, CATEGORY_NOT_C, + CATEGORY_CN, CATEGORY_NOT_CN, + CATEGORY_ASSIGNED, CATEGORY_NOT_ASSIGNED, + CATEGORY_BLANK, CATEGORY_NOT_BLANK, + CATEGORY_GRAPH, CATEGORY_NOT_GRAPH, + CATEGORY_PRINT, CATEGORY_NOT_PRINT, +) +for _cat in CH_PROPERTY: + CH_LOCALE[_cat] = _cat + CH_UNICODE[_cat] = _cat +del _cat + CH_NEGATE = dict(zip(CHCODES[::2] + CHCODES[1::2], CHCODES[1::2] + CHCODES[::2])) # flags diff --git a/Lib/re/_parser.py b/Lib/re/_parser.py index cc2b66c54b6681..3f3efb5d4d4008 100644 --- a/Lib/re/_parser.py +++ b/Lib/re/_parser.py @@ -310,6 +310,22 @@ def checkgroupname(self, name, offset): msg = "bad character in group name %r" % name raise self.error(msg, len(name) + offset) +def _property_escape(source, escape, in_set=False): + # handle \p{...} and \P{...} (UTS #18 1.2.4, "Property Syntax") + from . import _properties + if not source.match('{'): + raise source.error("missing {, expected property name") + name = source.getuntil('}', 'property name') + code = _properties.parse_property(name, escape[1] == 'P') + if code is None: + raise source.error("unknown property name %r" % name, + len(name) + len(r'\p{}')) + if in_set and code[1][0] == (NEGATE, None): + # A negated multi-range property cannot be a member of a set. + raise source.error("bad escape %s in character class" % escape, + len(name) + len(r'\p{}')) + return code + def _class_escape(source, escape): # handle escape code inside character class code = ESCAPES.get(escape) @@ -352,6 +368,8 @@ def _class_escape(source, escape): raise source.error("undefined character name %r" % charname, len(charname) + len(r'\N{}')) from None return LITERAL, c + elif c in "pP" and source.istext: + return _property_escape(source, escape, in_set=True) elif c in OCTDIGITS: # octal escape (up to three digits) escape += source.getwhile(2, OCTDIGITS) @@ -412,6 +430,8 @@ def _escape(source, escape, state): raise source.error("undefined character name %r" % charname, len(charname) + len(r'\N{}')) from None return LITERAL, c + elif c in "pP" and source.istext: + return _property_escape(source, escape) elif c == "0": # octal escape escape += source.getwhile(2, OCTDIGITS) @@ -566,6 +586,12 @@ def _parse_operand(source, state, nested, here, allow_nested): sourcematch = source.match set = [] setappend = set.append + def addmember(code): + # Flatten a \p{...} property's IN into the member set. + if code[0] is IN: + set.extend(code[1]) + else: + setappend(code) compound = None # elements of a standalone nested-set operand if allow_nested and sourcematch("["): # A nested set after an operator is the whole operand, used as-is (not @@ -608,13 +634,13 @@ def _parse_operand(source, state, nested, here, allow_nested): source.tell() - here) if that == "]": # A trailing '-' is a literal. - setappend(code1) + addmember(code1) setappend((LITERAL, _ord("-"))) return [_charset_node(_uniq(set))], None if that == "-": # 'X--': difference, not a range. '--' after a single member # lands here because the range probe consumed the first '-'. - setappend(code1) + addmember(code1) return [_charset_node(_uniq(set))], "--" if that[0] == "\\": code2 = _class_escape(source, that) @@ -630,7 +656,7 @@ def _parse_operand(source, state, nested, here, allow_nested): raise source.error(msg, len(this) + 1 + len(that)) setappend((RANGE, (lo, hi))) else: - setappend(code1) + addmember(code1) def _complement(elements, state): # The complement of `elements` (a single matcher, or a set operation as a diff --git a/Lib/re/_properties.py b/Lib/re/_properties.py new file mode 100644 index 00000000000000..6310aa7fa88f95 --- /dev/null +++ b/Lib/re/_properties.py @@ -0,0 +1,279 @@ +# +# Secret Labs' Regular Expression Engine +# +# support for Unicode property escapes \p{...} and \P{...} +# +# See https://unicode.org/reports/tr18/ "Unicode Regular Expressions", +# requirement RL1.2 "Properties". +# +# The supported properties are matched either as CATEGORY opcodes, or as fixed +# sets of character ranges: +# +# * Properties emitted as CATEGORY opcodes (see _CATEGORY_PROPERTIES): \d, \s +# and \w (as digit, space and word, honouring the ASCII/LOCALE/UNICODE +# flags), the binary properties Alphabetic, Lowercase, Uppercase, Numeric, +# Printable, alnum, XID_Start, XID_Continue, Cased and Case_Ignorable, and +# the POSIX classes blank, graph, print and assigned. +# +# * General_Category values (see _GC_CATEGORY): L, Lt, Nd, Lu, N, Lm, Nl, No, +# Cf, Z, Zs, C and Cn (combinations of the simple predicates), plus Cc, Cs, +# Co, Zl and Zp as fixed ranges (see _GC_ANALYTIC). +# +# * Code-point classes given by fixed ranges (see _analytic_ranges): ASCII, +# Any, Noncharacter_Code_Point, Join_Control, Regional_Indicator, xdigit, +# ASCII_Hex_Digit, Hex_Digit, cntrl, and the immutable Pattern_Syntax and +# Pattern_White_Space. +# + +from ._constants import ( + IN, CATEGORY, NEGATE, RANGE, LITERAL, + CATEGORY_DIGIT, CATEGORY_NOT_DIGIT, + CATEGORY_SPACE, CATEGORY_NOT_SPACE, + CATEGORY_WORD, CATEGORY_NOT_WORD, + CATEGORY_ALPHA, CATEGORY_NOT_ALPHA, + CATEGORY_LOWER, CATEGORY_NOT_LOWER, + CATEGORY_UPPER, CATEGORY_NOT_UPPER, + CATEGORY_NUMERIC, CATEGORY_NOT_NUMERIC, + CATEGORY_PRINTABLE, CATEGORY_NOT_PRINTABLE, + CATEGORY_ALNUM, CATEGORY_NOT_ALNUM, + CATEGORY_XID_START, CATEGORY_NOT_XID_START, + CATEGORY_XID_CONTINUE, CATEGORY_NOT_XID_CONTINUE, + CATEGORY_TITLE, CATEGORY_NOT_TITLE, + CATEGORY_CASED, CATEGORY_NOT_CASED, + CATEGORY_CASE_IGNORABLE, CATEGORY_NOT_CASE_IGNORABLE, + CATEGORY_LU, CATEGORY_NOT_LU, + CATEGORY_N, CATEGORY_NOT_N, + CATEGORY_LM, CATEGORY_NOT_LM, + CATEGORY_NL, CATEGORY_NOT_NL, + CATEGORY_NO, CATEGORY_NOT_NO, + CATEGORY_CF, CATEGORY_NOT_CF, + CATEGORY_Z, CATEGORY_NOT_Z, + CATEGORY_ZS, CATEGORY_NOT_ZS, + CATEGORY_C, CATEGORY_NOT_C, + CATEGORY_CN, CATEGORY_NOT_CN, + CATEGORY_ASSIGNED, CATEGORY_NOT_ASSIGNED, + CATEGORY_BLANK, CATEGORY_NOT_BLANK, + CATEGORY_GRAPH, CATEGORY_NOT_GRAPH, + CATEGORY_PRINT, CATEGORY_NOT_PRINT, +) + +MAXUNICODE = 0x10FFFF + +# Properties implemented directly by the engine as (positive, negative) +# CATEGORY codes. The keys are normalised (see _normalize). digit, space and +# word reuse the \d, \s and \w categories and so are affected by the ASCII, +# LOCALE and UNICODE flags; the rest are plain Unicode properties and are not. +_CATEGORY_PROPERTIES = { + "digit": (CATEGORY_DIGIT, CATEGORY_NOT_DIGIT), # same as \d + "space": (CATEGORY_SPACE, CATEGORY_NOT_SPACE), # same as \s + # \p{White_Space} is approximated by \s (str.isspace), which also matches + # the information separators U+001C..U+001F. + "whitespace": (CATEGORY_SPACE, CATEGORY_NOT_SPACE), + "wspace": (CATEGORY_SPACE, CATEGORY_NOT_SPACE), + "word": (CATEGORY_WORD, CATEGORY_NOT_WORD), # same as \w + + "alphabetic": (CATEGORY_ALPHA, CATEGORY_NOT_ALPHA), + "alpha": (CATEGORY_ALPHA, CATEGORY_NOT_ALPHA), # POSIX + "lowercase": (CATEGORY_LOWER, CATEGORY_NOT_LOWER), + "lower": (CATEGORY_LOWER, CATEGORY_NOT_LOWER), # POSIX + "uppercase": (CATEGORY_UPPER, CATEGORY_NOT_UPPER), + "upper": (CATEGORY_UPPER, CATEGORY_NOT_UPPER), # POSIX + "numeric": (CATEGORY_NUMERIC, CATEGORY_NOT_NUMERIC), + "printable": (CATEGORY_PRINTABLE, CATEGORY_NOT_PRINTABLE), + "cased": (CATEGORY_CASED, CATEGORY_NOT_CASED), + "caseignorable": (CATEGORY_CASE_IGNORABLE, CATEGORY_NOT_CASE_IGNORABLE), + # POSIX classes, the compatibility properties of UTS #18 Annex C (see the + # compound predicates in sre.c). + "blank": (CATEGORY_BLANK, CATEGORY_NOT_BLANK), + "graph": (CATEGORY_GRAPH, CATEGORY_NOT_GRAPH), + "print": (CATEGORY_PRINT, CATEGORY_NOT_PRINT), + "assigned": (CATEGORY_ASSIGNED, CATEGORY_NOT_ASSIGNED), + "alnum": (CATEGORY_ALNUM, CATEGORY_NOT_ALNUM), # POSIX + "xidstart": (CATEGORY_XID_START, CATEGORY_NOT_XID_START), + "xids": (CATEGORY_XID_START, CATEGORY_NOT_XID_START), + "xidcontinue": (CATEGORY_XID_CONTINUE, CATEGORY_NOT_XID_CONTINUE), + "xidc": (CATEGORY_XID_CONTINUE, CATEGORY_NOT_XID_CONTINUE), +} + +# General_Category values matched by an engine category. CATEGORY_ALPHA +# matches exactly the L group, and CATEGORY_TITLE the Lt category; +# CATEGORY_DIGIT matches Nd (but, like \d, is restricted to ASCII under the +# ASCII flag). The gc group memberships (L = Lu|Ll|Lt|Lm|Lo, N = Nd|Nl|No) +# are given by the Unicode Standard 4.5, Table 4-4 "General_Category Values" +# (https://www.unicode.org/versions/Unicode17.0.0/core-spec/chapter-4/#G124142) +# and listed in +# https://www.unicode.org/Public/UCD/latest/ucd/PropertyValueAliases.txt +# The compound categories Lu, N, Lm, Nl, No, Cf, Z, Zs, C and Cn are +# combinations of the simple predicates (see sre.c) that reproduce the +# canonical gc partition; they are not Unicode-published identities. +_GC_CATEGORY = { + "l": (CATEGORY_ALPHA, CATEGORY_NOT_ALPHA), + "lt": (CATEGORY_TITLE, CATEGORY_NOT_TITLE), + "nd": (CATEGORY_DIGIT, CATEGORY_NOT_DIGIT), + "lu": (CATEGORY_LU, CATEGORY_NOT_LU), + "n": (CATEGORY_N, CATEGORY_NOT_N), + "lm": (CATEGORY_LM, CATEGORY_NOT_LM), + "nl": (CATEGORY_NL, CATEGORY_NOT_NL), + "no": (CATEGORY_NO, CATEGORY_NOT_NO), + "cf": (CATEGORY_CF, CATEGORY_NOT_CF), + "z": (CATEGORY_Z, CATEGORY_NOT_Z), + "zs": (CATEGORY_ZS, CATEGORY_NOT_ZS), + "c": (CATEGORY_C, CATEGORY_NOT_C), + "cn": (CATEGORY_CN, CATEGORY_NOT_CN), +} + +# General_Category values whose members are fixed in every Unicode version, +# so they need no table: Cc (control, = POSIX cntrl), Cs (surrogates), Co +# (private use) and the single code points Zl and Zp. Cc, Cs and Co are the +# control codes, surrogate and private-use areas, fixed by the Unicode +# Standard 23.1, 23.6 and 23.5: +# https://www.unicode.org/versions/Unicode17.0.0/core-spec/chapter-23/ +# All five are listed in +# https://www.unicode.org/Public/UCD/latest/ucd/extracted/DerivedGeneralCategory.txt +_CC_RANGES = [(0x00, 0x1F), (0x7F, 0x9F)] +_CS_RANGES = [(0xD800, 0xDFFF)] +_CO_RANGES = [(0xE000, 0xF8FF), (0xF0000, 0xFFFFD), (0x100000, 0x10FFFD)] +_GC_ANALYTIC = { + "cc": _CC_RANGES, + "cs": _CS_RANGES, + "co": _CO_RANGES, + "zl": [(0x2028, 0x2028)], + "zp": [(0x2029, 0x2029)], +} + +# Pattern_Syntax and Pattern_White_Space are guaranteed immutable by the +# Unicode stability policy, so their members can be hardcoded. +# UAX #31 1.1, "Stability": https://www.unicode.org/reports/tr31/ +# Members listed in https://www.unicode.org/Public/UCD/latest/ucd/PropList.txt +_PATTERN_WHITE_SPACE_RANGES = [ + (0x0009, 0x000D), (0x0020, 0x0020), (0x0085, 0x0085), (0x200E, 0x200F), + (0x2028, 0x2029), +] +_PATTERN_SYNTAX_RANGES = [ + (0x0021, 0x002F), (0x003A, 0x0040), (0x005B, 0x005E), (0x0060, 0x0060), + (0x007B, 0x007E), (0x00A1, 0x00A7), (0x00A9, 0x00A9), (0x00AB, 0x00AC), + (0x00AE, 0x00AE), (0x00B0, 0x00B1), (0x00B6, 0x00B6), (0x00BB, 0x00BB), + (0x00BF, 0x00BF), (0x00D7, 0x00D7), (0x00F7, 0x00F7), (0x2010, 0x2027), + (0x2030, 0x203E), (0x2041, 0x2053), (0x2055, 0x205E), (0x2190, 0x245F), + (0x2500, 0x2775), (0x2794, 0x2BFF), (0x2E00, 0x2E7F), (0x3001, 0x3003), + (0x3008, 0x3020), (0x3030, 0x3030), (0xFD3E, 0xFD3F), (0xFE45, 0xFE46), +] + +# Normalised property names that introduce a General_Category value. A bare +# \p{Lu} is shorthand for \p{gc=Lu} (UTS #18 1.2.4, "Property Syntax"). +_GC_KEYS = frozenset({"gc", "generalcategory"}) + +# Normalised value names for the truth value of a binary property; Yes/No and +# True/False are the binary value aliases of PropertyValueAliases.txt. +_TRUE_VALUES = frozenset({"yes", "y", "true", "t"}) +_FALSE_VALUES = frozenset({"no", "n", "false", "f"}) + + +def _analytic_ranges(): + # Properties whose members follow directly from the code point. Keys are + # normalised. + # Noncharacter_Code_Point: U+FDD0..FDEF and the last two of every plane, + # permanently reserved (the Unicode Standard 23.7, "Noncharacters": + # https://www.unicode.org/versions/Unicode17.0.0/core-spec/chapter-23/). + noncharacter = [(0xFDD0, 0xFDEF)] + noncharacter += [(plane | 0xFFFE, plane | 0xFFFF) + for plane in range(0, MAXUNICODE + 1, 0x10000)] + # Regional_Indicator (RI): the 26 enclosed symbols A..Z, a complete fixed + # block (PropList.txt binary property). + regional_indicator = [(0x1F1E6, 0x1F1FF)] + # ASCII_Hex_Digit (= POSIX xdigit) and Hex_Digit, which adds the fullwidth + # forms. Both are complete, fixed sets (PropList.txt binary properties). + ascii_hex = [(0x30, 0x39), (0x41, 0x46), (0x61, 0x66)] + hex_digit = ascii_hex + [(0xFF10, 0xFF19), (0xFF21, 0xFF26), (0xFF41, 0xFF46)] + return { + "ascii": [(0, 0x7F)], + "any": [(0, MAXUNICODE)], + # Join_Control (U+200C ZWNJ, U+200D ZWJ; the Unicode Standard 23.2, + # "Layout Controls"), a PropList.txt binary property. + "joincontrol": [(0x200C, 0x200D)], + "regionalindicator": regional_indicator, + "ri": regional_indicator, + "noncharactercodepoint": noncharacter, + "xdigit": ascii_hex, # POSIX, ASCII only + "asciihexdigit": ascii_hex, + "ahex": ascii_hex, + "hexdigit": hex_digit, + "hex": hex_digit, + # POSIX cntrl is the General_Category Cc, a fixed set of code points. + "cntrl": _CC_RANGES, + "patternwhitespace": _PATTERN_WHITE_SPACE_RANGES, + "patws": _PATTERN_WHITE_SPACE_RANGES, + "patternsyntax": _PATTERN_SYNTAX_RANGES, + "patsyn": _PATTERN_SYNTAX_RANGES, + } + + +def _normalize(name): + # Unicode property and value names are matched loosely: case, spaces, + # hyphens and underscores are not significant, and an initial "is" prefix + # is ignored (UAX #44 5.9, "Matching Rules", UAX44-LM3; + # https://www.unicode.org/reports/tr44/). + name = name.lower().replace("_", "").replace("-", "").replace(" ", "") + # Strip a leading "is", unless "is" is the whole name and so not a prefix + # (e.g. the Line_Break value lb=IS). + if name != "is": + name = name.removeprefix("is") + return name + + +def _from_ranges(ranges, negate): + if ranges is None: + return None + items = [(LITERAL, lo) if lo == hi else (RANGE, (lo, hi)) + for lo, hi in ranges] + if negate: + items.insert(0, (NEGATE, None)) + return (IN, items) + + +def _general_category(value, negate): + # Resolve a General_Category value to a subpattern using an engine category + # or a fixed range set; unsupported values return None. + cat = _GC_CATEGORY.get(value) + if cat is not None: + return (IN, [(CATEGORY, cat[1] if negate else cat[0])]) + return _from_ranges(_GC_ANALYTIC.get(value), negate) + + +def _truth(value): + value = _normalize(value) + if value in _TRUE_VALUES: + return True + if value in _FALSE_VALUES: + return False + return None + + +def parse_property(name, negate): + """Parse the text inside \\p{...} / \\P{...}. + + Return an (IN, items) subpattern, or None if the property is unknown. + """ + prop, sep, value = name.partition("=") + if sep: + key = _normalize(prop) + if key in _GC_KEYS: + return _general_category(_normalize(value), negate) + # A binary property spelled name=yes or name=no. + truth = _truth(value) + if truth is None: + return None + negate ^= not truth + cat = _CATEGORY_PROPERTIES.get(key) + if cat is not None: + return (IN, [(CATEGORY, cat[1] if negate else cat[0])]) + return _from_ranges(_analytic_ranges().get(key), negate) + + key = _normalize(name) + cat = _CATEGORY_PROPERTIES.get(key) + if cat is not None: + return (IN, [(CATEGORY, cat[1] if negate else cat[0])]) + ranges = _analytic_ranges().get(key) + if ranges is not None: + return _from_ranges(ranges, negate) + return _general_category(key, negate) diff --git a/Lib/test/test_re.py b/Lib/test/test_re.py index 4ab615b150002c..7e8ed0e02833e8 100644 --- a/Lib/test/test_re.py +++ b/Lib/test/test_re.py @@ -900,6 +900,183 @@ def test_named_unicode_escapes(self): self.checkPatternError(br'\N{LESS-THAN SIGN}', r'bad escape \N', 0) self.checkPatternError(br'[\N{LESS-THAN SIGN}]', r'bad escape \N', 1) + def test_property_escapes(self): + import unicodedata + # Properties that reuse the engine categories behave exactly like + # \d, \s and \w, and honour the ASCII/UNICODE flags. + self.assertTrue(re.fullmatch(r'\p{digit}+', '0123456789')) + self.assertTrue(re.fullmatch(r'\p{word}+', 'foo_bar123')) + self.assertTrue(re.fullmatch(r'\p{space}+', ' \t\n\r\f\v')) + self.assertTrue(re.fullmatch(r'\p{whitespace}+', ' \t\n')) + self.assertTrue(re.match(r'\P{digit}', 'a')) + self.assertIsNone(re.match(r'\P{digit}', '5')) + # Arabic-Indic digit five is a digit only in Unicode mode. + self.assertTrue(re.fullmatch(r'\p{digit}', '٥')) + self.assertIsNone(re.fullmatch(r'(?a)\p{digit}', '٥')) + for prop, esc in [('digit', r'\d'), ('space', r'\s'), ('word', r'\w')]: + with self.subTest(prop=prop): + self.assertEqual(re.fullmatch(r'\p{%s}' % prop, '٥') is None, + re.fullmatch(esc, '٥') is None) + + # General_Category values; L, Lu, Nd are engine categories. + self.assertTrue(re.fullmatch(r'\p{Lu}+', 'ABC')) + self.assertIsNone(re.fullmatch(r'\p{Lu}+', 'abc')) + self.assertTrue(re.fullmatch(r'\p{L}+', 'fo\xf6Д日')) + self.assertTrue(re.fullmatch(r'\p{Nd}+', '12٥')) + self.assertTrue(re.fullmatch(r'\P{L}+', '123 .,')) + # gc= spelling and loose matching of names. + self.assertTrue(re.fullmatch(r'\p{gc=Lu}+', 'ABC')) + self.assertTrue(re.fullmatch(r'\p{General_Category=Lu}+', 'ABC')) + self.assertTrue(re.fullmatch(r'\p{ lu }+', 'ABC')) + self.assertTrue(re.fullmatch(r'\p{LU}+', 'ABC')) + # An initial "is" prefix is ignored (UAX44-LM3), on the property name + # and on a gc value; "is" alone is not a prefix (cf. lb=IS). + self.assertTrue(re.fullmatch(r'\p{isLu}+', 'ABC')) + self.assertTrue(re.fullmatch(r'\p{Is_Lu}+', 'ABC')) + self.assertTrue(re.fullmatch(r'\p{gc=isLu}+', 'ABC')) + self.assertTrue(re.fullmatch(r'\p{isUppercase}+', 'ABC')) + # Engine categories L, Lt, Nd, Lu, N, Lm, Nl, No, Cf, Z, Zs and the + # fixed ranges Cc, Cs, Co, Zl, Zp. + self.assertTrue(re.fullmatch(r'\p{Lt}+', 'DžLjNj')) + self.assertIsNone(re.fullmatch(r'\p{Lt}', 'A')) + self.assertTrue(re.fullmatch(r'\p{Cc}+', '\x00\x1f\x7f\x9f')) + self.assertTrue(re.fullmatch(r'\p{Co}+', '\U0010fffd')) + # Cn (unassigned) and the C group are also engine categories. + self.assertTrue(re.fullmatch(r'\p{Cn}+', '\U00040000\U000e0fff')) + self.assertIsNone(re.fullmatch(r'\p{Cn}', 'a')) + self.assertTrue(re.fullmatch(r'\p{C}+', '\x00\u200b\U00040000')) # Cc Cf Cn + self.assertTrue(re.fullmatch(r'\p{assigned}+', 'a\u0410!')) + self.assertIsNone(re.fullmatch(r'\p{assigned}', '\U00040000')) + self.assertTrue(re.fullmatch(r'[\P{Lt}]+', 'aA1')) # category negation + self.assertTrue(re.fullmatch(r'\p{Lu}+', 'ABC\xc0')) + self.assertIsNone(re.fullmatch(r'\p{Lu}', 'a')) + # N includes Nd, Nl (Roman numerals) and No (superscripts/fractions). + self.assertTrue(re.fullmatch(r'\p{N}+', '12\u0665\u2167\u216b\u00b2\u00bd')) + self.assertIsNone(re.fullmatch(r'\p{N}', 'A')) + self.assertTrue(re.fullmatch(r'[\P{Lu}\p{N}]+', 'ab12')) + # More compound/analytic categories: Lm, Nl, No, Cf, Z, Zs, Zl, Zp. + self.assertTrue(re.fullmatch(r'\p{Lm}+', '\u02b0\u02b1\u02c6')) # modifiers + self.assertTrue(re.fullmatch(r'\p{Nl}+', '\u2167\u216b')) # Roman + self.assertTrue(re.fullmatch(r'\p{No}+', '\u00b2\u00bd\u00be')) # super/frac + self.assertTrue(re.fullmatch(r'\p{Cf}+', '\u200b\u00ad\u2060')) # format + self.assertIsNone(re.fullmatch(r'\p{Cf}', 'a')) + self.assertTrue(re.fullmatch(r'\p{Z}+', ' \xa0\u2028\u2029')) + self.assertTrue(re.fullmatch(r'\p{Zs}+', ' \xa0 ')) + self.assertIsNone(re.fullmatch(r'\p{Zs}', '\u2028')) + self.assertTrue(re.fullmatch(r'\p{Zl}', '\u2028')) + self.assertTrue(re.fullmatch(r'\p{Zp}', '\u2029')) + self.assertTrue(re.fullmatch(r'[\P{Cf}\p{Lm}\p{No}]+', 'a\u02b0\u00bd')) + # \p{Nd} reuses the \d category and so follows the ASCII flag, + # while \p{L} stays a Unicode property. + self.assertIsNone(re.fullmatch(r'(?a)\p{Nd}', '٥')) + self.assertTrue(re.fullmatch(r'(?a)\p{L}+', 'abД')) + + # Properties inside a character class. + self.assertTrue(re.fullmatch(r'[\p{digit}x]+', '12x34')) + self.assertTrue(re.fullmatch(r'[\P{digit}]+', 'abc')) + self.assertTrue(re.fullmatch(r'[\p{Lu}\p{Nd}]+', 'AB12')) + self.assertIsNone(re.fullmatch(r'[\p{Lu}\p{Nd}]+', 'ab')) + + # XID_Start and XID_Continue. + self.assertTrue(re.fullmatch(r'\p{XID_Start}+', 'fo\xf6Д')) + self.assertIsNone(re.fullmatch(r'\p{XID_Start}', '1')) + self.assertTrue(re.fullmatch(r'\p{XID_Continue}+', 'foo_123')) + self.assertTrue(re.fullmatch(r'\p{XIDS}+', 'abc')) + self.assertTrue(re.fullmatch(r'\p{XID_Start=Yes}+', 'abc')) + self.assertTrue(re.fullmatch(r'\p{XID_Start=No}+', '123 ')) + self.assertTrue(re.fullmatch(r'\P{XID_Start}+', '123 ')) + + # Binary properties from str predicates. + self.assertTrue(re.fullmatch(r'\p{Alphabetic}+', 'fo\xf6Д日')) + self.assertTrue(re.fullmatch(r'\p{Lowercase}+', 'abc')) + self.assertTrue(re.fullmatch(r'\p{Uppercase}+', 'ABC')) + self.assertTrue(re.fullmatch(r'\p{Numeric}+', '12½')) # ½ + self.assertTrue(re.fullmatch(r'\p{Printable}+', 'a b!')) + self.assertIsNone(re.fullmatch(r'\p{Printable}', '\n')) + # Cased == Lowercase | Uppercase | Lt (via _PyUnicode_IsCased). + self.assertTrue(re.fullmatch(r'\p{Cased}+', 'aADž')) + self.assertTrue(re.fullmatch(r'\P{Cased}+', '123 .')) + # Case_Ignorable == gc in {Mn,Me,Cf,Lm,Sk} plus the Word_Break + # MidLetter/MidNumLet/Single_Quote characters (via + # _PyUnicode_IsCaseIgnorable). + word_break = {'\u0027', '\u002e', '\u003a', '\u00b7', '\u0387', + '\u055f', '\u05f4', '\u2018', '\u2019', '\u2024', + '\u2027', '\ufe13', '\ufe52', '\ufe55', '\uff07', + '\uff0e', '\uff1a'} + ci = re.compile(r'\p{Case_Ignorable}') + for c in [chr(i) for i in range(0x100)] + ['\u02b0', '\u0301']: + expect = (unicodedata.category(c) in ('Mn','Me','Cf','Lm','Sk') + or c in word_break) + with self.subTest(char=c): + self.assertEqual(bool(ci.fullmatch(c)), expect) + self.assertTrue(re.fullmatch(r'\p{Alphabetic=No}+', '123 ')) + # These are engine categories, so (unlike \P of a multi-range + # property) they can be negated inside a character class. + self.assertTrue(re.fullmatch(r'[\P{Alphabetic}]+', '123 .')) + self.assertTrue(re.fullmatch(r'[\p{XID_Start}_]+', 'foo_bar')) + + # POSIX / UTS #18 Annex C compatibility classes. + self.assertTrue(re.fullmatch(r'\p{alpha}+', 'abcД')) + self.assertTrue(re.fullmatch(r'\p{alnum}+', 'abc123')) + self.assertTrue(re.fullmatch(r'\p{upper}+', 'ABC')) + self.assertTrue(re.fullmatch(r'\p{lower}+', 'abc')) + self.assertTrue(re.fullmatch(r'\p{blank}+', ' \t')) + self.assertIsNone(re.fullmatch(r'\p{blank}', '\n')) + self.assertTrue(re.fullmatch(r'\p{cntrl}+', '\x00\x1f\x7f')) + self.assertTrue(re.fullmatch(r'\p{graph}+', 'a!~')) + self.assertIsNone(re.fullmatch(r'\p{graph}', ' ')) + self.assertTrue(re.fullmatch(r'\p{print}+', 'a b!')) + self.assertTrue(re.fullmatch(r'\p{xdigit}+', '0123456789abcdefABCDEF')) + self.assertIsNone(re.fullmatch(r'\p{xdigit}', 'g')) + + # Pattern_Syntax and Pattern_White_Space (immutable, fixed ranges). + self.assertTrue(re.fullmatch(r'\p{Pattern_Syntax}+', '+-*/=<>!@#~')) + self.assertIsNone(re.fullmatch(r'\p{Pattern_Syntax}', 'a')) + self.assertTrue(re.fullmatch(r'\p{Pat_Syn}+', '()[]{}')) + self.assertTrue(re.fullmatch(r'\p{Pattern_White_Space}+', + ' \t\n\r\x0b\x0c\x85\u200e\u2028')) + self.assertTrue(re.fullmatch(r'\p{Pat_WS}+', '\u200f\u2029')) + self.assertIsNone(re.fullmatch(r'\p{Pattern_White_Space}', '\xa0')) + self.assertTrue(re.fullmatch(r'\P{Pattern_Syntax}+', 'abc123')) + + # Properties derivable from the code point alone. + self.assertTrue(re.fullmatch(r'\p{ASCII}+', 'AZ09~\x7f')) + self.assertIsNone(re.fullmatch(r'\p{ASCII}', '\x80')) + self.assertTrue(re.fullmatch(r'\P{ASCII}+', 'Дé日')) + self.assertTrue(re.fullmatch(r'\p{Any}', '\U0010ffff')) + self.assertTrue(re.fullmatch(r'\p{Assigned}+', 'Aд')) + self.assertIsNone(re.fullmatch(r'\p{Assigned}', '\U000e0fff')) + self.assertTrue(re.fullmatch(r'\p{Noncharacter_Code_Point}+', + '\uFDD0\uFFFE\U0010FFFF')) + self.assertTrue(re.fullmatch(r'\p{Join_Control}+', '\u200C\u200D')) + self.assertTrue(re.fullmatch(r'\p{Regional_Indicator}+', + '\U0001F1E6\U0001F1FF')) + self.assertTrue(re.fullmatch(r'\p{RI}', '\U0001F1FA')) # symbol U + self.assertIsNone(re.fullmatch(r'\p{RI}', 'U')) + # Hex_Digit (ASCII hex plus fullwidth) and ASCII_Hex_Digit (= xdigit). + self.assertTrue(re.fullmatch(r'\p{Hex_Digit}+', '0123456789abcdefABCDEF')) + self.assertTrue(re.fullmatch(r'\p{Hex}+', '0Af')) # fullwidth + self.assertTrue(re.fullmatch(r'\p{ASCII_Hex_Digit}+', '0aF')) + self.assertTrue(re.fullmatch(r'\p{AHex}+', '0aF')) + self.assertIsNone(re.fullmatch(r'\p{ASCII_Hex_Digit}', '0')) + self.assertIsNone(re.fullmatch(r'\p{Hex_Digit}', 'g')) + + # Errors. + self.checkPatternError(r'\p', 'missing {, expected property name', 2) + self.checkPatternError(r'[\p]', 'missing {, expected property name', 3) + self.checkPatternError(r'\p{}', 'missing property name', 3) + self.checkPatternError(r'\p{Spam}', "unknown property name 'Spam'", 0) + # "is" by itself is not an ignorable prefix, so it stays unknown. + self.checkPatternError(r'\p{is}', "unknown property name 'is'", 0) + self.checkPatternError(r'\p{Lu', 'missing }, unterminated name', 3) + # \p is not special in bytes patterns. + self.checkPatternError(br'\p{Lu}', r'bad escape \p', 0) + self.checkPatternError(br'\P{Lu}', r'bad escape \P', 0) + # A negated multi-range property (one not backed by an engine + # category) cannot be a set member. + self.checkPatternError(r'[\P{ASCII}]', + r'bad escape \P in character class', 1) + def test_word_boundaries(self): # See http://bugs.python.org/issue10713 self.assertEqual(re.search(r"\b(abc)\b", "abc").group(1), "abc") diff --git a/Misc/NEWS.d/next/Library/2026-06-22-12-00-00.gh-issue-95555.Pr0p18.rst b/Misc/NEWS.d/next/Library/2026-06-22-12-00-00.gh-issue-95555.Pr0p18.rst new file mode 100644 index 00000000000000..fa792cae5ec076 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2026-06-22-12-00-00.gh-issue-95555.Pr0p18.rst @@ -0,0 +1,4 @@ +Regular expressions now support Unicode property escapes ``\p{...}`` and +``\P{...}`` for properties that the engine can resolve without the unicodedata +database: many ``General_Category`` values, a number of binary properties, the +POSIX compatibility classes, and properties derivable from the code point. diff --git a/Modules/_sre/sre.c b/Modules/_sre/sre.c index 9964532a7f401c..7d03b909226f24 100644 --- a/Modules/_sre/sre.c +++ b/Modules/_sre/sre.c @@ -46,6 +46,7 @@ static const char copyright[] = #include "pycore_moduleobject.h" // _PyModule_GetState() #include "pycore_tuple.h" // _PyTuple_FromPairSteal #include "pycore_unicodeobject.h" // _PyUnicode_Copy +#include "pycore_unicodectype.h" // _PyUnicode_IsXidStart() #include "pycore_weakref.h" // FT_CLEAR_WEAKREFS() #include "sre.h" // SRE_CODE @@ -170,6 +171,48 @@ static unsigned int sre_upper_locale(unsigned int ch) #define SRE_UNI_IS_LINEBREAK(ch) Py_UNICODE_ISLINEBREAK(ch) #define SRE_UNI_IS_ALNUM(ch) Py_UNICODE_ISALNUM(ch) #define SRE_UNI_IS_WORD(ch) (SRE_UNI_IS_ALNUM(ch) || (ch) == '_') +#define SRE_UNI_IS_ALPHA(ch) Py_UNICODE_ISALPHA(ch) +#define SRE_UNI_IS_LOWER(ch) Py_UNICODE_ISLOWER(ch) +#define SRE_UNI_IS_UPPER(ch) Py_UNICODE_ISUPPER(ch) +#define SRE_UNI_IS_NUMERIC(ch) Py_UNICODE_ISNUMERIC(ch) +#define SRE_UNI_IS_PRINTABLE(ch) Py_UNICODE_ISPRINTABLE(ch) +#define SRE_UNI_IS_XID_START(ch) _PyUnicode_IsXidStart(ch) +#define SRE_UNI_IS_XID_CONTINUE(ch) _PyUnicode_IsXidContinue(ch) +#define SRE_UNI_IS_TITLE(ch) Py_UNICODE_ISTITLE(ch) +#define SRE_UNI_IS_CASED(ch) _PyUnicode_IsCased(ch) +#define SRE_UNI_IS_CASE_IGNORABLE(ch) _PyUnicode_IsCaseIgnorable(ch) +/* General_Category values, here re-expressed as combinations of the simple + predicates; the combinations reproduce the canonical General_Category + partition (the Unicode Standard 4.5, Table 4-4 "General_Category Values"; + they are not Unicode-published identities). SRE_IS_CC/CS/CO are the fixed + categories Cc, Cs (surrogates) and Co (private use). Verify against + https://www.unicode.org/Public/UCD/latest/ucd/extracted/DerivedGeneralCategory.txt */ +#define SRE_IS_CC(ch) ((ch) <= 0x1F || (0x7F <= (ch) && (ch) <= 0x9F)) +#define SRE_IS_CS(ch) (0xD800 <= (ch) && (ch) <= 0xDFFF) +#define SRE_IS_CO(ch) ((0xE000 <= (ch) && (ch) <= 0xF8FF) || \ + (0xF0000 <= (ch) && (ch) <= 0xFFFFD) || \ + (0x100000 <= (ch) && (ch) <= 0x10FFFD)) +#define SRE_UNI_IS_LU(ch) (SRE_UNI_IS_UPPER(ch) && SRE_UNI_IS_ALPHA(ch)) +#define SRE_UNI_IS_N(ch) (SRE_UNI_IS_ALNUM(ch) && !SRE_UNI_IS_ALPHA(ch)) +#define SRE_UNI_IS_LM(ch) (SRE_UNI_IS_ALPHA(ch) && SRE_UNI_IS_CASE_IGNORABLE(ch)) +#define SRE_UNI_IS_NL(ch) (SRE_UNI_IS_N(ch) && SRE_UNI_IS_XID_START(ch)) +#define SRE_UNI_IS_NO(ch) (SRE_UNI_IS_N(ch) && !SRE_UNI_IS_DIGIT(ch) && \ + !SRE_UNI_IS_XID_START(ch)) +#define SRE_UNI_IS_CF(ch) (SRE_UNI_IS_CASE_IGNORABLE(ch) && !SRE_UNI_IS_PRINTABLE(ch)) +#define SRE_UNI_IS_Z(ch) (SRE_UNI_IS_SPACE(ch) && !SRE_IS_CC(ch)) +#define SRE_UNI_IS_ZS(ch) (SRE_UNI_IS_Z(ch) && (ch) != 0x2028 && (ch) != 0x2029) +/* Other (C) = not printable and not a separator; Cn (unassigned) = an Other + that is none of Cc, Cf, Cs, Co. Hence the POSIX classes, the compatibility + properties of UTS #18 Annex C. */ +#define SRE_UNI_IS_C(ch) (!SRE_UNI_IS_PRINTABLE(ch) && !SRE_UNI_IS_Z(ch)) +#define SRE_UNI_IS_CN(ch) (SRE_UNI_IS_C(ch) && !SRE_IS_CC(ch) && \ + !SRE_IS_CS(ch) && !SRE_IS_CO(ch) && !SRE_UNI_IS_CASE_IGNORABLE(ch)) +#define SRE_UNI_IS_ASSIGNED(ch) (!SRE_UNI_IS_CN(ch)) +#define SRE_UNI_IS_BLANK(ch) (SRE_UNI_IS_ZS(ch) || (ch) == 0x09) +#define SRE_UNI_IS_GRAPH(ch) (!SRE_UNI_IS_SPACE(ch) && !SRE_IS_CC(ch) && \ + !SRE_IS_CS(ch) && !SRE_UNI_IS_CN(ch)) +#define SRE_UNI_IS_PRINT(ch) ((SRE_UNI_IS_GRAPH(ch) || SRE_UNI_IS_BLANK(ch)) && \ + !SRE_IS_CC(ch)) static unsigned int sre_lower_unicode(unsigned int ch) { @@ -224,6 +267,107 @@ sre_category(SRE_CODE category, unsigned int ch) return SRE_UNI_IS_LINEBREAK(ch); case SRE_CATEGORY_UNI_NOT_LINEBREAK: return !SRE_UNI_IS_LINEBREAK(ch); + + case SRE_CATEGORY_ALPHA: + return SRE_UNI_IS_ALPHA(ch); + case SRE_CATEGORY_NOT_ALPHA: + return !SRE_UNI_IS_ALPHA(ch); + case SRE_CATEGORY_LOWER: + return SRE_UNI_IS_LOWER(ch); + case SRE_CATEGORY_NOT_LOWER: + return !SRE_UNI_IS_LOWER(ch); + case SRE_CATEGORY_UPPER: + return SRE_UNI_IS_UPPER(ch); + case SRE_CATEGORY_NOT_UPPER: + return !SRE_UNI_IS_UPPER(ch); + case SRE_CATEGORY_NUMERIC: + return SRE_UNI_IS_NUMERIC(ch); + case SRE_CATEGORY_NOT_NUMERIC: + return !SRE_UNI_IS_NUMERIC(ch); + case SRE_CATEGORY_PRINTABLE: + return SRE_UNI_IS_PRINTABLE(ch); + case SRE_CATEGORY_NOT_PRINTABLE: + return !SRE_UNI_IS_PRINTABLE(ch); + case SRE_CATEGORY_ALNUM: + return SRE_UNI_IS_ALNUM(ch); + case SRE_CATEGORY_NOT_ALNUM: + return !SRE_UNI_IS_ALNUM(ch); + case SRE_CATEGORY_XID_START: + return SRE_UNI_IS_XID_START(ch); + case SRE_CATEGORY_NOT_XID_START: + return !SRE_UNI_IS_XID_START(ch); + case SRE_CATEGORY_XID_CONTINUE: + return SRE_UNI_IS_XID_CONTINUE(ch); + case SRE_CATEGORY_NOT_XID_CONTINUE: + return !SRE_UNI_IS_XID_CONTINUE(ch); + case SRE_CATEGORY_TITLE: + return SRE_UNI_IS_TITLE(ch); + case SRE_CATEGORY_NOT_TITLE: + return !SRE_UNI_IS_TITLE(ch); + case SRE_CATEGORY_CASED: + return SRE_UNI_IS_CASED(ch); + case SRE_CATEGORY_NOT_CASED: + return !SRE_UNI_IS_CASED(ch); + case SRE_CATEGORY_CASE_IGNORABLE: + return SRE_UNI_IS_CASE_IGNORABLE(ch); + case SRE_CATEGORY_NOT_CASE_IGNORABLE: + return !SRE_UNI_IS_CASE_IGNORABLE(ch); + case SRE_CATEGORY_LU: + return SRE_UNI_IS_LU(ch); + case SRE_CATEGORY_NOT_LU: + return !SRE_UNI_IS_LU(ch); + case SRE_CATEGORY_N: + return SRE_UNI_IS_N(ch); + case SRE_CATEGORY_NOT_N: + return !SRE_UNI_IS_N(ch); + case SRE_CATEGORY_LM: + return SRE_UNI_IS_LM(ch); + case SRE_CATEGORY_NOT_LM: + return !SRE_UNI_IS_LM(ch); + case SRE_CATEGORY_NL: + return SRE_UNI_IS_NL(ch); + case SRE_CATEGORY_NOT_NL: + return !SRE_UNI_IS_NL(ch); + case SRE_CATEGORY_NO: + return SRE_UNI_IS_NO(ch); + case SRE_CATEGORY_NOT_NO: + return !SRE_UNI_IS_NO(ch); + case SRE_CATEGORY_CF: + return SRE_UNI_IS_CF(ch); + case SRE_CATEGORY_NOT_CF: + return !SRE_UNI_IS_CF(ch); + case SRE_CATEGORY_Z: + return SRE_UNI_IS_Z(ch); + case SRE_CATEGORY_NOT_Z: + return !SRE_UNI_IS_Z(ch); + case SRE_CATEGORY_ZS: + return SRE_UNI_IS_ZS(ch); + case SRE_CATEGORY_NOT_ZS: + return !SRE_UNI_IS_ZS(ch); + case SRE_CATEGORY_C: + return SRE_UNI_IS_C(ch); + case SRE_CATEGORY_NOT_C: + return !SRE_UNI_IS_C(ch); + case SRE_CATEGORY_CN: + return SRE_UNI_IS_CN(ch); + case SRE_CATEGORY_NOT_CN: + return !SRE_UNI_IS_CN(ch); + case SRE_CATEGORY_ASSIGNED: + return SRE_UNI_IS_ASSIGNED(ch); + case SRE_CATEGORY_NOT_ASSIGNED: + return !SRE_UNI_IS_ASSIGNED(ch); + case SRE_CATEGORY_BLANK: + return SRE_UNI_IS_BLANK(ch); + case SRE_CATEGORY_NOT_BLANK: + return !SRE_UNI_IS_BLANK(ch); + case SRE_CATEGORY_GRAPH: + return SRE_UNI_IS_GRAPH(ch); + case SRE_CATEGORY_NOT_GRAPH: + return !SRE_UNI_IS_GRAPH(ch); + case SRE_CATEGORY_PRINT: + return SRE_UNI_IS_PRINT(ch); + case SRE_CATEGORY_NOT_PRINT: + return !SRE_UNI_IS_PRINT(ch); } return 0; } @@ -1864,6 +2008,56 @@ _validate_category(SRE_CODE arg) case SRE_CATEGORY_UNI_NOT_WORD: case SRE_CATEGORY_UNI_LINEBREAK: case SRE_CATEGORY_UNI_NOT_LINEBREAK: + case SRE_CATEGORY_ALPHA: + case SRE_CATEGORY_NOT_ALPHA: + case SRE_CATEGORY_LOWER: + case SRE_CATEGORY_NOT_LOWER: + case SRE_CATEGORY_UPPER: + case SRE_CATEGORY_NOT_UPPER: + case SRE_CATEGORY_NUMERIC: + case SRE_CATEGORY_NOT_NUMERIC: + case SRE_CATEGORY_PRINTABLE: + case SRE_CATEGORY_NOT_PRINTABLE: + case SRE_CATEGORY_ALNUM: + case SRE_CATEGORY_NOT_ALNUM: + case SRE_CATEGORY_XID_START: + case SRE_CATEGORY_NOT_XID_START: + case SRE_CATEGORY_XID_CONTINUE: + case SRE_CATEGORY_NOT_XID_CONTINUE: + case SRE_CATEGORY_TITLE: + case SRE_CATEGORY_NOT_TITLE: + case SRE_CATEGORY_CASED: + case SRE_CATEGORY_NOT_CASED: + case SRE_CATEGORY_CASE_IGNORABLE: + case SRE_CATEGORY_NOT_CASE_IGNORABLE: + case SRE_CATEGORY_LU: + case SRE_CATEGORY_NOT_LU: + case SRE_CATEGORY_N: + case SRE_CATEGORY_NOT_N: + case SRE_CATEGORY_LM: + case SRE_CATEGORY_NOT_LM: + case SRE_CATEGORY_NL: + case SRE_CATEGORY_NOT_NL: + case SRE_CATEGORY_NO: + case SRE_CATEGORY_NOT_NO: + case SRE_CATEGORY_CF: + case SRE_CATEGORY_NOT_CF: + case SRE_CATEGORY_Z: + case SRE_CATEGORY_NOT_Z: + case SRE_CATEGORY_ZS: + case SRE_CATEGORY_NOT_ZS: + case SRE_CATEGORY_C: + case SRE_CATEGORY_NOT_C: + case SRE_CATEGORY_CN: + case SRE_CATEGORY_NOT_CN: + case SRE_CATEGORY_ASSIGNED: + case SRE_CATEGORY_NOT_ASSIGNED: + case SRE_CATEGORY_BLANK: + case SRE_CATEGORY_NOT_BLANK: + case SRE_CATEGORY_GRAPH: + case SRE_CATEGORY_NOT_GRAPH: + case SRE_CATEGORY_PRINT: + case SRE_CATEGORY_NOT_PRINT: return 1; default: return 0; diff --git a/Modules/_sre/sre_constants.h b/Modules/_sre/sre_constants.h index bd611b33614509..d5ecefb3764755 100644 --- a/Modules/_sre/sre_constants.h +++ b/Modules/_sre/sre_constants.h @@ -11,7 +11,7 @@ * See the sre.c file for information on usage and redistribution. */ -#define SRE_MAGIC 20230612 +#define SRE_MAGIC 20260622 #define SRE_OP_FAILURE 0 #define SRE_OP_SUCCESS 1 #define SRE_OP_ANY 2 @@ -85,6 +85,56 @@ #define SRE_CATEGORY_UNI_NOT_WORD 15 #define SRE_CATEGORY_UNI_LINEBREAK 16 #define SRE_CATEGORY_UNI_NOT_LINEBREAK 17 +#define SRE_CATEGORY_ALPHA 18 +#define SRE_CATEGORY_NOT_ALPHA 19 +#define SRE_CATEGORY_LOWER 20 +#define SRE_CATEGORY_NOT_LOWER 21 +#define SRE_CATEGORY_UPPER 22 +#define SRE_CATEGORY_NOT_UPPER 23 +#define SRE_CATEGORY_NUMERIC 24 +#define SRE_CATEGORY_NOT_NUMERIC 25 +#define SRE_CATEGORY_PRINTABLE 26 +#define SRE_CATEGORY_NOT_PRINTABLE 27 +#define SRE_CATEGORY_ALNUM 28 +#define SRE_CATEGORY_NOT_ALNUM 29 +#define SRE_CATEGORY_XID_START 30 +#define SRE_CATEGORY_NOT_XID_START 31 +#define SRE_CATEGORY_XID_CONTINUE 32 +#define SRE_CATEGORY_NOT_XID_CONTINUE 33 +#define SRE_CATEGORY_TITLE 34 +#define SRE_CATEGORY_NOT_TITLE 35 +#define SRE_CATEGORY_CASED 36 +#define SRE_CATEGORY_NOT_CASED 37 +#define SRE_CATEGORY_CASE_IGNORABLE 38 +#define SRE_CATEGORY_NOT_CASE_IGNORABLE 39 +#define SRE_CATEGORY_LU 40 +#define SRE_CATEGORY_NOT_LU 41 +#define SRE_CATEGORY_N 42 +#define SRE_CATEGORY_NOT_N 43 +#define SRE_CATEGORY_LM 44 +#define SRE_CATEGORY_NOT_LM 45 +#define SRE_CATEGORY_NL 46 +#define SRE_CATEGORY_NOT_NL 47 +#define SRE_CATEGORY_NO 48 +#define SRE_CATEGORY_NOT_NO 49 +#define SRE_CATEGORY_CF 50 +#define SRE_CATEGORY_NOT_CF 51 +#define SRE_CATEGORY_Z 52 +#define SRE_CATEGORY_NOT_Z 53 +#define SRE_CATEGORY_ZS 54 +#define SRE_CATEGORY_NOT_ZS 55 +#define SRE_CATEGORY_C 56 +#define SRE_CATEGORY_NOT_C 57 +#define SRE_CATEGORY_CN 58 +#define SRE_CATEGORY_NOT_CN 59 +#define SRE_CATEGORY_ASSIGNED 60 +#define SRE_CATEGORY_NOT_ASSIGNED 61 +#define SRE_CATEGORY_BLANK 62 +#define SRE_CATEGORY_NOT_BLANK 63 +#define SRE_CATEGORY_GRAPH 64 +#define SRE_CATEGORY_NOT_GRAPH 65 +#define SRE_CATEGORY_PRINT 66 +#define SRE_CATEGORY_NOT_PRINT 67 #define SRE_FLAG_IGNORECASE 2 #define SRE_FLAG_LOCALE 4 #define SRE_FLAG_MULTILINE 8 diff --git a/Tools/unicode/makeunicodedata.py b/Tools/unicode/makeunicodedata.py index 76283d6b794a0b..b91bc539616f41 100644 --- a/Tools/unicode/makeunicodedata.py +++ b/Tools/unicode/makeunicodedata.py @@ -42,9 +42,12 @@ # The Unicode Database # -------------------- # When changing UCD version please update -# * Doc/library/stdtypes.rst, and +# * Doc/library/stdtypes.rst (four occurrences) # * Doc/library/unicodedata.rst +# * Doc/library/re.rst # * Doc/reference/lexical_analysis.rst (three occurrences) +# and optionally (comments) +# * Lib/re/_properties.py (three occurrences) UNIDATA_VERSION = "17.0.0" UNICODE_DATA = "UnicodeData%s.txt" COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt" From 285d96dd784620fe8afb02688abe72a916555061 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Fri, 26 Jun 2026 07:48:53 +0300 Subject: [PATCH 3/3] gh-152219: Add curses window attribute get/set methods and WA_* constants (GH-152221) Add the window methods attr_get(), attr_set(), attr_on(), attr_off() and color_set(), wrapping wattr_get(), wattr_set(), wattr_on(), wattr_off() and wcolor_set(). Unlike the legacy attron()/attroff()/attrset() methods, these pass the color pair as a separate argument instead of packing it into the attribute value. Also add the corresponding WA_* attribute constants. Add an attr_converter that range-checks the attr_t attribute argument and raises OverflowError instead of silently truncating it; apply it to attr_set(), attr_on(), attr_off() and chgat(). Co-authored-by: Claude Opus 4.8 --- Doc/library/curses.rst | 67 ++++++ Doc/whatsnew/3.16.rst | 7 + Lib/test/test_curses.py | 45 +++- ...-06-25-19-44-16.gh-issue-152219.ndj4Ib.rst | 5 + Modules/_cursesmodule.c | 223 +++++++++++++++++- Modules/clinic/_cursesmodule.c.h | 158 ++++++++++++- 6 files changed, 494 insertions(+), 11 deletions(-) create mode 100644 Misc/NEWS.d/next/Library/2026-06-25-19-44-16.gh-issue-152219.ndj4Ib.rst diff --git a/Doc/library/curses.rst b/Doc/library/curses.rst index a8a9680d0703ec..d7c2905ec7347d 100644 --- a/Doc/library/curses.rst +++ b/Doc/library/curses.rst @@ -989,6 +989,55 @@ Window objects ``0`` (no attributes). +.. method:: window.attr_get() + + Return the window's current rendition as a ``(attrs, pair)`` tuple, + where *attrs* is the set of attributes and *pair* is the color pair number. + + Unlike :meth:`attron` and friends, which take packed ``A_*`` attributes, + this method and the other ``attr_*`` methods work with the + :ref:`WA_* attributes ` and keep the color pair as a + separate number, which lets them use color pairs that do not fit alongside + the attributes in a single value. + + .. versionadded:: next + + +.. method:: window.attr_set(attr, pair=0) + + Set the window's rendition to the attributes *attr* and the color pair *pair*. + + .. versionadded:: next + + +.. method:: window.attr_on(attr) + + Turn on the attributes *attr* without affecting any others. + + .. versionadded:: next + + +.. method:: window.attr_off(attr) + + Turn off the attributes *attr* without affecting any others. + + .. versionadded:: next + + +.. method:: window.color_set(pair) + + Set the window's color pair to *pair*, leaving the other attributes unchanged. + + .. versionadded:: next + + +.. method:: window.getattrs() + + Return the window's current attributes. + + .. versionadded:: next + + .. method:: window.bkgd(ch[, attr]) Set the background property of the window to the character *ch*, with @@ -1888,6 +1937,24 @@ The exact constants available are system dependent. .. versionadded:: 3.7 ``A_ITALIC`` was added. +.. _curses-wa-constants: + +The :meth:`~window.attr_get`, :meth:`~window.attr_set`, :meth:`~window.attr_on` +and :meth:`~window.attr_off` methods use a parallel set of ``WA_*`` constants. +These have the same meaning as the corresponding ``A_*`` attributes above +(``WA_BOLD`` like :const:`A_BOLD`, and so on), but belong to the ``attr_t`` type +rather than being packed into a character. In ncurses the two sets share the +same values, but other curses implementations may give them different ones, so +use the ``WA_*`` constants with the ``attr_*`` methods. The available names are +``WA_ATTRIBUTES``, ``WA_NORMAL``, ``WA_STANDOUT``, ``WA_UNDERLINE``, +``WA_REVERSE``, ``WA_BLINK``, ``WA_DIM``, ``WA_BOLD``, ``WA_ALTCHARSET``, +``WA_INVIS``, ``WA_PROTECT``, ``WA_HORIZONTAL``, ``WA_LEFT``, ``WA_LOW``, +``WA_RIGHT``, ``WA_TOP``, ``WA_VERTICAL`` and ``WA_ITALIC`` (each available only +where the platform defines it). + +.. versionadded:: next + The ``WA_*`` constants were added. + Several constants are available to extract corresponding attributes returned by some methods. diff --git a/Doc/whatsnew/3.16.rst b/Doc/whatsnew/3.16.rst index cb6d5434fb3ce7..5123671db0c07e 100644 --- a/Doc/whatsnew/3.16.rst +++ b/Doc/whatsnew/3.16.rst @@ -131,6 +131,13 @@ curses available when built against an ncurses with ``NCURSES_EXT_FUNCS``. (Contributed by Serhiy Storchaka in :gh:`151776`.) +* Add the :mod:`curses` window methods :meth:`~curses.window.attr_get`, + :meth:`~curses.window.attr_set`, :meth:`~curses.window.attr_on`, + :meth:`~curses.window.attr_off` and :meth:`~curses.window.color_set`, which + pass the color pair as a separate argument instead of packing it into the + attribute value, and the corresponding ``WA_*`` attribute constants. + (Contributed by Serhiy Storchaka in :gh:`152219`.) + gzip ---- diff --git a/Lib/test/test_curses.py b/Lib/test/test_curses.py index 721cde39861ce9..75e6d2bd62e887 100644 --- a/Lib/test/test_curses.py +++ b/Lib/test/test_curses.py @@ -651,7 +651,6 @@ def test_scroll(self): win.scrollok(False) def test_attributes(self): - # TODO: attr_get(), attr_set(), ... win = curses.newwin(5, 15, 5, 2) win.attron(curses.A_BOLD) win.attroff(curses.A_BOLD) @@ -660,6 +659,45 @@ def test_attributes(self): win.standout() win.standend() + # The attr_*() family works on attr_t attributes paired with a color + # pair, unlike the chtype-based attron()/attroff()/attrset(). + win.attr_set(curses.A_BOLD | curses.A_UNDERLINE) + attrs, pair = win.attr_get() + self.assertTrue(attrs & curses.A_BOLD) + self.assertTrue(attrs & curses.A_UNDERLINE) + self.assertEqual(pair, 0) + self.assertEqual(win.getattrs(), attrs) + + win.attr_on(curses.A_REVERSE) + self.assertTrue(win.attr_get()[0] & curses.A_REVERSE) + win.attr_off(curses.A_REVERSE) + self.assertFalse(win.attr_get()[0] & curses.A_REVERSE) + + # color_set() with a real pair needs start_color(); see + # test_attr_color_pair. Here only the argument validation is checked, + # which fails before wcolor_set() is reached. + self.assertRaises(TypeError, win.attr_set, 'x') + self.assertRaises(TypeError, win.attr_set, curses.A_BOLD, 'x') + self.assertRaises(TypeError, win.attr_on, 'x') + self.assertRaises(TypeError, win.color_set, 'x') + self.assertRaises(ValueError, win.color_set, -1) + self.assertRaises(ValueError, win.attr_set, curses.A_BOLD, -1) + # attr_t is unsigned: a negative or too-large attribute overflows. + self.assertRaises(OverflowError, win.attr_set, -1) + self.assertRaises(OverflowError, win.attr_on, -1) + self.assertRaises(OverflowError, win.attr_set, 1 << 64) + + @requires_colors + def test_attr_color_pair(self): + win = curses.newwin(5, 15, 5, 2) + curses.init_pair(1, curses.COLOR_RED, curses.COLOR_BLACK) + win.attr_set(curses.A_BOLD, 1) + attrs, pair = win.attr_get() + self.assertTrue(attrs & curses.A_BOLD) + self.assertEqual(pair, 1) + win.color_set(0) + self.assertEqual(win.attr_get()[1], 0) + @requires_curses_window_meth('chgat') def test_chgat(self): win = curses.newwin(5, 15, 5, 2) @@ -691,6 +729,11 @@ def test_chgat(self): self.assertEqual(win.inch(3, 11), b'm'[0] | curses.A_UNDERLINE) self.assertEqual(win.inch(3, 14), b' '[0] | curses.A_UNDERLINE) + # attr_t is unsigned: a negative or too-large attribute overflows. + self.assertRaises(TypeError, win.chgat, 'x') + self.assertRaises(OverflowError, win.chgat, -1) + self.assertRaises(OverflowError, win.chgat, 1 << 64) + def test_background(self): win = curses.newwin(5, 15, 5, 2) win.addstr(0, 0, 'Lorem ipsum') diff --git a/Misc/NEWS.d/next/Library/2026-06-25-19-44-16.gh-issue-152219.ndj4Ib.rst b/Misc/NEWS.d/next/Library/2026-06-25-19-44-16.gh-issue-152219.ndj4Ib.rst new file mode 100644 index 00000000000000..e7ea6011cb7b50 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2026-06-25-19-44-16.gh-issue-152219.ndj4Ib.rst @@ -0,0 +1,5 @@ +Add the :mod:`curses` window methods :meth:`~curses.window.attr_get`, +:meth:`~curses.window.attr_set`, :meth:`~curses.window.attr_on`, +:meth:`~curses.window.attr_off` and :meth:`~curses.window.color_set`, which use +a separate color pair argument instead of packing it into the attribute value, +and the corresponding ``WA_*`` attribute constants. diff --git a/Modules/_cursesmodule.c b/Modules/_cursesmodule.c index 3377367d4ce45d..0306c4af3288dc 100644 --- a/Modules/_cursesmodule.c +++ b/Modules/_cursesmodule.c @@ -799,6 +799,32 @@ class component_converter(CConverter): [python start generated code]*/ /*[python end generated code: output=da39a3ee5e6b4b0d input=38e9be01d33927fb]*/ +static int +attr_converter(PyObject *arg, void *ptr) +{ + /* attr_t is unsigned and at least as wide as chtype, so an attribute + value must be a non-negative integer that fits in attr_t. */ + unsigned long attr = PyLong_AsUnsignedLong(arg); + if (attr == (unsigned long)-1 && PyErr_Occurred()) { + return 0; + } + if (attr > (unsigned long)(attr_t)-1) { + PyErr_Format(PyExc_OverflowError, + "attribute value is greater than maximum (%lu)", + (unsigned long)(attr_t)-1); + return 0; + } + *(attr_t *)ptr = (attr_t)attr; + return 1; +} + +/*[python input] +class attr_converter(CConverter): + type = 'attr_t' + converter = 'attr_converter' +[python start generated code]*/ +/*[python end generated code: output=da39a3ee5e6b4b0d input=6132d3d99d3ec25a]*/ + /***************************************************************************** The Window Object ******************************************************************************/ @@ -1450,6 +1476,125 @@ _curses_window_attrset_impl(PyCursesWindowObject *self, long attr) return curses_window_check_err(self, rtn, "wattrset", "attrset"); } +/*[clinic input] +_curses.window.attr_get + +Return the window's attributes and color pair as (attrs, pair). +[clinic start generated code]*/ + +static PyObject * +_curses_window_attr_get_impl(PyCursesWindowObject *self) +/*[clinic end generated code: output=74b3f805a2958fb8 input=1efd3c450a1373ef]*/ +{ + attr_t attrs; + int rtn; +#if _NCURSES_EXTENDED_COLOR_FUNCS + int pair; + short legacy_pair; + rtn = wattr_get(self->win, &attrs, &legacy_pair, &pair); +#else + short pair; + rtn = wattr_get(self->win, &attrs, &pair, NULL); +#endif + if (curses_window_check_err(self, rtn, "wattr_get", "attr_get") == NULL) { + return NULL; + } + return Py_BuildValue("(ki)", (unsigned long)attrs, (int)pair); +} + +/*[clinic input] +_curses.window.attr_set + + attr: attr + pair: pair = 0 + / + +Set the window's attributes and color pair. +[clinic start generated code]*/ + +static PyObject * +_curses_window_attr_set_impl(PyCursesWindowObject *self, attr_t attr, + int pair) +/*[clinic end generated code: output=756416e0d6345d4a input=b7936bd6b73eb3f2]*/ +{ + int rtn; +#if _NCURSES_EXTENDED_COLOR_FUNCS + rtn = wattr_set(self->win, attr, 0, &pair); +#else + rtn = wattr_set(self->win, attr, (short)pair, NULL); +#endif + return curses_window_check_err(self, rtn, "wattr_set", "attr_set"); +} + +/*[clinic input] +_curses.window.attr_on + + attr: attr + / + +Turn on the given attributes without affecting any others. +[clinic start generated code]*/ + +static PyObject * +_curses_window_attr_on_impl(PyCursesWindowObject *self, attr_t attr) +/*[clinic end generated code: output=712f13a558c5a6cb input=6a51a3d597ddca4b]*/ +{ + int rtn = wattr_on(self->win, attr, NULL); + return curses_window_check_err(self, rtn, "wattr_on", "attr_on"); +} + +/*[clinic input] +_curses.window.attr_off + + attr: attr + / + +Turn off the given attributes without affecting any others. +[clinic start generated code]*/ + +static PyObject * +_curses_window_attr_off_impl(PyCursesWindowObject *self, attr_t attr) +/*[clinic end generated code: output=ac680aead16f74fa input=c5d778b84030d388]*/ +{ + int rtn = wattr_off(self->win, attr, NULL); + return curses_window_check_err(self, rtn, "wattr_off", "attr_off"); +} + +/*[clinic input] +_curses.window.color_set + + pair: pair + / + +Set the window's color pair attribute. +[clinic start generated code]*/ + +static PyObject * +_curses_window_color_set_impl(PyCursesWindowObject *self, int pair) +/*[clinic end generated code: output=5e9e83f02a29bf1c input=70026f6d411db130]*/ +{ + int rtn; +#if _NCURSES_EXTENDED_COLOR_FUNCS + rtn = wcolor_set(self->win, 0, &pair); +#else + rtn = wcolor_set(self->win, (short)pair, NULL); +#endif + return curses_window_check_err(self, rtn, "wcolor_set", "color_set"); +} + +/*[clinic input] +_curses.window.getattrs + +Return the window's current attributes. +[clinic start generated code]*/ + +static PyObject * +_curses_window_getattrs_impl(PyCursesWindowObject *self) +/*[clinic end generated code: output=835f499205204ec4 input=bf56a0af5b730bd1]*/ +{ + return PyLong_FromUnsignedLong((unsigned long)(attr_t)getattrs(self->win)); +} + /*[clinic input] _curses.window.bkgdset @@ -1697,30 +1842,25 @@ PyCursesWindow_ChgAt(PyObject *op, PyObject *args) int num = -1; short color; attr_t attr = A_NORMAL; - long lattr; int use_xy = FALSE; switch (PyTuple_Size(args)) { case 1: - if (!PyArg_ParseTuple(args,"l;attr", &lattr)) + if (!PyArg_ParseTuple(args,"O&;attr", attr_converter, &attr)) return NULL; - attr = lattr; break; case 2: - if (!PyArg_ParseTuple(args,"il;n,attr", &num, &lattr)) + if (!PyArg_ParseTuple(args,"iO&;n,attr", &num, attr_converter, &attr)) return NULL; - attr = lattr; break; case 3: - if (!PyArg_ParseTuple(args,"iil;y,x,attr", &y, &x, &lattr)) + if (!PyArg_ParseTuple(args,"iiO&;y,x,attr", &y, &x, attr_converter, &attr)) return NULL; - attr = lattr; use_xy = TRUE; break; case 4: - if (!PyArg_ParseTuple(args,"iiil;y,x,n,attr", &y, &x, &num, &lattr)) + if (!PyArg_ParseTuple(args,"iiiO&;y,x,n,attr", &y, &x, &num, attr_converter, &attr)) return NULL; - attr = lattr; use_xy = TRUE; break; default: @@ -3377,6 +3517,12 @@ static PyMethodDef PyCursesWindow_methods[] = { _CURSES_WINDOW_ATTROFF_METHODDEF _CURSES_WINDOW_ATTRON_METHODDEF _CURSES_WINDOW_ATTRSET_METHODDEF + _CURSES_WINDOW_ATTR_GET_METHODDEF + _CURSES_WINDOW_ATTR_SET_METHODDEF + _CURSES_WINDOW_ATTR_ON_METHODDEF + _CURSES_WINDOW_ATTR_OFF_METHODDEF + _CURSES_WINDOW_COLOR_SET_METHODDEF + _CURSES_WINDOW_GETATTRS_METHODDEF _CURSES_WINDOW_BKGD_METHODDEF #ifdef HAVE_CURSES_WCHGAT { @@ -6882,6 +7028,65 @@ cursesmodule_exec(PyObject *module) SetDictInt("A_ITALIC", A_ITALIC); #endif + /* The WA_* attributes are used by the attr_t-based functions + (attr_get, attr_set, ...). ncurses defines them bit-identically to the + matching A_* constants, but X/Open keeps the two sets distinct, so other + implementations (such as NetBSD curses) may give them different values. */ +#ifdef WA_ATTRIBUTES + SetDictInt("WA_ATTRIBUTES", WA_ATTRIBUTES); +#endif +#ifdef WA_NORMAL + SetDictInt("WA_NORMAL", WA_NORMAL); +#endif +#ifdef WA_STANDOUT + SetDictInt("WA_STANDOUT", WA_STANDOUT); +#endif +#ifdef WA_UNDERLINE + SetDictInt("WA_UNDERLINE", WA_UNDERLINE); +#endif +#ifdef WA_REVERSE + SetDictInt("WA_REVERSE", WA_REVERSE); +#endif +#ifdef WA_BLINK + SetDictInt("WA_BLINK", WA_BLINK); +#endif +#ifdef WA_DIM + SetDictInt("WA_DIM", WA_DIM); +#endif +#ifdef WA_BOLD + SetDictInt("WA_BOLD", WA_BOLD); +#endif +#ifdef WA_ALTCHARSET + SetDictInt("WA_ALTCHARSET", WA_ALTCHARSET); +#endif +#ifdef WA_INVIS + SetDictInt("WA_INVIS", WA_INVIS); +#endif +#ifdef WA_PROTECT + SetDictInt("WA_PROTECT", WA_PROTECT); +#endif +#ifdef WA_HORIZONTAL + SetDictInt("WA_HORIZONTAL", WA_HORIZONTAL); +#endif +#ifdef WA_LEFT + SetDictInt("WA_LEFT", WA_LEFT); +#endif +#ifdef WA_LOW + SetDictInt("WA_LOW", WA_LOW); +#endif +#ifdef WA_RIGHT + SetDictInt("WA_RIGHT", WA_RIGHT); +#endif +#ifdef WA_TOP + SetDictInt("WA_TOP", WA_TOP); +#endif +#ifdef WA_VERTICAL + SetDictInt("WA_VERTICAL", WA_VERTICAL); +#endif +#ifdef WA_ITALIC + SetDictInt("WA_ITALIC", WA_ITALIC); +#endif + SetDictInt("COLOR_BLACK", COLOR_BLACK); SetDictInt("COLOR_RED", COLOR_RED); SetDictInt("COLOR_GREEN", COLOR_GREEN); diff --git a/Modules/clinic/_cursesmodule.c.h b/Modules/clinic/_cursesmodule.c.h index 49f30a35656b48..d4d6e4eeef0158 100644 --- a/Modules/clinic/_cursesmodule.c.h +++ b/Modules/clinic/_cursesmodule.c.h @@ -353,6 +353,162 @@ _curses_window_attrset(PyObject *self, PyObject *arg) return return_value; } +PyDoc_STRVAR(_curses_window_attr_get__doc__, +"attr_get($self, /)\n" +"--\n" +"\n" +"Return the window\'s attributes and color pair as (attrs, pair)."); + +#define _CURSES_WINDOW_ATTR_GET_METHODDEF \ + {"attr_get", (PyCFunction)_curses_window_attr_get, METH_NOARGS, _curses_window_attr_get__doc__}, + +static PyObject * +_curses_window_attr_get_impl(PyCursesWindowObject *self); + +static PyObject * +_curses_window_attr_get(PyObject *self, PyObject *Py_UNUSED(ignored)) +{ + return _curses_window_attr_get_impl((PyCursesWindowObject *)self); +} + +PyDoc_STRVAR(_curses_window_attr_set__doc__, +"attr_set($self, attr, pair=0, /)\n" +"--\n" +"\n" +"Set the window\'s attributes and color pair."); + +#define _CURSES_WINDOW_ATTR_SET_METHODDEF \ + {"attr_set", _PyCFunction_CAST(_curses_window_attr_set), METH_FASTCALL, _curses_window_attr_set__doc__}, + +static PyObject * +_curses_window_attr_set_impl(PyCursesWindowObject *self, attr_t attr, + int pair); + +static PyObject * +_curses_window_attr_set(PyObject *self, PyObject *const *args, Py_ssize_t nargs) +{ + PyObject *return_value = NULL; + attr_t attr; + int pair = 0; + + if (!_PyArg_CheckPositional("attr_set", nargs, 1, 2)) { + goto exit; + } + if (!attr_converter(args[0], &attr)) { + goto exit; + } + if (nargs < 2) { + goto skip_optional; + } + if (!pair_converter(args[1], &pair)) { + goto exit; + } +skip_optional: + return_value = _curses_window_attr_set_impl((PyCursesWindowObject *)self, attr, pair); + +exit: + return return_value; +} + +PyDoc_STRVAR(_curses_window_attr_on__doc__, +"attr_on($self, attr, /)\n" +"--\n" +"\n" +"Turn on the given attributes without affecting any others."); + +#define _CURSES_WINDOW_ATTR_ON_METHODDEF \ + {"attr_on", (PyCFunction)_curses_window_attr_on, METH_O, _curses_window_attr_on__doc__}, + +static PyObject * +_curses_window_attr_on_impl(PyCursesWindowObject *self, attr_t attr); + +static PyObject * +_curses_window_attr_on(PyObject *self, PyObject *arg) +{ + PyObject *return_value = NULL; + attr_t attr; + + if (!attr_converter(arg, &attr)) { + goto exit; + } + return_value = _curses_window_attr_on_impl((PyCursesWindowObject *)self, attr); + +exit: + return return_value; +} + +PyDoc_STRVAR(_curses_window_attr_off__doc__, +"attr_off($self, attr, /)\n" +"--\n" +"\n" +"Turn off the given attributes without affecting any others."); + +#define _CURSES_WINDOW_ATTR_OFF_METHODDEF \ + {"attr_off", (PyCFunction)_curses_window_attr_off, METH_O, _curses_window_attr_off__doc__}, + +static PyObject * +_curses_window_attr_off_impl(PyCursesWindowObject *self, attr_t attr); + +static PyObject * +_curses_window_attr_off(PyObject *self, PyObject *arg) +{ + PyObject *return_value = NULL; + attr_t attr; + + if (!attr_converter(arg, &attr)) { + goto exit; + } + return_value = _curses_window_attr_off_impl((PyCursesWindowObject *)self, attr); + +exit: + return return_value; +} + +PyDoc_STRVAR(_curses_window_color_set__doc__, +"color_set($self, pair, /)\n" +"--\n" +"\n" +"Set the window\'s color pair attribute."); + +#define _CURSES_WINDOW_COLOR_SET_METHODDEF \ + {"color_set", (PyCFunction)_curses_window_color_set, METH_O, _curses_window_color_set__doc__}, + +static PyObject * +_curses_window_color_set_impl(PyCursesWindowObject *self, int pair); + +static PyObject * +_curses_window_color_set(PyObject *self, PyObject *arg) +{ + PyObject *return_value = NULL; + int pair; + + if (!pair_converter(arg, &pair)) { + goto exit; + } + return_value = _curses_window_color_set_impl((PyCursesWindowObject *)self, pair); + +exit: + return return_value; +} + +PyDoc_STRVAR(_curses_window_getattrs__doc__, +"getattrs($self, /)\n" +"--\n" +"\n" +"Return the window\'s current attributes."); + +#define _CURSES_WINDOW_GETATTRS_METHODDEF \ + {"getattrs", (PyCFunction)_curses_window_getattrs, METH_NOARGS, _curses_window_getattrs__doc__}, + +static PyObject * +_curses_window_getattrs_impl(PyCursesWindowObject *self); + +static PyObject * +_curses_window_getattrs(PyObject *self, PyObject *Py_UNUSED(ignored)) +{ + return _curses_window_getattrs_impl((PyCursesWindowObject *)self); +} + PyDoc_STRVAR(_curses_window_bkgdset__doc__, "bkgdset($self, ch, attr=_curses.A_NORMAL, /)\n" "--\n" @@ -4966,4 +5122,4 @@ _curses_has_extended_color_support(PyObject *module, PyObject *Py_UNUSED(ignored #ifndef _CURSES_ASSUME_DEFAULT_COLORS_METHODDEF #define _CURSES_ASSUME_DEFAULT_COLORS_METHODDEF #endif /* !defined(_CURSES_ASSUME_DEFAULT_COLORS_METHODDEF) */ -/*[clinic end generated code: output=fd0f4e65dc594a65 input=a9049054013a1b77]*/ +/*[clinic end generated code: output=3d8d59f44ded2226 input=a9049054013a1b77]*/