From 723360672db2142c4694db2ebba9bb2d3b70944c Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Wed, 24 Jun 2026 18:53:38 +0300 Subject: [PATCH] gh-152100: Fuse set-operation character classes into a single charset Add a compile-time optimization pass (Lib/re/_optimizer.py) that rewrites set-operation character classes into a single character set where the engine's charset() representation allows it. charset() treats every NEGATE as a polarity toggle, so a mid-list NEGATE expresses set difference and a flat run expresses union. Set difference -- [A--B], emitted by the parser as A(? --- Lib/re/_compiler.py | 6 ++- Lib/re/_optimizer.py | 116 ++++++++++++++++++++++++++++++++++++++++++- Lib/re/_parser.py | 15 ++++-- 3 files changed, 130 insertions(+), 7 deletions(-) diff --git a/Lib/re/_compiler.py b/Lib/re/_compiler.py index fb0c8d35f6f89a..58a24964c3b374 100644 --- a/Lib/re/_compiler.py +++ b/Lib/re/_compiler.py @@ -16,7 +16,7 @@ from ._casefix import _EXTRA_CASES from ._optimizer import ( _combine_flags, _compile_charset, _optimize_charset, _compile_info, - _simple, _CHARSET_ALL, _CODEBITS, MAXCODE, + _simple, _CHARSET_ALL, _CODEBITS, MAXCODE, optimize, ) assert _sre.MAGIC == MAGIC, "SRE module mismatch" @@ -219,6 +219,10 @@ def isstring(obj): def _code(p, flags): flags = p.state.flags | flags + + # run the optimizer passes over the parsed pattern + optimize(p) + code = [] # compile info block diff --git a/Lib/re/_optimizer.py b/Lib/re/_optimizer.py index 5e3892583a64c9..6a0bb5a2973eae 100644 --- a/Lib/re/_optimizer.py +++ b/Lib/re/_optimizer.py @@ -56,7 +56,39 @@ def _compile_charset(charset, flags, code): emit(FAILURE) def _optimize_charset(charset, iscased=None, fixup=None, fixes=None): - # internal: optimize character set + # internal: optimize character set. + # + # The engine's charset() walk toggles polarity on every NEGATE (see + # Modules/_sre/sre_lib.h), so NEGATE markers split the set into + # alternating-polarity segments: a leading NEGATE is a complemented class + # [^...], an interior one is set difference (RL1.3). Each segment is a + # plain union, optimized on its own with the NEGATE boundaries kept in place. + negates = [i for i, (op, _av) in enumerate(charset) if op is NEGATE] + if not negates or negates == [0]: + # Fast path: a plain union, optionally complemented as a whole -- every + # charset the parser produces today, optimized as before. + return _optimize_charset_segment(charset, iscased, fixup, fixes) + + # Optimize each NEGATE-delimited run on its own. _allow_anyall is off: the + # [\s\S] -> ANY_ALL / [^\s\S] -> empty shortcuts rewrite a whole set and + # would inject or drop a NEGATE mid-segment. + out = [] + hascased = False + start = 0 + for i in negates + [len(charset)]: + if i > start: # skip an empty run (e.g. a leading NEGATE) + opt, cased = _optimize_charset_segment( + charset[start:i], iscased, fixup, fixes, _allow_anyall=False) + out.extend(opt) + hascased |= cased + if i < len(charset): + out.append((NEGATE, None)) + start = i + 1 + return out, hascased + +def _optimize_charset_segment(charset, iscased=None, fixup=None, fixes=None, + _allow_anyall=True): + # internal: optimize one NEGATE-free union of character-set members out = [] tail = [] charmap = bytearray(256) @@ -94,7 +126,7 @@ def _optimize_charset(charset, iscased=None, fixup=None, fixes=None): charmap[i] = 1 elif op is NEGATE: out.append((op, av)) - elif op is CATEGORY and tail and (CATEGORY, CH_NEGATE[av]) in tail: + elif op is CATEGORY and _allow_anyall and tail and (CATEGORY, CH_NEGATE[av]) in tail: # Optimize [\s\S] etc. out = [] if out else _CHARSET_ALL return out, False @@ -395,3 +427,83 @@ def _compile_info(code, pattern, flags): elif charset: _compile_charset(charset, flags, code) code[skip] = len(code) - skip + +# Difference-fusion peephole: rewrite [A--B]-style A(? (? A (? A (?<=[B]) intersection # [A||B] -> [AB] or (?:A|B) union + # A flat-operand difference [A--B] is later fused back into a single charset + # by Lib/re/_optimizer.py (see that module). # Operators chain left-to-right with no precedence. A leading '^' negates by # De Morgan, pushing the negation into the operands (no lookahead needed): # [^A--B] -> [^A] | B ; [^A&&B] -> [^A] | [^B] ; [^A||B] -> [^A] && [^B]