diff --git a/Lib/re/_parser.py b/Lib/re/_parser.py index 3f3efb5d4d4008..832f90b4a05706 100644 --- a/Lib/re/_parser.py +++ b/Lib/re/_parser.py @@ -310,7 +310,7 @@ def checkgroupname(self, name, offset): msg = "bad character in group name %r" % name raise self.error(msg, len(name) + offset) -def _property_escape(source, escape, in_set=False): +def _property_escape(source, escape): # handle \p{...} and \P{...} (UTS #18 1.2.4, "Property Syntax") from . import _properties if not source.match('{'): @@ -320,10 +320,6 @@ def _property_escape(source, escape, in_set=False): if code is None: raise source.error("unknown property name %r" % name, len(name) + len(r'\p{}')) - if in_set and code[1][0] == (NEGATE, None): - # A negated multi-range property cannot be a member of a set. - raise source.error("bad escape %s in character class" % escape, - len(name) + len(r'\p{}')) return code def _class_escape(source, escape): @@ -369,7 +365,7 @@ def _class_escape(source, escape): len(charname) + len(r'\N{}')) from None return LITERAL, c elif c in "pP" and source.istext: - return _property_escape(source, escape, in_set=True) + return _property_escape(source, escape) elif c in OCTDIGITS: # octal escape (up to three digits) escape += source.getwhile(2, OCTDIGITS) @@ -569,11 +565,15 @@ def _difference(left, right, state): # with the next operand. _SETOPS = {'||': _union, '&&': _intersect, '--': _difference} -def _operand_elements(set, compound): - # The operand's elements: a standalone nested set, else the member union. +def _operand_elements(set, compound, negated, state): + # The operand's elements: a standalone nested set, else the member union, + # with any negated-property members alternated in (see addmember). if compound is not None: return compound - return [_charset_node(_uniq(set))] + result = [_charset_node(_uniq(set))] if set or not negated else None + for neg in negated: + result = [neg] if result is None else _union(result, [neg], state) + return result def _parse_operand(source, state, nested, here, allow_nested): # Read one operand, stopping at a set operator or the closing ']'. An @@ -586,10 +586,15 @@ def _parse_operand(source, state, nested, here, allow_nested): sourcematch = source.match set = [] setappend = set.append + negated = [] # \P{...} negated-range props, alternated in at the end def addmember(code): - # Flatten a \p{...} property's IN into the member set. + # Flatten a \p{...} property's IN into the member set; a negated one is a + # complemented charset, set aside to _union in (it can't join the union). if code[0] is IN: - set.extend(code[1]) + if code[1][0][0] is NEGATE: + negated.append(code) + else: + set.extend(code[1]) else: setappend(code) compound = None # elements of a standalone nested-set operand @@ -602,9 +607,9 @@ def addmember(code): if this is None: raise source.error("unterminated character set", source.tell() - here) - if set or compound is not None: + if set or compound is not None or negated: if this == "]": - return _operand_elements(set, compound), None + return _operand_elements(set, compound, negated, state), None if this in '-&|~' and source.next == this: if this == '~': import warnings @@ -616,7 +621,7 @@ def addmember(code): else: # '--', '&&' or '||' ends this operand and starts the next. sourceget() # consume the second operator character - return _operand_elements(set, compound), this + this + return _operand_elements(set, compound, negated, state), this + this if this[0] == "\\": code1 = _class_escape(source, this) else: @@ -636,12 +641,12 @@ def addmember(code): # A trailing '-' is a literal. addmember(code1) setappend((LITERAL, _ord("-"))) - return [_charset_node(_uniq(set))], None + return _operand_elements(set, None, negated, state), None if that == "-": # 'X--': difference, not a range. '--' after a single member # lands here because the range probe consumed the first '-'. addmember(code1) - return [_charset_node(_uniq(set))], "--" + return _operand_elements(set, None, negated, state), "--" if that[0] == "\\": code2 = _class_escape(source, that) else: diff --git a/Lib/test/test_re.py b/Lib/test/test_re.py index 7e8ed0e02833e8..af6e4612dcfaef 100644 --- a/Lib/test/test_re.py +++ b/Lib/test/test_re.py @@ -1061,6 +1061,19 @@ def test_property_escapes(self): self.assertIsNone(re.fullmatch(r'\p{ASCII_Hex_Digit}', '0')) self.assertIsNone(re.fullmatch(r'\p{Hex_Digit}', 'g')) + # A negated multi-range property (not backed by an engine category) can + # be a set member; it is alternated in with the other members. + self.assertIsNone(re.fullmatch(r'[\P{ASCII}]', 'a')) + self.assertTrue(re.fullmatch(r'[\P{ASCII}]', 'ä')) + self.assertTrue(re.fullmatch(r'[\P{ASCII}abc]+', 'abäc日')) + self.assertIsNone(re.fullmatch(r'[\P{ASCII}abc]', 'd')) + self.assertTrue(re.fullmatch(r'[abc\P{ASCII}]+', 'abäc日')) + self.assertTrue(re.fullmatch(r'[^\P{ASCII}]+', 'AZ09~')) # = ASCII + self.assertIsNone(re.fullmatch(r'[^\P{ASCII}]', 'ä')) + # Composes with set operations. + self.assertTrue(re.fullmatch(r'[\w--\P{ASCII}]+', 'AZ09_')) # \w and ASCII + self.assertIsNone(re.fullmatch(r'[\w--\P{ASCII}]', 'д')) + # Errors. self.checkPatternError(r'\p', 'missing {, expected property name', 2) self.checkPatternError(r'[\p]', 'missing {, expected property name', 3) @@ -1072,10 +1085,6 @@ def test_property_escapes(self): # \p is not special in bytes patterns. self.checkPatternError(br'\p{Lu}', r'bad escape \p', 0) self.checkPatternError(br'\P{Lu}', r'bad escape \P', 0) - # A negated multi-range property (one not backed by an engine - # category) cannot be a set member. - self.checkPatternError(r'[\P{ASCII}]', - r'bad escape \P in character class', 1) def test_word_boundaries(self): # See http://bugs.python.org/issue10713