Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 21 additions & 16 deletions Lib/re/_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -310,7 +310,7 @@ def checkgroupname(self, name, offset):
msg = "bad character in group name %r" % name
raise self.error(msg, len(name) + offset)

def _property_escape(source, escape, in_set=False):
def _property_escape(source, escape):
# handle \p{...} and \P{...} (UTS #18 1.2.4, "Property Syntax")
from . import _properties
if not source.match('{'):
Expand All @@ -320,10 +320,6 @@ def _property_escape(source, escape, in_set=False):
if code is None:
raise source.error("unknown property name %r" % name,
len(name) + len(r'\p{}'))
if in_set and code[1][0] == (NEGATE, None):
# A negated multi-range property cannot be a member of a set.
raise source.error("bad escape %s in character class" % escape,
len(name) + len(r'\p{}'))
return code

def _class_escape(source, escape):
Expand Down Expand Up @@ -369,7 +365,7 @@ def _class_escape(source, escape):
len(charname) + len(r'\N{}')) from None
return LITERAL, c
elif c in "pP" and source.istext:
return _property_escape(source, escape, in_set=True)
return _property_escape(source, escape)
elif c in OCTDIGITS:
# octal escape (up to three digits)
escape += source.getwhile(2, OCTDIGITS)
Expand Down Expand Up @@ -569,11 +565,15 @@ def _difference(left, right, state):
# with the next operand.
_SETOPS = {'||': _union, '&&': _intersect, '--': _difference}

def _operand_elements(set, compound):
# The operand's elements: a standalone nested set, else the member union.
def _operand_elements(set, compound, negated, state):
# The operand's elements: a standalone nested set, else the member union,
# with any negated-property members alternated in (see addmember).
if compound is not None:
return compound
return [_charset_node(_uniq(set))]
result = [_charset_node(_uniq(set))] if set or not negated else None
for neg in negated:
result = [neg] if result is None else _union(result, [neg], state)
return result

def _parse_operand(source, state, nested, here, allow_nested):
# Read one operand, stopping at a set operator or the closing ']'. An
Expand All @@ -586,10 +586,15 @@ def _parse_operand(source, state, nested, here, allow_nested):
sourcematch = source.match
set = []
setappend = set.append
negated = [] # \P{...} negated-range props, alternated in at the end
def addmember(code):
# Flatten a \p{...} property's IN into the member set.
# Flatten a \p{...} property's IN into the member set; a negated one is a
# complemented charset, set aside to _union in (it can't join the union).
if code[0] is IN:
set.extend(code[1])
if code[1][0][0] is NEGATE:
negated.append(code)
else:
set.extend(code[1])
else:
setappend(code)
compound = None # elements of a standalone nested-set operand
Expand All @@ -602,9 +607,9 @@ def addmember(code):
if this is None:
raise source.error("unterminated character set",
source.tell() - here)
if set or compound is not None:
if set or compound is not None or negated:
if this == "]":
return _operand_elements(set, compound), None
return _operand_elements(set, compound, negated, state), None
if this in '-&|~' and source.next == this:
if this == '~':
import warnings
Expand All @@ -616,7 +621,7 @@ def addmember(code):
else:
# '--', '&&' or '||' ends this operand and starts the next.
sourceget() # consume the second operator character
return _operand_elements(set, compound), this + this
return _operand_elements(set, compound, negated, state), this + this
if this[0] == "\\":
code1 = _class_escape(source, this)
else:
Expand All @@ -636,12 +641,12 @@ def addmember(code):
# A trailing '-' is a literal.
addmember(code1)
setappend((LITERAL, _ord("-")))
return [_charset_node(_uniq(set))], None
return _operand_elements(set, None, negated, state), None
if that == "-":
# 'X--': difference, not a range. '--' after a single member
# lands here because the range probe consumed the first '-'.
addmember(code1)
return [_charset_node(_uniq(set))], "--"
return _operand_elements(set, None, negated, state), "--"
if that[0] == "\\":
code2 = _class_escape(source, that)
else:
Expand Down
17 changes: 13 additions & 4 deletions Lib/test/test_re.py
Original file line number Diff line number Diff line change
Expand Up @@ -1061,6 +1061,19 @@ def test_property_escapes(self):
self.assertIsNone(re.fullmatch(r'\p{ASCII_Hex_Digit}', '0'))
self.assertIsNone(re.fullmatch(r'\p{Hex_Digit}', 'g'))

# A negated multi-range property (not backed by an engine category) can
# be a set member; it is alternated in with the other members.
self.assertIsNone(re.fullmatch(r'[\P{ASCII}]', 'a'))
self.assertTrue(re.fullmatch(r'[\P{ASCII}]', 'ä'))
self.assertTrue(re.fullmatch(r'[\P{ASCII}abc]+', 'abäc日'))
self.assertIsNone(re.fullmatch(r'[\P{ASCII}abc]', 'd'))
self.assertTrue(re.fullmatch(r'[abc\P{ASCII}]+', 'abäc日'))
self.assertTrue(re.fullmatch(r'[^\P{ASCII}]+', 'AZ09~')) # = ASCII
self.assertIsNone(re.fullmatch(r'[^\P{ASCII}]', 'ä'))
# Composes with set operations.
self.assertTrue(re.fullmatch(r'[\w--\P{ASCII}]+', 'AZ09_')) # \w and ASCII
self.assertIsNone(re.fullmatch(r'[\w--\P{ASCII}]', 'д'))

# Errors.
self.checkPatternError(r'\p', 'missing {, expected property name', 2)
self.checkPatternError(r'[\p]', 'missing {, expected property name', 3)
Expand All @@ -1072,10 +1085,6 @@ def test_property_escapes(self):
# \p is not special in bytes patterns.
self.checkPatternError(br'\p{Lu}', r'bad escape \p', 0)
self.checkPatternError(br'\P{Lu}', r'bad escape \P', 0)
# A negated multi-range property (one not backed by an engine
# category) cannot be a set member.
self.checkPatternError(r'[\P{ASCII}]',
r'bad escape \P in character class', 1)

def test_word_boundaries(self):
# See http://bugs.python.org/issue10713
Expand Down
Loading