Skip to content

Commit

Permalink
pythongh-91760: Deprecate group names and numbers which will be inval…
Browse files Browse the repository at this point in the history
…id in future (pythonGH-91794)

Only sequence of ASCII digits will be accepted as a numerical reference.
The group name in bytes patterns and replacement strings could only
contain ASCII letters and digits and underscore.
  • Loading branch information
serhiy-storchaka authored Apr 30, 2022
1 parent 6d0d547 commit 19dca04
Show file tree
Hide file tree
Showing 5 changed files with 112 additions and 7 deletions.
10 changes: 10 additions & 0 deletions Doc/library/re.rst
Original file line number Diff line number Diff line change
Expand Up @@ -417,6 +417,9 @@ The special characters are:
| | * ``\1`` |
+---------------------------------------+----------------------------------+

.. deprecated:: 3.11
Group names containing non-ASCII characters in bytes patterns.

.. index:: single: (?P=; in regular expressions

``(?P=name)``
Expand Down Expand Up @@ -486,6 +489,9 @@ The special characters are:
will match with ``'<[email protected]>'`` as well as ``'[email protected]'``, but
not with ``'<[email protected]'`` nor ``'[email protected]>'``.

.. deprecated:: 3.11
Group *id* containing anything except ASCII digits.


The special sequences consist of ``'\'`` and a character from the list below.
If the ordinary character is not an ASCII digit or an ASCII letter, then the
Expand Down Expand Up @@ -995,6 +1001,10 @@ form.
Empty matches for the pattern are replaced when adjacent to a previous
non-empty match.

.. deprecated:: 3.11
Group *id* containing anything except ASCII digits.
Group names containing non-ASCII characters in bytes replacement strings.


.. function:: subn(pattern, repl, string, count=0, flags=0)

Expand Down
8 changes: 8 additions & 0 deletions Doc/whatsnew/3.11.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1151,6 +1151,14 @@ Deprecated
(Contributed by Brett Cannon in :issue:`47061` and Victor Stinner in
:gh:`68966`.)

* More strict rules will be applied now applied for numerical group references
and group names in regular expressions in future Python versions.
Only sequence of ASCII digits will be now accepted as a numerical reference.
The group name in bytes patterns and replacement strings could only
contain ASCII letters and digits and underscore.
For now, a deprecation warning is raised for such syntax.
(Contributed by Serhiy Storchaka in :gh:`91760`.)


Removed
=======
Expand Down
41 changes: 34 additions & 7 deletions Lib/re/_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -287,8 +287,22 @@ def seek(self, index):
self.__next()

def error(self, msg, offset=0):
if not self.istext:
msg = msg.encode('ascii', 'backslashreplace').decode('ascii')
return error(msg, self.string, self.tell() - offset)

def checkgroupname(self, name, offset, nested):
if not name.isidentifier():
msg = "bad character in group name %r" % name
raise self.error(msg, len(name) + offset)
if not (self.istext or name.isascii()):
import warnings
warnings.warn(
"bad character in group name %a at position %d" %
(name, self.tell() - len(name) - offset),
DeprecationWarning, stacklevel=nested + 7
)

def _class_escape(source, escape):
# handle escape code inside character class
code = ESCAPES.get(escape)
Expand Down Expand Up @@ -703,15 +717,11 @@ def _parse(source, state, verbose, nested, first=False):
if sourcematch("<"):
# named group: skip forward to end of name
name = source.getuntil(">", "group name")
if not name.isidentifier():
msg = "bad character in group name %r" % name
raise source.error(msg, len(name) + 1)
source.checkgroupname(name, 1, nested)
elif sourcematch("="):
# named backreference
name = source.getuntil(")", "group name")
if not name.isidentifier():
msg = "bad character in group name %r" % name
raise source.error(msg, len(name) + 1)
source.checkgroupname(name, 1, nested)
gid = state.groupdict.get(name)
if gid is None:
msg = "unknown group name %r" % name
Expand Down Expand Up @@ -773,6 +783,7 @@ def _parse(source, state, verbose, nested, first=False):
# conditional backreference group
condname = source.getuntil(")", "group name")
if condname.isidentifier():
source.checkgroupname(condname, 1, nested)
condgroup = state.groupdict.get(condname)
if condgroup is None:
msg = "unknown group name %r" % condname
Expand All @@ -795,6 +806,14 @@ def _parse(source, state, verbose, nested, first=False):
state.grouprefpos[condgroup] = (
source.tell() - len(condname) - 1
)
if not (condname.isdecimal() and condname.isascii()):
import warnings
warnings.warn(
"bad character in group name %s at position %d" %
(repr(condname) if source.istext else ascii(condname),
source.tell() - len(condname) - 1),
DeprecationWarning, stacklevel=nested + 6
)
state.checklookbehindgroup(condgroup, source)
item_yes = _parse(source, state, verbose, nested + 1)
if source.match("|"):
Expand Down Expand Up @@ -1000,11 +1019,11 @@ def addgroup(index, pos):
# group
c = this[1]
if c == "g":
name = ""
if not s.match("<"):
raise s.error("missing <")
name = s.getuntil(">", "group name")
if name.isidentifier():
s.checkgroupname(name, 1, -1)
try:
index = groupindex[name]
except KeyError:
Expand All @@ -1020,6 +1039,14 @@ def addgroup(index, pos):
if index >= MAXGROUPS:
raise s.error("invalid group reference %d" % index,
len(name) + 1)
if not (name.isdecimal() and name.isascii()):
import warnings
warnings.warn(
"bad character in group name %s at position %d" %
(repr(name) if s.istext else ascii(name),
s.tell() - len(name) - 1),
DeprecationWarning, stacklevel=5
)
addgroup(index, len(name) + 1)
elif c == "0":
if s.next in OCTDIGITS:
Expand Down
56 changes: 56 additions & 0 deletions Lib/test/test_re.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,7 @@ def test_basic_re_sub(self):
self.assertEqual(re.sub('(?P<a>x)', r'\g<a>\g<1>', 'xx'), 'xxxx')
self.assertEqual(re.sub('(?P<unk>x)', r'\g<unk>\g<unk>', 'xx'), 'xxxx')
self.assertEqual(re.sub('(?P<unk>x)', r'\g<1>\g<1>', 'xx'), 'xxxx')
self.assertEqual(re.sub('()x', r'\g<0>\g<0>', 'xx'), 'xxxx')

self.assertEqual(re.sub('a', r'\t\n\v\r\f\a\b', 'a'), '\t\n\v\r\f\a\b')
self.assertEqual(re.sub('a', '\t\n\v\r\f\a\b', 'a'), '\t\n\v\r\f\a\b')
Expand Down Expand Up @@ -274,6 +275,21 @@ def test_symbolic_groups_errors(self):
self.checkPatternError('(?P<©>x)', "bad character in group name '©'", 4)
self.checkPatternError('(?P=©)', "bad character in group name '©'", 4)
self.checkPatternError('(?(©)y)', "bad character in group name '©'", 3)
with self.assertWarnsRegex(DeprecationWarning,
r"bad character in group name '\\xc2\\xb5' "
r"at position 4") as w:
re.compile(b'(?P<\xc2\xb5>x)')
self.assertEqual(w.filename, __file__)
with self.assertWarnsRegex(DeprecationWarning,
r"bad character in group name '\\xc2\\xb5' "
r"at position 4"):
self.checkPatternError(b'(?P=\xc2\xb5)',
r"unknown group name '\xc2\xb5'", 4)
with self.assertWarnsRegex(DeprecationWarning,
r"bad character in group name '\\xc2\\xb5' "
r"at position 3"):
self.checkPatternError(b'(?(\xc2\xb5)y)',
r"unknown group name '\xc2\xb5'", 3)

def test_symbolic_refs(self):
self.assertEqual(re.sub('(?P<a>x)|(?P<b>y)', r'\g<b>', 'xx'), '')
Expand Down Expand Up @@ -306,12 +322,35 @@ def test_symbolic_refs_errors(self):
re.sub('(?P<a>x)', r'\g<ab>', 'xx')
self.checkTemplateError('(?P<a>x)', r'\g<-1>', 'xx',
"bad character in group name '-1'", 3)
with self.assertWarnsRegex(DeprecationWarning,
r"bad character in group name '\+1' "
r"at position 3") as w:
re.sub('(?P<a>x)', r'\g<+1>', 'xx')
self.assertEqual(w.filename, __file__)
with self.assertWarnsRegex(DeprecationWarning,
r"bad character in group name '1_0' "
r"at position 3"):
re.sub('()'*10, r'\g<1_0>', 'xx')
with self.assertWarnsRegex(DeprecationWarning,
r"bad character in group name ' 1 ' "
r"at position 3"):
re.sub('(?P<a>x)', r'\g< 1 >', 'xx')
self.checkTemplateError('(?P<a>x)', r'\g<©>', 'xx',
"bad character in group name '©'", 3)
with self.assertWarnsRegex(DeprecationWarning,
r"bad character in group name '\\xc2\\xb5' "
r"at position 3") as w:
with self.assertRaisesRegex(IndexError, "unknown group name '\xc2\xb5'"):
re.sub(b'(?P<a>x)', b'\\g<\xc2\xb5>', b'xx')
self.assertEqual(w.filename, __file__)
self.checkTemplateError('(?P<a>x)', r'\g<㊀>', 'xx',
"bad character in group name '㊀'", 3)
self.checkTemplateError('(?P<a>x)', r'\g<¹>', 'xx',
"bad character in group name '¹'", 3)
with self.assertWarnsRegex(DeprecationWarning,
r"bad character in group name '१' "
r"at position 3"):
re.sub('(?P<a>x)', r'\g<१>', 'xx')

def test_re_subn(self):
self.assertEqual(re.subn("(?i)b+", "x", "bbbb BBBB"), ('x x', 2))
Expand Down Expand Up @@ -577,10 +616,27 @@ def test_re_groupref_exists_errors(self):
self.checkPatternError(r'(?P<a>)(?(0)a|b)', 'bad group number', 10)
self.checkPatternError(r'()(?(-1)a|b)',
"bad character in group name '-1'", 5)
with self.assertWarnsRegex(DeprecationWarning,
r"bad character in group name '\+1' "
r"at position 5") as w:
re.compile(r'()(?(+1)a|b)')
self.assertEqual(w.filename, __file__)
with self.assertWarnsRegex(DeprecationWarning,
r"bad character in group name '1_0' "
r"at position 23"):
re.compile(r'()'*10 + r'(?(1_0)a|b)')
with self.assertWarnsRegex(DeprecationWarning,
r"bad character in group name ' 1 ' "
r"at position 5"):
re.compile(r'()(?( 1 )a|b)')
self.checkPatternError(r'()(?(㊀)a|b)',
"bad character in group name '㊀'", 5)
self.checkPatternError(r'()(?(¹)a|b)',
"bad character in group name '¹'", 5)
with self.assertWarnsRegex(DeprecationWarning,
r"bad character in group name '१' "
r"at position 5"):
re.compile(r'()(?(१)a|b)')
self.checkPatternError(r'()(?(1',
"missing ), unterminated name", 5)
self.checkPatternError(r'()(?(1)a',
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
More strict rules will be applied for numerical group references and group
names in regular expressions. For now, a deprecation warning is emitted for
group references and group names which will be errors in future Python
versions.

0 comments on commit 19dca04

Please sign in to comment.