Skip to content

Commit

Permalink
pythongh-91524: Speed up the regular expression substitution (python#…
Browse files Browse the repository at this point in the history
…91525)

Functions re.sub() and re.subn() and corresponding re.Pattern methods
are now 2-3 times faster for replacement strings containing group references.

Closes python#91524

Primarily authored by serhiy-storchaka Serhiy Storchaka
Minor-cleanups-by: Gregory P. Smith [Google] <[email protected]>
  • Loading branch information
serhiy-storchaka authored Oct 23, 2022
1 parent 176b6c5 commit 75a6fad
Show file tree
Hide file tree
Showing 9 changed files with 358 additions and 91 deletions.
5 changes: 5 additions & 0 deletions Doc/whatsnew/3.12.rst
Original file line number Diff line number Diff line change
Expand Up @@ -205,6 +205,11 @@ Optimizations
process, which improves performance by 1-5%.
(Contributed by Kevin Modzelewski in :gh:`90536`.)

* Speed up the regular expression substitution (functions :func:`re.sub` and
:func:`re.subn` and corresponding :class:`re.Pattern` methods) for
replacement strings containing group references by 2--3 times.
(Contributed by Serhiy Storchaka in :gh:`91524`.)


CPython bytecode changes
========================
Expand Down
22 changes: 4 additions & 18 deletions Lib/re/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,7 @@
import enum
from . import _compiler, _parser
import functools
import _sre


# public symbols
Expand Down Expand Up @@ -230,7 +231,7 @@ def purge():
"Clear the regular expression caches"
_cache.clear()
_cache2.clear()
_compile_repl.cache_clear()
_compile_template.cache_clear()

def template(pattern, flags=0):
"Compile a template pattern, returning a Pattern object, deprecated"
Expand Down Expand Up @@ -328,24 +329,9 @@ def _compile(pattern, flags):
return p

@functools.lru_cache(_MAXCACHE)
def _compile_repl(repl, pattern):
def _compile_template(pattern, repl):
# internal: compile replacement pattern
return _parser.parse_template(repl, pattern)

def _expand(pattern, match, template):
# internal: Match.expand implementation hook
template = _parser.parse_template(template, pattern)
return _parser.expand_template(template, match)

def _subx(pattern, template):
# internal: Pattern.sub/subn implementation helper
template = _compile_repl(template, pattern)
if not template[0] and len(template[1]) == 1:
# literal replacement
return template[1][0]
def filter(match, template=template):
return _parser.expand_template(template, match)
return filter
return _sre.template(pattern, _parser.parse_template(repl, pattern))

# register myself for pickling

Expand Down
2 changes: 1 addition & 1 deletion Lib/re/_constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@

# update when constants are added or removed

MAGIC = 20220615
MAGIC = 20221023

from _sre import MAXREPEAT, MAXGROUPS

Expand Down
45 changes: 16 additions & 29 deletions Lib/re/_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -984,24 +984,28 @@ def parse(str, flags=0, state=None):

return p

def parse_template(source, state):
def parse_template(source, pattern):
# parse 're' replacement string into list of literals and
# group references
s = Tokenizer(source)
sget = s.get
groups = []
literals = []
result = []
literal = []
lappend = literal.append
def addliteral():
if s.istext:
result.append(''.join(literal))
else:
# The tokenizer implicitly decodes bytes objects as latin-1, we must
# therefore re-encode the final representation.
result.append(''.join(literal).encode('latin-1'))
del literal[:]
def addgroup(index, pos):
if index > state.groups:
if index > pattern.groups:
raise s.error("invalid group reference %d" % index, pos)
if literal:
literals.append(''.join(literal))
del literal[:]
groups.append((len(literals), index))
literals.append(None)
groupindex = state.groupindex
addliteral()
result.append(index)
groupindex = pattern.groupindex
while True:
this = sget()
if this is None:
Expand Down Expand Up @@ -1063,22 +1067,5 @@ def addgroup(index, pos):
lappend(this)
else:
lappend(this)
if literal:
literals.append(''.join(literal))
if not isinstance(source, str):
# The tokenizer implicitly decodes bytes objects as latin-1, we must
# therefore re-encode the final representation.
literals = [None if s is None else s.encode('latin-1') for s in literals]
return groups, literals

def expand_template(template, match):
g = match.group
empty = match.string[:0]
groups, literals = template
literals = literals[:]
try:
for index, group in groups:
literals[index] = g(group) or empty
except IndexError:
raise error("invalid group reference %d" % index) from None
return empty.join(literals)
addliteral()
return result
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
Speed up the regular expression substitution (functions :func:`re.sub` and
:func:`re.subn` and corresponding :class:`re.Pattern` methods) for
replacement strings containing group references by 2--3 times.
41 changes: 40 additions & 1 deletion Modules/_sre/clinic/sre.c.h

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading

0 comments on commit 75a6fad

Please sign in to comment.