Skip to content

Commit

Permalink
bpo-30215: Make re.compile() locale agnostic. (python#1361)
Browse files Browse the repository at this point in the history
Compiled regular expression objects with the re.LOCALE flag no longer
depend on the locale at compile time.  Only the locale at matching
time affects the result of matching.
  • Loading branch information
serhiy-storchaka authored May 5, 2017
1 parent 647c3d3 commit 898ff03
Show file tree
Hide file tree
Showing 9 changed files with 141 additions and 23 deletions.
5 changes: 5 additions & 0 deletions Doc/library/re.rst
Original file line number Diff line number Diff line change
Expand Up @@ -559,6 +559,11 @@ form.
:const:`re.LOCALE` can be used only with bytes patterns and is
not compatible with :const:`re.ASCII`.

.. versionchanged:: 3.7
Compiled regular expression objects with the :const:`re.LOCALE` flag no
longer depend on the locale at compile time. Only the locale at
matching time affects the result of matching.


.. data:: M
MULTILINE
Expand Down
12 changes: 2 additions & 10 deletions Lib/re.py
Original file line number Diff line number Diff line change
Expand Up @@ -268,9 +268,7 @@ def escape(pattern):
def _compile(pattern, flags):
# internal: compile pattern
try:
p, loc = _cache[type(pattern), pattern, flags]
if loc is None or loc == _locale.setlocale(_locale.LC_CTYPE):
return p
return _cache[type(pattern), pattern, flags]
except KeyError:
pass
if isinstance(pattern, _pattern_type):
Expand All @@ -284,13 +282,7 @@ def _compile(pattern, flags):
if not (flags & DEBUG):
if len(_cache) >= _MAXCACHE:
_cache.clear()
if p.flags & LOCALE:
if not _locale:
return p
loc = _locale.setlocale(_locale.LC_CTYPE)
else:
loc = None
_cache[type(pattern), pattern, flags] = p, loc
_cache[type(pattern), pattern, flags] = p
return p

@functools.lru_cache(_MAXCACHE)
Expand Down
24 changes: 15 additions & 9 deletions Lib/sre_compile.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,13 @@ def _compile(code, pattern, flags):
fixes = None
for op, av in pattern:
if op in LITERAL_CODES:
if flags & SRE_FLAG_IGNORECASE:
if not flags & SRE_FLAG_IGNORECASE:
emit(op)
emit(av)
elif flags & SRE_FLAG_LOCALE:
emit(OP_LOC_IGNORE[op])
emit(av)
else:
lo = _sre.getlower(av, flags)
if fixes and lo in fixes:
emit(IN_IGNORE)
Expand All @@ -93,17 +99,17 @@ def _compile(code, pattern, flags):
else:
emit(OP_IGNORE[op])
emit(lo)
else:
emit(op)
emit(av)
elif op is IN:
if flags & SRE_FLAG_IGNORECASE:
emit(OP_IGNORE[op])
def fixup(literal, flags=flags):
return _sre.getlower(literal, flags)
else:
if not flags & SRE_FLAG_IGNORECASE:
emit(op)
fixup = None
elif flags & SRE_FLAG_LOCALE:
emit(IN_LOC_IGNORE)
fixup = None
else:
emit(IN_IGNORE)
def fixup(literal, flags=flags):
return _sre.getlower(literal, flags)
skip = _len(code); emit(0)
_compile_charset(av, flags, code, fixup, fixes)
code[skip] = _len(code) - skip
Expand Down
10 changes: 9 additions & 1 deletion Lib/sre_constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@

# update when constants are added or removed

MAGIC = 20140917
MAGIC = 20170530

from _sre import MAXREPEAT, MAXGROUPS

Expand Down Expand Up @@ -87,6 +87,9 @@ def _makecodes(names):
SUBPATTERN
MIN_REPEAT_ONE
RANGE_IGNORE
LITERAL_LOC_IGNORE
NOT_LITERAL_LOC_IGNORE
IN_LOC_IGNORE
MIN_REPEAT MAX_REPEAT
""")
Expand Down Expand Up @@ -124,6 +127,11 @@ def _makecodes(names):
RANGE: RANGE_IGNORE,
}

OP_LOC_IGNORE = {
LITERAL: LITERAL_LOC_IGNORE,
NOT_LITERAL: NOT_LITERAL_LOC_IGNORE,
}

AT_MULTILINE = {
AT_BEGINNING: AT_BEGINNING_LINE,
AT_END: AT_END_LINE
Expand Down
32 changes: 32 additions & 0 deletions Lib/test/test_re.py
Original file line number Diff line number Diff line change
Expand Up @@ -1730,6 +1730,38 @@ def check_en_US_utf8(self):
self.assertIsNone(re.match(b'(?Li)\xc5', b'\xe5'))
self.assertIsNone(re.match(b'(?Li)\xe5', b'\xc5'))

def test_locale_compiled(self):
oldlocale = locale.setlocale(locale.LC_CTYPE)
self.addCleanup(locale.setlocale, locale.LC_CTYPE, oldlocale)
for loc in 'en_US.iso88591', 'en_US.utf8':
try:
locale.setlocale(locale.LC_CTYPE, loc)
except locale.Error:
# Unsupported locale on this system
self.skipTest('test needs %s locale' % loc)

locale.setlocale(locale.LC_CTYPE, 'en_US.iso88591')
p1 = re.compile(b'\xc5\xe5', re.L|re.I)
p2 = re.compile(b'[a\xc5][a\xe5]', re.L|re.I)
p3 = re.compile(b'[az\xc5][az\xe5]', re.L|re.I)
p4 = re.compile(b'[^\xc5][^\xe5]', re.L|re.I)
for p in p1, p2, p3:
self.assertTrue(p.match(b'\xc5\xe5'))
self.assertTrue(p.match(b'\xe5\xe5'))
self.assertTrue(p.match(b'\xc5\xc5'))
self.assertIsNone(p4.match(b'\xe5\xc5'))
self.assertIsNone(p4.match(b'\xe5\xe5'))
self.assertIsNone(p4.match(b'\xc5\xc5'))

locale.setlocale(locale.LC_CTYPE, 'en_US.utf8')
for p in p1, p2, p3:
self.assertTrue(p.match(b'\xc5\xe5'))
self.assertIsNone(p.match(b'\xe5\xe5'))
self.assertIsNone(p.match(b'\xc5\xc5'))
self.assertTrue(p4.match(b'\xe5\xc5'))
self.assertIsNone(p4.match(b'\xe5\xe5'))
self.assertIsNone(p4.match(b'\xc5\xc5'))

def test_error(self):
with self.assertRaises(re.error) as cm:
re.compile('(\u20ac))')
Expand Down
4 changes: 4 additions & 0 deletions Misc/NEWS
Original file line number Diff line number Diff line change
Expand Up @@ -317,6 +317,10 @@ Extension Modules
Library
-------

- bpo-30215: Compiled regular expression objects with the re.LOCALE flag no
longer depend on the locale at compile time. Only the locale at matching
time affects the result of matching.

- bpo-30185: Avoid KeyboardInterrupt tracebacks in forkserver helper process
when Ctrl-C is received.

Expand Down
3 changes: 3 additions & 0 deletions Modules/_sre.c
Original file line number Diff line number Diff line change
Expand Up @@ -1588,6 +1588,8 @@ _validate_inner(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups)
case SRE_OP_NOT_LITERAL:
case SRE_OP_LITERAL_IGNORE:
case SRE_OP_NOT_LITERAL_IGNORE:
case SRE_OP_LITERAL_LOC_IGNORE:
case SRE_OP_NOT_LITERAL_LOC_IGNORE:
GET_ARG;
/* The arg is just a character, nothing to check */
break;
Expand Down Expand Up @@ -1625,6 +1627,7 @@ _validate_inner(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups)

case SRE_OP_IN:
case SRE_OP_IN_IGNORE:
case SRE_OP_IN_LOC_IGNORE:
GET_SKIP;
/* Stop 1 before the end; we check the FAILURE below */
if (!_validate_charset(code, code+skip-2))
Expand Down
5 changes: 4 additions & 1 deletion Modules/sre_constants.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
* See the _sre.c file for information on usage and redistribution.
*/

#define SRE_MAGIC 20140917
#define SRE_MAGIC 20170530
#define SRE_OP_FAILURE 0
#define SRE_OP_SUCCESS 1
#define SRE_OP_ANY 2
Expand Down Expand Up @@ -45,6 +45,9 @@
#define SRE_OP_SUBPATTERN 30
#define SRE_OP_MIN_REPEAT_ONE 31
#define SRE_OP_RANGE_IGNORE 32
#define SRE_OP_LITERAL_LOC_IGNORE 33
#define SRE_OP_NOT_LITERAL_LOC_IGNORE 34
#define SRE_OP_IN_LOC_IGNORE 35
#define SRE_AT_BEGINNING 0
#define SRE_AT_BEGINNING_LINE 1
#define SRE_AT_BEGINNING_STRING 2
Expand Down
69 changes: 67 additions & 2 deletions Modules/sre_lib.h
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,14 @@ SRE(at)(SRE_STATE* state, SRE_CHAR* ptr, SRE_CODE at)
return 0;
}

LOCAL(int)
SRE(char_loc_ignore)(SRE_STATE* state, SRE_CODE pattern, SRE_CODE ch)
{
return ch == pattern
|| (SRE_CODE) state->lower(ch) == pattern
|| (SRE_CODE) state->upper(ch) == pattern;
}

LOCAL(int)
SRE(charset)(SRE_STATE* state, SRE_CODE* set, SRE_CODE ch)
{
Expand Down Expand Up @@ -187,6 +195,18 @@ SRE(charset)(SRE_STATE* state, SRE_CODE* set, SRE_CODE ch)
}
}

LOCAL(int)
SRE(charset_loc_ignore)(SRE_STATE* state, SRE_CODE* set, SRE_CODE ch)
{
SRE_CODE lo, up;
lo = state->lower(ch);
if (SRE(charset)(state, set, lo))
return 1;

up = state->upper(ch);
return up != lo && SRE(charset)(state, set, up);
}

LOCAL(Py_ssize_t) SRE(match)(SRE_STATE* state, SRE_CODE* pattern, int match_all);

LOCAL(Py_ssize_t)
Expand Down Expand Up @@ -247,6 +267,14 @@ SRE(count)(SRE_STATE* state, SRE_CODE* pattern, Py_ssize_t maxcount)
ptr++;
break;

case SRE_OP_LITERAL_LOC_IGNORE:
/* repeated literal */
chr = pattern[1];
TRACE(("|%p|%p|COUNT LITERAL_LOC_IGNORE %d\n", pattern, ptr, chr));
while (ptr < end && SRE(char_loc_ignore)(state, chr, *ptr))
ptr++;
break;

case SRE_OP_NOT_LITERAL:
/* repeated non-literal */
chr = pattern[1];
Expand All @@ -269,6 +297,14 @@ SRE(count)(SRE_STATE* state, SRE_CODE* pattern, Py_ssize_t maxcount)
ptr++;
break;

case SRE_OP_NOT_LITERAL_LOC_IGNORE:
/* repeated non-literal */
chr = pattern[1];
TRACE(("|%p|%p|COUNT NOT_LITERAL_LOC_IGNORE %d\n", pattern, ptr, chr));
while (ptr < end && !SRE(char_loc_ignore)(state, chr, *ptr))
ptr++;
break;

default:
/* repeated single character pattern */
TRACE(("|%p|%p|COUNT SUBPATTERN\n", pattern, ptr));
Expand Down Expand Up @@ -651,7 +687,17 @@ SRE(match)(SRE_STATE* state, SRE_CODE* pattern, int match_all)
TRACE(("|%p|%p|LITERAL_IGNORE %d\n",
ctx->pattern, ctx->ptr, ctx->pattern[0]));
if (ctx->ptr >= end ||
state->lower(*ctx->ptr) != state->lower(*ctx->pattern))
state->lower(*ctx->ptr) != *ctx->pattern)
RETURN_FAILURE;
ctx->pattern++;
ctx->ptr++;
break;

case SRE_OP_LITERAL_LOC_IGNORE:
TRACE(("|%p|%p|LITERAL_LOC_IGNORE %d\n",
ctx->pattern, ctx->ptr, ctx->pattern[0]));
if (ctx->ptr >= end
|| !SRE(char_loc_ignore)(state, *ctx->pattern, *ctx->ptr))
RETURN_FAILURE;
ctx->pattern++;
ctx->ptr++;
Expand All @@ -661,7 +707,17 @@ SRE(match)(SRE_STATE* state, SRE_CODE* pattern, int match_all)
TRACE(("|%p|%p|NOT_LITERAL_IGNORE %d\n",
ctx->pattern, ctx->ptr, *ctx->pattern));
if (ctx->ptr >= end ||
state->lower(*ctx->ptr) == state->lower(*ctx->pattern))
state->lower(*ctx->ptr) == *ctx->pattern)
RETURN_FAILURE;
ctx->pattern++;
ctx->ptr++;
break;

case SRE_OP_NOT_LITERAL_LOC_IGNORE:
TRACE(("|%p|%p|NOT_LITERAL_LOC_IGNORE %d\n",
ctx->pattern, ctx->ptr, *ctx->pattern));
if (ctx->ptr >= end
|| SRE(char_loc_ignore)(state, *ctx->pattern, *ctx->ptr))
RETURN_FAILURE;
ctx->pattern++;
ctx->ptr++;
Expand All @@ -677,6 +733,15 @@ SRE(match)(SRE_STATE* state, SRE_CODE* pattern, int match_all)
ctx->ptr++;
break;

case SRE_OP_IN_LOC_IGNORE:
TRACE(("|%p|%p|IN_LOC_IGNORE\n", ctx->pattern, ctx->ptr));
if (ctx->ptr >= end
|| !SRE(charset_loc_ignore)(state, ctx->pattern+1, *ctx->ptr))
RETURN_FAILURE;
ctx->pattern += ctx->pattern[0];
ctx->ptr++;
break;

case SRE_OP_JUMP:
case SRE_OP_INFO:
/* jump forward */
Expand Down

0 comments on commit 898ff03

Please sign in to comment.