Skip to content

Commit

Permalink
pythongh-70278: Fix PyUnicode_FromFormat() with precision for %s and …
Browse files Browse the repository at this point in the history
…%V (pythonGH-120365)

PyUnicode_FromFormat() no longer produces the ending \ufffd
character for truncated C string when use precision with %s and %V.
It now truncates the string before the start of truncated multibyte sequences.
  • Loading branch information
serhiy-storchaka authored and mrahtz committed Jun 30, 2024
1 parent 08ae68c commit 210b9e7
Show file tree
Hide file tree
Showing 3 changed files with 59 additions and 4 deletions.
46 changes: 44 additions & 2 deletions Lib/test/test_capi/test_unicode.py
Original file line number Diff line number Diff line change
Expand Up @@ -419,8 +419,29 @@ def check_format(expected, format, *args):
# truncated string
check_format('abc',
b'%.3s', b'abcdef')
check_format('abc[',
b'%.6s', 'abc[\u20ac]'.encode('utf8'))
check_format('abc[\u20ac',
b'%.7s', 'abc[\u20ac]'.encode('utf8'))
check_format('abc[\ufffd',
b'%.5s', 'abc[\u20ac]'.encode('utf8'))
b'%.5s', b'abc[\xff]')
check_format('abc[',
b'%.6s', b'abc[\xe2\x82]')
check_format('abc[\ufffd]',
b'%.7s', b'abc[\xe2\x82]')
check_format('abc[\ufffd',
b'%.7s', b'abc[\xe2\x82\0')
check_format(' abc[',
b'%10.6s', 'abc[\u20ac]'.encode('utf8'))
check_format(' abc[\u20ac',
b'%10.7s', 'abc[\u20ac]'.encode('utf8'))
check_format(' abc[\ufffd',
b'%10.5s', b'abc[\xff]')
check_format(' abc[',
b'%10.6s', b'abc[\xe2\x82]')
check_format(' abc[\ufffd]',
b'%10.7s', b'abc[\xe2\x82]')

check_format("'\\u20acABC'",
b'%A', '\u20acABC')
check_format("'\\u20",
Expand All @@ -433,10 +454,31 @@ def check_format(expected, format, *args):
b'%.3S', '\u20acABCDEF')
check_format('\u20acAB',
b'%.3U', '\u20acABCDEF')

check_format('\u20acAB',
b'%.3V', '\u20acABCDEF', None)
check_format('abc[',
b'%.6V', None, 'abc[\u20ac]'.encode('utf8'))
check_format('abc[\u20ac',
b'%.7V', None, 'abc[\u20ac]'.encode('utf8'))
check_format('abc[\ufffd',
b'%.5V', None, 'abc[\u20ac]'.encode('utf8'))
b'%.5V', None, b'abc[\xff]')
check_format('abc[',
b'%.6V', None, b'abc[\xe2\x82]')
check_format('abc[\ufffd]',
b'%.7V', None, b'abc[\xe2\x82]')
check_format(' abc[',
b'%10.6V', None, 'abc[\u20ac]'.encode('utf8'))
check_format(' abc[\u20ac',
b'%10.7V', None, 'abc[\u20ac]'.encode('utf8'))
check_format(' abc[\ufffd',
b'%10.5V', None, b'abc[\xff]')
check_format(' abc[',
b'%10.6V', None, b'abc[\xe2\x82]')
check_format(' abc[\ufffd]',
b'%10.7V', None, b'abc[\xe2\x82]')
check_format(' abc[\ufffd',
b'%10.7V', None, b'abc[\xe2\x82\0')

# following tests comes from #7330
# test width modifier and precision modifier with %S
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
:c:func:`PyUnicode_FromFormat` no longer produces the ending ``\ufffd``
character for truncated C string when use precision with ``%s`` and ``%V``.
It now truncates the string before the start of truncated multibyte
sequences.
13 changes: 11 additions & 2 deletions Objects/unicodeobject.c
Original file line number Diff line number Diff line change
Expand Up @@ -2581,6 +2581,7 @@ unicode_fromformat_write_utf8(_PyUnicodeWriter *writer, const char *str,
Py_ssize_t width, Py_ssize_t precision, int flags)
{
/* UTF-8 */
Py_ssize_t *pconsumed = NULL;
Py_ssize_t length;
if (precision == -1) {
length = strlen(str);
Expand All @@ -2590,15 +2591,23 @@ unicode_fromformat_write_utf8(_PyUnicodeWriter *writer, const char *str,
while (length < precision && str[length]) {
length++;
}
if (length == precision) {
/* The input string is not NUL-terminated. If it ends with an
* incomplete UTF-8 sequence, truncate the string just before it.
* Incomplete sequences in the middle and sequences which cannot
* be valid prefixes are still treated as errors and replaced
* with \xfffd. */
pconsumed = &length;
}
}

if (width < 0) {
return unicode_decode_utf8_writer(writer, str, length,
_Py_ERROR_REPLACE, "replace", NULL);
_Py_ERROR_REPLACE, "replace", pconsumed);
}

PyObject *unicode = PyUnicode_DecodeUTF8Stateful(str, length,
"replace", NULL);
"replace", pconsumed);
if (unicode == NULL)
return -1;

Expand Down

0 comments on commit 210b9e7

Please sign in to comment.