Skip to content

Commit

Permalink
Implement names for CJK unified ideographs. Add name to KeyError output.
Browse files Browse the repository at this point in the history
Verify that the lookup for an existing name succeeds.
  • Loading branch information
loewis committed Nov 23, 2002
1 parent 8579efc commit ef7fe2e
Show file tree
Hide file tree
Showing 4 changed files with 59 additions and 8 deletions.
5 changes: 3 additions & 2 deletions Lib/test/output/test_ucn
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,8 @@ test_ucn
Testing General Unicode Character Name, and case insensitivity... done.
Testing name to code mapping.... done.
Testing hangul syllable names.... done.
Testing code to name mapping for all characters.... done.
Found 22728 characters in the unicode name database
Testing names of CJK unified ideographs.... done.
Testing code to name mapping for all BMP characters.... done.
Found 50212 characters in the unicode name database
Testing misc. symbols for unicode character name expansion.... done.
Testing unicode character name expansion strict error handling.... done.
20 changes: 16 additions & 4 deletions Lib/test/test_ucn.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,16 +80,28 @@
raise AssertionError, "Found name for U+D7A4"
print "done."

print "Testing code to name mapping for all characters....",
print "Testing names of CJK unified ideographs....",
exec r"""
verify(u"\N{CJK UNIFIED IDEOGRAPH-3400}" == u"\u3400")
verify(u"\N{CJK UNIFIED IDEOGRAPH-4DB5}" == u"\u4db5")
verify(u"\N{CJK UNIFIED IDEOGRAPH-4E00}" == u"\u4e00")
verify(u"\N{CJK UNIFIED IDEOGRAPH-9FA5}" == u"\u9fa5")
verify(u"\N{CJK UNIFIED IDEOGRAPH-20000}" == u"\U00020000")
verify(u"\N{CJK UNIFIED IDEOGRAPH-2A6D6}" == u"\U0002a6d6")
"""
print "done."

print "Testing code to name mapping for all BMP characters....",
count = 0
for code in range(65536):
for code in range(0x10000):
try:
char = unichr(code)
name = unicodedata.name(char)
verify(unicodedata.lookup(name) == char)
count += 1
except (KeyError, ValueError):
pass
else:
verify(unicodedata.lookup(name) == char)
count += 1
print "done."

print "Found", count, "characters in the unicode name database"
Expand Down
2 changes: 1 addition & 1 deletion Misc/NEWS
Original file line number Diff line number Diff line change
Expand Up @@ -318,7 +318,7 @@ Extension modules
is now named bsddb185.

- unicodedata was updated to Unicode 3.2. In now also supports names
for Hangul syllables.
for Hangul syllables and CJK unified ideographs.

- resource.getrlimit() now returns longs instead of ints.

Expand Down
40 changes: 39 additions & 1 deletion Modules/unicodedata.c
Original file line number Diff line number Diff line change
Expand Up @@ -348,6 +348,16 @@ _getucname(Py_UCS4 code, char* buffer, int buflen)
return 1;
}

if ((0x3400 <= code && code <= 0x4DB5) || /* CJK Ideograph Extension A */
(0x4E00 <= code && code <= 0x9FA5) || /* CJK Ideograph */
(0x20000 <= code && code <= 0x2A6D6)) {/* CJK Ideograph Extension B */
if (buflen < 28)
/* Worst case: CJK UNIFIED IDEOGRAPH-20000 */
return 0;
sprintf(buffer, "CJK UNIFIED IDEOGRAPH-%X", code);
return 1;
}

if (code >= 0x110000)
return 0;

Expand Down Expand Up @@ -449,6 +459,30 @@ _getcode(const char* name, int namelen, Py_UCS4* code)
*code = SBase + (L*VCount+V)*TCount + T;
return 1;
}
/* Otherwise, it's an illegal syllable name. */
return 0;
}

/* Check for unified ideographs. */
if (strncmp(name, "CJK UNIFIED IDEOGRAPH-", 22) == 0) {
/* Four or five hexdigits must follow. */
v = 0;
name += 22;
namelen -= 22;
if (namelen != 4 && namelen != 5)
return 0;
while (namelen--) {
v *= 16;
if (*name >= '0' && *name <= '9')
v += *name - '0';
else if (*name >= 'A' && *name <= 'F')
v += *name - 'A' + 10;
else
return 0;
name++;
}
*code = v;
return 1;
}

/* the following is the same as python's dictionary lookup, with
Expand Down Expand Up @@ -535,7 +569,11 @@ unicodedata_lookup(PyObject* self, PyObject* args)
return NULL;

if (!_getcode(name, namelen, &code)) {
PyErr_SetString(PyExc_KeyError, "undefined character name");
char fmt[] = "undefined character name '%s'";
char *buf = PyMem_MALLOC(sizeof(fmt) + namelen);
sprintf(buf, fmt, name);
PyErr_SetString(PyExc_KeyError, buf);
PyMem_FREE(buf);
return NULL;
}

Expand Down

0 comments on commit ef7fe2e

Please sign in to comment.