-
-
Notifications
You must be signed in to change notification settings - Fork 30.1k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Browse files
Browse the repository at this point in the history
- Loading branch information
Showing
15 changed files
with
51,015 additions
and
3 deletions.
There are no files selected for viewing
1 change: 1 addition & 0 deletions
1
Misc/NEWS.d/next/Core and Builtins/2020-04-19-22-23-32.bpo-40328.gWJ53f.rst
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
Add tools for generating mappings headers for CJKCodecs. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,251 @@ | ||
# | ||
# genmap_ja_codecs.py: Japanese Codecs Map Generator | ||
# | ||
# Original Author: Hye-Shik Chang <[email protected]> | ||
# Modified Author: Dong-hee Na <[email protected]> | ||
# | ||
import os | ||
|
||
from genmap_support import * | ||
|
||
JISX0208_C1 = (0x21, 0x74) | ||
JISX0208_C2 = (0x21, 0x7e) | ||
JISX0212_C1 = (0x22, 0x6d) | ||
JISX0212_C2 = (0x21, 0x7e) | ||
JISX0213_C1 = (0x21, 0x7e) | ||
JISX0213_C2 = (0x21, 0x7e) | ||
CP932P0_C1 = (0x81, 0x81) # patches between shift-jis and cp932 | ||
CP932P0_C2 = (0x5f, 0xca) | ||
CP932P1_C1 = (0x87, 0x87) # CP932 P1 | ||
CP932P1_C2 = (0x40, 0x9c) | ||
CP932P2_C1 = (0xed, 0xfc) # CP932 P2 | ||
CP932P2_C2 = (0x40, 0xfc) | ||
|
||
MAPPINGS_JIS0208 = 'http://www.unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/JIS/JIS0208.TXT' | ||
MAPPINGS_JIS0212 = 'http://www.unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/JIS/JIS0212.TXT' | ||
MAPPINGS_CP932 = 'http://www.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP932.TXT' | ||
MAPPINGS_JISX0213_2004 = 'http://wakaba-web.hp.infoseek.co.jp/table/jisx0213-2004-std.txt' | ||
|
||
|
||
def loadmap_jisx0213(fo): | ||
decmap3, decmap4 = {}, {} # maps to BMP for level 3 and 4 | ||
decmap3_2, decmap4_2 = {}, {} # maps to U+2xxxx for level 3 and 4 | ||
decmap3_pair = {} # maps to BMP-pair for level 3 | ||
for line in fo: | ||
line = line.split('#', 1)[0].strip() | ||
if not line or len(line.split()) < 2: | ||
continue | ||
|
||
row = line.split() | ||
loc = eval('0x' + row[0][2:]) | ||
level = eval(row[0][0]) | ||
m = None | ||
if len(row[1].split('+')) == 2: # single unicode | ||
uni = eval('0x' + row[1][2:]) | ||
if level == 3: | ||
if uni < 0x10000: | ||
m = decmap3 | ||
elif 0x20000 <= uni < 0x30000: | ||
uni -= 0x20000 | ||
m = decmap3_2 | ||
elif level == 4: | ||
if uni < 0x10000: | ||
m = decmap4 | ||
elif 0x20000 <= uni < 0x30000: | ||
uni -= 0x20000 | ||
m = decmap4_2 | ||
m.setdefault((loc >> 8), {}) | ||
m[(loc >> 8)][(loc & 0xff)] = uni | ||
else: # pair | ||
uniprefix = eval('0x' + row[1][2:6]) # body | ||
uni = eval('0x' + row[1][7:11]) # modifier | ||
if level != 3: | ||
raise ValueError("invalid map") | ||
decmap3_pair.setdefault(uniprefix, {}) | ||
m = decmap3_pair[uniprefix] | ||
|
||
if m is None: | ||
raise ValueError("invalid map") | ||
m.setdefault((loc >> 8), {}) | ||
m[(loc >> 8)][(loc & 0xff)] = uni | ||
|
||
return decmap3, decmap4, decmap3_2, decmap4_2, decmap3_pair | ||
|
||
|
||
def main(): | ||
jisx0208file = open_mapping_file('python-mappings/JIS0208.TXT', MAPPINGS_JIS0208) | ||
jisx0212file = open_mapping_file('python-mappings/JIS0212.TXT', MAPPINGS_JIS0212) | ||
cp932file = open_mapping_file('python-mappings/CP932.TXT', MAPPINGS_CP932) | ||
jisx0213file = open_mapping_file('python-mappings/jisx0213-2004-std.txt', MAPPINGS_JISX0213_2004) | ||
|
||
print("Loading Mapping File...") | ||
|
||
sjisdecmap = loadmap(jisx0208file, natcol=0, unicol=2) | ||
jisx0208decmap = loadmap(jisx0208file, natcol=1, unicol=2) | ||
jisx0212decmap = loadmap(jisx0212file) | ||
cp932decmap = loadmap(cp932file) | ||
jis3decmap, jis4decmap, jis3_2_decmap, jis4_2_decmap, jis3_pairdecmap = loadmap_jisx0213(jisx0213file) | ||
|
||
if jis3decmap[0x21][0x24] != 0xff0c: | ||
raise SystemExit('Please adjust your JIS X 0213 map using jisx0213-2000-std.txt.diff') | ||
|
||
sjisencmap, cp932encmap = {}, {} | ||
jisx0208_0212encmap = {} | ||
for c1, m in sjisdecmap.items(): | ||
for c2, code in m.items(): | ||
sjisencmap.setdefault(code >> 8, {}) | ||
sjisencmap[code >> 8][code & 0xff] = c1 << 8 | c2 | ||
for c1, m in cp932decmap.items(): | ||
for c2, code in m.items(): | ||
cp932encmap.setdefault(code >> 8, {}) | ||
if (code & 0xff) not in cp932encmap[code >> 8]: | ||
cp932encmap[code >> 8][code & 0xff] = c1 << 8 | c2 | ||
for c1, m in cp932encmap.copy().items(): | ||
for c2, code in m.copy().items(): | ||
if c1 in sjisencmap and c2 in sjisencmap[c1] and sjisencmap[c1][c2] == code: | ||
del cp932encmap[c1][c2] | ||
if not cp932encmap[c1]: | ||
del cp932encmap[c1] | ||
|
||
jisx0213pairdecmap = {} | ||
jisx0213pairencmap = [] | ||
for unibody, m1 in jis3_pairdecmap.items(): | ||
for c1, m2 in m1.items(): | ||
for c2, modifier in m2.items(): | ||
jisx0213pairencmap.append((unibody, modifier, c1 << 8 | c2)) | ||
jisx0213pairdecmap.setdefault(c1, {}) | ||
jisx0213pairdecmap[c1][c2] = unibody << 16 | modifier | ||
|
||
# Twinmap for both of JIS X 0208 (MSB unset) and JIS X 0212 (MSB set) | ||
for c1, m in jisx0208decmap.items(): | ||
for c2, code in m.items(): | ||
jisx0208_0212encmap.setdefault(code >> 8, {}) | ||
jisx0208_0212encmap[code >> 8][code & 0xff] = c1 << 8 | c2 | ||
|
||
for c1, m in jisx0212decmap.items(): | ||
for c2, code in m.items(): | ||
jisx0208_0212encmap.setdefault(code >> 8, {}) | ||
if (code & 0xff) in jisx0208_0212encmap[code >> 8]: | ||
print("OOPS!!!", (code)) | ||
jisx0208_0212encmap[code >> 8][code & 0xff] = 0x8000 | c1 << 8 | c2 | ||
|
||
jisx0213bmpencmap = {} | ||
for c1, m in jis3decmap.copy().items(): | ||
for c2, code in m.copy().items(): | ||
if c1 in jisx0208decmap and c2 in jisx0208decmap[c1]: | ||
if code in jis3_pairdecmap: | ||
jisx0213bmpencmap[code >> 8][code & 0xff] = (0,) # pair | ||
jisx0213pairencmap.append((code, 0, c1 << 8 | c2)) | ||
elif jisx0208decmap[c1][c2] == code: | ||
del jis3decmap[c1][c2] | ||
if not jis3decmap[c1]: | ||
del jis3decmap[c1] | ||
else: | ||
raise ValueError("Difference between JIS X 0208 and JIS X 0213 Plane 1 is found.") | ||
else: | ||
jisx0213bmpencmap.setdefault(code >> 8, {}) | ||
if code not in jis3_pairdecmap: | ||
jisx0213bmpencmap[code >> 8][code & 0xff] = c1 << 8 | c2 | ||
else: | ||
jisx0213bmpencmap[code >> 8][code & 0xff] = (0,) # pair | ||
jisx0213pairencmap.append((code, 0, c1 << 8 | c2)) | ||
|
||
for c1, m in jis4decmap.items(): | ||
for c2, code in m.items(): | ||
jisx0213bmpencmap.setdefault(code >> 8, {}) | ||
jisx0213bmpencmap[code >> 8][code & 0xff] = 0x8000 | c1 << 8 | c2 | ||
|
||
jisx0213empencmap = {} | ||
for c1, m in jis3_2_decmap.items(): | ||
for c2, code in m.items(): | ||
jisx0213empencmap.setdefault(code >> 8, {}) | ||
jisx0213empencmap[code >> 8][code & 0xff] = c1 << 8 | c2 | ||
for c1, m in jis4_2_decmap.items(): | ||
for c2, code in m.items(): | ||
jisx0213empencmap.setdefault(code >> 8, {}) | ||
jisx0213empencmap[code >> 8][code & 0xff] = 0x8000 | c1 << 8 | c2 | ||
|
||
with open("mappings_jp.h", "w") as fp: | ||
print_autogen(fp, os.path.basename(__file__)) | ||
print("Generating JIS X 0208 decode map...") | ||
writer = DecodeMapWriter(fp, "jisx0208", jisx0208decmap) | ||
writer.update_decode_map(JISX0208_C1, JISX0208_C2) | ||
writer.generate() | ||
|
||
print("Generating JIS X 0212 decode map...") | ||
writer = DecodeMapWriter(fp, "jisx0212", jisx0212decmap) | ||
writer.update_decode_map(JISX0212_C1, JISX0212_C2) | ||
writer.generate() | ||
|
||
print("Generating JIS X 0208 && JIS X 0212 encode map...") | ||
writer = EncodeMapWriter(fp, "jisxcommon", jisx0208_0212encmap) | ||
writer.generate() | ||
|
||
print("Generating CP932 Extension decode map...") | ||
writer = DecodeMapWriter(fp, "cp932ext", cp932decmap) | ||
writer.update_decode_map(CP932P0_C1, CP932P0_C2) | ||
writer.update_decode_map(CP932P1_C1, CP932P1_C2) | ||
writer.update_decode_map(CP932P2_C1, CP932P2_C2) | ||
writer.generate() | ||
|
||
print("Generating CP932 Extension encode map...") | ||
writer = EncodeMapWriter(fp, "cp932ext", cp932encmap) | ||
writer.generate() | ||
|
||
print("Generating JIS X 0213 Plane 1 BMP decode map...") | ||
writer = DecodeMapWriter(fp, "jisx0213_1_bmp", jis3decmap) | ||
writer.update_decode_map(JISX0213_C1, JISX0213_C2) | ||
writer.generate() | ||
|
||
print("Generating JIS X 0213 Plane 2 BMP decode map...") | ||
writer = DecodeMapWriter(fp, "jisx0213_2_bmp", jis4decmap) | ||
writer.update_decode_map(JISX0213_C1, JISX0213_C2) | ||
writer.generate() | ||
|
||
print("Generating JIS X 0213 BMP encode map...") | ||
writer = EncodeMapWriter(fp, "jisx0213_bmp", jisx0213bmpencmap) | ||
writer.generate() | ||
|
||
print("Generating JIS X 0213 Plane 1 EMP decode map...") | ||
writer = DecodeMapWriter(fp, "jisx0213_1_emp", jis3_2_decmap) | ||
writer.update_decode_map(JISX0213_C1, JISX0213_C2) | ||
writer.generate() | ||
|
||
print("Generating JIS X 0213 Plane 2 EMP decode map...") | ||
writer = DecodeMapWriter(fp, "jisx0213_2_emp", jis4_2_decmap) | ||
writer.update_decode_map(JISX0213_C1, JISX0213_C2) | ||
writer.generate() | ||
|
||
print("Generating JIS X 0213 EMP encode map...") | ||
writer = EncodeMapWriter(fp, "jisx0213_emp", jisx0213empencmap) | ||
writer.generate() | ||
|
||
with open('mappings_jisx0213_pair.h', 'w') as fp: | ||
print_autogen(fp, os.path.basename(__file__)) | ||
fp.write(f"#define JISX0213_ENCPAIRS {len(jisx0213pairencmap)}\n") | ||
fp.write("""\ | ||
#ifdef EXTERN_JISX0213_PAIR | ||
static const struct widedbcs_index *jisx0213_pair_decmap; | ||
static const struct pair_encodemap *jisx0213_pair_encmap; | ||
#else | ||
""") | ||
|
||
print("Generating JIS X 0213 unicode-pair decode map...") | ||
writer = DecodeMapWriter(fp, "jisx0213_pair", jisx0213pairdecmap) | ||
writer.update_decode_map(JISX0213_C1, JISX0213_C2) | ||
writer.generate(wide=True) | ||
|
||
print("Generating JIS X 0213 unicode-pair encode map...") | ||
jisx0213pairencmap.sort() | ||
fp.write("static const struct pair_encodemap jisx0213_pair_encmap[JISX0213_ENCPAIRS] = {\n") | ||
filler = BufferedFiller() | ||
for body, modifier, jis in jisx0213pairencmap: | ||
filler.write('{', '0x%04x%04x,' % (body, modifier), '0x%04x' % jis, '},') | ||
filler.printout(fp) | ||
fp.write("};\n") | ||
fp.write("#endif\n") | ||
|
||
print("Done!") | ||
|
||
if __name__ == '__main__': | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,62 @@ | ||
# | ||
# genmap_korean.py: Korean Codecs Map Generator | ||
# | ||
# Original Author: Hye-Shik Chang <[email protected]> | ||
# Modified Author: Dong-hee Na <[email protected]> | ||
# | ||
import os | ||
|
||
from genmap_support import * | ||
|
||
|
||
KSX1001_C1 = (0x21, 0x7e) | ||
KSX1001_C2 = (0x21, 0x7e) | ||
UHCL1_C1 = (0x81, 0xa0) | ||
UHCL1_C2 = (0x41, 0xfe) | ||
UHCL2_C1 = (0xa1, 0xfe) | ||
UHCL2_C2 = (0x41, 0xa0) | ||
MAPPINGS_CP949 = 'http://www.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP949.TXT' | ||
|
||
|
||
def main(): | ||
mapfile = open_mapping_file('python-mappings/CP949.TXT', MAPPINGS_CP949) | ||
print("Loading Mapping File...") | ||
decmap = loadmap(mapfile) | ||
uhcdecmap, ksx1001decmap, cp949encmap = {}, {}, {} | ||
for c1, c2map in decmap.items(): | ||
for c2, code in c2map.items(): | ||
if c1 >= 0xa1 and c2 >= 0xa1: | ||
ksx1001decmap.setdefault(c1 & 0x7f, {}) | ||
ksx1001decmap[c1 & 0x7f][c2 & 0x7f] = c2map[c2] | ||
cp949encmap.setdefault(code >> 8, {}) | ||
cp949encmap[code >> 8][code & 0xFF] = (c1 << 8 | c2) & 0x7f7f | ||
else: | ||
# uhc | ||
uhcdecmap.setdefault(c1, {}) | ||
uhcdecmap[c1][c2] = c2map[c2] | ||
cp949encmap.setdefault(code >> 8, {}) # MSB set | ||
cp949encmap[code >> 8][code & 0xFF] = (c1 << 8 | c2) | ||
|
||
with open('mappings_kr.h', 'w') as fp: | ||
print_autogen(fp, os.path.basename(__file__)) | ||
|
||
print("Generating KS X 1001 decode map...") | ||
writer = DecodeMapWriter(fp, "ksx1001", ksx1001decmap) | ||
writer.update_decode_map(KSX1001_C1, KSX1001_C2) | ||
writer.generate() | ||
|
||
print("Generating UHC decode map...") | ||
writer = DecodeMapWriter(fp, "cp949ext", uhcdecmap) | ||
writer.update_decode_map(UHCL1_C1, UHCL1_C2) | ||
writer.update_decode_map(UHCL2_C1, UHCL2_C2) | ||
writer.generate() | ||
|
||
print("Generating CP949 (includes KS X 1001) encode map...") | ||
writer = EncodeMapWriter(fp, "cp949", cp949encmap) | ||
writer.generate() | ||
|
||
print("Done!") | ||
|
||
|
||
if __name__ == '__main__': | ||
main() |
Oops, something went wrong.