Skip to content

Commit

Permalink
deps: ICU 60 bump
Browse files Browse the repository at this point in the history
- Update to released ICU 60.1, including:
  - CLDR 32 (many new languages and data improvements)
  - Unicode 10 (8,518 new characters, including four new scripts,
  7,494 new Han characters, and 56 new emoji characters)
  - UTF-8 malformed bytes now handled according to W3C/WHATWG spec

Fixes: nodejs#15540
PR-URL: nodejs#16876
Reviewed-By: James M Snell <[email protected]>
Reviewed-By: Michael Dawson <[email protected]>
  • Loading branch information
srl295 committed Nov 10, 2017
1 parent 3b3ceaf commit 44d3e17
Show file tree
Hide file tree
Showing 254 changed files with 23,876 additions and 10,365 deletions.
2 changes: 1 addition & 1 deletion LICENSE
Original file line number Diff line number Diff line change
Expand Up @@ -230,7 +230,7 @@ The externally maintained libraries used by Node.js are:
# ---------COPYING.libtabe ---- BEGIN--------------------
#
# /*
# * Copyrighy (c) 1999 TaBE Project.
# * Copyright (c) 1999 TaBE Project.
# * Copyright (c) 1999 Pai-Hsiang Hsiao.
# * All rights reserved.
# *
Expand Down
4 changes: 2 additions & 2 deletions configure
Original file line number Diff line number Diff line change
Expand Up @@ -1092,8 +1092,8 @@ def glob_to_var(dir_base, dir_sub, patch_dir):
def configure_intl(o):
icus = [
{
'url': 'https://ssl.icu-project.org/files/icu4c/59.1/icu4c-59_1-src.zip',
'md5': '29a41f9bb576b06c7eef0487a84a7674',
'url': 'https://ssl.icu-project.org/files/icu4c/60.1/icu4c-60_1-src.zip',
'md5': 'e6cb990ac2a3161d31a3def8435f80cb',
},
]
def icu_download(path):
Expand Down
2 changes: 1 addition & 1 deletion deps/icu-small/LICENSE
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,7 @@ property of their respective owners.
# ---------COPYING.libtabe ---- BEGIN--------------------
#
# /*
# * Copyrighy (c) 1999 TaBE Project.
# * Copyright (c) 1999 TaBE Project.
# * Copyright (c) 1999 Pai-Hsiang Hsiao.
# * All rights reserved.
# *
Expand Down
4 changes: 2 additions & 2 deletions deps/icu-small/README-SMALL-ICU.txt
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
Small ICU sources - auto generated by shrink-icu-src.py

This directory contains the ICU subset used by --with-intl=small-icu (the default)
It is a strict subset of ICU 59 source files with the following exception(s):
* deps/icu-small/source/data/in/icudt59l.dat : Reduced-size data file
It is a strict subset of ICU 60 source files with the following exception(s):
* deps/icu-small/source/data/in/icudt60l.dat : Reduced-size data file


To rebuild this directory, see ../../tools/icu/README.md
108 changes: 61 additions & 47 deletions deps/icu-small/source/common/bmpset.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ U_NAMESPACE_BEGIN

BMPSet::BMPSet(const int32_t *parentList, int32_t parentListLength) :
list(parentList), listLength(parentListLength) {
uprv_memset(asciiBytes, 0, sizeof(asciiBytes));
uprv_memset(latin1Contains, 0, sizeof(latin1Contains));
uprv_memset(table7FF, 0, sizeof(table7FF));
uprv_memset(bmpBlockBits, 0, sizeof(bmpBlockBits));

Expand All @@ -45,14 +45,16 @@ BMPSet::BMPSet(const int32_t *parentList, int32_t parentListLength) :
list4kStarts[i]=findCodePoint(i<<12, list4kStarts[i-1], listLength-1);
}
list4kStarts[0x11]=listLength-1;
containsFFFD=containsSlow(0xfffd, list4kStarts[0xf], list4kStarts[0x10]);

initBits();
overrideIllegal();
}

BMPSet::BMPSet(const BMPSet &otherBMPSet, const int32_t *newParentList, int32_t newParentListLength) :
containsFFFD(otherBMPSet.containsFFFD),
list(newParentList), listLength(newParentListLength) {
uprv_memcpy(asciiBytes, otherBMPSet.asciiBytes, sizeof(asciiBytes));
uprv_memcpy(latin1Contains, otherBMPSet.latin1Contains, sizeof(latin1Contains));
uprv_memcpy(table7FF, otherBMPSet.table7FF, sizeof(table7FF));
uprv_memcpy(bmpBlockBits, otherBMPSet.bmpBlockBits, sizeof(bmpBlockBits));
uprv_memcpy(list4kStarts, otherBMPSet.list4kStarts, sizeof(list4kStarts));
Expand Down Expand Up @@ -120,21 +122,38 @@ void BMPSet::initBits() {
UChar32 start, limit;
int32_t listIndex=0;

// Set asciiBytes[].
// Set latin1Contains[].
do {
start=list[listIndex++];
if(listIndex<listLength) {
limit=list[listIndex++];
} else {
limit=0x110000;
}
if(start>=0x80) {
if(start>=0x100) {
break;
}
do {
asciiBytes[start++]=1;
} while(start<limit && start<0x80);
} while(limit<=0x80);
latin1Contains[start++]=1;
} while(start<limit && start<0x100);
} while(limit<=0x100);

// Find the first range overlapping with (or after) 80..FF again,
// to include them in table7FF as well.
for(listIndex=0;;) {
start=list[listIndex++];
if(listIndex<listLength) {
limit=list[listIndex++];
} else {
limit=0x110000;
}
if(limit>0x80) {
if(start<0x80) {
start=0x80;
}
break;
}
}

// Set table7FF[].
while(start<0x800) {
Expand Down Expand Up @@ -204,19 +223,14 @@ void BMPSet::initBits() {
* for faster validity checking at runtime.
* No need to set 0 values where they were reset to 0 in the constructor
* and not modified by initBits().
* (asciiBytes[] trail bytes, table7FF[] 0..7F, bmpBlockBits[] 0..7FF)
* (table7FF[] 0..7F, bmpBlockBits[] 0..7FF)
* Need to set 0 values for surrogates D800..DFFF.
*/
void BMPSet::overrideIllegal() {
uint32_t bits, mask;
int32_t i;

if(containsSlow(0xfffd, list4kStarts[0xf], list4kStarts[0x10])) {
// contains(FFFD)==TRUE
for(i=0x80; i<0xc0; ++i) {
asciiBytes[i]=1;
}

if(containsFFFD) {
bits=3; // Lead bytes 0xC0 and 0xC1.
for(i=0; i<64; ++i) {
table7FF[i]|=bits;
Expand All @@ -233,7 +247,6 @@ void BMPSet::overrideIllegal() {
bmpBlockBits[i]=(bmpBlockBits[i]&mask)|bits;
}
} else {
// contains(FFFD)==FALSE
mask=~(0x10001<<0xd); // Lead byte 0xED.
for(i=32; i<64; ++i) { // Second half of 4k block.
bmpBlockBits[i]&=mask;
Expand Down Expand Up @@ -277,8 +290,8 @@ int32_t BMPSet::findCodePoint(UChar32 c, int32_t lo, int32_t hi) const {

UBool
BMPSet::contains(UChar32 c) const {
if((uint32_t)c<=0x7f) {
return (UBool)asciiBytes[c];
if((uint32_t)c<=0xff) {
return (UBool)latin1Contains[c];
} else if((uint32_t)c<=0x7ff) {
return (UBool)((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))!=0);
} else if((uint32_t)c<0xd800 || (c>=0xe000 && c<=0xffff)) {
Expand Down Expand Up @@ -314,8 +327,8 @@ BMPSet::span(const UChar *s, const UChar *limit, USetSpanCondition spanCondition
// span
do {
c=*s;
if(c<=0x7f) {
if(!asciiBytes[c]) {
if(c<=0xff) {
if(!latin1Contains[c]) {
break;
}
} else if(c<=0x7ff) {
Expand Down Expand Up @@ -354,8 +367,8 @@ BMPSet::span(const UChar *s, const UChar *limit, USetSpanCondition spanCondition
// span not
do {
c=*s;
if(c<=0x7f) {
if(asciiBytes[c]) {
if(c<=0xff) {
if(latin1Contains[c]) {
break;
}
} else if(c<=0x7ff) {
Expand Down Expand Up @@ -403,8 +416,8 @@ BMPSet::spanBack(const UChar *s, const UChar *limit, USetSpanCondition spanCondi
// span
for(;;) {
c=*(--limit);
if(c<=0x7f) {
if(!asciiBytes[c]) {
if(c<=0xff) {
if(!latin1Contains[c]) {
break;
}
} else if(c<=0x7ff) {
Expand Down Expand Up @@ -446,8 +459,8 @@ BMPSet::spanBack(const UChar *s, const UChar *limit, USetSpanCondition spanCondi
// span not
for(;;) {
c=*(--limit);
if(c<=0x7f) {
if(asciiBytes[c]) {
if(c<=0xff) {
if(latin1Contains[c]) {
break;
}
} else if(c<=0x7ff) {
Expand Down Expand Up @@ -497,22 +510,22 @@ const uint8_t *
BMPSet::spanUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const {
const uint8_t *limit=s+length;
uint8_t b=*s;
if((int8_t)b>=0) {
if(U8_IS_SINGLE(b)) {
// Initial all-ASCII span.
if(spanCondition) {
do {
if(!asciiBytes[b] || ++s==limit) {
if(!latin1Contains[b] || ++s==limit) {
return s;
}
b=*s;
} while((int8_t)b>=0);
} while(U8_IS_SINGLE(b));
} else {
do {
if(asciiBytes[b] || ++s==limit) {
if(latin1Contains[b] || ++s==limit) {
return s;
}
b=*s;
} while((int8_t)b>=0);
} while(U8_IS_SINGLE(b));
}
length=(int32_t)(limit-s);
}
Expand Down Expand Up @@ -540,20 +553,20 @@ BMPSet::spanUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanConditi
// single trail byte, check for preceding 3- or 4-byte lead byte
if(length>=2 && (b=*(limit-2))>=0xe0) {
limit-=2;
if(asciiBytes[0x80]!=spanCondition) {
if(containsFFFD!=spanCondition) {
limit0=limit;
}
} else if(b<0xc0 && b>=0x80 && length>=3 && (b=*(limit-3))>=0xf0) {
// 4-byte lead byte with only two trail bytes
limit-=3;
if(asciiBytes[0x80]!=spanCondition) {
if(containsFFFD!=spanCondition) {
limit0=limit;
}
}
} else {
// lead byte with no trail bytes
--limit;
if(asciiBytes[0x80]!=spanCondition) {
if(containsFFFD!=spanCondition) {
limit0=limit;
}
}
Expand All @@ -563,26 +576,26 @@ BMPSet::spanUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanConditi

while(s<limit) {
b=*s;
if(b<0xc0) {
// ASCII; or trail bytes with the result of contains(FFFD).
if(U8_IS_SINGLE(b)) {
// ASCII
if(spanCondition) {
do {
if(!asciiBytes[b]) {
if(!latin1Contains[b]) {
return s;
} else if(++s==limit) {
return limit0;
}
b=*s;
} while(b<0xc0);
} while(U8_IS_SINGLE(b));
} else {
do {
if(asciiBytes[b]) {
if(latin1Contains[b]) {
return s;
} else if(++s==limit) {
return limit0;
}
b=*s;
} while(b<0xc0);
} while(U8_IS_SINGLE(b));
}
}
++s; // Advance past the lead byte.
Expand Down Expand Up @@ -619,16 +632,17 @@ BMPSet::spanUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanConditi
UChar32 c=((UChar32)(b-0xf0)<<18)|((UChar32)t1<<12)|(t2<<6)|t3;
if( ( (0x10000<=c && c<=0x10ffff) ?
containsSlow(c, list4kStarts[0x10], list4kStarts[0x11]) :
asciiBytes[0x80]
containsFFFD
) != spanCondition
) {
return s-1;
}
s+=3;
continue;
}
} else /* 0xc0<=b<0xe0 */ {
} else {
if( /* handle U+0000..U+07FF inline */
b>=0xc0 &&
(t1=(uint8_t)(*s-0x80)) <= 0x3f
) {
if((USetSpanCondition)((table7FF[t1]&((uint32_t)1<<(b&0x1f)))!=0) != spanCondition) {
Expand All @@ -642,7 +656,7 @@ BMPSet::spanUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanConditi
// Give an illegal sequence the same value as the result of contains(FFFD).
// Handle each byte of an illegal sequence separately to simplify the code;
// no need to optimize error handling.
if(asciiBytes[0x80]!=spanCondition) {
if(containsFFFD!=spanCondition) {
return s-1;
}
}
Expand All @@ -667,26 +681,26 @@ BMPSet::spanBackUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCon

do {
b=s[--length];
if((int8_t)b>=0) {
if(U8_IS_SINGLE(b)) {
// ASCII sub-span
if(spanCondition) {
do {
if(!asciiBytes[b]) {
if(!latin1Contains[b]) {
return length+1;
} else if(length==0) {
return 0;
}
b=s[--length];
} while((int8_t)b>=0);
} while(U8_IS_SINGLE(b));
} else {
do {
if(asciiBytes[b]) {
if(latin1Contains[b]) {
return length+1;
} else if(length==0) {
return 0;
}
b=s[--length];
} while((int8_t)b>=0);
} while(U8_IS_SINGLE(b));
}
}

Expand Down
15 changes: 8 additions & 7 deletions deps/icu-small/source/common/bmpset.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,11 +28,12 @@ U_NAMESPACE_BEGIN
* Helper class for frozen UnicodeSets, implements contains() and span()
* optimized for BMP code points. Structured to be UTF-8-friendly.
*
* ASCII: Look up bytes.
* Latin-1: Look up bytes.
* 2-byte characters: Bits organized vertically.
* 3-byte characters: Use zero/one/mixed data per 64-block in U+0000..U+FFFF,
* with mixed for illegal ranges.
* Supplementary characters: Call contains() on the parent set.
* Supplementary characters: Binary search over
* the supplementary part of the parent set's inversion list.
*/
class BMPSet : public UMemory {
public:
Expand Down Expand Up @@ -96,12 +97,12 @@ class BMPSet : public UMemory {
inline UBool containsSlow(UChar32 c, int32_t lo, int32_t hi) const;

/*
* One byte per ASCII character, or trail byte in lead position.
* 0 or 1 for ASCII characters.
* The value for trail bytes is the result of contains(FFFD)
* for faster validity checking at runtime.
* One byte 0 or 1 per Latin-1 character.
*/
UBool asciiBytes[0xc0];
UBool latin1Contains[0x100];

/* TRUE if contains(U+FFFD). */
UBool containsFFFD;

/*
* One bit per code point from U+0000..U+07FF.
Expand Down
Loading

0 comments on commit 44d3e17

Please sign in to comment.