-
Notifications
You must be signed in to change notification settings - Fork 933
/
utf8stream.hpp
200 lines (158 loc) · 5.1 KB
/
utf8stream.hpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
#ifndef MISC_UTF8ITER_HPP
#define MISC_UTF8ITER_HPP
#include <cstdint>
#include <cstring>
#include <string>
#include <string_view>
#include <tuple>
class Utf8Stream
{
public:
typedef uint32_t UnicodeChar;
typedef unsigned char const* Point;
// static const unicode_char sBadChar = 0xFFFFFFFF; gcc can't handle this
static UnicodeChar sBadChar() { return UnicodeChar(0xFFFFFFFF); }
Utf8Stream(Point begin, Point end)
: cur(begin)
, nxt(begin)
, end(end)
, val(Utf8Stream::sBadChar())
{
}
Utf8Stream(const char* str)
: cur(reinterpret_cast<const unsigned char*>(str))
, nxt(reinterpret_cast<const unsigned char*>(str))
, end(reinterpret_cast<const unsigned char*>(str) + strlen(str))
, val(Utf8Stream::sBadChar())
{
}
Utf8Stream(std::pair<Point, Point> range)
: cur(range.first)
, nxt(range.first)
, end(range.second)
, val(Utf8Stream::sBadChar())
{
}
Utf8Stream(std::string_view str)
: Utf8Stream(reinterpret_cast<Point>(str.data()), reinterpret_cast<Point>(str.data() + str.size()))
{
}
bool eof() const { return cur == end; }
Point current() const { return cur; }
UnicodeChar peek()
{
if (cur == nxt)
next();
return val;
}
UnicodeChar consume()
{
if (cur == nxt)
next();
cur = nxt;
return val;
}
static bool isAscii(unsigned char value) { return (value & 0x80) == 0; }
static std::pair<UnicodeChar, Point> decode(Point cur, Point end)
{
if (isAscii(*cur))
{
UnicodeChar chr = *cur++;
return std::make_pair(chr, cur);
}
std::size_t octets;
UnicodeChar chr;
std::tie(octets, chr) = getOctetCount(*cur++);
return decode(cur, end, chr, octets);
}
static std::pair<UnicodeChar, Point> decode(Point cur, Point end, UnicodeChar chr, std::size_t octets)
{
if (octets > 5)
return std::make_pair(sBadChar(), cur);
Point eoc = cur + octets;
if (eoc > end)
return std::make_pair(sBadChar(), cur);
while (cur != eoc)
{
if ((*cur & 0xC0) != 0x80) // check continuation mark
return std::make_pair(sBadChar(), cur);
chr = (chr << 6) | UnicodeChar((*cur++) & 0x3F);
}
return std::make_pair(chr, cur);
}
static UnicodeChar toLowerUtf8(UnicodeChar ch)
{
// Russian alphabet
if (ch >= 0x0410 && ch < 0x0430)
return ch + 0x20;
// Cyrillic IO character
if (ch == 0x0401)
return ch + 0x50;
// Latin alphabet
if (ch >= 0x41 && ch < 0x60)
return ch + 0x20;
// German characters
if (ch == 0xc4 || ch == 0xd6 || ch == 0xdc)
return ch + 0x20;
if (ch == 0x1e9e)
return 0xdf;
// TODO: probably we will need to support characters from other languages
return ch;
}
static std::string lowerCaseUtf8(std::string_view str)
{
if (str.empty())
return std::string{ str };
// Decode string as utf8 characters, convert to lower case and pack them to string
std::string out;
out.reserve(str.length());
Utf8Stream stream(str);
while (!stream.eof())
{
UnicodeChar character = toLowerUtf8(stream.peek());
if (character <= 0x7f)
out.append(1, static_cast<char>(character));
else if (character <= 0x7ff)
{
out.append(1, static_cast<char>(0xc0 | ((character >> 6) & 0x1f)));
out.append(1, static_cast<char>(0x80 | (character & 0x3f)));
}
else if (character <= 0xffff)
{
out.append(1, static_cast<char>(0xe0 | ((character >> 12) & 0x0f)));
out.append(1, static_cast<char>(0x80 | ((character >> 6) & 0x3f)));
out.append(1, static_cast<char>(0x80 | (character & 0x3f)));
}
else
{
out.append(1, static_cast<char>(0xf0 | ((character >> 18) & 0x07)));
out.append(1, static_cast<char>(0x80 | ((character >> 12) & 0x3f)));
out.append(1, static_cast<char>(0x80 | ((character >> 6) & 0x3f)));
out.append(1, static_cast<char>(0x80 | (character & 0x3f)));
}
stream.consume();
}
return out;
}
static std::pair<std::size_t, UnicodeChar> getOctetCount(unsigned char octet)
{
std::size_t octets;
unsigned char mark = 0xC0;
unsigned char mask = 0xE0;
for (octets = 1; octets <= 5; ++octets)
{
if ((octet & mask) == mark)
break;
mark = (mark >> 1) | 0x80;
mask = (mask >> 1) | 0x80;
}
return std::make_pair(octets, octet & ~mask);
}
private:
void next() { std::tie(val, nxt) = decode(nxt, end); }
Point cur;
Point nxt;
Point end;
UnicodeChar val;
};
#endif