-
Notifications
You must be signed in to change notification settings - Fork 933
/
utf8.cpp
260 lines (217 loc) · 9.46 KB
/
utf8.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
#include <components/misc/strings/format.hpp>
#include "utf8.hpp"
namespace
{
constexpr std::string_view UTF8PATT = "[%z\x01-\x7F\xC2-\xF4][\x80-\xBF]*"; // %z is deprecated in Lua5.2
// constexpr uint32_t MAXUTF = 0x7FFFFFFFu;
constexpr uint32_t MAXUNICODE = 0x10FFFFu;
inline bool isNilOrNone(const sol::stack_proxy arg)
{
return (arg.get_type() == sol::type::lua_nil || arg.get_type() == sol::type::none);
}
inline std::int64_t getInteger(const sol::stack_proxy arg, const size_t n, std::string_view name)
{
double integer;
if (!arg.is<double>())
throw std::runtime_error(Misc::StringUtils::format("bad argument #%i to '%s' (number expected, got %s)", n,
name, sol::type_name(arg.lua_state(), arg.get_type())));
if (std::modf(arg, &integer) != 0)
throw std::runtime_error(
Misc::StringUtils::format("bad argument #%i to '%s' (number has no integer representation)", n, name));
return static_cast<std::int64_t>(integer);
}
// If the input 'pos' is negative, it is treated as counting from the end of the string,
// where -1 represents the last character position, -2 represents the second-to-last position,
// and so on. If 'pos' is non-negative, it is used as-is.
inline void relativePosition(int64_t& pos, const size_t len)
{
if (pos < 0)
pos = std::max<int64_t>(0, pos + len + 1);
}
inline void codepointToUTF8(char32_t codepoint, std::string& str)
{
if (codepoint <= 0x7Fu)
{
str.push_back(static_cast<char>(codepoint));
}
else if (codepoint <= 0x7FFu)
{
str.push_back(static_cast<char>(0xC0 | ((codepoint & 0x7C0) >> 6)));
str.push_back(static_cast<char>(0x80 | (codepoint & 0x3F)));
}
else if (codepoint <= 0xFFFFu)
{
str.push_back(static_cast<char>(0xE0 | ((codepoint & 0xF000) >> 12)));
str.push_back(static_cast<char>(0x80 | ((codepoint & 0xFC0) >> 6)));
str.push_back(static_cast<char>(0x80 | (codepoint & 0x3F)));
}
else if (codepoint <= MAXUNICODE)
{
str.push_back(static_cast<char>(0xF0 | ((codepoint & 0x1C0000) >> 18)));
str.push_back(static_cast<char>(0x80 | ((codepoint & 0x3F000) >> 12)));
str.push_back(static_cast<char>(0x80 | ((codepoint & 0xFC0) >> 6)));
str.push_back(static_cast<char>(0x80 | (codepoint & 0x3F)));
}
else
throw std::runtime_error("Invalid codepoint");
}
// returns: first - character pos in bytes, second - character codepoint
std::pair<int64_t, int64_t> decodeNextUTF8Character(std::string_view s, std::vector<int64_t>& pos_byte)
{
const int64_t pos = pos_byte.back() - 1;
const unsigned char ch = static_cast<unsigned char>(s[pos]);
int64_t codepoint = -1;
size_t byteSize = 0;
if ((ch & 0b10000000) == 0)
{
codepoint = ch;
byteSize = 1;
}
else if ((ch & 0b11100000) == 0b11000000)
{
codepoint = ch & 0b00011111;
byteSize = 2;
}
else if ((ch & 0b11110000) == 0b11100000)
{
codepoint = ch & 0b00001111;
byteSize = 3;
}
else if ((ch & 0b11111000) == 0b11110000)
{
codepoint = ch & 0b00000111;
byteSize = 4;
}
// construct codepoint for non-ascii
for (size_t i = 1; i < byteSize; ++i)
{
// if not a continuation byte
if ((pos + i) >= s.size() || (static_cast<unsigned char>(s[pos + i]) & 0b11000000) != 0b10000000)
{
return std::make_pair(0, -1);
}
codepoint = (codepoint << 6) | (static_cast<unsigned char>(s[pos + i]) & 0b00111111);
}
std::pair<size_t, int64_t> res = std::make_pair(pos_byte.back(), codepoint);
pos_byte.push_back(pos_byte.back() + byteSize); /* the next character (if exists) starts at this byte */
return res;
}
}
namespace LuaUtf8
{
sol::table initUtf8Package(sol::state_view& lua)
{
sol::table utf8(lua, sol::create);
utf8["charpattern"] = UTF8PATT;
utf8["char"] = [](const sol::variadic_args args) -> std::string {
std::string result{};
for (size_t i = 0; i < args.size(); ++i)
{
int64_t codepoint = getInteger(args[i], (i + 1), "char");
if (codepoint < 0 || codepoint > MAXUNICODE)
throw std::runtime_error(
"bad argument #" + std::to_string(i + 1) + " to 'char' (value out of range)");
codepointToUTF8(static_cast<char32_t>(codepoint), result);
}
return result;
};
utf8["codes"] = [](std::string_view s) {
return sol::as_function(
[s, pos_byte = std::vector<int64_t>{ 1 }]() mutable -> sol::optional<std::pair<int64_t, int64_t>> {
if (pos_byte.back() <= static_cast<int64_t>(s.size()))
{
const auto pair = decodeNextUTF8Character(s, pos_byte);
if (pair.second == -1)
throw std::runtime_error(
"Invalid UTF-8 code at position " + std::to_string(pos_byte.size()));
return pair;
}
return sol::nullopt;
});
};
utf8["len"] = [](std::string_view s,
const sol::variadic_args args) -> std::variant<size_t, std::pair<sol::object, int64_t>> {
const size_t len = s.size();
int64_t iv = isNilOrNone(args[0]) ? 1 : getInteger(args[0], 2, "len");
int64_t fv = isNilOrNone(args[1]) ? -1 : getInteger(args[1], 3, "len");
relativePosition(iv, len);
relativePosition(fv, len);
if (iv <= 0)
throw std::runtime_error("bad argument #2 to 'len' (initial position out of bounds)");
if (fv > static_cast<int64_t>(len))
throw std::runtime_error("bad argument #3 to 'len' (final position out of bounds)");
if (len == 0)
return len;
std::vector<int64_t> pos_byte = { iv };
while (pos_byte.back() <= fv)
{
if (decodeNextUTF8Character(s, pos_byte).second == -1)
return std::pair(sol::lua_nil, pos_byte.back());
}
return pos_byte.size() - 1;
};
utf8["codepoint"]
= [](std::string_view s, const sol::variadic_args args) -> sol::as_returns_t<std::vector<int64_t>> {
size_t len = s.size();
int64_t iv = isNilOrNone(args[0]) ? 1 : getInteger(args[0], 2, "codepoint");
int64_t fv = isNilOrNone(args[1]) ? iv : getInteger(args[1], 3, "codepoint");
relativePosition(iv, len);
relativePosition(fv, len);
if (iv <= 0)
throw std::runtime_error("bad argument #2 to 'codepoint' (initial position out of bounds)");
if (fv > static_cast<int64_t>(len))
throw std::runtime_error("bad argument #3 to 'codepoint' (final position out of bounds)");
if (iv > fv)
return sol::as_returns(std::vector<int64_t>{}); /* empty interval; return nothing */
std::vector<int64_t> pos_byte = { iv };
std::vector<int64_t> codepoints;
while (pos_byte.back() <= fv)
{
codepoints.push_back(decodeNextUTF8Character(s, pos_byte).second);
if (codepoints.back() == -1)
throw std::runtime_error("Invalid UTF-8 code at position " + std::to_string(pos_byte.size()));
}
return sol::as_returns(std::move(codepoints));
};
utf8["offset"]
= [](std::string_view s, const int64_t n, const sol::variadic_args args) -> sol::optional<int64_t> {
size_t len = s.size();
int64_t iv;
if (isNilOrNone(args[0]))
{
if (n >= 0)
iv = 1;
else
iv = s.size() + 1;
}
else
iv = getInteger(args[0], 3, "offset");
std::vector<int64_t> pos_byte = { 1 };
relativePosition(iv, len);
if (iv > static_cast<int64_t>(len) + 1)
throw std::runtime_error("bad argument #3 to 'offset' (position out of bounds)");
while (pos_byte.back() <= static_cast<int64_t>(len))
decodeNextUTF8Character(s, pos_byte);
for (auto it = pos_byte.begin(); it != pos_byte.end(); ++it)
{
if (*it == iv)
{
if (n <= 0 && it + n >= pos_byte.begin())
return *(it + n);
if (n > 0 && it + n - 1 < pos_byte.end())
return *(it + n - 1);
break;
}
else if (*it > iv) /* a continuation byte */
{
if (n == 0)
return *(it - 1); /* special case */
else
throw std::runtime_error("initial position is a continuation byte");
}
}
return sol::nullopt;
};
return utf8;
}
}