Skip to content

Commit

Permalink
Adjust to the CVTUTF code being gone
Browse files Browse the repository at this point in the history
I, Luke T. Shumaker, am the sole author of the added code.

I did not reference CVTUTF when writing it.  I did reference the
Unicode standard (15.0.0), the Wikipedia article on UTF-8, and the
Wikipedia article on UTF-16.  When I saw some tests fail, I did
reference the old deleted code (but a JSON-specific part, inherently
not as based on CVTUTF) to determine that script_safe should also
escape U+2028 and U+2029.

I targeted simplicity and clarity when writing the code--it can likely
be optimized.  In my mind, the obvious next optimization is to have it
combine contiguous non-escaped characters into just one call to
fbuffer_append(), instead of calling fbuffer_append() for each
character.

Regarding the use of the "modern" types `uint32_t`, `uint16_t`, and
`bool`:
 - ruby.h is guaranteed to give us uint32_t and uint16_t.
 - Since Ruby 3.0.0, ruby.h is guaranteed to give us bool... but we
   support down to Ruby 2.3.  But, ruby.h is guaranteed to give us
   HAVE_STDBOOL_H for the C99 stdbool.h; so use that to include
   stdbool.h if we can, and if not then fall back to a copy of the
   same bool definition that Ruby 3.0.5 uses with C89.

(cherry picked from commit 8720b46)
  • Loading branch information
LukeShu committed May 15, 2024
1 parent ddbc241 commit b5fdd15
Show file tree
Hide file tree
Showing 5 changed files with 230 additions and 142 deletions.
145 changes: 102 additions & 43 deletions ext/json/ext/generator/generator.c
Original file line number Diff line number Diff line change
Expand Up @@ -24,50 +24,113 @@ static ID i_to_s, i_to_json, i_new, i_indent, i_space, i_space_before,
i_aref, i_send, i_respond_to_p, i_match, i_keys, i_depth,
i_buffer_initial_length, i_dup;

/* Escapes the UTF16 character and stores the result in the buffer buf. */
static void unicode_escape(char *buf, UTF16 character)
/* Converts in_string to a JSON string (without the wrapping '"'
* characters) in FBuffer out_buffer.
*
* Character are JSON-escaped according to:
*
* - Always: ASCII control characters (0x00-0x1F), dquote, and
* backslash.
*
* - If out_ascii_only: non-ASCII characters (>0x7F)
*
* Everything else (should be UTF-8) is just passed through and
* appended to the result.
*/
static void convert_UTF8_to_JSON(FBuffer *out_buffer, VALUE in_string, bool out_ascii_only)
{
const char *digits = "0123456789abcdef";
const char *hexdig = "0123456789abcdef";
char scratch[12] = { '\\', 'u', 0, 0, 0, 0, '\\', 'u' };

buf[2] = digits[character >> 12];
buf[3] = digits[(character >> 8) & 0xf];
buf[4] = digits[(character >> 4) & 0xf];
buf[5] = digits[character & 0xf];
}
const char *in_utf8_str = RSTRING_PTR(in_string);
unsigned long in_utf8_len = RSTRING_LEN(in_string);
bool in_is_ascii_only = rb_enc_str_asciionly_p(in_string);

/* Escapes the UTF16 character and stores the result in the buffer buf, then
* the buffer buf is appended to the FBuffer buffer. */
static void unicode_escape_to_buffer(FBuffer *buffer, char buf[6], UTF16
character)
{
unicode_escape(buf, character);
fbuffer_append(buffer, buf, 6);
}
unsigned long pos;

/* Converts string to a JSON string in FBuffer buffer, where all but the ASCII
* and control characters are JSON escaped. */
static void convert_UTF8_to_JSON_ASCII(FBuffer *buffer, VALUE string)
{
const UTF8 *source = (UTF8 *) RSTRING_PTR(string);
const UTF8 *sourceEnd = source + RSTRING_LEN(string);
char buf[6] = { '\\', 'u' };
for (pos = 0; pos < in_utf8_len;) {
uint32_t ch;
unsigned long ch_len;
bool should_escape;

RB_GC_GUARD(string);
}
/* UTF-8 decoding */
if (in_is_ascii_only) {
ch = in_utf8_str[pos];
ch_len = 1;
} else {
short i;
if ((in_utf8_str[pos] & 0x80) == 0x00) { ch_len = 1; ch = in_utf8_str[pos]; } /* leading 1 bit is 0b0 */
else if ((in_utf8_str[pos] & 0xE0) == 0xC0) { ch_len = 2; ch = in_utf8_str[pos] & 0x1F; } /* leading 3 bits are 0b110 */
else if ((in_utf8_str[pos] & 0xF0) == 0xE0) { ch_len = 3; ch = in_utf8_str[pos] & 0x0F; } /* leading 4 bits are 0b1110 */
else if ((in_utf8_str[pos] & 0xF8) == 0xF0) { ch_len = 4; ch = in_utf8_str[pos] & 0x07; } /* leading 5 bits are 0b11110 */
else
rb_raise(rb_path2class("JSON::GeneratorError"),
"source sequence is illegal/malformed utf-8");
if ((pos+ch_len) > in_utf8_len)
rb_raise(rb_path2class("JSON::GeneratorError"),
"partial character in source, but hit end");
for (i = 1; i < ch_len; i++) {
if ((in_utf8_str[pos+i] & 0xC0) != 0x80) /* leading 2 bits should be 0b10 */
rb_raise(rb_path2class("JSON::GeneratorError"),
"source sequence is illegal/malformed utf-8");
ch = (ch<<6) | (in_utf8_str[pos+i] & 0x3F);
}
if (ch > 0x10FFFF)
rb_raise(rb_path2class("JSON::GeneratorError"),
"source sequence is illegal/malformed utf-8");
}

/* Converts string to a JSON string in FBuffer buffer, where only the
* characters required by the JSON standard are JSON escaped. The remaining
* characters (should be UTF8) are just passed through and appended to the
* result. */
static void convert_UTF8_to_JSON(FBuffer *buffer, VALUE string)
{
const char *ptr = RSTRING_PTR(string), *p;
unsigned long len = RSTRING_LEN(string), start = 0, end = 0;
const char *escape = NULL;
int escape_len;
unsigned char c;
char buf[6] = { '\\', 'u' };
int ascii_only = rb_enc_str_asciionly_p(string);
/* JSON policy */
should_escape =
(ch < 0x20) ||
(ch == '"') ||
(ch == '\\') ||
(out_ascii_only && (ch > 0x7F));

/* JSON encoding */
if (should_escape) {
switch (ch) {
case '"': fbuffer_append(out_buffer, "\\\"", 2); break;
case '\\': fbuffer_append(out_buffer, "\\\\", 2); break;
case '/': fbuffer_append(out_buffer, "\\/", 2); break;
case '\b': fbuffer_append(out_buffer, "\\b", 2); break;
case '\f': fbuffer_append(out_buffer, "\\f", 2); break;
case '\n': fbuffer_append(out_buffer, "\\n", 2); break;
case '\r': fbuffer_append(out_buffer, "\\r", 2); break;
case '\t': fbuffer_append(out_buffer, "\\t", 2); break;
default:
if (ch <= 0xFFFF) {
scratch[2] = hexdig[ch >> 12];
scratch[3] = hexdig[(ch >> 8) & 0xf];
scratch[4] = hexdig[(ch >> 4) & 0xf];
scratch[5] = hexdig[ch & 0xf];
fbuffer_append(out_buffer, scratch, 6);
} else {
uint16_t hi, lo;
ch -= 0x10000;
hi = 0xD800 + (uint16_t)(ch >> 10);
lo = 0xDC00 + (uint16_t)(ch & 0x3FF);

scratch[2] = hexdig[hi >> 12];
scratch[3] = hexdig[(hi >> 8) & 0xf];
scratch[4] = hexdig[(hi >> 4) & 0xf];
scratch[5] = hexdig[hi & 0xf];

scratch[8] = hexdig[lo >> 12];
scratch[9] = hexdig[(lo >> 8) & 0xf];
scratch[10] = hexdig[(lo >> 4) & 0xf];
scratch[11] = hexdig[lo & 0xf];

fbuffer_append(out_buffer, scratch, 12);
}
}
} else {
fbuffer_append(out_buffer, &in_utf8_str[pos], ch_len);
}

pos += ch_len;
}
RB_GC_GUARD(in_string);
}

static char *fstrndup(const char *ptr, unsigned long len) {
Expand Down Expand Up @@ -625,11 +688,7 @@ static void generate_json_string(FBuffer *buffer, VALUE Vstate, JSON_Generator_S
obj = rb_str_encode(obj, CEncoding_UTF_8, 0, Qnil);
}
#endif
if (state->ascii_only) {
convert_UTF8_to_JSON_ASCII(buffer, obj);
} else {
convert_UTF8_to_JSON(buffer, obj);
}
convert_UTF8_to_JSON(buffer, obj, state->ascii_only);
fbuffer_append_char(buffer, '"');
}

Expand Down
13 changes: 9 additions & 4 deletions ext/json/ext/generator/generator.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,14 @@

#include "ruby.h"

#ifdef HAVE_STDBOOL_H
#include <stdbool.h>
#else
/* This is the fallback definition from Ruby 3.0.5. */
typedef unsigned char _Bool
#define bool _Bool
#endif

#ifdef HAVE_RUBY_RE_H
#include "ruby/re.h"
#else
Expand All @@ -22,10 +30,7 @@

#define option_given_p(opts, key) RTEST(rb_funcall(opts, i_key_p, 1, key))

static void unicode_escape(char *buf, UTF16 character);
static void unicode_escape_to_buffer(FBuffer *buffer, char buf[6], UTF16 character);
static void convert_UTF8_to_JSON_ASCII(FBuffer *buffer, VALUE string);
static void convert_UTF8_to_JSON(FBuffer *buffer, VALUE string);
static void convert_UTF8_to_JSON(FBuffer *out_buffer, VALUE in_string, bool out_ascii_only);
static char *fstrndup(const char *ptr, unsigned long len);

/* ruby api and some helpers */
Expand Down
Loading

0 comments on commit b5fdd15

Please sign in to comment.