@@ -311,11 +311,14 @@ mpack_error_t mpack_track_destroy(mpack_track_t* track, bool cancel) {
311311
312312
313313/* The below code is from Bjoern Hoehrmann's Flexible and Economical */
314- /* UTF-8 decoder, modified to support MPack inlining and add the mpack prefix. */
314+ /* UTF-8 decoder, modified to add the mpack prefix. */
315315
316316/* Copyright (c) 2008-2010 Bjoern Hoehrmann <bjoern@hoehrmann.de> */
317317/* See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details. */
318318
319+ #define MPACK_UTF8_ACCEPT 0
320+ #define MPACK_UTF8_REJECT 12
321+
319322static const uint8_t mpack_utf8d [] = {
320323 /* The first part of the table maps bytes to character classes that */
321324 /* to reduce the size of the transition table and create bitmasks. */
@@ -337,7 +340,25 @@ static const uint8_t mpack_utf8d[] = {
337340 12 ,36 ,12 ,12 ,12 ,12 ,12 ,12 ,12 ,12 ,12 ,12 ,
338341};
339342
340- uint32_t mpack_utf8_decode (uint32_t * state , uint32_t * codep , uint8_t byte ) {
343+ /**
344+ * Parses one byte from a UTF-8 stream.
345+ *
346+ * Returns and sets state to:
347+ * - MPACK_UTF8_ACCEPT if the byte completes a valid unicode code point, placing it in codep
348+ * - MPACK_UTF8_REJECT if the byte is invalid UTF-8
349+ * - something else if more bytes are needed to form a valid character
350+ *
351+ * If more bytes are needed, this should be called again with the next byte
352+ * in the string. state and codep should not be modified, since they will
353+ * contain the partially read code point.
354+ *
355+ * The initial state should be set to MPACK_UTF8_ACCEPT before parsing a string.
356+ *
357+ * This does not accept any UTF-8 variant such as Modified UTF-8, CESU-8 or
358+ * WTF-8. Overlong sequences and UTF-16 surrogates will be rejected. Only
359+ * pure UTF-8 is accepted.
360+ */
361+ static inline uint32_t mpack_utf8_decode (uint32_t * state , uint32_t * codep , uint8_t byte ) {
341362 uint32_t type = mpack_utf8d [byte ];
342363
343364 * codep = (* state != MPACK_UTF8_ACCEPT ) ?
@@ -352,7 +373,7 @@ uint32_t mpack_utf8_decode(uint32_t* state, uint32_t* codep, uint8_t byte) {
352373
353374
354375
355- bool mpack_utf8_check (char * str , size_t bytes ) {
376+ bool mpack_utf8_check (const char * str , size_t bytes ) {
356377 uint32_t state = MPACK_UTF8_ACCEPT ;
357378 uint32_t codepoint ;
358379 for (size_t i = 0 ; i < bytes ; ++ i )
@@ -361,7 +382,7 @@ bool mpack_utf8_check(char* str, size_t bytes) {
361382 return state == MPACK_UTF8_ACCEPT ;
362383}
363384
364- bool mpack_utf8_check_no_null (char * str , size_t bytes ) {
385+ bool mpack_utf8_check_no_null (const char * str , size_t bytes ) {
365386 uint32_t state = MPACK_UTF8_ACCEPT ;
366387 uint32_t codepoint ;
367388 for (size_t i = 0 ; i < bytes ; ++ i )
@@ -370,7 +391,7 @@ bool mpack_utf8_check_no_null(char* str, size_t bytes) {
370391 return state == MPACK_UTF8_ACCEPT ;
371392}
372393
373- bool mpack_str_check_no_null (char * str , size_t bytes ) {
394+ bool mpack_str_check_no_null (const char * str , size_t bytes ) {
374395 for (size_t i = 0 ; i < bytes ; ++ i )
375396 if (str [i ] == '\0' )
376397 return false;
0 commit comments