Move duplicate code to static validateutf8() and fix boundary access on multi-byte character check. --- src/shared/util.c | 56 +++++++++++++++-------------------------------- 1 file changed, 18 insertions(+), 38 deletions(-) diff --git a/src/shared/util.c b/src/shared/util.c index 4780f26b6..36c06188f 100644 --- a/src/shared/util.c +++ b/src/shared/util.c @@ -1909,7 +1909,7 @@ char *strstrip(char *str) return str; } -bool strisutf8(const char *str, size_t len) +static bool validateutf8(const char *str, size_t len, size_t *invalid_index) { size_t i = 0; @@ -1928,17 +1928,23 @@ bool strisutf8(const char *str, size_t len) size = 3; else if ((c & 0xF8) == 0xF0) size = 4; - else + else { /* Invalid UTF-8 sequence */ + if (invalid_index) + *invalid_index = i; return false; + } /* Check the following bytes to ensure they have the correct * format. */ for (size_t j = 1; j < size; ++j) { - if (i + j > len || (str[i + j] & 0xC0) != 0x80) + if (i + j >= len || (str[i + j] & 0xC0) != 0x80) { /* Invalid UTF-8 sequence */ + if (invalid_index) + *invalid_index = i; return false; + } } /* Move to the next character */ @@ -1948,6 +1954,11 @@ bool strisutf8(const char *str, size_t len) return true; } +bool strisutf8(const char *str, size_t len) +{ + return validateutf8(str, len, NULL); +} + bool argsisutf8(int argc, char *argv[]) { for (int i = 0; i < argc; i++) { @@ -1962,42 +1973,11 @@ bool argsisutf8(int argc, char *argv[]) char *strtoutf8(char *str, size_t len) { - size_t i = 0; - - while (i < len) { - unsigned char c = str[i]; - size_t size = 0; - - /* Check the first byte to determine the number of bytes in the - * UTF-8 character. - */ - if ((c & 0x80) == 0x00) - size = 1; - else if ((c & 0xE0) == 0xC0) - size = 2; - else if ((c & 0xF0) == 0xE0) - size = 3; - else if ((c & 0xF8) == 0xF0) - size = 4; - else - /* Invalid UTF-8 sequence */ - goto done; - - /* Check the following bytes to ensure they have the correct - * format. - */ - for (size_t j = 1; j < size; ++j) { - if (i + j > len || (str[i + j] & 0xC0) != 0x80) - /* Invalid UTF-8 sequence */ - goto done; - } + size_t invalid_index = 0; - /* Move to the next character */ - i += size; - } + if (!validateutf8(str, len, &invalid_index)) + /* Truncate to the longest valid UTF-8 string */ + memset(str + invalid_index, 0, len - invalid_index); -done: - /* Truncate to the longest valid UTF-8 string */ - memset(str + i, 0, len - i); return str; } -- 2.43.0