Re: [PATCH BlueZ] shared/util: Refactor code from strisutf8 and strtoutf8

Luiz Augusto von Dentz <luiz.dentz@xxxxxxxxx> · Wed, 9 Jul 2025 10:11:01 -0300

Hi Frédéric,

On Wed, Jul 9, 2025 at 5:46 AM Frédéric Danis
<frederic.danis@xxxxxxxxxxxxx> wrote:
>
> Move duplicate code to static validateutf8() and fix boundary access
> on multi-byte character check.
> ---
>  src/shared/util.c | 56 +++++++++++++++--------------------------------
>  1 file changed, 18 insertions(+), 38 deletions(-)
>
> diff --git a/src/shared/util.c b/src/shared/util.c
> index 4780f26b6..36c06188f 100644
> --- a/src/shared/util.c
> +++ b/src/shared/util.c
> @@ -1909,7 +1909,7 @@ char *strstrip(char *str)
>         return str;
>  }
>
> -bool strisutf8(const char *str, size_t len)
> +static bool validateutf8(const char *str, size_t len, size_t *invalid_index)
>  {
>         size_t i = 0;
>
> @@ -1928,17 +1928,23 @@ bool strisutf8(const char *str, size_t len)
>                         size = 3;
>                 else if ((c & 0xF8) == 0xF0)
>                         size = 4;
> -               else
> +               else {
>                         /* Invalid UTF-8 sequence */
> +                       if (invalid_index)
> +                               *invalid_index = i;
>                         return false;
> +               }
>
>                 /* Check the following bytes to ensure they have the correct
>                  * format.
>                  */
>                 for (size_t j = 1; j < size; ++j) {
> -                       if (i + j > len || (str[i + j] & 0xC0) != 0x80)
> +                       if (i + j >= len || (str[i + j] & 0xC0) != 0x80) {
>                                 /* Invalid UTF-8 sequence */
> +                               if (invalid_index)
> +                                       *invalid_index = i;
>                                 return false;
> +                       }
>                 }
>
>                 /* Move to the next character */
> @@ -1948,6 +1954,11 @@ bool strisutf8(const char *str, size_t len)
>         return true;
>  }
>
> +bool strisutf8(const char *str, size_t len)
> +{
> +       return validateutf8(str, len, NULL);
> +}
> +
>  bool argsisutf8(int argc, char *argv[])
>  {
>         for (int i = 0; i < argc; i++) {
> @@ -1962,42 +1973,11 @@ bool argsisutf8(int argc, char *argv[])
>
>  char *strtoutf8(char *str, size_t len)
>  {
> -       size_t i = 0;
> -
> -       while (i < len) {
> -               unsigned char c = str[i];
> -               size_t size = 0;
> -
> -               /* Check the first byte to determine the number of bytes in the
> -                * UTF-8 character.
> -                */
> -               if ((c & 0x80) == 0x00)
> -                       size = 1;
> -               else if ((c & 0xE0) == 0xC0)
> -                       size = 2;
> -               else if ((c & 0xF0) == 0xE0)
> -                       size = 3;
> -               else if ((c & 0xF8) == 0xF0)
> -                       size = 4;
> -               else
> -                       /* Invalid UTF-8 sequence */
> -                       goto done;
> -
> -               /* Check the following bytes to ensure they have the correct
> -                * format.
> -                */
> -               for (size_t j = 1; j < size; ++j) {
> -                       if (i + j > len || (str[i + j] & 0xC0) != 0x80)
> -                               /* Invalid UTF-8 sequence */
> -                               goto done;
> -               }
> +       size_t invalid_index = 0;
>
> -               /* Move to the next character */
> -               i += size;
> -       }
> +       if (!validateutf8(str, len, &invalid_index))
> +               /* Truncate to the longest valid UTF-8 string */
> +               memset(str + invalid_index, 0, len - invalid_index);
>
> -done:
> -       /* Truncate to the longest valid UTF-8 string */
> -       memset(str + i, 0, len - i);
>         return str;
>  }
> --
> 2.43.0
>

I did something similar yesterday:

https://patchwork.kernel.org/project/bluetooth/patch/20250708174628.2949030-1-luiz.dentz@xxxxxxxxx/

Let me know if you have any comments.

-- 
Luiz Augusto von Dentz