Hi Luiz,
On 09/07/2025 15:11, Luiz Augusto von Dentz wrote:
Hi Frédéric,
On Wed, Jul 9, 2025 at 5:46 AM Frédéric Danis
<frederic.danis@xxxxxxxxxxxxx> wrote:
Move duplicate code to static validateutf8() and fix boundary access
on multi-byte character check.
---
src/shared/util.c | 56 +++++++++++++++--------------------------------
1 file changed, 18 insertions(+), 38 deletions(-)
diff --git a/src/shared/util.c b/src/shared/util.c
index 4780f26b6..36c06188f 100644
--- a/src/shared/util.c
+++ b/src/shared/util.c
@@ -1909,7 +1909,7 @@ char *strstrip(char *str)
return str;
}
-bool strisutf8(const char *str, size_t len)
+static bool validateutf8(const char *str, size_t len, size_t *invalid_index)
{
size_t i = 0;
@@ -1928,17 +1928,23 @@ bool strisutf8(const char *str, size_t len)
size = 3;
else if ((c & 0xF8) == 0xF0)
size = 4;
- else
+ else {
/* Invalid UTF-8 sequence */
+ if (invalid_index)
+ *invalid_index = i;
return false;
+ }
/* Check the following bytes to ensure they have the correct
* format.
*/
for (size_t j = 1; j < size; ++j) {
- if (i + j > len || (str[i + j] & 0xC0) != 0x80)
+ if (i + j >= len || (str[i + j] & 0xC0) != 0x80) {
/* Invalid UTF-8 sequence */
+ if (invalid_index)
+ *invalid_index = i;
return false;
+ }
}
/* Move to the next character */
@@ -1948,6 +1954,11 @@ bool strisutf8(const char *str, size_t len)
return true;
}
+bool strisutf8(const char *str, size_t len)
+{
+ return validateutf8(str, len, NULL);
+}
+
bool argsisutf8(int argc, char *argv[])
{
for (int i = 0; i < argc; i++) {
@@ -1962,42 +1973,11 @@ bool argsisutf8(int argc, char *argv[])
char *strtoutf8(char *str, size_t len)
{
- size_t i = 0;
-
- while (i < len) {
- unsigned char c = str[i];
- size_t size = 0;
-
- /* Check the first byte to determine the number of bytes in the
- * UTF-8 character.
- */
- if ((c & 0x80) == 0x00)
- size = 1;
- else if ((c & 0xE0) == 0xC0)
- size = 2;
- else if ((c & 0xF0) == 0xE0)
- size = 3;
- else if ((c & 0xF8) == 0xF0)
- size = 4;
- else
- /* Invalid UTF-8 sequence */
- goto done;
-
- /* Check the following bytes to ensure they have the correct
- * format.
- */
- for (size_t j = 1; j < size; ++j) {
- if (i + j > len || (str[i + j] & 0xC0) != 0x80)
- /* Invalid UTF-8 sequence */
- goto done;
- }
+ size_t invalid_index = 0;
- /* Move to the next character */
- i += size;
- }
+ if (!validateutf8(str, len, &invalid_index))
+ /* Truncate to the longest valid UTF-8 string */
+ memset(str + invalid_index, 0, len - invalid_index);
-done:
- /* Truncate to the longest valid UTF-8 string */
- memset(str + i, 0, len - i);
return str;
}
--
2.43.0
I did something similar yesterday:
https://patchwork.kernel.org/project/bluetooth/patch/20250708174628.2949030-1-luiz.dentz@xxxxxxxxx/
Let me know if you have any comments.
Sorry, I missed it, you can discard mine
--
Frédéric Danis
Senior Software Engineer
Collabora Ltd.
Platinum Building, St John's Innovation Park, Cambridge CB4 0DS, United Kingdom
Registered in England & Wales, no. 5513718