Change utf8_count_chars return type to int and drop thesys/types.h, this way the function does not depend on posix types. Signed-off-by: Fabio Baltieri <fabiobaltieri@google.com>
127 lines
2.9 KiB
C
127 lines
2.9 KiB
C
/*
|
|
* Copyright (c) 2021 Nordic Semiconductor ASA
|
|
*
|
|
* SPDX-License-Identifier: Apache-2.0
|
|
*/
|
|
|
|
#include <stdint.h>
|
|
#include <string.h>
|
|
#include <zephyr/sys/__assert.h>
|
|
#include <errno.h>
|
|
#include <zephyr/sys/util_utf8.h>
|
|
|
|
#define ASCII_CHAR 0x7F
|
|
#define SEQUENCE_FIRST_MASK 0xC0
|
|
#define SEQUENCE_LEN_2_BYTE 0xC0
|
|
#define SEQUENCE_LEN_3_BYTE 0xE0
|
|
#define SEQUENCE_LEN_4_BYTE 0xF0
|
|
#define MSB_SET 0x80
|
|
|
|
char *utf8_trunc(char *utf8_str)
|
|
{
|
|
const size_t len = strlen(utf8_str);
|
|
|
|
if (len == 0U) {
|
|
/* no-op */
|
|
return utf8_str;
|
|
}
|
|
|
|
char *last_byte_p = utf8_str + len - 1U;
|
|
uint8_t bytes_truncated;
|
|
char seq_start_byte;
|
|
|
|
if ((*last_byte_p & ASCII_CHAR) == *last_byte_p) {
|
|
/* Not part of an UTF8 sequence, return */
|
|
return utf8_str;
|
|
}
|
|
|
|
/* Find the starting byte and NULL-terminate other bytes */
|
|
bytes_truncated = 0;
|
|
while ((*last_byte_p & SEQUENCE_FIRST_MASK) != SEQUENCE_FIRST_MASK &&
|
|
last_byte_p > utf8_str) {
|
|
last_byte_p--;
|
|
bytes_truncated++;
|
|
}
|
|
bytes_truncated++; /* include the starting byte */
|
|
|
|
/* Verify if the last character actually need to be truncated
|
|
* Handles the case where the number of bytes in the last UTF8-char
|
|
* matches the number of bytes we searched for the starting byte
|
|
*/
|
|
seq_start_byte = *last_byte_p;
|
|
if ((seq_start_byte & SEQUENCE_LEN_4_BYTE) == SEQUENCE_LEN_4_BYTE) {
|
|
if (bytes_truncated == 4) {
|
|
return utf8_str;
|
|
}
|
|
} else if ((seq_start_byte & SEQUENCE_LEN_3_BYTE) == SEQUENCE_LEN_3_BYTE) {
|
|
if (bytes_truncated == 3) {
|
|
return utf8_str;
|
|
}
|
|
} else if ((seq_start_byte & SEQUENCE_LEN_2_BYTE) == SEQUENCE_LEN_2_BYTE) {
|
|
if (bytes_truncated == 2) {
|
|
return utf8_str;
|
|
}
|
|
}
|
|
|
|
/* NULL-terminate the unterminated starting byte */
|
|
*last_byte_p = '\0';
|
|
|
|
return utf8_str;
|
|
}
|
|
|
|
char *utf8_lcpy(char *dst, const char *src, size_t n)
|
|
{
|
|
if (n > 0) {
|
|
strncpy(dst, src, n - 1);
|
|
dst[n - 1] = '\0';
|
|
|
|
if (n != 1) {
|
|
utf8_trunc(dst);
|
|
}
|
|
}
|
|
|
|
return dst;
|
|
}
|
|
|
|
int utf8_count_chars(const char *s)
|
|
{
|
|
int count = 0;
|
|
const char *p = s; /* getting a pointer to increment */
|
|
|
|
while (*p != '\0') {
|
|
if ((*p & MSB_SET) == 0) { /* 1-byte character: 0xxxxxxx */
|
|
p += 1;
|
|
} else if ((*p & SEQUENCE_LEN_3_BYTE) == SEQUENCE_FIRST_MASK) {
|
|
/* 2-byte character: 110xxxxx */
|
|
if ((p[1] & SEQUENCE_FIRST_MASK) != MSB_SET) {
|
|
/* invalid continuation */
|
|
return -EINVAL;
|
|
}
|
|
p += 2;
|
|
} else if ((*p & SEQUENCE_LEN_4_BYTE) == SEQUENCE_LEN_3_BYTE) {
|
|
/* 3-byte character: 1110xxxx */
|
|
if ((p[1] & SEQUENCE_FIRST_MASK) != MSB_SET
|
|
|| (p[2] & SEQUENCE_FIRST_MASK) != MSB_SET) {
|
|
/* invalid continuation */
|
|
return -EINVAL;
|
|
}
|
|
p += 3;
|
|
} else if ((*p & 0xF8) == SEQUENCE_LEN_4_BYTE) {
|
|
/* 4-byte character: 11110xxx */
|
|
if ((p[1] & SEQUENCE_FIRST_MASK) != MSB_SET
|
|
|| (p[2] & SEQUENCE_FIRST_MASK) != MSB_SET
|
|
|| (p[3] & SEQUENCE_FIRST_MASK) != MSB_SET) {
|
|
/* invalid continuation */
|
|
return -EINVAL;
|
|
}
|
|
p += 4;
|
|
} else {
|
|
/* Invalid UTF-8 byte (return) */
|
|
return -EINVAL;
|
|
}
|
|
count++;
|
|
}
|
|
|
|
return count;
|
|
}
|