This is my C language function to decode utf-8 into wide characters, a byte at a time. It also detects encoding errors.
/**
* mpdm_utf8_to_wc - Converts from utf8 to wchar (streaming).
* @w: convert wide char
* @s: temporal state
* @c: char read from stream
*
* Converts a stream of utf8 characters to wide char ones. The input
* stream is read one byte at a time from @c and composed into @w
* until a Unicode codepoint is ready. The @s integer keeps an internal
* state change and must be set to 0 before the stream is read. It
* detects encoding errors; in this case, the special Unicode
* char U+FFFD is returned.
*
* When 0 is returned, a new wide char is available into @w. If
* the stream is interrupted in the middle of a multibyte character,
* the @s state variable will not be 0.
*/
int mpdm_utf8_to_wc(wchar_t *w, int *s, char c)
{
if (!*s && (c & 0x80) == 0) { /* 1 byte char */
*w = c;
}
else
if (!*s && (c & 0xe0) == 0xc0) { /* 2 byte char */
*w = (c & 0x1f) << 6; *s = 1;
}
else
if (!*s && (c & 0xf0) == 0xe0) { /* 3 byte char */
*w = (c & 0x0f) << 12; *s = 2;
}
else
if (!*s && (c & 0xf8) == 0xf0) { /* 4 byte char */
*w = (c & 0x07) << 18; *s = 3;
}
else
if (*s && (c & 0xc0) == 0x80) { /* continuation byte */
switch (*s) {
case 3: *w |= (c & 0x3f) << 12; break;
case 2: *w |= (c & 0x3f) << 6; break;
case 1: *w |= (c & 0x3f); break;
}
(*s)--;
}
else {
*w = L'\xfffd';
*s = 0;
}
return *s;
}
#c #unicode #utf-8 #programming