This is my C language function to decode utf-8 into wide characters, a byte at a time. It also detects encoding errors.
/** * mpdm_utf8_to_wc - Converts from utf8 to wchar (streaming). * @w: convert wide char * @s: temporal state * @c: char read from stream * * Converts a stream of utf8 characters to wide char ones. The input * stream is read one byte at a time from @c and composed into @w * until a Unicode codepoint is ready. The @s integer keeps an internal * state change and must be set to 0 before the stream is read. It * detects encoding errors; in this case, the special Unicode * char U+FFFD is returned. * * When 0 is returned, a new wide char is available into @w. If * the stream is interrupted in the middle of a multibyte character, * the @s state variable will not be 0. */ int mpdm_utf8_to_wc(wchar_t *w, int *s, char c) { if (!*s && (c & 0x80) == 0) { /* 1 byte char */ *w = c; } else if (!*s && (c & 0xe0) == 0xc0) { /* 2 byte char */ *w = (c & 0x1f) << 6; *s = 1; } else if (!*s && (c & 0xf0) == 0xe0) { /* 3 byte char */ *w = (c & 0x0f) << 12; *s = 2; } else if (!*s && (c & 0xf8) == 0xf0) { /* 4 byte char */ *w = (c & 0x07) << 18; *s = 3; } else if (*s && (c & 0xc0) == 0x80) { /* continuation byte */ switch (*s) { case 3: *w |= (c & 0x3f) << 12; break; case 2: *w |= (c & 0x3f) << 6; break; case 1: *w |= (c & 0x3f); break; } (*s)--; } else { *w = L'\xfffd'; *s = 0; } return *s; }