Ángel Ortega III

Un naufragio personal

A function to decode utf-8 in streaming mode

This is my C language function to decode utf-8 into wide characters, a byte at a time. It also detects encoding errors.

/**
 * mpdm_utf8_to_wc - Converts from utf8 to wchar (streaming).
 * @w: convert wide char
 * @s: temporal state
 * @c: char read from stream
 *
 * Converts a stream of utf8 characters to wide char ones. The input
 * stream is read one byte at a time from @c and composed into @w
 * until a Unicode codepoint is ready. The @s integer keeps an internal
 * state change and must be set to 0 before the stream is read. It
 * detects encoding errors; in this case, the special Unicode
 * char U+FFFD is returned.
 *
 * When 0 is returned, a new wide char is available into @w. If
 * the stream is interrupted in the middle of a multibyte character,
 * the @s state variable will not be 0.
 */
int mpdm_utf8_to_wc(wchar_t *w, int *s, char c)
{
    if (!*s && (c & 0x80) == 0) { /* 1 byte char */
        *w = c;
    }
    else
    if (!*s && (c & 0xe0) == 0xc0) { /* 2 byte char */
        *w = (c & 0x1f) << 6; *s = 1;
    }
    else
    if (!*s && (c & 0xf0) == 0xe0) { /* 3 byte char */
        *w = (c & 0x0f) << 12; *s = 2;
    }
    else
    if (!*s && (c & 0xf8) == 0xf0) { /* 4 byte char */
        *w = (c & 0x07) << 18; *s = 3;
    }
    else
    if (*s && (c & 0xc0) == 0x80) { /* continuation byte */
        switch (*s) {
        case 3: *w |= (c & 0x3f) << 12; break;
        case 2: *w |= (c & 0x3f) << 6;  break;
        case 1: *w |= (c & 0x3f);       break;
        }

        (*s)--;
    }
    else {
        *w = L'\xfffd';
        *s = 0;
    }

    return *s;
}
notes, c, unicode, utf-8, programming