DoxigAlpha

formatUtf8

Print the given utf8 string, encoded as UTF-8 bytes. Ill-formed UTF-8 byte sequences are replaced by the replacement character (U+FFFD) according to "U+FFFD Substitution of Maximal Subparts" from Chapter 3 of the Unicode standard, and as specified by https://encoding.spec.whatwg.org/#utf-8-decoder

Function parameters

Parameters

#
utf8:[]const u8
writer:*std.io.Writer

Utf8View iterates the code points of a utf-8 encoded string.

Types

#
Utf8View
Utf8View iterates the code points of a utf-8 encoded string.
Wtf8View
Wtf8View iterates the code points of a WTF-8 encoded string,
Wtf8Iterator
Asserts that `bytes` is valid WTF-8

Returns how many bytes the UTF-8 representation would require

Functions

#
utf8CodepointSequenceLength
Returns how many bytes the UTF-8 representation would require
utf8ByteSequenceLength
Given the first byte of a UTF-8 codepoint,
utf8Encode
Encodes the given codepoint into a UTF-8 byte sequence.
utf8Decode
Deprecated.
utf8ValidCodepoint
Returns true if the given unicode codepoint can be encoded in UTF-8.
utf8CountCodepoints
Returns the length of a supplied UTF-8 string literal in terms of unicode
utf8ValidateSlice
Returns true if the input consists entirely of UTF-8 codepoints
utf16CodepointSequenceLength
Returns how many code units the UTF-16 representation would require
utf16CodeUnitSequenceLength
Given the first code unit of a UTF-16 codepoint, returns a number 1-2
utf16DecodeSurrogatePair
Decodes the codepoint encoded in the given pair of UTF-16 code units.
utf16CountCodepoints
Returns the length of a supplied UTF-16 string literal in terms of unicode
fmtUtf8
Return a Formatter for a (potentially ill-formed) UTF-8 string.
utf16LeToUtf8Alloc
Caller owns returned memory.
utf16LeToUtf8AllocZ
Caller owns returned memory.
utf8ToUtf16Le
Returns index of next character.
utf8ToUtf16LeStringLiteral
Converts a UTF-8 string literal into a UTF-16LE string literal.
wtf8ToWtf16LeStringLiteral
Converts a WTF-8 string literal into a WTF-16LE string literal.
calcUtf16LeLen
Returns length in UTF-16LE of UTF-8 slice as length of []u16.
calcWtf16LeLen
Returns length in WTF-16LE of WTF-8 slice as length of []u16.
fmtUtf16Le
Return a Formatter for a (potentially ill-formed) UTF-16 LE string,
isSurrogateCodepoint
Returns true if the codepoint is a surrogate (U+DC00 to U+DFFF)
wtf8Encode
Encodes the given codepoint into a WTF-8 byte sequence.
wtf8Decode
Deprecated.
wtf8ValidateSlice
Returns true if the input consists entirely of WTF-8 codepoints
wtf16LeToWtf8Alloc
Caller must free returned memory.
wtf16LeToWtf8AllocZ
Caller must free returned memory.
wtf8ToWtf16Le
Returns index of next character.
checkUtf8ToUtf16LeOverflow
Checks if calling `utf8ToUtf16Le` would overflow.
checkWtf8ToWtf16LeOverflow
Checks if calling `utf8ToUtf16Le` would overflow.
wtf8ToUtf8Lossy
Surrogate codepoints (U+D800 to U+DFFF) are replaced by the Unicode replacement
calcWtf8Len
Returns the length, in bytes, that would be necessary to encode the

Error sets in this namespace

Error Sets

#

Use this to replace an unknown, unrecognized, or unrepresentable character.

Values

#
replacement_character
Use this to replace an unknown, unrecognized, or unrepresentable character.
replacement_character_utf8
= utf8EncodeComptime(replacement_character)

Source

Implementation

#
fn formatUtf8(utf8: []const u8, writer: *std.io.Writer) std.io.Writer.Error!void {
    var buf: [300]u8 = undefined; // just an arbitrary size
    var u8len: usize = 0;

    // This implementation is based on this specification:
    // https://encoding.spec.whatwg.org/#utf-8-decoder
    var codepoint: u21 = 0;
    var cont_bytes_seen: u3 = 0;
    var cont_bytes_needed: u3 = 0;
    var lower_boundary: u8 = 0x80;
    var upper_boundary: u8 = 0xBF;

    var i: usize = 0;
    while (i < utf8.len) {
        const byte = utf8[i];
        if (cont_bytes_needed == 0) {
            switch (byte) {
                0x00...0x7F => {
                    buf[u8len] = byte;
                    u8len += 1;
                },
                0xC2...0xDF => {
                    cont_bytes_needed = 1;
                    codepoint = byte & 0b00011111;
                },
                0xE0...0xEF => {
                    if (byte == 0xE0) lower_boundary = 0xA0;
                    if (byte == 0xED) upper_boundary = 0x9F;
                    cont_bytes_needed = 2;
                    codepoint = byte & 0b00001111;
                },
                0xF0...0xF4 => {
                    if (byte == 0xF0) lower_boundary = 0x90;
                    if (byte == 0xF4) upper_boundary = 0x8F;
                    cont_bytes_needed = 3;
                    codepoint = byte & 0b00000111;
                },
                else => {
                    u8len += utf8Encode(replacement_character, buf[u8len..]) catch unreachable;
                },
            }
            // consume the byte
            i += 1;
        } else if (byte < lower_boundary or byte > upper_boundary) {
            codepoint = 0;
            cont_bytes_needed = 0;
            cont_bytes_seen = 0;
            lower_boundary = 0x80;
            upper_boundary = 0xBF;
            u8len += utf8Encode(replacement_character, buf[u8len..]) catch unreachable;
            // do not consume the current byte, it should now be treated as a possible start byte
        } else {
            lower_boundary = 0x80;
            upper_boundary = 0xBF;
            codepoint <<= 6;
            codepoint |= byte & 0b00111111;
            cont_bytes_seen += 1;
            // consume the byte
            i += 1;

            if (cont_bytes_seen == cont_bytes_needed) {
                const codepoint_len = cont_bytes_seen + 1;
                const codepoint_start_i = i - codepoint_len;
                @memcpy(buf[u8len..][0..codepoint_len], utf8[codepoint_start_i..][0..codepoint_len]);
                u8len += codepoint_len;

                codepoint = 0;
                cont_bytes_needed = 0;
                cont_bytes_seen = 0;
            }
        }
        // make sure there's always enough room for another maximum length UTF-8 codepoint
        if (u8len + 4 > buf.len) {
            try writer.writeAll(buf[0..u8len]);
            u8len = 0;
        }
    }
    if (cont_bytes_needed != 0) {
        // we know there's enough room because we always flush
        // if there's less than 4 bytes remaining in the buffer.
        u8len += utf8Encode(replacement_character, buf[u8len..]) catch unreachable;
    }
    try writer.writeAll(buf[0..u8len]);
}