DoxigAlpha

utf8ValidateSliceImpl

Function parameters

Parameters

#
input:[]const u8

Utf8View iterates the code points of a utf-8 encoded string.

Types

#
Utf8View
Utf8View iterates the code points of a utf-8 encoded string.
Wtf8View
Wtf8View iterates the code points of a WTF-8 encoded string,
Wtf8Iterator
Asserts that `bytes` is valid WTF-8

Returns how many bytes the UTF-8 representation would require

Functions

#
utf8CodepointSequenceLength
Returns how many bytes the UTF-8 representation would require
utf8ByteSequenceLength
Given the first byte of a UTF-8 codepoint,
utf8Encode
Encodes the given codepoint into a UTF-8 byte sequence.
utf8Decode
Deprecated.
utf8ValidCodepoint
Returns true if the given unicode codepoint can be encoded in UTF-8.
utf8CountCodepoints
Returns the length of a supplied UTF-8 string literal in terms of unicode
utf8ValidateSlice
Returns true if the input consists entirely of UTF-8 codepoints
utf16CodepointSequenceLength
Returns how many code units the UTF-16 representation would require
utf16CodeUnitSequenceLength
Given the first code unit of a UTF-16 codepoint, returns a number 1-2
utf16DecodeSurrogatePair
Decodes the codepoint encoded in the given pair of UTF-16 code units.
utf16CountCodepoints
Returns the length of a supplied UTF-16 string literal in terms of unicode
fmtUtf8
Return a Formatter for a (potentially ill-formed) UTF-8 string.
utf16LeToUtf8Alloc
Caller owns returned memory.
utf16LeToUtf8AllocZ
Caller owns returned memory.
utf8ToUtf16Le
Returns index of next character.
utf8ToUtf16LeStringLiteral
Converts a UTF-8 string literal into a UTF-16LE string literal.
wtf8ToWtf16LeStringLiteral
Converts a WTF-8 string literal into a WTF-16LE string literal.
calcUtf16LeLen
Returns length in UTF-16LE of UTF-8 slice as length of []u16.
calcWtf16LeLen
Returns length in WTF-16LE of WTF-8 slice as length of []u16.
fmtUtf16Le
Return a Formatter for a (potentially ill-formed) UTF-16 LE string,
isSurrogateCodepoint
Returns true if the codepoint is a surrogate (U+DC00 to U+DFFF)
wtf8Encode
Encodes the given codepoint into a WTF-8 byte sequence.
wtf8Decode
Deprecated.
wtf8ValidateSlice
Returns true if the input consists entirely of WTF-8 codepoints
wtf16LeToWtf8Alloc
Caller must free returned memory.
wtf16LeToWtf8AllocZ
Caller must free returned memory.
wtf8ToWtf16Le
Returns index of next character.
checkUtf8ToUtf16LeOverflow
Checks if calling `utf8ToUtf16Le` would overflow.
checkWtf8ToWtf16LeOverflow
Checks if calling `utf8ToUtf16Le` would overflow.
wtf8ToUtf8Lossy
Surrogate codepoints (U+D800 to U+DFFF) are replaced by the Unicode replacement
calcWtf8Len
Returns the length, in bytes, that would be necessary to encode the

Error sets in this namespace

Error Sets

#

Use this to replace an unknown, unrecognized, or unrepresentable character.

Values

#
replacement_character
Use this to replace an unknown, unrecognized, or unrepresentable character.
replacement_character_utf8
= utf8EncodeComptime(replacement_character)

Source

Implementation

#
fn utf8ValidateSliceImpl(input: []const u8, comptime surrogates: Surrogates) bool {
    var remaining = input;

    if (std.simd.suggestVectorLength(u8)) |chunk_len| {
        const Chunk = @Vector(chunk_len, u8);

        // Fast path. Check for and skip ASCII characters at the start of the input.
        while (remaining.len >= chunk_len) {
            const chunk: Chunk = remaining[0..chunk_len].*;
            const mask: Chunk = @splat(0x80);
            if (@reduce(.Or, chunk & mask == mask)) {
                // found a non ASCII byte
                break;
            }
            remaining = remaining[chunk_len..];
        }
    }

    // default lowest and highest continuation byte
    const lo_cb = 0b10000000;
    const hi_cb = 0b10111111;

    const min_non_ascii_codepoint = 0x80;

    // The first nibble is used to identify the continuation byte range to
    // accept. The second nibble is the size.
    const xx = 0xF1; // invalid: size 1
    const as = 0xF0; // ASCII: size 1
    const s1 = 0x02; // accept 0, size 2
    const s2 = switch (surrogates) {
        .cannot_encode_surrogate_half => 0x13, // accept 1, size 3
        .can_encode_surrogate_half => 0x03, // accept 0, size 3
    };
    const s3 = 0x03; // accept 0, size 3
    const s4 = switch (surrogates) {
        .cannot_encode_surrogate_half => 0x23, // accept 2, size 3
        .can_encode_surrogate_half => 0x03, // accept 0, size 3
    };
    const s5 = 0x34; // accept 3, size 4
    const s6 = 0x04; // accept 0, size 4
    const s7 = 0x44; // accept 4, size 4

    // Information about the first byte in a UTF-8 sequence.
    const first = comptime ([_]u8{as} ** 128) ++ ([_]u8{xx} ** 64) ++ [_]u8{
        xx, xx, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1,
        s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1,
        s2, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s4, s3, s3,
        s5, s6, s6, s6, s7, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx,
    };

    const n = remaining.len;
    var i: usize = 0;
    while (i < n) {
        const first_byte = remaining[i];
        if (first_byte < min_non_ascii_codepoint) {
            i += 1;
            continue;
        }

        const info = first[first_byte];
        if (info == xx) {
            return false; // Illegal starter byte.
        }

        const size = info & 7;
        if (i + size > n) {
            return false; // Short or invalid.
        }

        // Figure out the acceptable low and high continuation bytes, starting
        // with our defaults.
        var accept_lo: u8 = lo_cb;
        var accept_hi: u8 = hi_cb;

        switch (info >> 4) {
            0 => {},
            1 => accept_lo = 0xA0,
            2 => accept_hi = 0x9F,
            3 => accept_lo = 0x90,
            4 => accept_hi = 0x8F,
            else => unreachable,
        }

        const c1 = remaining[i + 1];
        if (c1 < accept_lo or accept_hi < c1) {
            return false;
        }

        switch (size) {
            2 => i += 2,
            3 => {
                const c2 = remaining[i + 2];
                if (c2 < lo_cb or hi_cb < c2) {
                    return false;
                }
                i += 3;
            },
            4 => {
                const c2 = remaining[i + 2];
                if (c2 < lo_cb or hi_cb < c2) {
                    return false;
                }
                const c3 = remaining[i + 3];
                if (c3 < lo_cb or hi_cb < c3) {
                    return false;
                }
                i += 4;
            },
            else => unreachable,
        }
    }

    return true;
}