utf8ValidateSliceImpl

Function parameters

Parameters

input:[]const u8
surrogates:Surrogates

Utf8View iterates the code points of a utf-8 encoded string.

Types

Utf8View: Utf8View iterates the code points of a utf-8 encoded string.
Utf8Iterator
Utf16LeIterator
Wtf8View: Wtf8View iterates the code points of a WTF-8 encoded string,
Wtf8Iterator: Asserts that `bytes` is valid WTF-8
Wtf16LeIterator

Returns how many bytes the UTF-8 representation would require

Functions

utf8CodepointSequenceLength: Returns how many bytes the UTF-8 representation would require
utf8ByteSequenceLength: Given the first byte of a UTF-8 codepoint,
utf8Encode: Encodes the given codepoint into a UTF-8 byte sequence.
utf8EncodeComptime
utf8Decode: Deprecated.
utf8Decode2
utf8Decode3
utf8Decode3AllowSurrogateHalf
utf8Decode4
utf8ValidCodepoint: Returns true if the given unicode codepoint can be encoded in UTF-8.
utf8CountCodepoints: Returns the length of a supplied UTF-8 string literal in terms of unicode
utf8ValidateSlice: Returns true if the input consists entirely of UTF-8 codepoints
utf16IsHighSurrogate
utf16IsLowSurrogate
utf16CodepointSequenceLength: Returns how many code units the UTF-16 representation would require
utf16CodeUnitSequenceLength: Given the first code unit of a UTF-16 codepoint, returns a number 1-2
utf16DecodeSurrogatePair: Decodes the codepoint encoded in the given pair of UTF-16 code units.
utf16CountCodepoints: Returns the length of a supplied UTF-16 string literal in terms of unicode
fmtUtf8: Return a Formatter for a (potentially ill-formed) UTF-8 string.
utf16LeToUtf8ArrayList
utf16LeToUtf8Alloc: Caller owns returned memory.
utf16LeToUtf8AllocZ: Caller owns returned memory.
utf16LeToUtf8
utf8ToUtf16LeArrayList
utf8ToUtf16LeAlloc
utf8ToUtf16LeAllocZ
utf8ToUtf16Le: Returns index of next character.
utf8ToUtf16LeImpl
utf8ToUtf16LeStringLiteral: Converts a UTF-8 string literal into a UTF-16LE string literal.
wtf8ToWtf16LeStringLiteral: Converts a WTF-8 string literal into a WTF-16LE string literal.
calcUtf16LeLenImpl
calcUtf16LeLen: Returns length in UTF-16LE of UTF-8 slice as length of []u16.
calcWtf16LeLen: Returns length in WTF-16LE of WTF-8 slice as length of []u16.
fmtUtf16Le: Return a Formatter for a (potentially ill-formed) UTF-16 LE string,
isSurrogateCodepoint: Returns true if the codepoint is a surrogate (U+DC00 to U+DFFF)
wtf8Encode: Encodes the given codepoint into a WTF-8 byte sequence.
wtf8Decode: Deprecated.
wtf8ValidateSlice: Returns true if the input consists entirely of WTF-8 codepoints
wtf16LeToWtf8ArrayList
wtf16LeToWtf8Alloc: Caller must free returned memory.
wtf16LeToWtf8AllocZ: Caller must free returned memory.
wtf16LeToWtf8
wtf8ToWtf16LeArrayList
wtf8ToWtf16LeAlloc
wtf8ToWtf16LeAllocZ
wtf8ToWtf16Le: Returns index of next character.
checkUtf8ToUtf16LeOverflow: Checks if calling `utf8ToUtf16Le` would overflow.
checkWtf8ToWtf16LeOverflow: Checks if calling `utf8ToUtf16Le` would overflow.
wtf8ToUtf8Lossy: Surrogate codepoints (U+D800 to U+DFFF) are replaced by the Unicode replacement
wtf8ToUtf8LossyAlloc
wtf8ToUtf8LossyAllocZ
calcWtf8Len: Returns the length, in bytes, that would be necessary to encode the

Error sets in this namespace

Error Sets

Utf16LeToUtf8AllocError
Utf16LeToUtf8Error

Use this to replace an unknown, unrecognized, or unrepresentable character.

Values

replacement_character: Use this to replace an unknown, unrecognized, or unrepresentable character.
replacement_character_utf8: = utf8EncodeComptime(replacement_character)

Source

Implementation

fn utf8ValidateSliceImpl(input: []const u8, comptime surrogates: Surrogates) bool {
    var remaining = input;

    if (std.simd.suggestVectorLength(u8)) |chunk_len| {
        const Chunk = @Vector(chunk_len, u8);

        // Fast path. Check for and skip ASCII characters at the start of the input.
        while (remaining.len >= chunk_len) {
            const chunk: Chunk = remaining[0..chunk_len].*;
            const mask: Chunk = @splat(0x80);
            if (@reduce(.Or, chunk & mask == mask)) {
                // found a non ASCII byte
                break;
            }
            remaining = remaining[chunk_len..];
        }
    }

    // default lowest and highest continuation byte
    const lo_cb = 0b10000000;
    const hi_cb = 0b10111111;

    const min_non_ascii_codepoint = 0x80;

    // The first nibble is used to identify the continuation byte range to
    // accept. The second nibble is the size.
    const xx = 0xF1; // invalid: size 1
    const as = 0xF0; // ASCII: size 1
    const s1 = 0x02; // accept 0, size 2
    const s2 = switch (surrogates) {
        .cannot_encode_surrogate_half => 0x13, // accept 1, size 3
        .can_encode_surrogate_half => 0x03, // accept 0, size 3
    };
    const s3 = 0x03; // accept 0, size 3
    const s4 = switch (surrogates) {
        .cannot_encode_surrogate_half => 0x23, // accept 2, size 3
        .can_encode_surrogate_half => 0x03, // accept 0, size 3
    };
    const s5 = 0x34; // accept 3, size 4
    const s6 = 0x04; // accept 0, size 4
    const s7 = 0x44; // accept 4, size 4

    // Information about the first byte in a UTF-8 sequence.
    const first = comptime ([_]u8{as} ** 128) ++ ([_]u8{xx} ** 64) ++ [_]u8{
        xx, xx, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1,
        s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1,
        s2, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s4, s3, s3,
        s5, s6, s6, s6, s7, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx,
    };

    const n = remaining.len;
    var i: usize = 0;
    while (i < n) {
        const first_byte = remaining[i];
        if (first_byte < min_non_ascii_codepoint) {
            i += 1;
            continue;
        }

        const info = first[first_byte];
        if (info == xx) {
            return false; // Illegal starter byte.
        }

        const size = info & 7;
        if (i + size > n) {
            return false; // Short or invalid.
        }

        // Figure out the acceptable low and high continuation bytes, starting
        // with our defaults.
        var accept_lo: u8 = lo_cb;
        var accept_hi: u8 = hi_cb;

        switch (info >> 4) {
            0 => {},
            1 => accept_lo = 0xA0,
            2 => accept_hi = 0x9F,
            3 => accept_lo = 0x90,
            4 => accept_hi = 0x8F,
            else => unreachable,
        }

        const c1 = remaining[i + 1];
        if (c1 < accept_lo or accept_hi < c1) {
            return false;
        }

        switch (size) {
            2 => i += 2,
            3 => {
                const c2 = remaining[i + 2];
                if (c2 < lo_cb or hi_cb < c2) {
                    return false;
                }
                i += 3;
            },
            4 => {
                const c2 = remaining[i + 2];
                if (c2 < lo_cb or hi_cb < c2) {
                    return false;
                }
                const c3 = remaining[i + 3];
                if (c3 < lo_cb or hi_cb < c3) {
                    return false;
                }
                i += 4;
            },
            else => unreachable,
        }
    }

    return true;
}