DoxigAlpha

utf16LeToUtf8ArrayListImpl

Function parameters

Parameters

#
result:*std.array_list.Managed(u8)
utf16le:[]const u16

Utf8View iterates the code points of a utf-8 encoded string.

Types

#
Utf8View
Utf8View iterates the code points of a utf-8 encoded string.
Wtf8View
Wtf8View iterates the code points of a WTF-8 encoded string,
Wtf8Iterator
Asserts that `bytes` is valid WTF-8

Returns how many bytes the UTF-8 representation would require

Functions

#
utf8CodepointSequenceLength
Returns how many bytes the UTF-8 representation would require
utf8ByteSequenceLength
Given the first byte of a UTF-8 codepoint,
utf8Encode
Encodes the given codepoint into a UTF-8 byte sequence.
utf8Decode
Deprecated.
utf8ValidCodepoint
Returns true if the given unicode codepoint can be encoded in UTF-8.
utf8CountCodepoints
Returns the length of a supplied UTF-8 string literal in terms of unicode
utf8ValidateSlice
Returns true if the input consists entirely of UTF-8 codepoints
utf16CodepointSequenceLength
Returns how many code units the UTF-16 representation would require
utf16CodeUnitSequenceLength
Given the first code unit of a UTF-16 codepoint, returns a number 1-2
utf16DecodeSurrogatePair
Decodes the codepoint encoded in the given pair of UTF-16 code units.
utf16CountCodepoints
Returns the length of a supplied UTF-16 string literal in terms of unicode
fmtUtf8
Return a Formatter for a (potentially ill-formed) UTF-8 string.
utf16LeToUtf8Alloc
Caller owns returned memory.
utf16LeToUtf8AllocZ
Caller owns returned memory.
utf8ToUtf16Le
Returns index of next character.
utf8ToUtf16LeStringLiteral
Converts a UTF-8 string literal into a UTF-16LE string literal.
wtf8ToWtf16LeStringLiteral
Converts a WTF-8 string literal into a WTF-16LE string literal.
calcUtf16LeLen
Returns length in UTF-16LE of UTF-8 slice as length of []u16.
calcWtf16LeLen
Returns length in WTF-16LE of WTF-8 slice as length of []u16.
fmtUtf16Le
Return a Formatter for a (potentially ill-formed) UTF-16 LE string,
isSurrogateCodepoint
Returns true if the codepoint is a surrogate (U+DC00 to U+DFFF)
wtf8Encode
Encodes the given codepoint into a WTF-8 byte sequence.
wtf8Decode
Deprecated.
wtf8ValidateSlice
Returns true if the input consists entirely of WTF-8 codepoints
wtf16LeToWtf8Alloc
Caller must free returned memory.
wtf16LeToWtf8AllocZ
Caller must free returned memory.
wtf8ToWtf16Le
Returns index of next character.
checkUtf8ToUtf16LeOverflow
Checks if calling `utf8ToUtf16Le` would overflow.
checkWtf8ToWtf16LeOverflow
Checks if calling `utf8ToUtf16Le` would overflow.
wtf8ToUtf8Lossy
Surrogate codepoints (U+D800 to U+DFFF) are replaced by the Unicode replacement
calcWtf8Len
Returns the length, in bytes, that would be necessary to encode the

Error sets in this namespace

Error Sets

#

Use this to replace an unknown, unrecognized, or unrepresentable character.

Values

#
replacement_character
Use this to replace an unknown, unrecognized, or unrepresentable character.
replacement_character_utf8
= utf8EncodeComptime(replacement_character)

Source

Implementation

#
fn utf16LeToUtf8ArrayListImpl(
    result: *std.array_list.Managed(u8),
    utf16le: []const u16,
    comptime surrogates: Surrogates,
) (switch (surrogates) {
    .cannot_encode_surrogate_half => Utf16LeToUtf8AllocError,
    .can_encode_surrogate_half => Allocator.Error,
})!void {
    assert(result.unusedCapacitySlice().len >= utf16le.len);

    var remaining = utf16le;
    vectorized: {
        const chunk_len = std.simd.suggestVectorLength(u16) orelse break :vectorized;
        const Chunk = @Vector(chunk_len, u16);

        // Fast path. Check for and encode ASCII characters at the start of the input.
        while (remaining.len >= chunk_len) {
            const chunk: Chunk = remaining[0..chunk_len].*;
            const mask: Chunk = @splat(mem.nativeToLittle(u16, 0x7F));
            if (@reduce(.Or, chunk | mask != mask)) {
                // found a non ASCII code unit
                break;
            }
            const ascii_chunk: @Vector(chunk_len, u8) = @truncate(mem.nativeToLittle(Chunk, chunk));
            // We allocated enough space to encode every UTF-16 code unit
            // as ASCII, so if the entire string is ASCII then we are
            // guaranteed to have enough space allocated
            result.addManyAsArrayAssumeCapacity(chunk_len).* = ascii_chunk;
            remaining = remaining[chunk_len..];
        }
    }

    switch (surrogates) {
        .cannot_encode_surrogate_half => {
            var it = Utf16LeIterator.init(remaining);
            while (try it.nextCodepoint()) |codepoint| {
                const utf8_len = utf8CodepointSequenceLength(codepoint) catch unreachable;
                assert((utf8Encode(codepoint, try result.addManyAsSlice(utf8_len)) catch unreachable) == utf8_len);
            }
        },
        .can_encode_surrogate_half => {
            var it = Wtf16LeIterator.init(remaining);
            while (it.nextCodepoint()) |codepoint| {
                const utf8_len = utf8CodepointSequenceLength(codepoint) catch unreachable;
                assert((wtf8Encode(codepoint, try result.addManyAsSlice(utf8_len)) catch unreachable) == utf8_len);
            }
        },
    }
}