utf16LeToUtf8Impl
Asserts that the output buffer is big enough. Returns end byte index into utf8.
Function parameters
Parameters
- utf8:[]u8
- utf16le:[]const u16
Utf8View iterates the code points of a utf-8 encoded string.
Types
- Utf8View
- Utf8View iterates the code points of a utf-8 encoded string.
- Wtf8View
- Wtf8View iterates the code points of a WTF-8 encoded string,
- Wtf8Iterator
- Asserts that `bytes` is valid WTF-8
Returns how many bytes the UTF-8 representation would require
Functions
- utf8CodepointSequenceLength
- Returns how many bytes the UTF-8 representation would require
- utf8ByteSequenceLength
- Given the first byte of a UTF-8 codepoint,
- utf8Encode
- Encodes the given codepoint into a UTF-8 byte sequence.
- utf8Decode
- Deprecated.
- utf8ValidCodepoint
- Returns true if the given unicode codepoint can be encoded in UTF-8.
- utf8CountCodepoints
- Returns the length of a supplied UTF-8 string literal in terms of unicode
- utf8ValidateSlice
- Returns true if the input consists entirely of UTF-8 codepoints
- utf16CodepointSequenceLength
- Returns how many code units the UTF-16 representation would require
- utf16CodeUnitSequenceLength
- Given the first code unit of a UTF-16 codepoint, returns a number 1-2
- utf16DecodeSurrogatePair
- Decodes the codepoint encoded in the given pair of UTF-16 code units.
- utf16CountCodepoints
- Returns the length of a supplied UTF-16 string literal in terms of unicode
- fmtUtf8
- Return a Formatter for a (potentially ill-formed) UTF-8 string.
- utf16LeToUtf8Alloc
- Caller owns returned memory.
- utf16LeToUtf8AllocZ
- Caller owns returned memory.
- utf8ToUtf16Le
- Returns index of next character.
- utf8ToUtf16LeStringLiteral
- Converts a UTF-8 string literal into a UTF-16LE string literal.
- wtf8ToWtf16LeStringLiteral
- Converts a WTF-8 string literal into a WTF-16LE string literal.
- calcUtf16LeLen
- Returns length in UTF-16LE of UTF-8 slice as length of []u16.
- calcWtf16LeLen
- Returns length in WTF-16LE of WTF-8 slice as length of []u16.
- fmtUtf16Le
- Return a Formatter for a (potentially ill-formed) UTF-16 LE string,
- isSurrogateCodepoint
- Returns true if the codepoint is a surrogate (U+DC00 to U+DFFF)
- wtf8Encode
- Encodes the given codepoint into a WTF-8 byte sequence.
- wtf8Decode
- Deprecated.
- wtf8ValidateSlice
- Returns true if the input consists entirely of WTF-8 codepoints
- wtf16LeToWtf8Alloc
- Caller must free returned memory.
- wtf16LeToWtf8AllocZ
- Caller must free returned memory.
- wtf8ToWtf16Le
- Returns index of next character.
- checkUtf8ToUtf16LeOverflow
- Checks if calling `utf8ToUtf16Le` would overflow.
- checkWtf8ToWtf16LeOverflow
- Checks if calling `utf8ToUtf16Le` would overflow.
- wtf8ToUtf8Lossy
- Surrogate codepoints (U+D800 to U+DFFF) are replaced by the Unicode replacement
- calcWtf8Len
- Returns the length, in bytes, that would be necessary to encode the
Error sets in this namespace
Error Sets
Use this to replace an unknown, unrecognized, or unrepresentable character.
Values
- replacement_character
- Use this to replace an unknown, unrecognized, or unrepresentable character.
- replacement_character_utf8
- = utf8EncodeComptime(replacement_character)
Source
Implementation
fn utf16LeToUtf8Impl(utf8: []u8, utf16le: []const u16, comptime surrogates: Surrogates) (switch (surrogates) {
.cannot_encode_surrogate_half => Utf16LeToUtf8Error,
.can_encode_surrogate_half => error{},
})!usize {
var dest_index: usize = 0;
var remaining = utf16le;
vectorized: {
const chunk_len = std.simd.suggestVectorLength(u16) orelse break :vectorized;
const Chunk = @Vector(chunk_len, u16);
// Fast path. Check for and encode ASCII characters at the start of the input.
while (remaining.len >= chunk_len) {
const chunk: Chunk = remaining[0..chunk_len].*;
const mask: Chunk = @splat(mem.nativeToLittle(u16, 0x7F));
if (@reduce(.Or, chunk | mask != mask)) {
// found a non ASCII code unit
break;
}
const ascii_chunk: @Vector(chunk_len, u8) = @truncate(mem.nativeToLittle(Chunk, chunk));
utf8[dest_index..][0..chunk_len].* = ascii_chunk;
dest_index += chunk_len;
remaining = remaining[chunk_len..];
}
}
switch (surrogates) {
.cannot_encode_surrogate_half => {
var it = Utf16LeIterator.init(remaining);
while (try it.nextCodepoint()) |codepoint| {
dest_index += utf8Encode(codepoint, utf8[dest_index..]) catch |err| switch (err) {
// The maximum possible codepoint encoded by UTF-16 is U+10FFFF,
// which is within the valid codepoint range.
error.CodepointTooLarge => unreachable,
// We know the codepoint was valid in UTF-16, meaning it is not
// an unpaired surrogate codepoint.
error.Utf8CannotEncodeSurrogateHalf => unreachable,
};
}
},
.can_encode_surrogate_half => {
var it = Wtf16LeIterator.init(remaining);
while (it.nextCodepoint()) |codepoint| {
dest_index += wtf8Encode(codepoint, utf8[dest_index..]) catch |err| switch (err) {
// The maximum possible codepoint encoded by UTF-16 is U+10FFFF,
// which is within the valid codepoint range.
error.CodepointTooLarge => unreachable,
};
}
},
}
return dest_index;
}