utf8ValidateSliceImpl
Function parameters
Parameters
- input:[]const u8
Utf8View iterates the code points of a utf-8 encoded string.
Types
- Utf8View
- Utf8View iterates the code points of a utf-8 encoded string.
- Wtf8View
- Wtf8View iterates the code points of a WTF-8 encoded string,
- Wtf8Iterator
- Asserts that `bytes` is valid WTF-8
Returns how many bytes the UTF-8 representation would require
Functions
- utf8CodepointSequenceLength
- Returns how many bytes the UTF-8 representation would require
- utf8ByteSequenceLength
- Given the first byte of a UTF-8 codepoint,
- utf8Encode
- Encodes the given codepoint into a UTF-8 byte sequence.
- utf8Decode
- Deprecated.
- utf8ValidCodepoint
- Returns true if the given unicode codepoint can be encoded in UTF-8.
- utf8CountCodepoints
- Returns the length of a supplied UTF-8 string literal in terms of unicode
- utf8ValidateSlice
- Returns true if the input consists entirely of UTF-8 codepoints
- utf16CodepointSequenceLength
- Returns how many code units the UTF-16 representation would require
- utf16CodeUnitSequenceLength
- Given the first code unit of a UTF-16 codepoint, returns a number 1-2
- utf16DecodeSurrogatePair
- Decodes the codepoint encoded in the given pair of UTF-16 code units.
- utf16CountCodepoints
- Returns the length of a supplied UTF-16 string literal in terms of unicode
- fmtUtf8
- Return a Formatter for a (potentially ill-formed) UTF-8 string.
- utf16LeToUtf8Alloc
- Caller owns returned memory.
- utf16LeToUtf8AllocZ
- Caller owns returned memory.
- utf8ToUtf16Le
- Returns index of next character.
- utf8ToUtf16LeStringLiteral
- Converts a UTF-8 string literal into a UTF-16LE string literal.
- wtf8ToWtf16LeStringLiteral
- Converts a WTF-8 string literal into a WTF-16LE string literal.
- calcUtf16LeLen
- Returns length in UTF-16LE of UTF-8 slice as length of []u16.
- calcWtf16LeLen
- Returns length in WTF-16LE of WTF-8 slice as length of []u16.
- fmtUtf16Le
- Return a Formatter for a (potentially ill-formed) UTF-16 LE string,
- isSurrogateCodepoint
- Returns true if the codepoint is a surrogate (U+DC00 to U+DFFF)
- wtf8Encode
- Encodes the given codepoint into a WTF-8 byte sequence.
- wtf8Decode
- Deprecated.
- wtf8ValidateSlice
- Returns true if the input consists entirely of WTF-8 codepoints
- wtf16LeToWtf8Alloc
- Caller must free returned memory.
- wtf16LeToWtf8AllocZ
- Caller must free returned memory.
- wtf8ToWtf16Le
- Returns index of next character.
- checkUtf8ToUtf16LeOverflow
- Checks if calling `utf8ToUtf16Le` would overflow.
- checkWtf8ToWtf16LeOverflow
- Checks if calling `utf8ToUtf16Le` would overflow.
- wtf8ToUtf8Lossy
- Surrogate codepoints (U+D800 to U+DFFF) are replaced by the Unicode replacement
- calcWtf8Len
- Returns the length, in bytes, that would be necessary to encode the
Error sets in this namespace
Error Sets
Use this to replace an unknown, unrecognized, or unrepresentable character.
Values
- replacement_character
- Use this to replace an unknown, unrecognized, or unrepresentable character.
- replacement_character_utf8
- = utf8EncodeComptime(replacement_character)
Source
Implementation
fn utf8ValidateSliceImpl(input: []const u8, comptime surrogates: Surrogates) bool {
var remaining = input;
if (std.simd.suggestVectorLength(u8)) |chunk_len| {
const Chunk = @Vector(chunk_len, u8);
// Fast path. Check for and skip ASCII characters at the start of the input.
while (remaining.len >= chunk_len) {
const chunk: Chunk = remaining[0..chunk_len].*;
const mask: Chunk = @splat(0x80);
if (@reduce(.Or, chunk & mask == mask)) {
// found a non ASCII byte
break;
}
remaining = remaining[chunk_len..];
}
}
// default lowest and highest continuation byte
const lo_cb = 0b10000000;
const hi_cb = 0b10111111;
const min_non_ascii_codepoint = 0x80;
// The first nibble is used to identify the continuation byte range to
// accept. The second nibble is the size.
const xx = 0xF1; // invalid: size 1
const as = 0xF0; // ASCII: size 1
const s1 = 0x02; // accept 0, size 2
const s2 = switch (surrogates) {
.cannot_encode_surrogate_half => 0x13, // accept 1, size 3
.can_encode_surrogate_half => 0x03, // accept 0, size 3
};
const s3 = 0x03; // accept 0, size 3
const s4 = switch (surrogates) {
.cannot_encode_surrogate_half => 0x23, // accept 2, size 3
.can_encode_surrogate_half => 0x03, // accept 0, size 3
};
const s5 = 0x34; // accept 3, size 4
const s6 = 0x04; // accept 0, size 4
const s7 = 0x44; // accept 4, size 4
// Information about the first byte in a UTF-8 sequence.
const first = comptime ([_]u8{as} ** 128) ++ ([_]u8{xx} ** 64) ++ [_]u8{
xx, xx, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1,
s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1,
s2, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s4, s3, s3,
s5, s6, s6, s6, s7, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx,
};
const n = remaining.len;
var i: usize = 0;
while (i < n) {
const first_byte = remaining[i];
if (first_byte < min_non_ascii_codepoint) {
i += 1;
continue;
}
const info = first[first_byte];
if (info == xx) {
return false; // Illegal starter byte.
}
const size = info & 7;
if (i + size > n) {
return false; // Short or invalid.
}
// Figure out the acceptable low and high continuation bytes, starting
// with our defaults.
var accept_lo: u8 = lo_cb;
var accept_hi: u8 = hi_cb;
switch (info >> 4) {
0 => {},
1 => accept_lo = 0xA0,
2 => accept_hi = 0x9F,
3 => accept_lo = 0x90,
4 => accept_hi = 0x8F,
else => unreachable,
}
const c1 = remaining[i + 1];
if (c1 < accept_lo or accept_hi < c1) {
return false;
}
switch (size) {
2 => i += 2,
3 => {
const c2 = remaining[i + 2];
if (c2 < lo_cb or hi_cb < c2) {
return false;
}
i += 3;
},
4 => {
const c2 = remaining[i + 2];
if (c2 < lo_cb or hi_cb < c2) {
return false;
}
const c3 = remaining[i + 3];
if (c3 < lo_cb or hi_cb < c3) {
return false;
}
i += 4;
},
else => unreachable,
}
}
return true;
}