formatUtf8
Print the given utf8 string, encoded as UTF-8 bytes.
Ill-formed UTF-8 byte sequences are replaced by the replacement character (U+FFFD)
according to "U+FFFD Substitution of Maximal Subparts" from Chapter 3 of
the Unicode standard, and as specified by https://encoding.spec.whatwg.org/#utf-8-decoder
Function parameters
Parameters
- utf8:[]const u8
- writer:*std.io.Writer
Utf8View iterates the code points of a utf-8 encoded string.
Types
- Utf8View
- Utf8View iterates the code points of a utf-8 encoded string.
- Wtf8View
- Wtf8View iterates the code points of a WTF-8 encoded string,
- Wtf8Iterator
- Asserts that `bytes` is valid WTF-8
Returns how many bytes the UTF-8 representation would require
Functions
- utf8CodepointSequenceLength
- Returns how many bytes the UTF-8 representation would require
- utf8ByteSequenceLength
- Given the first byte of a UTF-8 codepoint,
- utf8Encode
- Encodes the given codepoint into a UTF-8 byte sequence.
- utf8Decode
- Deprecated.
- utf8ValidCodepoint
- Returns true if the given unicode codepoint can be encoded in UTF-8.
- utf8CountCodepoints
- Returns the length of a supplied UTF-8 string literal in terms of unicode
- utf8ValidateSlice
- Returns true if the input consists entirely of UTF-8 codepoints
- utf16CodepointSequenceLength
- Returns how many code units the UTF-16 representation would require
- utf16CodeUnitSequenceLength
- Given the first code unit of a UTF-16 codepoint, returns a number 1-2
- utf16DecodeSurrogatePair
- Decodes the codepoint encoded in the given pair of UTF-16 code units.
- utf16CountCodepoints
- Returns the length of a supplied UTF-16 string literal in terms of unicode
- fmtUtf8
- Return a Formatter for a (potentially ill-formed) UTF-8 string.
- utf16LeToUtf8Alloc
- Caller owns returned memory.
- utf16LeToUtf8AllocZ
- Caller owns returned memory.
- utf8ToUtf16Le
- Returns index of next character.
- utf8ToUtf16LeStringLiteral
- Converts a UTF-8 string literal into a UTF-16LE string literal.
- wtf8ToWtf16LeStringLiteral
- Converts a WTF-8 string literal into a WTF-16LE string literal.
- calcUtf16LeLen
- Returns length in UTF-16LE of UTF-8 slice as length of []u16.
- calcWtf16LeLen
- Returns length in WTF-16LE of WTF-8 slice as length of []u16.
- fmtUtf16Le
- Return a Formatter for a (potentially ill-formed) UTF-16 LE string,
- isSurrogateCodepoint
- Returns true if the codepoint is a surrogate (U+DC00 to U+DFFF)
- wtf8Encode
- Encodes the given codepoint into a WTF-8 byte sequence.
- wtf8Decode
- Deprecated.
- wtf8ValidateSlice
- Returns true if the input consists entirely of WTF-8 codepoints
- wtf16LeToWtf8Alloc
- Caller must free returned memory.
- wtf16LeToWtf8AllocZ
- Caller must free returned memory.
- wtf8ToWtf16Le
- Returns index of next character.
- checkUtf8ToUtf16LeOverflow
- Checks if calling `utf8ToUtf16Le` would overflow.
- checkWtf8ToWtf16LeOverflow
- Checks if calling `utf8ToUtf16Le` would overflow.
- wtf8ToUtf8Lossy
- Surrogate codepoints (U+D800 to U+DFFF) are replaced by the Unicode replacement
- calcWtf8Len
- Returns the length, in bytes, that would be necessary to encode the
Error sets in this namespace
Error Sets
Use this to replace an unknown, unrecognized, or unrepresentable character.
Values
- replacement_character
- Use this to replace an unknown, unrecognized, or unrepresentable character.
- replacement_character_utf8
- = utf8EncodeComptime(replacement_character)
Source
Implementation
fn formatUtf8(utf8: []const u8, writer: *std.io.Writer) std.io.Writer.Error!void {
var buf: [300]u8 = undefined; // just an arbitrary size
var u8len: usize = 0;
// This implementation is based on this specification:
// https://encoding.spec.whatwg.org/#utf-8-decoder
var codepoint: u21 = 0;
var cont_bytes_seen: u3 = 0;
var cont_bytes_needed: u3 = 0;
var lower_boundary: u8 = 0x80;
var upper_boundary: u8 = 0xBF;
var i: usize = 0;
while (i < utf8.len) {
const byte = utf8[i];
if (cont_bytes_needed == 0) {
switch (byte) {
0x00...0x7F => {
buf[u8len] = byte;
u8len += 1;
},
0xC2...0xDF => {
cont_bytes_needed = 1;
codepoint = byte & 0b00011111;
},
0xE0...0xEF => {
if (byte == 0xE0) lower_boundary = 0xA0;
if (byte == 0xED) upper_boundary = 0x9F;
cont_bytes_needed = 2;
codepoint = byte & 0b00001111;
},
0xF0...0xF4 => {
if (byte == 0xF0) lower_boundary = 0x90;
if (byte == 0xF4) upper_boundary = 0x8F;
cont_bytes_needed = 3;
codepoint = byte & 0b00000111;
},
else => {
u8len += utf8Encode(replacement_character, buf[u8len..]) catch unreachable;
},
}
// consume the byte
i += 1;
} else if (byte < lower_boundary or byte > upper_boundary) {
codepoint = 0;
cont_bytes_needed = 0;
cont_bytes_seen = 0;
lower_boundary = 0x80;
upper_boundary = 0xBF;
u8len += utf8Encode(replacement_character, buf[u8len..]) catch unreachable;
// do not consume the current byte, it should now be treated as a possible start byte
} else {
lower_boundary = 0x80;
upper_boundary = 0xBF;
codepoint <<= 6;
codepoint |= byte & 0b00111111;
cont_bytes_seen += 1;
// consume the byte
i += 1;
if (cont_bytes_seen == cont_bytes_needed) {
const codepoint_len = cont_bytes_seen + 1;
const codepoint_start_i = i - codepoint_len;
@memcpy(buf[u8len..][0..codepoint_len], utf8[codepoint_start_i..][0..codepoint_len]);
u8len += codepoint_len;
codepoint = 0;
cont_bytes_needed = 0;
cont_bytes_seen = 0;
}
}
// make sure there's always enough room for another maximum length UTF-8 codepoint
if (u8len + 4 > buf.len) {
try writer.writeAll(buf[0..u8len]);
u8len = 0;
}
}
if (cont_bytes_needed != 0) {
// we know there's enough room because we always flush
// if there's less than 4 bytes remaining in the buffer.
u8len += utf8Encode(replacement_character, buf[u8len..]) catch unreachable;
}
try writer.writeAll(buf[0..u8len]);
}