diff --git a/benchmark_test.go b/benchmark_test.go new file mode 100644 index 0000000..831acac --- /dev/null +++ b/benchmark_test.go @@ -0,0 +1,71 @@ +package toml + +import ( + "bytes" + "testing" +) + +var valid10Ascii = []byte("1234567890") +var valid10Utf8 = []byte("日本語a") +var valid1kUtf8 = bytes.Repeat([]byte("0123456789日本語日本語日本語日abcdefghijklmnopqrstuvwx"), 16) +var valid1MUtf8 = bytes.Repeat(valid1kUtf8, 1024) +var valid1kAscii = bytes.Repeat([]byte("012345678998jhjklasDJKLAAdjdfjsdklfjdslkabcdefghijklmnopqrstuvwx"), 16) +var valid1MAscii = bytes.Repeat(valid1kAscii, 1024) + +func BenchmarkScanComments(b *testing.B) { + wrap := func(x []byte) []byte { + return []byte("# " + string(x) + "\n") + } + + inputs := map[string][]byte{ + "10Valid": wrap(valid10Ascii), + "1kValid": wrap(valid1kAscii), + "1MValid": wrap(valid1MAscii), + "10ValidUtf8": wrap(valid10Utf8), + "1kValidUtf8": wrap(valid1kUtf8), + "1MValidUtf8": wrap(valid1MUtf8), + } + + for name, input := range inputs { + b.Run(name, func(b *testing.B) { + b.SetBytes(int64(len(input))) + b.ReportAllocs() + b.ResetTimer() + + for i := 0; i < b.N; i++ { + scanComment(input) + } + }) + } +} + +func BenchmarkParseLiteralStringValid(b *testing.B) { + wrap := func(x []byte) []byte { + return []byte("'" + string(x) + "'") + } + + inputs := map[string][]byte{ + "10Valid": wrap(valid10Ascii), + "1kValid": wrap(valid1kAscii), + "1MValid": wrap(valid1MAscii), + "10ValidUtf8": wrap(valid10Utf8), + "1kValidUtf8": wrap(valid1kUtf8), + "1MValidUtf8": wrap(valid1MUtf8), + } + + for name, input := range inputs { + b.Run(name, func(b *testing.B) { + p := parser{} + b.SetBytes(int64(len(input))) + b.ReportAllocs() + b.ResetTimer() + + for i := 0; i < b.N; i++ { + _, _, _, err := p.parseLiteralString(input) + if err != nil { + panic(err) + } + } + }) + } +} diff --git a/parser.go b/parser.go index 4e6fa32..014c4d9 100644 --- a/parser.go +++ b/parser.go @@ -2,6 +2,7 @@ package toml import ( "bytes" + "unicode" "github.com/pelletier/go-toml/v2/internal/ast" "github.com/pelletier/go-toml/v2/internal/danger" @@ -106,9 +107,8 @@ func (p *parser) parseExpression(b []byte) (ast.Reference, []byte, error) { } if b[0] == '#' { - _, rest := scanComment(b) - - return ref, rest, nil + _, rest, err := scanComment(b) + return ref, rest, err } if b[0] == '\n' || b[0] == '\r' { @@ -129,9 +129,8 @@ func (p *parser) parseExpression(b []byte) (ast.Reference, []byte, error) { b = p.parseWhitespace(b) if len(b) > 0 && b[0] == '#' { - _, rest := scanComment(b) - - return ref, rest, nil + _, rest, err := scanComment(b) + return ref, rest, err } return ref, b, nil @@ -479,7 +478,10 @@ func (p *parser) parseOptionalWhitespaceCommentNewline(b []byte) ([]byte, error) b = p.parseWhitespace(b) if len(b) > 0 && b[0] == '#' { - _, b = scanComment(b) + _, b, err = scanComment(b) + if err != nil { + return nil, err + } } if len(b) == 0 { @@ -529,7 +531,7 @@ func (p *parser) parseMultilineBasicString(b []byte) ([]byte, []byte, []byte, er // mlb-quotes = 1*2quotation-mark // mlb-unescaped = wschar / %x21 / %x23-5B / %x5D-7E / non-ascii // mlb-escaped-nl = escape ws newline *( wschar / newline ) - token, rest, err := scanMultilineBasicString(b) + token, escaped, rest, err := scanMultilineBasicString(b) if err != nil { return nil, nil, nil, err } @@ -546,16 +548,20 @@ func (p *parser) parseMultilineBasicString(b []byte) ([]byte, []byte, []byte, er // fast path startIdx := i endIdx := len(token) - len(`"""`) - for ; i < endIdx; i++ { - if token[i] == '\\' { - break + + if escaped < 0 { + str := token[startIdx:endIdx] + verr := utf8TomlValidAlreadyEscaped(str) + if verr.Zero() { + return token, str, rest, nil } - } - if i == endIdx { - return token, token[startIdx:endIdx], rest, nil + return nil, nil, nil, newDecodeError(str[verr.Index:verr.Index+verr.Size], "invalid UTF-8") } + i = escaped + var builder bytes.Buffer + // grow? builder.Write(token[startIdx:i]) // The scanner ensures that the token starts and ends with quotes and that @@ -705,25 +711,30 @@ func (p *parser) parseBasicString(b []byte) ([]byte, []byte, []byte, error) { // escape-seq-char =/ %x74 ; t tab U+0009 // escape-seq-char =/ %x75 4HEXDIG ; uXXXX U+XXXX // escape-seq-char =/ %x55 8HEXDIG ; UXXXXXXXX U+XXXXXXXX - token, rest, err := scanBasicString(b) + token, escaped, rest, err := scanBasicString(b) if err != nil { return nil, nil, nil, err } - // fast path - i := len(`"`) - startIdx := i + startIdx := len(`"`) endIdx := len(token) - len(`"`) - for ; i < endIdx; i++ { - if token[i] == '\\' { - break + + // Fast path. If there is no escape sequence, the string should just be + // an UTF-8 encoded string, which is the same as Go. In that case, + // validate the string and return a direct reference to the buffer. + if escaped < 0 { + str := token[startIdx:endIdx] + verr := utf8TomlValidAlreadyEscaped(str) + if verr.Zero() { + return token, str, rest, nil } - } - if i == endIdx { - return token, token[startIdx:endIdx], rest, nil + return nil, nil, nil, newDecodeError(str[verr.Index:verr.Index+verr.Size], "invalid UTF-8") } + i := escaped + var builder bytes.Buffer + // grow? builder.Write(token[startIdx:i]) // The scanner ensures that the token starts and ends with quotes and that @@ -780,22 +791,27 @@ func hexToRune(b []byte, length int) (rune, error) { } b = b[:length] - var r rune + var r uint32 for i, c := range b { + d := uint32(0) switch { case '0' <= c && c <= '9': - c = c - '0' + d = uint32(c - '0') case 'a' <= c && c <= 'f': - c = c - 'a' + 10 + d = uint32(c - 'a' + 10) case 'A' <= c && c <= 'F': - c = c - 'A' + 10 + d = uint32(c - 'A' + 10) default: return -1, newDecodeError(b[i:i+1], "non-hex character") } - r = r*16 + rune(c) + r = r*16 + d } - return r, nil + if r > unicode.MaxRune || 0xD800 <= r && r < 0xE000 { + return -1, newDecodeError(b, "escape sequence is invalid Unicode code point") + } + + return rune(r), nil } func (p *parser) parseWhitespace(b []byte) []byte { diff --git a/scanner.go b/scanner.go index 043adc3..4a5ccda 100644 --- a/scanner.go +++ b/scanner.go @@ -49,13 +49,18 @@ func scanLiteralString(b []byte) ([]byte, []byte, error) { // literal-string = apostrophe *literal-char apostrophe // apostrophe = %x27 ; ' apostrophe // literal-char = %x09 / %x20-26 / %x28-7E / non-ascii - for i := 1; i < len(b); i++ { + for i := 1; i < len(b); { switch b[i] { case '\'': return b[:i+1], b[i+1:], nil case '\n': return nil, nil, newDecodeError(b[i:i+1], "literal strings cannot have new lines") } + size := utf8ValidNext(b[i:]) + if size == 0 { + return nil, nil, newDecodeError(b[i:i+1], "invalid character") + } + i += size } return nil, nil, newDecodeError(b[len(b):], "unterminated literal string") @@ -70,10 +75,15 @@ func scanMultilineLiteralString(b []byte) ([]byte, []byte, error) { // mll-content = mll-char / newline // mll-char = %x09 / %x20-26 / %x28-7E / non-ascii // mll-quotes = 1*2apostrophe - for i := 3; i < len(b); i++ { + for i := 3; i < len(b); { if b[i] == '\'' && scanFollowsMultilineLiteralStringDelimiter(b[i:]) { return b[:i+3], b[i+3:], nil } + size := utf8ValidNext(b[i:]) + if size == 0 { + return nil, nil, newDecodeError(b[i:i+1], "invalid character") + } + i += size } return nil, nil, newDecodeError(b[len(b):], `multiline literal string not terminated by '''`) @@ -106,45 +116,72 @@ func scanWhitespace(b []byte) ([]byte, []byte) { } //nolint:unparam -func scanComment(b []byte) ([]byte, []byte) { +func scanComment(b []byte) ([]byte, []byte, error) { // comment-start-symbol = %x23 ; # // non-ascii = %x80-D7FF / %xE000-10FFFF // non-eol = %x09 / %x20-7F / non-ascii // // comment = comment-start-symbol *non-eol - for i := 1; i < len(b); i++ { + + for i := 1; i < len(b); { if b[i] == '\n' { - return b[:i], b[i:] + return b[:i], b[i:], nil } + size := utf8ValidNext(b[i:]) + if size == 0 { + return nil, nil, newDecodeError(b[i:i+1], "invalid character in comment") + } + + i += size } - return b, b[len(b):] + return b, b[len(b):], nil } -func scanBasicString(b []byte) ([]byte, []byte, error) { +func scanBasicString(b []byte) ([]byte, int, []byte, error) { // basic-string = quotation-mark *basic-char quotation-mark // quotation-mark = %x22 ; " // basic-char = basic-unescaped / escaped // basic-unescaped = wschar / %x21 / %x23-5B / %x5D-7E / non-ascii // escaped = escape escape-seq-char - for i := 1; i < len(b); i++ { + escaped := -1 // index of the first \. -1 means no escape character in there. + i := 1 + +loop: + for ; i < len(b); i++ { switch b[i] { case '"': - return b[:i+1], b[i+1:], nil + return b[:i+1], escaped, b[i+1:], nil case '\n': - return nil, nil, newDecodeError(b[i:i+1], "basic strings cannot have new lines") + return nil, escaped, nil, newDecodeError(b[i:i+1], "basic strings cannot have new lines") case '\\': if len(b) < i+2 { - return nil, nil, newDecodeError(b[i:i+1], "need a character after \\") + return nil, escaped, nil, newDecodeError(b[i:i+1], "need a character after \\") + } + escaped = i + i += 2 // skip the next character + break loop + } + } + + for ; i < len(b); i++ { + switch b[i] { + case '"': + return b[:i+1], escaped, b[i+1:], nil + case '\n': + return nil, escaped, nil, newDecodeError(b[i:i+1], "basic strings cannot have new lines") + case '\\': + if len(b) < i+2 { + return nil, escaped, nil, newDecodeError(b[i:i+1], "need a character after \\") } i++ // skip the next character } } - return nil, nil, newDecodeError(b[len(b):], `basic string not terminated by "`) + return nil, escaped, nil, newDecodeError(b[len(b):], `basic string not terminated by "`) } -func scanMultilineBasicString(b []byte) ([]byte, []byte, error) { +func scanMultilineBasicString(b []byte) ([]byte, int, []byte, error) { // ml-basic-string = ml-basic-string-delim [ newline ] ml-basic-body // ml-basic-string-delim // ml-basic-string-delim = 3quotation-mark @@ -155,19 +192,40 @@ func scanMultilineBasicString(b []byte) ([]byte, []byte, error) { // mlb-quotes = 1*2quotation-mark // mlb-unescaped = wschar / %x21 / %x23-5B / %x5D-7E / non-ascii // mlb-escaped-nl = escape ws newline *( wschar / newline ) - for i := 3; i < len(b); i++ { + + escaped := -1 + i := 3 + +loop: + for ; i < len(b); i++ { switch b[i] { case '"': if scanFollowsMultilineBasicStringDelimiter(b[i:]) { - return b[:i+3], b[i+3:], nil + return b[:i+3], escaped, b[i+3:], nil } case '\\': if len(b) < i+2 { - return nil, nil, newDecodeError(b[len(b):], "need a character after \\") + return nil, escaped, nil, newDecodeError(b[len(b):], "need a character after \\") + } + escaped = i + i += 2 // skip the next character + break loop + } + } + + for ; i < len(b); i++ { + switch b[i] { + case '"': + if scanFollowsMultilineBasicStringDelimiter(b[i:]) { + return b[:i+3], escaped, b[i+3:], nil + } + case '\\': + if len(b) < i+2 { + return nil, escaped, nil, newDecodeError(b[len(b):], "need a character after \\") } i++ // skip the next character } } - return nil, nil, newDecodeError(b[len(b):], `multiline basic string not terminated by """`) + return nil, escaped, nil, newDecodeError(b[len(b):], `multiline basic string not terminated by """`) } diff --git a/toml_testgen_test.go b/toml_testgen_test.go index de70ba9..f552cb4 100644 --- a/toml_testgen_test.go +++ b/toml_testgen_test.go @@ -71,139 +71,106 @@ func TestTOMLTest_Invalid_Bool_WrongCaseTrue(t *testing.T) { } func TestTOMLTest_Invalid_Control_CommentDel(t *testing.T) { - t.Skip("FIXME") input := "comment-del = \"0x7f\" # \u007f\n" testgenInvalid(t, input) } func TestTOMLTest_Invalid_Control_CommentLf(t *testing.T) { - t.Skip("FIXME") input := "comment-lf = \"ctrl-P\" # \x10\n" testgenInvalid(t, input) } func TestTOMLTest_Invalid_Control_CommentNull(t *testing.T) { - t.Skip("FIXME") input := "comment-null = \"null\" # \x00\n" testgenInvalid(t, input) } func TestTOMLTest_Invalid_Control_CommentUs(t *testing.T) { - t.Skip("FIXME") input := "comment-us = \"ctrl-_\" # \x1f\n" testgenInvalid(t, input) } func TestTOMLTest_Invalid_Control_MultiDel(t *testing.T) { - t.Skip("FIXME") - input := "multi-del = \"\"\"null\u007f\"\"\"\n" testgenInvalid(t, input) } func TestTOMLTest_Invalid_Control_MultiLf(t *testing.T) { - t.Skip("FIXME") - input := "multi-lf = \"\"\"null\x10\"\"\"\n" testgenInvalid(t, input) } func TestTOMLTest_Invalid_Control_MultiNull(t *testing.T) { - t.Skip("FIXME") - input := "multi-null = \"\"\"null\x00\"\"\"\n" testgenInvalid(t, input) } func TestTOMLTest_Invalid_Control_MultiUs(t *testing.T) { - t.Skip("FIXME") - input := "multi-us = \"\"\"null\x1f\"\"\"\n" testgenInvalid(t, input) } func TestTOMLTest_Invalid_Control_RawmultiDel(t *testing.T) { - t.Skip("FIXME") - input := "rawmulti-del = '''null\u007f'''\n" testgenInvalid(t, input) } func TestTOMLTest_Invalid_Control_RawmultiLf(t *testing.T) { - t.Skip("FIXME") - input := "rawmulti-lf = '''null\x10'''\n" testgenInvalid(t, input) } func TestTOMLTest_Invalid_Control_RawmultiNull(t *testing.T) { - t.Skip("FIXME") - input := "rawmulti-null = '''null\x00'''\n" testgenInvalid(t, input) } func TestTOMLTest_Invalid_Control_RawmultiUs(t *testing.T) { - t.Skip("FIXME") - input := "rawmulti-us = '''null\x1f'''\n" testgenInvalid(t, input) } func TestTOMLTest_Invalid_Control_RawstringDel(t *testing.T) { - t.Skip("FIXME") - input := "rawstring-del = 'null\u007f'\n" testgenInvalid(t, input) } func TestTOMLTest_Invalid_Control_RawstringLf(t *testing.T) { - t.Skip("FIXME") - input := "rawstring-lf = 'null\x10'\n" testgenInvalid(t, input) } func TestTOMLTest_Invalid_Control_RawstringNull(t *testing.T) { - t.Skip("FIXME") - input := "rawstring-null = 'null\x00'\n" testgenInvalid(t, input) } func TestTOMLTest_Invalid_Control_RawstringUs(t *testing.T) { - t.Skip("FIXME") - input := "rawstring-us = 'null\x1f'\n" testgenInvalid(t, input) } func TestTOMLTest_Invalid_Control_StringBs(t *testing.T) { - t.Skip("FIXME") input := "string-bs = \"backspace\b\"\n" testgenInvalid(t, input) } func TestTOMLTest_Invalid_Control_StringDel(t *testing.T) { - t.Skip("FIXME") input := "string-del = \"null\u007f\"\n" testgenInvalid(t, input) } func TestTOMLTest_Invalid_Control_StringLf(t *testing.T) { - t.Skip("FIXME") input := "string-lf = \"null\x10\"\n" testgenInvalid(t, input) } func TestTOMLTest_Invalid_Control_StringNull(t *testing.T) { - t.Skip("FIXME") input := "string-null = \"null\x00\"\n" testgenInvalid(t, input) } func TestTOMLTest_Invalid_Control_StringUs(t *testing.T) { - t.Skip("FIXME") input := "string-us = \"null\x1f\"\n" testgenInvalid(t, input) } @@ -757,7 +724,6 @@ func TestTOMLTest_Invalid_String_BadByteEscape(t *testing.T) { } func TestTOMLTest_Invalid_String_BadCodepoint(t *testing.T) { - t.Skip("FIXME") input := "invalid-codepoint = \"This string contains a non scalar unicode codepoint \\uD801\"\n" testgenInvalid(t, input) } @@ -793,13 +759,11 @@ func TestTOMLTest_Invalid_String_BasicByteEscapes(t *testing.T) { } func TestTOMLTest_Invalid_String_BasicMultilineOutOfRangeUnicodeEscape1(t *testing.T) { - t.Skip("FIXME") input := "a = \"\"\"\\UFFFFFFFF\"\"\"\n" testgenInvalid(t, input) } func TestTOMLTest_Invalid_String_BasicMultilineOutOfRangeUnicodeEscape2(t *testing.T) { - t.Skip("FIXME") input := "a = \"\"\"\\U00D80000\"\"\"\n" testgenInvalid(t, input) } @@ -815,13 +779,11 @@ func TestTOMLTest_Invalid_String_BasicMultilineUnknownEscape(t *testing.T) { } func TestTOMLTest_Invalid_String_BasicOutOfRangeUnicodeEscape1(t *testing.T) { - t.Skip("FIXME") input := "a = \"\\UFFFFFFFF\"\n" testgenInvalid(t, input) } func TestTOMLTest_Invalid_String_BasicOutOfRangeUnicodeEscape2(t *testing.T) { - t.Skip("FIXME") input := "a = \"\\U00D80000\"\n" testgenInvalid(t, input) } diff --git a/unmarshaler.go b/unmarshaler.go index 1fdd686..dbe7cb0 100644 --- a/unmarshaler.go +++ b/unmarshaler.go @@ -899,18 +899,16 @@ func (d *decoder) unmarshalInteger(value *ast.Node, v reflect.Value) error { } func (d *decoder) unmarshalString(value *ast.Node, v reflect.Value) error { - var err error - switch v.Kind() { case reflect.String: v.SetString(string(value.Data)) case reflect.Interface: v.Set(reflect.ValueOf(string(value.Data))) default: - err = newDecodeError(d.p.Raw(value.Raw), "cannot store TOML string into a Go %s", v.Kind()) + return newDecodeError(d.p.Raw(value.Raw), "cannot store TOML string into a Go %s", v.Kind()) } - return err + return nil } func (d *decoder) handleKeyValue(expr *ast.Node, v reflect.Value) (reflect.Value, error) { diff --git a/unmarshaler_test.go b/unmarshaler_test.go index b77ced1..7b05afa 100644 --- a/unmarshaler_test.go +++ b/unmarshaler_test.go @@ -246,6 +246,20 @@ func TestUnmarshal(t *testing.T) { } }, }, + { + desc: "kv literal string", + input: `A = 'foo 🙂 '`, + gen: func() test { + type doc struct { + A string + } + + return test{ + target: &doc{}, + expected: &doc{A: "foo 🙂 "}, + } + }, + }, { desc: "time.time with negative zone", input: `a = 1979-05-27T00:32:00-07:00 `, // space intentional @@ -2009,6 +2023,74 @@ world'`, desc: `invalid nan`, data: `A = non`, }, + { + desc: `invalid character in comment in array`, + data: "A = [#\x00\n]", + }, + { + desc: "invalid utf8 character in long string with no escape sequence", + data: "a = \"aaaa\x80aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\"", + }, + { + desc: "invalid ascii character in long string with no escape sequence", + data: "a = \"aaaa\x00aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\"", + }, + { + desc: "unfinished 2-byte utf8 character in string with no escape sequence", + data: "a = \"aaaa\xC2\"", + }, + { + desc: "unfinished 3-byte utf8 character in string with no escape sequence", + data: "a = \"aaaa\xE2\x00\x00\"", + }, + { + desc: "invalid 3rd byte of 3-byte utf8 character in string with no escape sequence", + data: "a = \"aaaa\xE2\x80\x00\"", + }, + { + desc: "invalid 4rd byte of 4-byte utf8 character in string with no escape sequence", + data: "a = \"aaaa\xF2\x81\x81\x00\"", + }, + { + desc: "unfinished 2-byte utf8 character in literal string", + data: "a = 'aaa\xC2'", + }, + { + desc: "unfinished 3-byte utf8 character in literal string", + data: "a = 'aaaa\xE2\x00\x00'", + }, + { + desc: "invalid 3rd byte of 3-byte utf8 character in literal string", + data: "a = 'aaaa\xE2\x80\x00'", + }, + { + desc: "invalid 4rd byte of 4-byte utf8 character in literal string", + data: "a = 'aaaa\xF2\x81\x81\x00'", + }, + { + desc: "invalid start utf8 character in literal string", + data: "a = '\x80'", + }, + { + desc: "utf8 character with not enough bytes before end in literal string", + data: "a = '\xEF'", + }, + { + desc: "basic string with newline after the first escape code", + data: "a = \"\\t\n\"", + }, + { + desc: "basic string with unfinished escape sequence after the first escape code", + data: "a = \"\\t\\", + }, + { + desc: "basic string with unfinished after the first escape code", + data: "a = \"\\t", + }, + { + desc: "multiline basic string with unfinished escape sequence after the first escape code", + data: "a = \"\"\"\\t\\", + }, } for _, e := range examples { diff --git a/utf8.go b/utf8.go new file mode 100644 index 0000000..79fb28e --- /dev/null +++ b/utf8.go @@ -0,0 +1,203 @@ +package toml + +import ( + "unicode/utf8" +) + +type utf8Err struct { + Index int + Size int +} + +func (u utf8Err) Zero() bool { + return u.Size == 0 +} + +// Verified that a given string is only made of valid UTF-8 characters allowed +// by the TOML spec: +// +// Any Unicode character may be used except those that must be escaped: +// quotation mark, backslash, and the control characters other than tab (U+0000 +// to U+0008, U+000A to U+001F, U+007F). +// +// It is a copy of the Go 1.17 utf8.Valid implementation, tweaked to exit early +// when a character is not allowed. +// +// The returned utf8Err is Zero() if the string is valid, or contains the byte +// index and size of the invalid character. +// +// quotation mark => already checked +// backslash => already checked +// 0-0x8 => invalid +// 0x9 => tab, ok +// 0xA - 0x1F => invalid +// 0x7F => invalid +func utf8TomlValidAlreadyEscaped(p []byte) (err utf8Err) { + // Fast path. Check for and skip 8 bytes of ASCII characters per iteration. + offset := 0 + for len(p) >= 8 { + // Combining two 32 bit loads allows the same code to be used + // for 32 and 64 bit platforms. + // The compiler can generate a 32bit load for first32 and second32 + // on many platforms. See test/codegen/memcombine.go. + first32 := uint32(p[0]) | uint32(p[1])<<8 | uint32(p[2])<<16 | uint32(p[3])<<24 + second32 := uint32(p[4]) | uint32(p[5])<<8 | uint32(p[6])<<16 | uint32(p[7])<<24 + if (first32|second32)&0x80808080 != 0 { + // Found a non ASCII byte (>= RuneSelf). + break + } + + for i, b := range p[:8] { + if invalidAscii(b) { + err.Index = offset + i + err.Size = 1 + return + } + } + + p = p[8:] + offset += 8 + } + n := len(p) + for i := 0; i < n; { + pi := p[i] + if pi < utf8.RuneSelf { + if invalidAscii(pi) { + err.Index = offset + i + err.Size = 1 + return + } + i++ + continue + } + x := first[pi] + if x == xx { + // Illegal starter byte. + err.Index = offset + i + err.Size = 1 + return + } + size := int(x & 7) + if i+size > n { + // Short or invalid. + err.Index = offset + i + err.Size = n - i + return + } + accept := acceptRanges[x>>4] + if c := p[i+1]; c < accept.lo || accept.hi < c { + err.Index = offset + i + err.Size = 2 + return + } else if size == 2 { + } else if c := p[i+2]; c < locb || hicb < c { + err.Index = offset + i + err.Size = 3 + return + } else if size == 3 { + } else if c := p[i+3]; c < locb || hicb < c { + err.Index = offset + i + err.Size = 4 + return + } + i += size + } + return +} + +// Return the size of the next rune if valid, 0 otherwise. +func utf8ValidNext(p []byte) int { + c := p[0] + + if c < utf8.RuneSelf { + if invalidAscii(c) { + return 0 + } + return 1 + } + + x := first[c] + if x == xx { + // Illegal starter byte. + return 0 + } + size := int(x & 7) + if size > len(p) { + // Short or invalid. + return 0 + } + accept := acceptRanges[x>>4] + if c := p[1]; c < accept.lo || accept.hi < c { + return 0 + } else if size == 2 { + } else if c := p[2]; c < locb || hicb < c { + return 0 + } else if size == 3 { + } else if c := p[3]; c < locb || hicb < c { + return 0 + } + + return size +} + +func invalidAscii(b byte) bool { + return b <= 0x08 || (b > 0x0A && b < 0x0D) || (b > 0x0D && b <= 0x1F) || b == 0x7F +} + +// acceptRange gives the range of valid values for the second byte in a UTF-8 +// sequence. +type acceptRange struct { + lo uint8 // lowest value for second byte. + hi uint8 // highest value for second byte. +} + +// acceptRanges has size 16 to avoid bounds checks in the code that uses it. +var acceptRanges = [16]acceptRange{ + 0: {locb, hicb}, + 1: {0xA0, hicb}, + 2: {locb, 0x9F}, + 3: {0x90, hicb}, + 4: {locb, 0x8F}, +} + +// first is information about the first byte in a UTF-8 sequence. +var first = [256]uint8{ + // 1 2 3 4 5 6 7 8 9 A B C D E F + as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x00-0x0F + as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x10-0x1F + as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x20-0x2F + as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x30-0x3F + as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x40-0x4F + as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x50-0x5F + as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x60-0x6F + as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x70-0x7F + // 1 2 3 4 5 6 7 8 9 A B C D E F + xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0x80-0x8F + xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0x90-0x9F + xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0xA0-0xAF + xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0xB0-0xBF + xx, xx, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, // 0xC0-0xCF + s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, // 0xD0-0xDF + s2, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s4, s3, s3, // 0xE0-0xEF + s5, s6, s6, s6, s7, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0xF0-0xFF +} + +const ( + // The default lowest and highest continuation byte. + locb = 0b10000000 + hicb = 0b10111111 + + // These names of these constants are chosen to give nice alignment in the + // table below. The first nibble is an index into acceptRanges or F for + // special one-byte cases. The second nibble is the Rune length or the + // Status for the special one-byte case. + xx = 0xF1 // invalid: size 1 + as = 0xF0 // ASCII: size 1 + s1 = 0x02 // accept 0, size 2 + s2 = 0x13 // accept 1, size 3 + s3 = 0x03 // accept 0, size 3 + s4 = 0x23 // accept 2, size 3 + s5 = 0x34 // accept 3, size 4 + s6 = 0x04 // accept 0, size 4 + s7 = 0x44 // accept 4, size 4 +)