Validate UTF-8 (#629)

2021-10-15 16:13:21 -07:00
parent cc0d1a90ff
commit cd54472d03
7 changed files with 479 additions and 89 deletions
@@ -0,0 +1,71 @@
 package toml
 import (
 	"bytes"
 	"testing"
 )
 var valid10Ascii = []byte("1234567890")
 var valid10Utf8 = []byte("日本語a")
 var valid1kUtf8 = bytes.Repeat([]byte("0123456789日本語日本語日本語日abcdefghijklmnopqrstuvwx"), 16)
 var valid1MUtf8 = bytes.Repeat(valid1kUtf8, 1024)
 var valid1kAscii = bytes.Repeat([]byte("012345678998jhjklasDJKLAAdjdfjsdklfjdslkabcdefghijklmnopqrstuvwx"), 16)
 var valid1MAscii = bytes.Repeat(valid1kAscii, 1024)
 func BenchmarkScanComments(b *testing.B) {
 	wrap := func(x []byte) []byte {
 		return []byte("# " + string(x) + "\n")
 	}
 	inputs := map[string][]byte{
 		"10Valid":     wrap(valid10Ascii),
 		"1kValid":     wrap(valid1kAscii),
 		"1MValid":     wrap(valid1MAscii),
 		"10ValidUtf8": wrap(valid10Utf8),
 		"1kValidUtf8": wrap(valid1kUtf8),
 		"1MValidUtf8": wrap(valid1MUtf8),
 	}
 	for name, input := range inputs {
 		b.Run(name, func(b *testing.B) {
 			b.SetBytes(int64(len(input)))
 			b.ReportAllocs()
 			b.ResetTimer()
 			for i := 0; i < b.N; i++ {
 				scanComment(input)
 			}
 		})
 	}
 }
 func BenchmarkParseLiteralStringValid(b *testing.B) {
 	wrap := func(x []byte) []byte {
 		return []byte("'" + string(x) + "'")
 	}
 	inputs := map[string][]byte{
 		"10Valid":     wrap(valid10Ascii),
 		"1kValid":     wrap(valid1kAscii),
 		"1MValid":     wrap(valid1MAscii),
 		"10ValidUtf8": wrap(valid10Utf8),
 		"1kValidUtf8": wrap(valid1kUtf8),
 		"1MValidUtf8": wrap(valid1MUtf8),
 	}
 	for name, input := range inputs {
 		b.Run(name, func(b *testing.B) {
 			p := parser{}
 			b.SetBytes(int64(len(input)))
 			b.ReportAllocs()
 			b.ResetTimer()
 			for i := 0; i < b.N; i++ {
 				_, _, _, err := p.parseLiteralString(input)
 				if err != nil {
 					panic(err)
 				}
 			}
 		})
 	}
 }
@@ -2,6 +2,7 @@ package toml
 import (
 	"bytes"
 	"unicode"
 	"github.com/pelletier/go-toml/v2/internal/ast"
 	"github.com/pelletier/go-toml/v2/internal/danger"
@@ -106,9 +107,8 @@ func (p *parser) parseExpression(b []byte) (ast.Reference, []byte, error) {
 	}
 	if b[0] == '#' {
-		_, rest := scanComment(b)
+		_, rest, err := scanComment(b)
-
+		return ref, rest, err
 		return ref, rest, nil
 	}
 	if b[0] == '\n' || b[0] == '\r' {
@@ -129,9 +129,8 @@ func (p *parser) parseExpression(b []byte) (ast.Reference, []byte, error) {
 	b = p.parseWhitespace(b)
 	if len(b) > 0 && b[0] == '#' {
-		_, rest := scanComment(b)
+		_, rest, err := scanComment(b)
-
+		return ref, rest, err
 		return ref, rest, nil
 	}
 	return ref, b, nil
@@ -479,7 +478,10 @@ func (p *parser) parseOptionalWhitespaceCommentNewline(b []byte) ([]byte, error)
 		b = p.parseWhitespace(b)
 		if len(b) > 0 && b[0] == '#' {
-			_, b = scanComment(b)
+			_, b, err = scanComment(b)
 			if err != nil {
 				return nil, err
 			}
 		}
 		if len(b) == 0 {
@@ -529,7 +531,7 @@ func (p *parser) parseMultilineBasicString(b []byte) ([]byte, []byte, []byte, er
 	// mlb-quotes = 1*2quotation-mark
 	// mlb-unescaped = wschar / %x21 / %x23-5B / %x5D-7E / non-ascii
 	// mlb-escaped-nl = escape ws newline *( wschar / newline )
-	token, rest, err := scanMultilineBasicString(b)
+	token, escaped, rest, err := scanMultilineBasicString(b)
 	if err != nil {
 		return nil, nil, nil, err
 	}
@@ -546,16 +548,20 @@ func (p *parser) parseMultilineBasicString(b []byte) ([]byte, []byte, []byte, er
 	// fast path
 	startIdx := i
 	endIdx := len(token) - len(`"""`)
-	for ; i < endIdx; i++ {
+
-		if token[i] == '\\' {
+	if escaped < 0 {
-			break
+		str := token[startIdx:endIdx]
 		verr := utf8TomlValidAlreadyEscaped(str)
 		if verr.Zero() {
 			return token, str, rest, nil
 		}
-	}
+		return nil, nil, nil, newDecodeError(str[verr.Index:verr.Index+verr.Size], "invalid UTF-8")
 	if i == endIdx {
 		return token, token[startIdx:endIdx], rest, nil
 	}
 	i = escaped
 	var builder bytes.Buffer
 	// grow?
 	builder.Write(token[startIdx:i])
 	// The scanner ensures that the token starts and ends with quotes and that
@@ -705,25 +711,30 @@ func (p *parser) parseBasicString(b []byte) ([]byte, []byte, []byte, error) {
 	// escape-seq-char =/ %x74         ; t    tab             U+0009
 	// escape-seq-char =/ %x75 4HEXDIG ; uXXXX                U+XXXX
 	// escape-seq-char =/ %x55 8HEXDIG ; UXXXXXXXX            U+XXXXXXXX
-	token, rest, err := scanBasicString(b)
+	token, escaped, rest, err := scanBasicString(b)
 	if err != nil {
 		return nil, nil, nil, err
 	}
-	// fast path
+	startIdx := len(`"`)
 	i := len(`"`)
 	startIdx := i
 	endIdx := len(token) - len(`"`)
-	for ; i < endIdx; i++ {
+
-		if token[i] == '\\' {
+	// Fast path. If there is no escape sequence, the string should just be
-			break
+	// an UTF-8 encoded string, which is the same as Go. In that case,
 	// validate the string and return a direct reference to the buffer.
 	if escaped < 0 {
 		str := token[startIdx:endIdx]
 		verr := utf8TomlValidAlreadyEscaped(str)
 		if verr.Zero() {
 			return token, str, rest, nil
 		}
-	}
+		return nil, nil, nil, newDecodeError(str[verr.Index:verr.Index+verr.Size], "invalid UTF-8")
 	if i == endIdx {
 		return token, token[startIdx:endIdx], rest, nil
 	}
 	i := escaped
 	var builder bytes.Buffer
 	// grow?
 	builder.Write(token[startIdx:i])
 	// The scanner ensures that the token starts and ends with quotes and that
@@ -780,22 +791,27 @@ func hexToRune(b []byte, length int) (rune, error) {
 	}
 	b = b[:length]
-	var r rune
+	var r uint32
 	for i, c := range b {
 		d := uint32(0)
 		switch {
 		case '0' <= c && c <= '9':
-			c = c - '0'
+			d = uint32(c - '0')
 		case 'a' <= c && c <= 'f':
-			c = c - 'a' + 10
+			d = uint32(c - 'a' + 10)
 		case 'A' <= c && c <= 'F':
-			c = c - 'A' + 10
+			d = uint32(c - 'A' + 10)
 		default:
 			return -1, newDecodeError(b[i:i+1], "non-hex character")
 		}
-		r = r*16 + rune(c)
+		r = r*16 + d
 	}
-	return r, nil
+	if r > unicode.MaxRune || 0xD800 <= r && r < 0xE000 {
 		return -1, newDecodeError(b, "escape sequence is invalid Unicode code point")
 	}
 	return rune(r), nil
 }
 func (p *parser) parseWhitespace(b []byte) []byte {
@@ -49,13 +49,18 @@ func scanLiteralString(b []byte) ([]byte, []byte, error) {
 	// literal-string = apostrophe *literal-char apostrophe
 	// apostrophe = %x27 ; ' apostrophe
 	// literal-char = %x09 / %x20-26 / %x28-7E / non-ascii
-	for i := 1; i < len(b); i++ {
+	for i := 1; i < len(b); {
 		switch b[i] {
 		case '\'':
 			return b[:i+1], b[i+1:], nil
 		case '\n':
 			return nil, nil, newDecodeError(b[i:i+1], "literal strings cannot have new lines")
 		}
 		size := utf8ValidNext(b[i:])
 		if size == 0 {
 			return nil, nil, newDecodeError(b[i:i+1], "invalid character")
 		}
 		i += size
 	}
 	return nil, nil, newDecodeError(b[len(b):], "unterminated literal string")
@@ -70,10 +75,15 @@ func scanMultilineLiteralString(b []byte) ([]byte, []byte, error) {
 	// mll-content = mll-char / newline
 	// mll-char = %x09 / %x20-26 / %x28-7E / non-ascii
 	// mll-quotes = 1*2apostrophe
-	for i := 3; i < len(b); i++ {
+	for i := 3; i < len(b); {
 		if b[i] == '\'' && scanFollowsMultilineLiteralStringDelimiter(b[i:]) {
 			return b[:i+3], b[i+3:], nil
 		}
 		size := utf8ValidNext(b[i:])
 		if size == 0 {
 			return nil, nil, newDecodeError(b[i:i+1], "invalid character")
 		}
 		i += size
 	}
 	return nil, nil, newDecodeError(b[len(b):], `multiline literal string not terminated by '''`)
@@ -106,45 +116,72 @@ func scanWhitespace(b []byte) ([]byte, []byte) {
 }
 //nolint:unparam
-func scanComment(b []byte) ([]byte, []byte) {
+func scanComment(b []byte) ([]byte, []byte, error) {
 	// comment-start-symbol = %x23 ; #
 	// non-ascii = %x80-D7FF / %xE000-10FFFF
 	// non-eol = %x09 / %x20-7F / non-ascii
 	//
 	// comment = comment-start-symbol *non-eol
-	for i := 1; i < len(b); i++ {
+
 	for i := 1; i < len(b); {
 		if b[i] == '\n' {
-			return b[:i], b[i:]
+			return b[:i], b[i:], nil
 		}
 		size := utf8ValidNext(b[i:])
 		if size == 0 {
 			return nil, nil, newDecodeError(b[i:i+1], "invalid character in comment")
 		}
 		i += size
 	}
-	return b, b[len(b):]
+	return b, b[len(b):], nil
 }
-func scanBasicString(b []byte) ([]byte, []byte, error) {
+func scanBasicString(b []byte) ([]byte, int, []byte, error) {
 	// basic-string = quotation-mark *basic-char quotation-mark
 	// quotation-mark = %x22            ; "
 	// basic-char = basic-unescaped / escaped
 	// basic-unescaped = wschar / %x21 / %x23-5B / %x5D-7E / non-ascii
 	// escaped = escape escape-seq-char
-	for i := 1; i < len(b); i++ {
+	escaped := -1 // index of the first \. -1 means no escape character in there.
 	i := 1
 loop:
 	for ; i < len(b); i++ {
 		switch b[i] {
 		case '"':
-			return b[:i+1], b[i+1:], nil
+			return b[:i+1], escaped, b[i+1:], nil
 		case '\n':
-			return nil, nil, newDecodeError(b[i:i+1], "basic strings cannot have new lines")
+			return nil, escaped, nil, newDecodeError(b[i:i+1], "basic strings cannot have new lines")
 		case '\\':
 			if len(b) < i+2 {
-				return nil, nil, newDecodeError(b[i:i+1], "need a character after \\")
+				return nil, escaped, nil, newDecodeError(b[i:i+1], "need a character after \\")
 			}
 			escaped = i
 			i += 2 // skip the next character
 			break loop
 		}
 	}
 	for ; i < len(b); i++ {
 		switch b[i] {
 		case '"':
 			return b[:i+1], escaped, b[i+1:], nil
 		case '\n':
 			return nil, escaped, nil, newDecodeError(b[i:i+1], "basic strings cannot have new lines")
 		case '\\':
 			if len(b) < i+2 {
 				return nil, escaped, nil, newDecodeError(b[i:i+1], "need a character after \\")
 			}
 			i++ // skip the next character
 		}
 	}
-	return nil, nil, newDecodeError(b[len(b):], `basic string not terminated by "`)
+	return nil, escaped, nil, newDecodeError(b[len(b):], `basic string not terminated by "`)
 }
-func scanMultilineBasicString(b []byte) ([]byte, []byte, error) {
+func scanMultilineBasicString(b []byte) ([]byte, int, []byte, error) {
 	// ml-basic-string = ml-basic-string-delim [ newline ] ml-basic-body
 	// ml-basic-string-delim
 	// ml-basic-string-delim = 3quotation-mark
@@ -155,19 +192,40 @@ func scanMultilineBasicString(b []byte) ([]byte, []byte, error) {
 	// mlb-quotes = 1*2quotation-mark
 	// mlb-unescaped = wschar / %x21 / %x23-5B / %x5D-7E / non-ascii
 	// mlb-escaped-nl = escape ws newline *( wschar / newline )
-	for i := 3; i < len(b); i++ {
+
 	escaped := -1
 	i := 3
 loop:
 	for ; i < len(b); i++ {
 		switch b[i] {
 		case '"':
 			if scanFollowsMultilineBasicStringDelimiter(b[i:]) {
-				return b[:i+3], b[i+3:], nil
+				return b[:i+3], escaped, b[i+3:], nil
 			}
 		case '\\':
 			if len(b) < i+2 {
-				return nil, nil, newDecodeError(b[len(b):], "need a character after \\")
+				return nil, escaped, nil, newDecodeError(b[len(b):], "need a character after \\")
 			}
 			escaped = i
 			i += 2 // skip the next character
 			break loop
 		}
 	}
 	for ; i < len(b); i++ {
 		switch b[i] {
 		case '"':
 			if scanFollowsMultilineBasicStringDelimiter(b[i:]) {
 				return b[:i+3], escaped, b[i+3:], nil
 			}
 		case '\\':
 			if len(b) < i+2 {
 				return nil, escaped, nil, newDecodeError(b[len(b):], "need a character after \\")
 			}
 			i++ // skip the next character
 		}
 	}
-	return nil, nil, newDecodeError(b[len(b):], `multiline basic string not terminated by """`)
+	return nil, escaped, nil, newDecodeError(b[len(b):], `multiline basic string not terminated by """`)
 }
@@ -71,139 +71,106 @@ func TestTOMLTest_Invalid_Bool_WrongCaseTrue(t *testing.T) {
 }
 func TestTOMLTest_Invalid_Control_CommentDel(t *testing.T) {
 	t.Skip("FIXME")
 	input := "comment-del = \"0x7f\" # \u007f\n"
 	testgenInvalid(t, input)
 }
 func TestTOMLTest_Invalid_Control_CommentLf(t *testing.T) {
 	t.Skip("FIXME")
 	input := "comment-lf = \"ctrl-P\" # \x10\n"
 	testgenInvalid(t, input)
 }
 func TestTOMLTest_Invalid_Control_CommentNull(t *testing.T) {
 	t.Skip("FIXME")
 	input := "comment-null = \"null\" # \x00\n"
 	testgenInvalid(t, input)
 }
 func TestTOMLTest_Invalid_Control_CommentUs(t *testing.T) {
 	t.Skip("FIXME")
 	input := "comment-us = \"ctrl-_\" # \x1f\n"
 	testgenInvalid(t, input)
 }
 func TestTOMLTest_Invalid_Control_MultiDel(t *testing.T) {
 	t.Skip("FIXME")
 	input := "multi-del = \"\"\"null\u007f\"\"\"\n"
 	testgenInvalid(t, input)
 }
 func TestTOMLTest_Invalid_Control_MultiLf(t *testing.T) {
 	t.Skip("FIXME")
 	input := "multi-lf = \"\"\"null\x10\"\"\"\n"
 	testgenInvalid(t, input)
 }
 func TestTOMLTest_Invalid_Control_MultiNull(t *testing.T) {
 	t.Skip("FIXME")
 	input := "multi-null = \"\"\"null\x00\"\"\"\n"
 	testgenInvalid(t, input)
 }
 func TestTOMLTest_Invalid_Control_MultiUs(t *testing.T) {
 	t.Skip("FIXME")
 	input := "multi-us = \"\"\"null\x1f\"\"\"\n"
 	testgenInvalid(t, input)
 }
 func TestTOMLTest_Invalid_Control_RawmultiDel(t *testing.T) {
 	t.Skip("FIXME")
 	input := "rawmulti-del = '''null\u007f'''\n"
 	testgenInvalid(t, input)
 }
 func TestTOMLTest_Invalid_Control_RawmultiLf(t *testing.T) {
 	t.Skip("FIXME")
 	input := "rawmulti-lf = '''null\x10'''\n"
 	testgenInvalid(t, input)
 }
 func TestTOMLTest_Invalid_Control_RawmultiNull(t *testing.T) {
 	t.Skip("FIXME")
 	input := "rawmulti-null = '''null\x00'''\n"
 	testgenInvalid(t, input)
 }
 func TestTOMLTest_Invalid_Control_RawmultiUs(t *testing.T) {
 	t.Skip("FIXME")
 	input := "rawmulti-us = '''null\x1f'''\n"
 	testgenInvalid(t, input)
 }
 func TestTOMLTest_Invalid_Control_RawstringDel(t *testing.T) {
 	t.Skip("FIXME")
 	input := "rawstring-del = 'null\u007f'\n"
 	testgenInvalid(t, input)
 }
 func TestTOMLTest_Invalid_Control_RawstringLf(t *testing.T) {
 	t.Skip("FIXME")
 	input := "rawstring-lf = 'null\x10'\n"
 	testgenInvalid(t, input)
 }
 func TestTOMLTest_Invalid_Control_RawstringNull(t *testing.T) {
 	t.Skip("FIXME")
 	input := "rawstring-null = 'null\x00'\n"
 	testgenInvalid(t, input)
 }
 func TestTOMLTest_Invalid_Control_RawstringUs(t *testing.T) {
 	t.Skip("FIXME")
 	input := "rawstring-us = 'null\x1f'\n"
 	testgenInvalid(t, input)
 }
 func TestTOMLTest_Invalid_Control_StringBs(t *testing.T) {
 	t.Skip("FIXME")
 	input := "string-bs = \"backspace\b\"\n"
 	testgenInvalid(t, input)
 }
 func TestTOMLTest_Invalid_Control_StringDel(t *testing.T) {
 	t.Skip("FIXME")
 	input := "string-del = \"null\u007f\"\n"
 	testgenInvalid(t, input)
 }
 func TestTOMLTest_Invalid_Control_StringLf(t *testing.T) {
 	t.Skip("FIXME")
 	input := "string-lf = \"null\x10\"\n"
 	testgenInvalid(t, input)
 }
 func TestTOMLTest_Invalid_Control_StringNull(t *testing.T) {
 	t.Skip("FIXME")
 	input := "string-null = \"null\x00\"\n"
 	testgenInvalid(t, input)
 }
 func TestTOMLTest_Invalid_Control_StringUs(t *testing.T) {
 	t.Skip("FIXME")
 	input := "string-us = \"null\x1f\"\n"
 	testgenInvalid(t, input)
 }
@@ -757,7 +724,6 @@ func TestTOMLTest_Invalid_String_BadByteEscape(t *testing.T) {
 }
 func TestTOMLTest_Invalid_String_BadCodepoint(t *testing.T) {
 	t.Skip("FIXME")
 	input := "invalid-codepoint = \"This string contains a non scalar unicode codepoint \\uD801\"\n"
 	testgenInvalid(t, input)
 }
@@ -793,13 +759,11 @@ func TestTOMLTest_Invalid_String_BasicByteEscapes(t *testing.T) {
 }
 func TestTOMLTest_Invalid_String_BasicMultilineOutOfRangeUnicodeEscape1(t *testing.T) {
 	t.Skip("FIXME")
 	input := "a = \"\"\"\\UFFFFFFFF\"\"\"\n"
 	testgenInvalid(t, input)
 }
 func TestTOMLTest_Invalid_String_BasicMultilineOutOfRangeUnicodeEscape2(t *testing.T) {
 	t.Skip("FIXME")
 	input := "a = \"\"\"\\U00D80000\"\"\"\n"
 	testgenInvalid(t, input)
 }
@@ -815,13 +779,11 @@ func TestTOMLTest_Invalid_String_BasicMultilineUnknownEscape(t *testing.T) {
 }
 func TestTOMLTest_Invalid_String_BasicOutOfRangeUnicodeEscape1(t *testing.T) {
 	t.Skip("FIXME")
 	input := "a = \"\\UFFFFFFFF\"\n"
 	testgenInvalid(t, input)
 }
 func TestTOMLTest_Invalid_String_BasicOutOfRangeUnicodeEscape2(t *testing.T) {
 	t.Skip("FIXME")
 	input := "a = \"\\U00D80000\"\n"
 	testgenInvalid(t, input)
 }
@@ -899,18 +899,16 @@ func (d *decoder) unmarshalInteger(value *ast.Node, v reflect.Value) error {
 }
 func (d *decoder) unmarshalString(value *ast.Node, v reflect.Value) error {
 	var err error
 	switch v.Kind() {
 	case reflect.String:
 		v.SetString(string(value.Data))
 	case reflect.Interface:
 		v.Set(reflect.ValueOf(string(value.Data)))
 	default:
-		err = newDecodeError(d.p.Raw(value.Raw), "cannot store TOML string into a Go %s", v.Kind())
+		return newDecodeError(d.p.Raw(value.Raw), "cannot store TOML string into a Go %s", v.Kind())
 	}
-	return err
+	return nil
 }
 func (d *decoder) handleKeyValue(expr *ast.Node, v reflect.Value) (reflect.Value, error) {
@@ -246,6 +246,20 @@ func TestUnmarshal(t *testing.T) {
 				}
 			},
 		},
 		{
 			desc:  "kv literal string",
 			input: `A = 'foo 🙂 '`,
 			gen: func() test {
 				type doc struct {
 					A string
 				}
 				return test{
 					target:   &doc{},
 					expected: &doc{A: "foo 🙂 "},
 				}
 			},
 		},
 		{
 			desc:  "time.time with negative zone",
 			input: `a = 1979-05-27T00:32:00-07:00 `, // space intentional
@@ -2009,6 +2023,74 @@ world'`,
 			desc: `invalid nan`,
 			data: `A = non`,
 		},
 		{
 			desc: `invalid character in comment in array`,
 			data: "A = [#\x00\n]",
 		},
 		{
 			desc: "invalid utf8 character in long string with no escape sequence",
 			data: "a = \"aaaa\x80aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\"",
 		},
 		{
 			desc: "invalid ascii character in long string with no escape sequence",
 			data: "a = \"aaaa\x00aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\"",
 		},
 		{
 			desc: "unfinished 2-byte utf8 character in string with no escape sequence",
 			data: "a = \"aaaa\xC2\"",
 		},
 		{
 			desc: "unfinished 3-byte utf8 character in string with no escape sequence",
 			data: "a = \"aaaa\xE2\x00\x00\"",
 		},
 		{
 			desc: "invalid 3rd byte of 3-byte utf8 character in string with no escape sequence",
 			data: "a = \"aaaa\xE2\x80\x00\"",
 		},
 		{
 			desc: "invalid 4rd byte of 4-byte utf8 character in string with no escape sequence",
 			data: "a = \"aaaa\xF2\x81\x81\x00\"",
 		},
 		{
 			desc: "unfinished 2-byte utf8 character in literal string",
 			data: "a = 'aaa\xC2'",
 		},
 		{
 			desc: "unfinished 3-byte utf8 character in literal string",
 			data: "a = 'aaaa\xE2\x00\x00'",
 		},
 		{
 			desc: "invalid 3rd byte of 3-byte utf8 character in literal string",
 			data: "a = 'aaaa\xE2\x80\x00'",
 		},
 		{
 			desc: "invalid 4rd byte of 4-byte utf8 character in literal string",
 			data: "a = 'aaaa\xF2\x81\x81\x00'",
 		},
 		{
 			desc: "invalid start utf8 character in literal string",
 			data: "a = '\x80'",
 		},
 		{
 			desc: "utf8 character with not enough bytes before end in literal string",
 			data: "a = '\xEF'",
 		},
 		{
 			desc: "basic string with newline after the first escape code",
 			data: "a = \"\\t\n\"",
 		},
 		{
 			desc: "basic string with unfinished escape sequence after the first escape code",
 			data: "a = \"\\t\\",
 		},
 		{
 			desc: "basic string with unfinished after the first escape code",
 			data: "a = \"\\t",
 		},
 		{
 			desc: "multiline basic string with unfinished escape sequence after the first escape code",
 			data: "a = \"\"\"\\t\\",
 		},
 	}
 	for _, e := range examples {
@@ -0,0 +1,203 @@
 package toml
 import (
 	"unicode/utf8"
 )
 type utf8Err struct {
 	Index int
 	Size  int
 }
 func (u utf8Err) Zero() bool {
 	return u.Size == 0
 }
 // Verified that a given string is only made of valid UTF-8 characters allowed
 // by the TOML spec:
 //
 // Any Unicode character may be used except those that must be escaped:
 // quotation mark, backslash, and the control characters other than tab (U+0000
 // to U+0008, U+000A to U+001F, U+007F).
 //
 // It is a copy of the Go 1.17 utf8.Valid implementation, tweaked to exit early
 // when a character is not allowed.
 //
 // The returned utf8Err is Zero() if the string is valid, or contains the byte
 // index and size of the invalid character.
 //
 // quotation mark => already checked
 // backslash => already checked
 // 0-0x8 => invalid
 // 0x9 => tab, ok
 // 0xA - 0x1F => invalid
 // 0x7F => invalid
 func utf8TomlValidAlreadyEscaped(p []byte) (err utf8Err) {
 	// Fast path. Check for and skip 8 bytes of ASCII characters per iteration.
 	offset := 0
 	for len(p) >= 8 {
 		// Combining two 32 bit loads allows the same code to be used
 		// for 32 and 64 bit platforms.
 		// The compiler can generate a 32bit load for first32 and second32
 		// on many platforms. See test/codegen/memcombine.go.
 		first32 := uint32(p[0]) | uint32(p[1])<<8 | uint32(p[2])<<16 | uint32(p[3])<<24
 		second32 := uint32(p[4]) | uint32(p[5])<<8 | uint32(p[6])<<16 | uint32(p[7])<<24
 		if (first32|second32)&0x80808080 != 0 {
 			// Found a non ASCII byte (>= RuneSelf).
 			break
 		}
 		for i, b := range p[:8] {
 			if invalidAscii(b) {
 				err.Index = offset + i
 				err.Size = 1
 				return
 			}
 		}
 		p = p[8:]
 		offset += 8
 	}
 	n := len(p)
 	for i := 0; i < n; {
 		pi := p[i]
 		if pi < utf8.RuneSelf {
 			if invalidAscii(pi) {
 				err.Index = offset + i
 				err.Size = 1
 				return
 			}
 			i++
 			continue
 		}
 		x := first[pi]
 		if x == xx {
 			// Illegal starter byte.
 			err.Index = offset + i
 			err.Size = 1
 			return
 		}
 		size := int(x & 7)
 		if i+size > n {
 			// Short or invalid.
 			err.Index = offset + i
 			err.Size = n - i
 			return
 		}
 		accept := acceptRanges[x>>4]
 		if c := p[i+1]; c < accept.lo || accept.hi < c {
 			err.Index = offset + i
 			err.Size = 2
 			return
 		} else if size == 2 {
 		} else if c := p[i+2]; c < locb || hicb < c {
 			err.Index = offset + i
 			err.Size = 3
 			return
 		} else if size == 3 {
 		} else if c := p[i+3]; c < locb || hicb < c {
 			err.Index = offset + i
 			err.Size = 4
 			return
 		}
 		i += size
 	}
 	return
 }
 // Return the size of the next rune if valid, 0 otherwise.
 func utf8ValidNext(p []byte) int {
 	c := p[0]
 	if c < utf8.RuneSelf {
 		if invalidAscii(c) {
 			return 0
 		}
 		return 1
 	}
 	x := first[c]
 	if x == xx {
 		// Illegal starter byte.
 		return 0
 	}
 	size := int(x & 7)
 	if size > len(p) {
 		// Short or invalid.
 		return 0
 	}
 	accept := acceptRanges[x>>4]
 	if c := p[1]; c < accept.lo || accept.hi < c {
 		return 0
 	} else if size == 2 {
 	} else if c := p[2]; c < locb || hicb < c {
 		return 0
 	} else if size == 3 {
 	} else if c := p[3]; c < locb || hicb < c {
 		return 0
 	}
 	return size
 }
 func invalidAscii(b byte) bool {
 	return b <= 0x08 || (b > 0x0A && b < 0x0D) || (b > 0x0D && b <= 0x1F) || b == 0x7F
 }
 // acceptRange gives the range of valid values for the second byte in a UTF-8
 // sequence.
 type acceptRange struct {
 	lo uint8 // lowest value for second byte.
 	hi uint8 // highest value for second byte.
 }
 // acceptRanges has size 16 to avoid bounds checks in the code that uses it.
 var acceptRanges = [16]acceptRange{
 	0: {locb, hicb},
 	1: {0xA0, hicb},
 	2: {locb, 0x9F},
 	3: {0x90, hicb},
 	4: {locb, 0x8F},
 }
 // first is information about the first byte in a UTF-8 sequence.
 var first = [256]uint8{
 	//   1   2   3   4   5   6   7   8   9   A   B   C   D   E   F
 	as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x00-0x0F
 	as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x10-0x1F
 	as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x20-0x2F
 	as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x30-0x3F
 	as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x40-0x4F
 	as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x50-0x5F
 	as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x60-0x6F
 	as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x70-0x7F
 	//   1   2   3   4   5   6   7   8   9   A   B   C   D   E   F
 	xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0x80-0x8F
 	xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0x90-0x9F
 	xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0xA0-0xAF
 	xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0xB0-0xBF
 	xx, xx, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, // 0xC0-0xCF
 	s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, // 0xD0-0xDF
 	s2, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s4, s3, s3, // 0xE0-0xEF
 	s5, s6, s6, s6, s7, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0xF0-0xFF
 }
 const (
 	// The default lowest and highest continuation byte.
 	locb = 0b10000000
 	hicb = 0b10111111
 	// These names of these constants are chosen to give nice alignment in the
 	// table below. The first nibble is an index into acceptRanges or F for
 	// special one-byte cases. The second nibble is the Rune length or the
 	// Status for the special one-byte case.
 	xx = 0xF1 // invalid: size 1
 	as = 0xF0 // ASCII: size 1
 	s1 = 0x02 // accept 0, size 2
 	s2 = 0x13 // accept 1, size 3
 	s3 = 0x03 // accept 0, size 3
 	s4 = 0x23 // accept 2, size 3
 	s5 = 0x34 // accept 3, size 4
 	s6 = 0x04 // accept 0, size 4
 	s7 = 0x44 // accept 4, size 4
 )