Validate UTF-8 (#629)

2021-10-15 16:13:21 -07:00
parent cc0d1a90ff
commit cd54472d03
7 changed files with 479 additions and 89 deletions
@@ -0,0 +1,71 @@
+package toml
+
+import (
+	"bytes"
+	"testing"
+)
+
+var valid10Ascii = []byte("1234567890")
+var valid10Utf8 = []byte("日本語a")
+var valid1kUtf8 = bytes.Repeat([]byte("0123456789日本語日本語日本語日abcdefghijklmnopqrstuvwx"), 16)
+var valid1MUtf8 = bytes.Repeat(valid1kUtf8, 1024)
+var valid1kAscii = bytes.Repeat([]byte("012345678998jhjklasDJKLAAdjdfjsdklfjdslkabcdefghijklmnopqrstuvwx"), 16)
+var valid1MAscii = bytes.Repeat(valid1kAscii, 1024)
+
+func BenchmarkScanComments(b *testing.B) {
+	wrap := func(x []byte) []byte {
+		return []byte("# " + string(x) + "\n")
+	}
+
+	inputs := map[string][]byte{
+		"10Valid":     wrap(valid10Ascii),
+		"1kValid":     wrap(valid1kAscii),
+		"1MValid":     wrap(valid1MAscii),
+		"10ValidUtf8": wrap(valid10Utf8),
+		"1kValidUtf8": wrap(valid1kUtf8),
+		"1MValidUtf8": wrap(valid1MUtf8),
+	}
+
+	for name, input := range inputs {
+		b.Run(name, func(b *testing.B) {
+			b.SetBytes(int64(len(input)))
+			b.ReportAllocs()
+			b.ResetTimer()
+
+			for i := 0; i < b.N; i++ {
+				scanComment(input)
+			}
+		})
+	}
+}
+
+func BenchmarkParseLiteralStringValid(b *testing.B) {
+	wrap := func(x []byte) []byte {
+		return []byte("'" + string(x) + "'")
+	}
+
+	inputs := map[string][]byte{
+		"10Valid":     wrap(valid10Ascii),
+		"1kValid":     wrap(valid1kAscii),
+		"1MValid":     wrap(valid1MAscii),
+		"10ValidUtf8": wrap(valid10Utf8),
+		"1kValidUtf8": wrap(valid1kUtf8),
+		"1MValidUtf8": wrap(valid1MUtf8),
+	}
+
+	for name, input := range inputs {
+		b.Run(name, func(b *testing.B) {
+			p := parser{}
+			b.SetBytes(int64(len(input)))
+			b.ReportAllocs()
+			b.ResetTimer()
+
+			for i := 0; i < b.N; i++ {
+				_, _, _, err := p.parseLiteralString(input)
+				if err != nil {
+					panic(err)
+				}
+			}
+		})
+	}
+}
@@ -2,6 +2,7 @@ package toml

 import (
 	"bytes"
+	"unicode"

 	"github.com/pelletier/go-toml/v2/internal/ast"
 	"github.com/pelletier/go-toml/v2/internal/danger"
@@ -106,9 +107,8 @@ func (p *parser) parseExpression(b []byte) (ast.Reference, []byte, error) {
 	}

 	if b[0] == '#' {
-		_, rest := scanComment(b)
-
-		return ref, rest, nil
+		_, rest, err := scanComment(b)
+		return ref, rest, err
 	}

 	if b[0] == '\n' || b[0] == '\r' {
@@ -129,9 +129,8 @@ func (p *parser) parseExpression(b []byte) (ast.Reference, []byte, error) {
 	b = p.parseWhitespace(b)

 	if len(b) > 0 && b[0] == '#' {
-		_, rest := scanComment(b)
-
-		return ref, rest, nil
+		_, rest, err := scanComment(b)
+		return ref, rest, err
 	}

 	return ref, b, nil
@@ -479,7 +478,10 @@ func (p *parser) parseOptionalWhitespaceCommentNewline(b []byte) ([]byte, error)
 		b = p.parseWhitespace(b)

 		if len(b) > 0 && b[0] == '#' {
-			_, b = scanComment(b)
+			_, b, err = scanComment(b)
+			if err != nil {
+				return nil, err
+			}
 		}

 		if len(b) == 0 {
@@ -529,7 +531,7 @@ func (p *parser) parseMultilineBasicString(b []byte) ([]byte, []byte, []byte, er
 	// mlb-quotes = 1*2quotation-mark
 	// mlb-unescaped = wschar / %x21 / %x23-5B / %x5D-7E / non-ascii
 	// mlb-escaped-nl = escape ws newline *( wschar / newline )
-	token, rest, err := scanMultilineBasicString(b)
+	token, escaped, rest, err := scanMultilineBasicString(b)
 	if err != nil {
 		return nil, nil, nil, err
 	}
@@ -546,16 +548,20 @@ func (p *parser) parseMultilineBasicString(b []byte) ([]byte, []byte, []byte, er
 	// fast path
 	startIdx := i
 	endIdx := len(token) - len(`"""`)
-	for ; i < endIdx; i++ {
-		if token[i] == '\\' {
-			break
+
+	if escaped < 0 {
+		str := token[startIdx:endIdx]
+		verr := utf8TomlValidAlreadyEscaped(str)
+		if verr.Zero() {
+			return token, str, rest, nil
 		}
-	}
-	if i == endIdx {
-		return token, token[startIdx:endIdx], rest, nil
+		return nil, nil, nil, newDecodeError(str[verr.Index:verr.Index+verr.Size], "invalid UTF-8")
 	}

+	i = escaped
+
 	var builder bytes.Buffer
+	// grow?
 	builder.Write(token[startIdx:i])

 	// The scanner ensures that the token starts and ends with quotes and that
@@ -705,25 +711,30 @@ func (p *parser) parseBasicString(b []byte) ([]byte, []byte, []byte, error) {
 	// escape-seq-char =/ %x74         ; t    tab             U+0009
 	// escape-seq-char =/ %x75 4HEXDIG ; uXXXX                U+XXXX
 	// escape-seq-char =/ %x55 8HEXDIG ; UXXXXXXXX            U+XXXXXXXX
-	token, rest, err := scanBasicString(b)
+	token, escaped, rest, err := scanBasicString(b)
 	if err != nil {
 		return nil, nil, nil, err
 	}

-	// fast path
-	i := len(`"`)
-	startIdx := i
+	startIdx := len(`"`)
 	endIdx := len(token) - len(`"`)
-	for ; i < endIdx; i++ {
-		if token[i] == '\\' {
-			break
+
+	// Fast path. If there is no escape sequence, the string should just be
+	// an UTF-8 encoded string, which is the same as Go. In that case,
+	// validate the string and return a direct reference to the buffer.
+	if escaped < 0 {
+		str := token[startIdx:endIdx]
+		verr := utf8TomlValidAlreadyEscaped(str)
+		if verr.Zero() {
+			return token, str, rest, nil
 		}
-	}
-	if i == endIdx {
-		return token, token[startIdx:endIdx], rest, nil
+		return nil, nil, nil, newDecodeError(str[verr.Index:verr.Index+verr.Size], "invalid UTF-8")
 	}

+	i := escaped
+
 	var builder bytes.Buffer
+	// grow?
 	builder.Write(token[startIdx:i])

 	// The scanner ensures that the token starts and ends with quotes and that
@@ -780,22 +791,27 @@ func hexToRune(b []byte, length int) (rune, error) {
 	}
 	b = b[:length]

-	var r rune
+	var r uint32
 	for i, c := range b {
+		d := uint32(0)
 		switch {
 		case '0' <= c && c <= '9':
-			c = c - '0'
+			d = uint32(c - '0')
 		case 'a' <= c && c <= 'f':
-			c = c - 'a' + 10
+			d = uint32(c - 'a' + 10)
 		case 'A' <= c && c <= 'F':
-			c = c - 'A' + 10
+			d = uint32(c - 'A' + 10)
 		default:
 			return -1, newDecodeError(b[i:i+1], "non-hex character")
 		}
-		r = r*16 + rune(c)
+		r = r*16 + d
 	}

-	return r, nil
+	if r > unicode.MaxRune || 0xD800 <= r && r < 0xE000 {
+		return -1, newDecodeError(b, "escape sequence is invalid Unicode code point")
+	}
+
+	return rune(r), nil
 }

 func (p *parser) parseWhitespace(b []byte) []byte {
@@ -49,13 +49,18 @@ func scanLiteralString(b []byte) ([]byte, []byte, error) {
 	// literal-string = apostrophe *literal-char apostrophe
 	// apostrophe = %x27 ; ' apostrophe
 	// literal-char = %x09 / %x20-26 / %x28-7E / non-ascii
-	for i := 1; i < len(b); i++ {
+	for i := 1; i < len(b); {
 		switch b[i] {
 		case '\'':
 			return b[:i+1], b[i+1:], nil
 		case '\n':
 			return nil, nil, newDecodeError(b[i:i+1], "literal strings cannot have new lines")
 		}
+		size := utf8ValidNext(b[i:])
+		if size == 0 {
+			return nil, nil, newDecodeError(b[i:i+1], "invalid character")
+		}
+		i += size
 	}

 	return nil, nil, newDecodeError(b[len(b):], "unterminated literal string")
@@ -70,10 +75,15 @@ func scanMultilineLiteralString(b []byte) ([]byte, []byte, error) {
 	// mll-content = mll-char / newline
 	// mll-char = %x09 / %x20-26 / %x28-7E / non-ascii
 	// mll-quotes = 1*2apostrophe
-	for i := 3; i < len(b); i++ {
+	for i := 3; i < len(b); {
 		if b[i] == '\'' && scanFollowsMultilineLiteralStringDelimiter(b[i:]) {
 			return b[:i+3], b[i+3:], nil
 		}
+		size := utf8ValidNext(b[i:])
+		if size == 0 {
+			return nil, nil, newDecodeError(b[i:i+1], "invalid character")
+		}
+		i += size
 	}

 	return nil, nil, newDecodeError(b[len(b):], `multiline literal string not terminated by '''`)
@@ -106,45 +116,72 @@ func scanWhitespace(b []byte) ([]byte, []byte) {
 }

 //nolint:unparam
-func scanComment(b []byte) ([]byte, []byte) {
+func scanComment(b []byte) ([]byte, []byte, error) {
 	// comment-start-symbol = %x23 ; #
 	// non-ascii = %x80-D7FF / %xE000-10FFFF
 	// non-eol = %x09 / %x20-7F / non-ascii
 	//
 	// comment = comment-start-symbol *non-eol
-	for i := 1; i < len(b); i++ {
+
+	for i := 1; i < len(b); {
 		if b[i] == '\n' {
-			return b[:i], b[i:]
+			return b[:i], b[i:], nil
 		}
+		size := utf8ValidNext(b[i:])
+		if size == 0 {
+			return nil, nil, newDecodeError(b[i:i+1], "invalid character in comment")
+		}
+
+		i += size
 	}

-	return b, b[len(b):]
+	return b, b[len(b):], nil
 }

-func scanBasicString(b []byte) ([]byte, []byte, error) {
+func scanBasicString(b []byte) ([]byte, int, []byte, error) {
 	// basic-string = quotation-mark *basic-char quotation-mark
 	// quotation-mark = %x22            ; "
 	// basic-char = basic-unescaped / escaped
 	// basic-unescaped = wschar / %x21 / %x23-5B / %x5D-7E / non-ascii
 	// escaped = escape escape-seq-char
-	for i := 1; i < len(b); i++ {
+	escaped := -1 // index of the first \. -1 means no escape character in there.
+	i := 1
+
+loop:
+	for ; i < len(b); i++ {
 		switch b[i] {
 		case '"':
-			return b[:i+1], b[i+1:], nil
+			return b[:i+1], escaped, b[i+1:], nil
 		case '\n':
-			return nil, nil, newDecodeError(b[i:i+1], "basic strings cannot have new lines")
+			return nil, escaped, nil, newDecodeError(b[i:i+1], "basic strings cannot have new lines")
 		case '\\':
 			if len(b) < i+2 {
-				return nil, nil, newDecodeError(b[i:i+1], "need a character after \\")
+				return nil, escaped, nil, newDecodeError(b[i:i+1], "need a character after \\")
+			}
+			escaped = i
+			i += 2 // skip the next character
+			break loop
+		}
+	}
+
+	for ; i < len(b); i++ {
+		switch b[i] {
+		case '"':
+			return b[:i+1], escaped, b[i+1:], nil
+		case '\n':
+			return nil, escaped, nil, newDecodeError(b[i:i+1], "basic strings cannot have new lines")
+		case '\\':
+			if len(b) < i+2 {
+				return nil, escaped, nil, newDecodeError(b[i:i+1], "need a character after \\")
 			}
 			i++ // skip the next character
 		}
 	}

-	return nil, nil, newDecodeError(b[len(b):], `basic string not terminated by "`)
+	return nil, escaped, nil, newDecodeError(b[len(b):], `basic string not terminated by "`)
 }

-func scanMultilineBasicString(b []byte) ([]byte, []byte, error) {
+func scanMultilineBasicString(b []byte) ([]byte, int, []byte, error) {
 	// ml-basic-string = ml-basic-string-delim [ newline ] ml-basic-body
 	// ml-basic-string-delim
 	// ml-basic-string-delim = 3quotation-mark
@@ -155,19 +192,40 @@ func scanMultilineBasicString(b []byte) ([]byte, []byte, error) {
 	// mlb-quotes = 1*2quotation-mark
 	// mlb-unescaped = wschar / %x21 / %x23-5B / %x5D-7E / non-ascii
 	// mlb-escaped-nl = escape ws newline *( wschar / newline )
-	for i := 3; i < len(b); i++ {
+
+	escaped := -1
+	i := 3
+
+loop:
+	for ; i < len(b); i++ {
 		switch b[i] {
 		case '"':
 			if scanFollowsMultilineBasicStringDelimiter(b[i:]) {
-				return b[:i+3], b[i+3:], nil
+				return b[:i+3], escaped, b[i+3:], nil
 			}
 		case '\\':
 			if len(b) < i+2 {
-				return nil, nil, newDecodeError(b[len(b):], "need a character after \\")
+				return nil, escaped, nil, newDecodeError(b[len(b):], "need a character after \\")
+			}
+			escaped = i
+			i += 2 // skip the next character
+			break loop
+		}
+	}
+
+	for ; i < len(b); i++ {
+		switch b[i] {
+		case '"':
+			if scanFollowsMultilineBasicStringDelimiter(b[i:]) {
+				return b[:i+3], escaped, b[i+3:], nil
+			}
+		case '\\':
+			if len(b) < i+2 {
+				return nil, escaped, nil, newDecodeError(b[len(b):], "need a character after \\")
 			}
 			i++ // skip the next character
 		}
 	}

-	return nil, nil, newDecodeError(b[len(b):], `multiline basic string not terminated by """`)
+	return nil, escaped, nil, newDecodeError(b[len(b):], `multiline basic string not terminated by """`)
 }
@@ -71,139 +71,106 @@ func TestTOMLTest_Invalid_Bool_WrongCaseTrue(t *testing.T) {
 }

 func TestTOMLTest_Invalid_Control_CommentDel(t *testing.T) {
-	t.Skip("FIXME")
 	input := "comment-del = \"0x7f\" # \u007f\n"
 	testgenInvalid(t, input)
 }

 func TestTOMLTest_Invalid_Control_CommentLf(t *testing.T) {
-	t.Skip("FIXME")
 	input := "comment-lf = \"ctrl-P\" # \x10\n"
 	testgenInvalid(t, input)
 }

 func TestTOMLTest_Invalid_Control_CommentNull(t *testing.T) {
-	t.Skip("FIXME")
 	input := "comment-null = \"null\" # \x00\n"
 	testgenInvalid(t, input)
 }

 func TestTOMLTest_Invalid_Control_CommentUs(t *testing.T) {
-	t.Skip("FIXME")
 	input := "comment-us = \"ctrl-_\" # \x1f\n"
 	testgenInvalid(t, input)
 }

 func TestTOMLTest_Invalid_Control_MultiDel(t *testing.T) {
-	t.Skip("FIXME")
-
 	input := "multi-del = \"\"\"null\u007f\"\"\"\n"
 	testgenInvalid(t, input)
 }

 func TestTOMLTest_Invalid_Control_MultiLf(t *testing.T) {
-	t.Skip("FIXME")
-
 	input := "multi-lf = \"\"\"null\x10\"\"\"\n"
 	testgenInvalid(t, input)
 }

 func TestTOMLTest_Invalid_Control_MultiNull(t *testing.T) {
-	t.Skip("FIXME")
-
 	input := "multi-null = \"\"\"null\x00\"\"\"\n"
 	testgenInvalid(t, input)
 }

 func TestTOMLTest_Invalid_Control_MultiUs(t *testing.T) {
-	t.Skip("FIXME")
-
 	input := "multi-us = \"\"\"null\x1f\"\"\"\n"
 	testgenInvalid(t, input)
 }

 func TestTOMLTest_Invalid_Control_RawmultiDel(t *testing.T) {
-	t.Skip("FIXME")
-
 	input := "rawmulti-del = '''null\u007f'''\n"
 	testgenInvalid(t, input)
 }

 func TestTOMLTest_Invalid_Control_RawmultiLf(t *testing.T) {
-	t.Skip("FIXME")
-
 	input := "rawmulti-lf = '''null\x10'''\n"
 	testgenInvalid(t, input)
 }

 func TestTOMLTest_Invalid_Control_RawmultiNull(t *testing.T) {
-	t.Skip("FIXME")
-
 	input := "rawmulti-null = '''null\x00'''\n"
 	testgenInvalid(t, input)
 }

 func TestTOMLTest_Invalid_Control_RawmultiUs(t *testing.T) {
-	t.Skip("FIXME")
-
 	input := "rawmulti-us = '''null\x1f'''\n"
 	testgenInvalid(t, input)
 }

 func TestTOMLTest_Invalid_Control_RawstringDel(t *testing.T) {
-	t.Skip("FIXME")
-
 	input := "rawstring-del = 'null\u007f'\n"
 	testgenInvalid(t, input)
 }

 func TestTOMLTest_Invalid_Control_RawstringLf(t *testing.T) {
-	t.Skip("FIXME")
-
 	input := "rawstring-lf = 'null\x10'\n"
 	testgenInvalid(t, input)
 }

 func TestTOMLTest_Invalid_Control_RawstringNull(t *testing.T) {
-	t.Skip("FIXME")
-
 	input := "rawstring-null = 'null\x00'\n"
 	testgenInvalid(t, input)
 }

 func TestTOMLTest_Invalid_Control_RawstringUs(t *testing.T) {
-	t.Skip("FIXME")
-
 	input := "rawstring-us = 'null\x1f'\n"
 	testgenInvalid(t, input)
 }

 func TestTOMLTest_Invalid_Control_StringBs(t *testing.T) {
-	t.Skip("FIXME")
 	input := "string-bs = \"backspace\b\"\n"
 	testgenInvalid(t, input)
 }

 func TestTOMLTest_Invalid_Control_StringDel(t *testing.T) {
-	t.Skip("FIXME")
 	input := "string-del = \"null\u007f\"\n"
 	testgenInvalid(t, input)
 }

 func TestTOMLTest_Invalid_Control_StringLf(t *testing.T) {
-	t.Skip("FIXME")
 	input := "string-lf = \"null\x10\"\n"
 	testgenInvalid(t, input)
 }

 func TestTOMLTest_Invalid_Control_StringNull(t *testing.T) {
-	t.Skip("FIXME")
 	input := "string-null = \"null\x00\"\n"
 	testgenInvalid(t, input)
 }

 func TestTOMLTest_Invalid_Control_StringUs(t *testing.T) {
-	t.Skip("FIXME")
 	input := "string-us = \"null\x1f\"\n"
 	testgenInvalid(t, input)
 }
@@ -757,7 +724,6 @@ func TestTOMLTest_Invalid_String_BadByteEscape(t *testing.T) {
 }

 func TestTOMLTest_Invalid_String_BadCodepoint(t *testing.T) {
-	t.Skip("FIXME")
 	input := "invalid-codepoint = \"This string contains a non scalar unicode codepoint \\uD801\"\n"
 	testgenInvalid(t, input)
 }
@@ -793,13 +759,11 @@ func TestTOMLTest_Invalid_String_BasicByteEscapes(t *testing.T) {
 }

 func TestTOMLTest_Invalid_String_BasicMultilineOutOfRangeUnicodeEscape1(t *testing.T) {
-	t.Skip("FIXME")
 	input := "a = \"\"\"\\UFFFFFFFF\"\"\"\n"
 	testgenInvalid(t, input)
 }

 func TestTOMLTest_Invalid_String_BasicMultilineOutOfRangeUnicodeEscape2(t *testing.T) {
-	t.Skip("FIXME")
 	input := "a = \"\"\"\\U00D80000\"\"\"\n"
 	testgenInvalid(t, input)
 }
@@ -815,13 +779,11 @@ func TestTOMLTest_Invalid_String_BasicMultilineUnknownEscape(t *testing.T) {
 }

 func TestTOMLTest_Invalid_String_BasicOutOfRangeUnicodeEscape1(t *testing.T) {
-	t.Skip("FIXME")
 	input := "a = \"\\UFFFFFFFF\"\n"
 	testgenInvalid(t, input)
 }

 func TestTOMLTest_Invalid_String_BasicOutOfRangeUnicodeEscape2(t *testing.T) {
-	t.Skip("FIXME")
 	input := "a = \"\\U00D80000\"\n"
 	testgenInvalid(t, input)
 }
@@ -899,18 +899,16 @@ func (d *decoder) unmarshalInteger(value *ast.Node, v reflect.Value) error {
 }

 func (d *decoder) unmarshalString(value *ast.Node, v reflect.Value) error {
-	var err error
-
 	switch v.Kind() {
 	case reflect.String:
 		v.SetString(string(value.Data))
 	case reflect.Interface:
 		v.Set(reflect.ValueOf(string(value.Data)))
 	default:
-		err = newDecodeError(d.p.Raw(value.Raw), "cannot store TOML string into a Go %s", v.Kind())
+		return newDecodeError(d.p.Raw(value.Raw), "cannot store TOML string into a Go %s", v.Kind())
 	}

-	return err
+	return nil
 }

 func (d *decoder) handleKeyValue(expr *ast.Node, v reflect.Value) (reflect.Value, error) {
@@ -246,6 +246,20 @@ func TestUnmarshal(t *testing.T) {
 				}
 			},
 		},
+		{
+			desc:  "kv literal string",
+			input: `A = 'foo 🙂 '`,
+			gen: func() test {
+				type doc struct {
+					A string
+				}
+
+				return test{
+					target:   &doc{},
+					expected: &doc{A: "foo 🙂 "},
+				}
+			},
+		},
 		{
 			desc:  "time.time with negative zone",
 			input: `a = 1979-05-27T00:32:00-07:00 `, // space intentional
@@ -2009,6 +2023,74 @@ world'`,
 			desc: `invalid nan`,
 			data: `A = non`,
 		},
+		{
+			desc: `invalid character in comment in array`,
+			data: "A = [#\x00\n]",
+		},
+		{
+			desc: "invalid utf8 character in long string with no escape sequence",
+			data: "a = \"aaaa\x80aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\"",
+		},
+		{
+			desc: "invalid ascii character in long string with no escape sequence",
+			data: "a = \"aaaa\x00aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\"",
+		},
+		{
+			desc: "unfinished 2-byte utf8 character in string with no escape sequence",
+			data: "a = \"aaaa\xC2\"",
+		},
+		{
+			desc: "unfinished 3-byte utf8 character in string with no escape sequence",
+			data: "a = \"aaaa\xE2\x00\x00\"",
+		},
+		{
+			desc: "invalid 3rd byte of 3-byte utf8 character in string with no escape sequence",
+			data: "a = \"aaaa\xE2\x80\x00\"",
+		},
+		{
+			desc: "invalid 4rd byte of 4-byte utf8 character in string with no escape sequence",
+			data: "a = \"aaaa\xF2\x81\x81\x00\"",
+		},
+		{
+			desc: "unfinished 2-byte utf8 character in literal string",
+			data: "a = 'aaa\xC2'",
+		},
+		{
+			desc: "unfinished 3-byte utf8 character in literal string",
+			data: "a = 'aaaa\xE2\x00\x00'",
+		},
+		{
+			desc: "invalid 3rd byte of 3-byte utf8 character in literal string",
+			data: "a = 'aaaa\xE2\x80\x00'",
+		},
+		{
+			desc: "invalid 4rd byte of 4-byte utf8 character in literal string",
+			data: "a = 'aaaa\xF2\x81\x81\x00'",
+		},
+		{
+			desc: "invalid start utf8 character in literal string",
+			data: "a = '\x80'",
+		},
+		{
+			desc: "utf8 character with not enough bytes before end in literal string",
+			data: "a = '\xEF'",
+		},
+		{
+			desc: "basic string with newline after the first escape code",
+			data: "a = \"\\t\n\"",
+		},
+		{
+			desc: "basic string with unfinished escape sequence after the first escape code",
+			data: "a = \"\\t\\",
+		},
+		{
+			desc: "basic string with unfinished after the first escape code",
+			data: "a = \"\\t",
+		},
+		{
+			desc: "multiline basic string with unfinished escape sequence after the first escape code",
+			data: "a = \"\"\"\\t\\",
+		},
 	}

 	for _, e := range examples {
@@ -0,0 +1,203 @@
+package toml
+
+import (
+	"unicode/utf8"
+)
+
+type utf8Err struct {
+	Index int
+	Size  int
+}
+
+func (u utf8Err) Zero() bool {
+	return u.Size == 0
+}
+
+// Verified that a given string is only made of valid UTF-8 characters allowed
+// by the TOML spec:
+//
+// Any Unicode character may be used except those that must be escaped:
+// quotation mark, backslash, and the control characters other than tab (U+0000
+// to U+0008, U+000A to U+001F, U+007F).
+//
+// It is a copy of the Go 1.17 utf8.Valid implementation, tweaked to exit early
+// when a character is not allowed.
+//
+// The returned utf8Err is Zero() if the string is valid, or contains the byte
+// index and size of the invalid character.
+//
+// quotation mark => already checked
+// backslash => already checked
+// 0-0x8 => invalid
+// 0x9 => tab, ok
+// 0xA - 0x1F => invalid
+// 0x7F => invalid
+func utf8TomlValidAlreadyEscaped(p []byte) (err utf8Err) {
+	// Fast path. Check for and skip 8 bytes of ASCII characters per iteration.
+	offset := 0
+	for len(p) >= 8 {
+		// Combining two 32 bit loads allows the same code to be used
+		// for 32 and 64 bit platforms.
+		// The compiler can generate a 32bit load for first32 and second32
+		// on many platforms. See test/codegen/memcombine.go.
+		first32 := uint32(p[0]) | uint32(p[1])<<8 | uint32(p[2])<<16 | uint32(p[3])<<24
+		second32 := uint32(p[4]) | uint32(p[5])<<8 | uint32(p[6])<<16 | uint32(p[7])<<24
+		if (first32|second32)&0x80808080 != 0 {
+			// Found a non ASCII byte (>= RuneSelf).
+			break
+		}
+
+		for i, b := range p[:8] {
+			if invalidAscii(b) {
+				err.Index = offset + i
+				err.Size = 1
+				return
+			}
+		}
+
+		p = p[8:]
+		offset += 8
+	}
+	n := len(p)
+	for i := 0; i < n; {
+		pi := p[i]
+		if pi < utf8.RuneSelf {
+			if invalidAscii(pi) {
+				err.Index = offset + i
+				err.Size = 1
+				return
+			}
+			i++
+			continue
+		}
+		x := first[pi]
+		if x == xx {
+			// Illegal starter byte.
+			err.Index = offset + i
+			err.Size = 1
+			return
+		}
+		size := int(x & 7)
+		if i+size > n {
+			// Short or invalid.
+			err.Index = offset + i
+			err.Size = n - i
+			return
+		}
+		accept := acceptRanges[x>>4]
+		if c := p[i+1]; c < accept.lo || accept.hi < c {
+			err.Index = offset + i
+			err.Size = 2
+			return
+		} else if size == 2 {
+		} else if c := p[i+2]; c < locb || hicb < c {
+			err.Index = offset + i
+			err.Size = 3
+			return
+		} else if size == 3 {
+		} else if c := p[i+3]; c < locb || hicb < c {
+			err.Index = offset + i
+			err.Size = 4
+			return
+		}
+		i += size
+	}
+	return
+}
+
+// Return the size of the next rune if valid, 0 otherwise.
+func utf8ValidNext(p []byte) int {
+	c := p[0]
+
+	if c < utf8.RuneSelf {
+		if invalidAscii(c) {
+			return 0
+		}
+		return 1
+	}
+
+	x := first[c]
+	if x == xx {
+		// Illegal starter byte.
+		return 0
+	}
+	size := int(x & 7)
+	if size > len(p) {
+		// Short or invalid.
+		return 0
+	}
+	accept := acceptRanges[x>>4]
+	if c := p[1]; c < accept.lo || accept.hi < c {
+		return 0
+	} else if size == 2 {
+	} else if c := p[2]; c < locb || hicb < c {
+		return 0
+	} else if size == 3 {
+	} else if c := p[3]; c < locb || hicb < c {
+		return 0
+	}
+
+	return size
+}
+
+func invalidAscii(b byte) bool {
+	return b <= 0x08 || (b > 0x0A && b < 0x0D) || (b > 0x0D && b <= 0x1F) || b == 0x7F
+}
+
+// acceptRange gives the range of valid values for the second byte in a UTF-8
+// sequence.
+type acceptRange struct {
+	lo uint8 // lowest value for second byte.
+	hi uint8 // highest value for second byte.
+}
+
+// acceptRanges has size 16 to avoid bounds checks in the code that uses it.
+var acceptRanges = [16]acceptRange{
+	0: {locb, hicb},
+	1: {0xA0, hicb},
+	2: {locb, 0x9F},
+	3: {0x90, hicb},
+	4: {locb, 0x8F},
+}
+
+// first is information about the first byte in a UTF-8 sequence.
+var first = [256]uint8{
+	//   1   2   3   4   5   6   7   8   9   A   B   C   D   E   F
+	as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x00-0x0F
+	as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x10-0x1F
+	as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x20-0x2F
+	as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x30-0x3F
+	as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x40-0x4F
+	as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x50-0x5F
+	as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x60-0x6F
+	as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x70-0x7F
+	//   1   2   3   4   5   6   7   8   9   A   B   C   D   E   F
+	xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0x80-0x8F
+	xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0x90-0x9F
+	xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0xA0-0xAF
+	xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0xB0-0xBF
+	xx, xx, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, // 0xC0-0xCF
+	s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, // 0xD0-0xDF
+	s2, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s4, s3, s3, // 0xE0-0xEF
+	s5, s6, s6, s6, s7, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0xF0-0xFF
+}
+
+const (
+	// The default lowest and highest continuation byte.
+	locb = 0b10000000
+	hicb = 0b10111111
+
+	// These names of these constants are chosen to give nice alignment in the
+	// table below. The first nibble is an index into acceptRanges or F for
+	// special one-byte cases. The second nibble is the Rune length or the
+	// Status for the special one-byte case.
+	xx = 0xF1 // invalid: size 1
+	as = 0xF0 // ASCII: size 1
+	s1 = 0x02 // accept 0, size 2
+	s2 = 0x13 // accept 1, size 3
+	s3 = 0x03 // accept 0, size 3
+	s4 = 0x23 // accept 2, size 3
+	s5 = 0x34 // accept 3, size 4
+	s6 = 0x04 // accept 0, size 4
+	s7 = 0x44 // accept 4, size 4
+)