Validate UTF-8 (#629)

This commit is contained in:
Thomas Pelletier
2021-10-15 16:13:21 -07:00
committed by GitHub
parent cc0d1a90ff
commit cd54472d03
7 changed files with 479 additions and 89 deletions
+71
View File
@@ -0,0 +1,71 @@
package toml
import (
"bytes"
"testing"
)
var valid10Ascii = []byte("1234567890")
var valid10Utf8 = []byte("日本語a")
var valid1kUtf8 = bytes.Repeat([]byte("0123456789日本語日本語日本語日abcdefghijklmnopqrstuvwx"), 16)
var valid1MUtf8 = bytes.Repeat(valid1kUtf8, 1024)
var valid1kAscii = bytes.Repeat([]byte("012345678998jhjklasDJKLAAdjdfjsdklfjdslkabcdefghijklmnopqrstuvwx"), 16)
var valid1MAscii = bytes.Repeat(valid1kAscii, 1024)
func BenchmarkScanComments(b *testing.B) {
wrap := func(x []byte) []byte {
return []byte("# " + string(x) + "\n")
}
inputs := map[string][]byte{
"10Valid": wrap(valid10Ascii),
"1kValid": wrap(valid1kAscii),
"1MValid": wrap(valid1MAscii),
"10ValidUtf8": wrap(valid10Utf8),
"1kValidUtf8": wrap(valid1kUtf8),
"1MValidUtf8": wrap(valid1MUtf8),
}
for name, input := range inputs {
b.Run(name, func(b *testing.B) {
b.SetBytes(int64(len(input)))
b.ReportAllocs()
b.ResetTimer()
for i := 0; i < b.N; i++ {
scanComment(input)
}
})
}
}
func BenchmarkParseLiteralStringValid(b *testing.B) {
wrap := func(x []byte) []byte {
return []byte("'" + string(x) + "'")
}
inputs := map[string][]byte{
"10Valid": wrap(valid10Ascii),
"1kValid": wrap(valid1kAscii),
"1MValid": wrap(valid1MAscii),
"10ValidUtf8": wrap(valid10Utf8),
"1kValidUtf8": wrap(valid1kUtf8),
"1MValidUtf8": wrap(valid1MUtf8),
}
for name, input := range inputs {
b.Run(name, func(b *testing.B) {
p := parser{}
b.SetBytes(int64(len(input)))
b.ReportAllocs()
b.ResetTimer()
for i := 0; i < b.N; i++ {
_, _, _, err := p.parseLiteralString(input)
if err != nil {
panic(err)
}
}
})
}
}
+46 -30
View File
@@ -2,6 +2,7 @@ package toml
import ( import (
"bytes" "bytes"
"unicode"
"github.com/pelletier/go-toml/v2/internal/ast" "github.com/pelletier/go-toml/v2/internal/ast"
"github.com/pelletier/go-toml/v2/internal/danger" "github.com/pelletier/go-toml/v2/internal/danger"
@@ -106,9 +107,8 @@ func (p *parser) parseExpression(b []byte) (ast.Reference, []byte, error) {
} }
if b[0] == '#' { if b[0] == '#' {
_, rest := scanComment(b) _, rest, err := scanComment(b)
return ref, rest, err
return ref, rest, nil
} }
if b[0] == '\n' || b[0] == '\r' { if b[0] == '\n' || b[0] == '\r' {
@@ -129,9 +129,8 @@ func (p *parser) parseExpression(b []byte) (ast.Reference, []byte, error) {
b = p.parseWhitespace(b) b = p.parseWhitespace(b)
if len(b) > 0 && b[0] == '#' { if len(b) > 0 && b[0] == '#' {
_, rest := scanComment(b) _, rest, err := scanComment(b)
return ref, rest, err
return ref, rest, nil
} }
return ref, b, nil return ref, b, nil
@@ -479,7 +478,10 @@ func (p *parser) parseOptionalWhitespaceCommentNewline(b []byte) ([]byte, error)
b = p.parseWhitespace(b) b = p.parseWhitespace(b)
if len(b) > 0 && b[0] == '#' { if len(b) > 0 && b[0] == '#' {
_, b = scanComment(b) _, b, err = scanComment(b)
if err != nil {
return nil, err
}
} }
if len(b) == 0 { if len(b) == 0 {
@@ -529,7 +531,7 @@ func (p *parser) parseMultilineBasicString(b []byte) ([]byte, []byte, []byte, er
// mlb-quotes = 1*2quotation-mark // mlb-quotes = 1*2quotation-mark
// mlb-unescaped = wschar / %x21 / %x23-5B / %x5D-7E / non-ascii // mlb-unescaped = wschar / %x21 / %x23-5B / %x5D-7E / non-ascii
// mlb-escaped-nl = escape ws newline *( wschar / newline ) // mlb-escaped-nl = escape ws newline *( wschar / newline )
token, rest, err := scanMultilineBasicString(b) token, escaped, rest, err := scanMultilineBasicString(b)
if err != nil { if err != nil {
return nil, nil, nil, err return nil, nil, nil, err
} }
@@ -546,16 +548,20 @@ func (p *parser) parseMultilineBasicString(b []byte) ([]byte, []byte, []byte, er
// fast path // fast path
startIdx := i startIdx := i
endIdx := len(token) - len(`"""`) endIdx := len(token) - len(`"""`)
for ; i < endIdx; i++ {
if token[i] == '\\' { if escaped < 0 {
break str := token[startIdx:endIdx]
verr := utf8TomlValidAlreadyEscaped(str)
if verr.Zero() {
return token, str, rest, nil
} }
} return nil, nil, nil, newDecodeError(str[verr.Index:verr.Index+verr.Size], "invalid UTF-8")
if i == endIdx {
return token, token[startIdx:endIdx], rest, nil
} }
i = escaped
var builder bytes.Buffer var builder bytes.Buffer
// grow?
builder.Write(token[startIdx:i]) builder.Write(token[startIdx:i])
// The scanner ensures that the token starts and ends with quotes and that // The scanner ensures that the token starts and ends with quotes and that
@@ -705,25 +711,30 @@ func (p *parser) parseBasicString(b []byte) ([]byte, []byte, []byte, error) {
// escape-seq-char =/ %x74 ; t tab U+0009 // escape-seq-char =/ %x74 ; t tab U+0009
// escape-seq-char =/ %x75 4HEXDIG ; uXXXX U+XXXX // escape-seq-char =/ %x75 4HEXDIG ; uXXXX U+XXXX
// escape-seq-char =/ %x55 8HEXDIG ; UXXXXXXXX U+XXXXXXXX // escape-seq-char =/ %x55 8HEXDIG ; UXXXXXXXX U+XXXXXXXX
token, rest, err := scanBasicString(b) token, escaped, rest, err := scanBasicString(b)
if err != nil { if err != nil {
return nil, nil, nil, err return nil, nil, nil, err
} }
// fast path startIdx := len(`"`)
i := len(`"`)
startIdx := i
endIdx := len(token) - len(`"`) endIdx := len(token) - len(`"`)
for ; i < endIdx; i++ {
if token[i] == '\\' { // Fast path. If there is no escape sequence, the string should just be
break // an UTF-8 encoded string, which is the same as Go. In that case,
// validate the string and return a direct reference to the buffer.
if escaped < 0 {
str := token[startIdx:endIdx]
verr := utf8TomlValidAlreadyEscaped(str)
if verr.Zero() {
return token, str, rest, nil
} }
} return nil, nil, nil, newDecodeError(str[verr.Index:verr.Index+verr.Size], "invalid UTF-8")
if i == endIdx {
return token, token[startIdx:endIdx], rest, nil
} }
i := escaped
var builder bytes.Buffer var builder bytes.Buffer
// grow?
builder.Write(token[startIdx:i]) builder.Write(token[startIdx:i])
// The scanner ensures that the token starts and ends with quotes and that // The scanner ensures that the token starts and ends with quotes and that
@@ -780,22 +791,27 @@ func hexToRune(b []byte, length int) (rune, error) {
} }
b = b[:length] b = b[:length]
var r rune var r uint32
for i, c := range b { for i, c := range b {
d := uint32(0)
switch { switch {
case '0' <= c && c <= '9': case '0' <= c && c <= '9':
c = c - '0' d = uint32(c - '0')
case 'a' <= c && c <= 'f': case 'a' <= c && c <= 'f':
c = c - 'a' + 10 d = uint32(c - 'a' + 10)
case 'A' <= c && c <= 'F': case 'A' <= c && c <= 'F':
c = c - 'A' + 10 d = uint32(c - 'A' + 10)
default: default:
return -1, newDecodeError(b[i:i+1], "non-hex character") return -1, newDecodeError(b[i:i+1], "non-hex character")
} }
r = r*16 + rune(c) r = r*16 + d
} }
return r, nil if r > unicode.MaxRune || 0xD800 <= r && r < 0xE000 {
return -1, newDecodeError(b, "escape sequence is invalid Unicode code point")
}
return rune(r), nil
} }
func (p *parser) parseWhitespace(b []byte) []byte { func (p *parser) parseWhitespace(b []byte) []byte {
+75 -17
View File
@@ -49,13 +49,18 @@ func scanLiteralString(b []byte) ([]byte, []byte, error) {
// literal-string = apostrophe *literal-char apostrophe // literal-string = apostrophe *literal-char apostrophe
// apostrophe = %x27 ; ' apostrophe // apostrophe = %x27 ; ' apostrophe
// literal-char = %x09 / %x20-26 / %x28-7E / non-ascii // literal-char = %x09 / %x20-26 / %x28-7E / non-ascii
for i := 1; i < len(b); i++ { for i := 1; i < len(b); {
switch b[i] { switch b[i] {
case '\'': case '\'':
return b[:i+1], b[i+1:], nil return b[:i+1], b[i+1:], nil
case '\n': case '\n':
return nil, nil, newDecodeError(b[i:i+1], "literal strings cannot have new lines") return nil, nil, newDecodeError(b[i:i+1], "literal strings cannot have new lines")
} }
size := utf8ValidNext(b[i:])
if size == 0 {
return nil, nil, newDecodeError(b[i:i+1], "invalid character")
}
i += size
} }
return nil, nil, newDecodeError(b[len(b):], "unterminated literal string") return nil, nil, newDecodeError(b[len(b):], "unterminated literal string")
@@ -70,10 +75,15 @@ func scanMultilineLiteralString(b []byte) ([]byte, []byte, error) {
// mll-content = mll-char / newline // mll-content = mll-char / newline
// mll-char = %x09 / %x20-26 / %x28-7E / non-ascii // mll-char = %x09 / %x20-26 / %x28-7E / non-ascii
// mll-quotes = 1*2apostrophe // mll-quotes = 1*2apostrophe
for i := 3; i < len(b); i++ { for i := 3; i < len(b); {
if b[i] == '\'' && scanFollowsMultilineLiteralStringDelimiter(b[i:]) { if b[i] == '\'' && scanFollowsMultilineLiteralStringDelimiter(b[i:]) {
return b[:i+3], b[i+3:], nil return b[:i+3], b[i+3:], nil
} }
size := utf8ValidNext(b[i:])
if size == 0 {
return nil, nil, newDecodeError(b[i:i+1], "invalid character")
}
i += size
} }
return nil, nil, newDecodeError(b[len(b):], `multiline literal string not terminated by '''`) return nil, nil, newDecodeError(b[len(b):], `multiline literal string not terminated by '''`)
@@ -106,45 +116,72 @@ func scanWhitespace(b []byte) ([]byte, []byte) {
} }
//nolint:unparam //nolint:unparam
func scanComment(b []byte) ([]byte, []byte) { func scanComment(b []byte) ([]byte, []byte, error) {
// comment-start-symbol = %x23 ; # // comment-start-symbol = %x23 ; #
// non-ascii = %x80-D7FF / %xE000-10FFFF // non-ascii = %x80-D7FF / %xE000-10FFFF
// non-eol = %x09 / %x20-7F / non-ascii // non-eol = %x09 / %x20-7F / non-ascii
// //
// comment = comment-start-symbol *non-eol // comment = comment-start-symbol *non-eol
for i := 1; i < len(b); i++ {
for i := 1; i < len(b); {
if b[i] == '\n' { if b[i] == '\n' {
return b[:i], b[i:] return b[:i], b[i:], nil
} }
size := utf8ValidNext(b[i:])
if size == 0 {
return nil, nil, newDecodeError(b[i:i+1], "invalid character in comment")
}
i += size
} }
return b, b[len(b):] return b, b[len(b):], nil
} }
func scanBasicString(b []byte) ([]byte, []byte, error) { func scanBasicString(b []byte) ([]byte, int, []byte, error) {
// basic-string = quotation-mark *basic-char quotation-mark // basic-string = quotation-mark *basic-char quotation-mark
// quotation-mark = %x22 ; " // quotation-mark = %x22 ; "
// basic-char = basic-unescaped / escaped // basic-char = basic-unescaped / escaped
// basic-unescaped = wschar / %x21 / %x23-5B / %x5D-7E / non-ascii // basic-unescaped = wschar / %x21 / %x23-5B / %x5D-7E / non-ascii
// escaped = escape escape-seq-char // escaped = escape escape-seq-char
for i := 1; i < len(b); i++ { escaped := -1 // index of the first \. -1 means no escape character in there.
i := 1
loop:
for ; i < len(b); i++ {
switch b[i] { switch b[i] {
case '"': case '"':
return b[:i+1], b[i+1:], nil return b[:i+1], escaped, b[i+1:], nil
case '\n': case '\n':
return nil, nil, newDecodeError(b[i:i+1], "basic strings cannot have new lines") return nil, escaped, nil, newDecodeError(b[i:i+1], "basic strings cannot have new lines")
case '\\': case '\\':
if len(b) < i+2 { if len(b) < i+2 {
return nil, nil, newDecodeError(b[i:i+1], "need a character after \\") return nil, escaped, nil, newDecodeError(b[i:i+1], "need a character after \\")
}
escaped = i
i += 2 // skip the next character
break loop
}
}
for ; i < len(b); i++ {
switch b[i] {
case '"':
return b[:i+1], escaped, b[i+1:], nil
case '\n':
return nil, escaped, nil, newDecodeError(b[i:i+1], "basic strings cannot have new lines")
case '\\':
if len(b) < i+2 {
return nil, escaped, nil, newDecodeError(b[i:i+1], "need a character after \\")
} }
i++ // skip the next character i++ // skip the next character
} }
} }
return nil, nil, newDecodeError(b[len(b):], `basic string not terminated by "`) return nil, escaped, nil, newDecodeError(b[len(b):], `basic string not terminated by "`)
} }
func scanMultilineBasicString(b []byte) ([]byte, []byte, error) { func scanMultilineBasicString(b []byte) ([]byte, int, []byte, error) {
// ml-basic-string = ml-basic-string-delim [ newline ] ml-basic-body // ml-basic-string = ml-basic-string-delim [ newline ] ml-basic-body
// ml-basic-string-delim // ml-basic-string-delim
// ml-basic-string-delim = 3quotation-mark // ml-basic-string-delim = 3quotation-mark
@@ -155,19 +192,40 @@ func scanMultilineBasicString(b []byte) ([]byte, []byte, error) {
// mlb-quotes = 1*2quotation-mark // mlb-quotes = 1*2quotation-mark
// mlb-unescaped = wschar / %x21 / %x23-5B / %x5D-7E / non-ascii // mlb-unescaped = wschar / %x21 / %x23-5B / %x5D-7E / non-ascii
// mlb-escaped-nl = escape ws newline *( wschar / newline ) // mlb-escaped-nl = escape ws newline *( wschar / newline )
for i := 3; i < len(b); i++ {
escaped := -1
i := 3
loop:
for ; i < len(b); i++ {
switch b[i] { switch b[i] {
case '"': case '"':
if scanFollowsMultilineBasicStringDelimiter(b[i:]) { if scanFollowsMultilineBasicStringDelimiter(b[i:]) {
return b[:i+3], b[i+3:], nil return b[:i+3], escaped, b[i+3:], nil
} }
case '\\': case '\\':
if len(b) < i+2 { if len(b) < i+2 {
return nil, nil, newDecodeError(b[len(b):], "need a character after \\") return nil, escaped, nil, newDecodeError(b[len(b):], "need a character after \\")
}
escaped = i
i += 2 // skip the next character
break loop
}
}
for ; i < len(b); i++ {
switch b[i] {
case '"':
if scanFollowsMultilineBasicStringDelimiter(b[i:]) {
return b[:i+3], escaped, b[i+3:], nil
}
case '\\':
if len(b) < i+2 {
return nil, escaped, nil, newDecodeError(b[len(b):], "need a character after \\")
} }
i++ // skip the next character i++ // skip the next character
} }
} }
return nil, nil, newDecodeError(b[len(b):], `multiline basic string not terminated by """`) return nil, escaped, nil, newDecodeError(b[len(b):], `multiline basic string not terminated by """`)
} }
-38
View File
@@ -71,139 +71,106 @@ func TestTOMLTest_Invalid_Bool_WrongCaseTrue(t *testing.T) {
} }
func TestTOMLTest_Invalid_Control_CommentDel(t *testing.T) { func TestTOMLTest_Invalid_Control_CommentDel(t *testing.T) {
t.Skip("FIXME")
input := "comment-del = \"0x7f\" # \u007f\n" input := "comment-del = \"0x7f\" # \u007f\n"
testgenInvalid(t, input) testgenInvalid(t, input)
} }
func TestTOMLTest_Invalid_Control_CommentLf(t *testing.T) { func TestTOMLTest_Invalid_Control_CommentLf(t *testing.T) {
t.Skip("FIXME")
input := "comment-lf = \"ctrl-P\" # \x10\n" input := "comment-lf = \"ctrl-P\" # \x10\n"
testgenInvalid(t, input) testgenInvalid(t, input)
} }
func TestTOMLTest_Invalid_Control_CommentNull(t *testing.T) { func TestTOMLTest_Invalid_Control_CommentNull(t *testing.T) {
t.Skip("FIXME")
input := "comment-null = \"null\" # \x00\n" input := "comment-null = \"null\" # \x00\n"
testgenInvalid(t, input) testgenInvalid(t, input)
} }
func TestTOMLTest_Invalid_Control_CommentUs(t *testing.T) { func TestTOMLTest_Invalid_Control_CommentUs(t *testing.T) {
t.Skip("FIXME")
input := "comment-us = \"ctrl-_\" # \x1f\n" input := "comment-us = \"ctrl-_\" # \x1f\n"
testgenInvalid(t, input) testgenInvalid(t, input)
} }
func TestTOMLTest_Invalid_Control_MultiDel(t *testing.T) { func TestTOMLTest_Invalid_Control_MultiDel(t *testing.T) {
t.Skip("FIXME")
input := "multi-del = \"\"\"null\u007f\"\"\"\n" input := "multi-del = \"\"\"null\u007f\"\"\"\n"
testgenInvalid(t, input) testgenInvalid(t, input)
} }
func TestTOMLTest_Invalid_Control_MultiLf(t *testing.T) { func TestTOMLTest_Invalid_Control_MultiLf(t *testing.T) {
t.Skip("FIXME")
input := "multi-lf = \"\"\"null\x10\"\"\"\n" input := "multi-lf = \"\"\"null\x10\"\"\"\n"
testgenInvalid(t, input) testgenInvalid(t, input)
} }
func TestTOMLTest_Invalid_Control_MultiNull(t *testing.T) { func TestTOMLTest_Invalid_Control_MultiNull(t *testing.T) {
t.Skip("FIXME")
input := "multi-null = \"\"\"null\x00\"\"\"\n" input := "multi-null = \"\"\"null\x00\"\"\"\n"
testgenInvalid(t, input) testgenInvalid(t, input)
} }
func TestTOMLTest_Invalid_Control_MultiUs(t *testing.T) { func TestTOMLTest_Invalid_Control_MultiUs(t *testing.T) {
t.Skip("FIXME")
input := "multi-us = \"\"\"null\x1f\"\"\"\n" input := "multi-us = \"\"\"null\x1f\"\"\"\n"
testgenInvalid(t, input) testgenInvalid(t, input)
} }
func TestTOMLTest_Invalid_Control_RawmultiDel(t *testing.T) { func TestTOMLTest_Invalid_Control_RawmultiDel(t *testing.T) {
t.Skip("FIXME")
input := "rawmulti-del = '''null\u007f'''\n" input := "rawmulti-del = '''null\u007f'''\n"
testgenInvalid(t, input) testgenInvalid(t, input)
} }
func TestTOMLTest_Invalid_Control_RawmultiLf(t *testing.T) { func TestTOMLTest_Invalid_Control_RawmultiLf(t *testing.T) {
t.Skip("FIXME")
input := "rawmulti-lf = '''null\x10'''\n" input := "rawmulti-lf = '''null\x10'''\n"
testgenInvalid(t, input) testgenInvalid(t, input)
} }
func TestTOMLTest_Invalid_Control_RawmultiNull(t *testing.T) { func TestTOMLTest_Invalid_Control_RawmultiNull(t *testing.T) {
t.Skip("FIXME")
input := "rawmulti-null = '''null\x00'''\n" input := "rawmulti-null = '''null\x00'''\n"
testgenInvalid(t, input) testgenInvalid(t, input)
} }
func TestTOMLTest_Invalid_Control_RawmultiUs(t *testing.T) { func TestTOMLTest_Invalid_Control_RawmultiUs(t *testing.T) {
t.Skip("FIXME")
input := "rawmulti-us = '''null\x1f'''\n" input := "rawmulti-us = '''null\x1f'''\n"
testgenInvalid(t, input) testgenInvalid(t, input)
} }
func TestTOMLTest_Invalid_Control_RawstringDel(t *testing.T) { func TestTOMLTest_Invalid_Control_RawstringDel(t *testing.T) {
t.Skip("FIXME")
input := "rawstring-del = 'null\u007f'\n" input := "rawstring-del = 'null\u007f'\n"
testgenInvalid(t, input) testgenInvalid(t, input)
} }
func TestTOMLTest_Invalid_Control_RawstringLf(t *testing.T) { func TestTOMLTest_Invalid_Control_RawstringLf(t *testing.T) {
t.Skip("FIXME")
input := "rawstring-lf = 'null\x10'\n" input := "rawstring-lf = 'null\x10'\n"
testgenInvalid(t, input) testgenInvalid(t, input)
} }
func TestTOMLTest_Invalid_Control_RawstringNull(t *testing.T) { func TestTOMLTest_Invalid_Control_RawstringNull(t *testing.T) {
t.Skip("FIXME")
input := "rawstring-null = 'null\x00'\n" input := "rawstring-null = 'null\x00'\n"
testgenInvalid(t, input) testgenInvalid(t, input)
} }
func TestTOMLTest_Invalid_Control_RawstringUs(t *testing.T) { func TestTOMLTest_Invalid_Control_RawstringUs(t *testing.T) {
t.Skip("FIXME")
input := "rawstring-us = 'null\x1f'\n" input := "rawstring-us = 'null\x1f'\n"
testgenInvalid(t, input) testgenInvalid(t, input)
} }
func TestTOMLTest_Invalid_Control_StringBs(t *testing.T) { func TestTOMLTest_Invalid_Control_StringBs(t *testing.T) {
t.Skip("FIXME")
input := "string-bs = \"backspace\b\"\n" input := "string-bs = \"backspace\b\"\n"
testgenInvalid(t, input) testgenInvalid(t, input)
} }
func TestTOMLTest_Invalid_Control_StringDel(t *testing.T) { func TestTOMLTest_Invalid_Control_StringDel(t *testing.T) {
t.Skip("FIXME")
input := "string-del = \"null\u007f\"\n" input := "string-del = \"null\u007f\"\n"
testgenInvalid(t, input) testgenInvalid(t, input)
} }
func TestTOMLTest_Invalid_Control_StringLf(t *testing.T) { func TestTOMLTest_Invalid_Control_StringLf(t *testing.T) {
t.Skip("FIXME")
input := "string-lf = \"null\x10\"\n" input := "string-lf = \"null\x10\"\n"
testgenInvalid(t, input) testgenInvalid(t, input)
} }
func TestTOMLTest_Invalid_Control_StringNull(t *testing.T) { func TestTOMLTest_Invalid_Control_StringNull(t *testing.T) {
t.Skip("FIXME")
input := "string-null = \"null\x00\"\n" input := "string-null = \"null\x00\"\n"
testgenInvalid(t, input) testgenInvalid(t, input)
} }
func TestTOMLTest_Invalid_Control_StringUs(t *testing.T) { func TestTOMLTest_Invalid_Control_StringUs(t *testing.T) {
t.Skip("FIXME")
input := "string-us = \"null\x1f\"\n" input := "string-us = \"null\x1f\"\n"
testgenInvalid(t, input) testgenInvalid(t, input)
} }
@@ -757,7 +724,6 @@ func TestTOMLTest_Invalid_String_BadByteEscape(t *testing.T) {
} }
func TestTOMLTest_Invalid_String_BadCodepoint(t *testing.T) { func TestTOMLTest_Invalid_String_BadCodepoint(t *testing.T) {
t.Skip("FIXME")
input := "invalid-codepoint = \"This string contains a non scalar unicode codepoint \\uD801\"\n" input := "invalid-codepoint = \"This string contains a non scalar unicode codepoint \\uD801\"\n"
testgenInvalid(t, input) testgenInvalid(t, input)
} }
@@ -793,13 +759,11 @@ func TestTOMLTest_Invalid_String_BasicByteEscapes(t *testing.T) {
} }
func TestTOMLTest_Invalid_String_BasicMultilineOutOfRangeUnicodeEscape1(t *testing.T) { func TestTOMLTest_Invalid_String_BasicMultilineOutOfRangeUnicodeEscape1(t *testing.T) {
t.Skip("FIXME")
input := "a = \"\"\"\\UFFFFFFFF\"\"\"\n" input := "a = \"\"\"\\UFFFFFFFF\"\"\"\n"
testgenInvalid(t, input) testgenInvalid(t, input)
} }
func TestTOMLTest_Invalid_String_BasicMultilineOutOfRangeUnicodeEscape2(t *testing.T) { func TestTOMLTest_Invalid_String_BasicMultilineOutOfRangeUnicodeEscape2(t *testing.T) {
t.Skip("FIXME")
input := "a = \"\"\"\\U00D80000\"\"\"\n" input := "a = \"\"\"\\U00D80000\"\"\"\n"
testgenInvalid(t, input) testgenInvalid(t, input)
} }
@@ -815,13 +779,11 @@ func TestTOMLTest_Invalid_String_BasicMultilineUnknownEscape(t *testing.T) {
} }
func TestTOMLTest_Invalid_String_BasicOutOfRangeUnicodeEscape1(t *testing.T) { func TestTOMLTest_Invalid_String_BasicOutOfRangeUnicodeEscape1(t *testing.T) {
t.Skip("FIXME")
input := "a = \"\\UFFFFFFFF\"\n" input := "a = \"\\UFFFFFFFF\"\n"
testgenInvalid(t, input) testgenInvalid(t, input)
} }
func TestTOMLTest_Invalid_String_BasicOutOfRangeUnicodeEscape2(t *testing.T) { func TestTOMLTest_Invalid_String_BasicOutOfRangeUnicodeEscape2(t *testing.T) {
t.Skip("FIXME")
input := "a = \"\\U00D80000\"\n" input := "a = \"\\U00D80000\"\n"
testgenInvalid(t, input) testgenInvalid(t, input)
} }
+2 -4
View File
@@ -899,18 +899,16 @@ func (d *decoder) unmarshalInteger(value *ast.Node, v reflect.Value) error {
} }
func (d *decoder) unmarshalString(value *ast.Node, v reflect.Value) error { func (d *decoder) unmarshalString(value *ast.Node, v reflect.Value) error {
var err error
switch v.Kind() { switch v.Kind() {
case reflect.String: case reflect.String:
v.SetString(string(value.Data)) v.SetString(string(value.Data))
case reflect.Interface: case reflect.Interface:
v.Set(reflect.ValueOf(string(value.Data))) v.Set(reflect.ValueOf(string(value.Data)))
default: default:
err = newDecodeError(d.p.Raw(value.Raw), "cannot store TOML string into a Go %s", v.Kind()) return newDecodeError(d.p.Raw(value.Raw), "cannot store TOML string into a Go %s", v.Kind())
} }
return err return nil
} }
func (d *decoder) handleKeyValue(expr *ast.Node, v reflect.Value) (reflect.Value, error) { func (d *decoder) handleKeyValue(expr *ast.Node, v reflect.Value) (reflect.Value, error) {
+82
View File
@@ -246,6 +246,20 @@ func TestUnmarshal(t *testing.T) {
} }
}, },
}, },
{
desc: "kv literal string",
input: `A = 'foo 🙂 '`,
gen: func() test {
type doc struct {
A string
}
return test{
target: &doc{},
expected: &doc{A: "foo 🙂 "},
}
},
},
{ {
desc: "time.time with negative zone", desc: "time.time with negative zone",
input: `a = 1979-05-27T00:32:00-07:00 `, // space intentional input: `a = 1979-05-27T00:32:00-07:00 `, // space intentional
@@ -2009,6 +2023,74 @@ world'`,
desc: `invalid nan`, desc: `invalid nan`,
data: `A = non`, data: `A = non`,
}, },
{
desc: `invalid character in comment in array`,
data: "A = [#\x00\n]",
},
{
desc: "invalid utf8 character in long string with no escape sequence",
data: "a = \"aaaa\x80aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\"",
},
{
desc: "invalid ascii character in long string with no escape sequence",
data: "a = \"aaaa\x00aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\"",
},
{
desc: "unfinished 2-byte utf8 character in string with no escape sequence",
data: "a = \"aaaa\xC2\"",
},
{
desc: "unfinished 3-byte utf8 character in string with no escape sequence",
data: "a = \"aaaa\xE2\x00\x00\"",
},
{
desc: "invalid 3rd byte of 3-byte utf8 character in string with no escape sequence",
data: "a = \"aaaa\xE2\x80\x00\"",
},
{
desc: "invalid 4rd byte of 4-byte utf8 character in string with no escape sequence",
data: "a = \"aaaa\xF2\x81\x81\x00\"",
},
{
desc: "unfinished 2-byte utf8 character in literal string",
data: "a = 'aaa\xC2'",
},
{
desc: "unfinished 3-byte utf8 character in literal string",
data: "a = 'aaaa\xE2\x00\x00'",
},
{
desc: "invalid 3rd byte of 3-byte utf8 character in literal string",
data: "a = 'aaaa\xE2\x80\x00'",
},
{
desc: "invalid 4rd byte of 4-byte utf8 character in literal string",
data: "a = 'aaaa\xF2\x81\x81\x00'",
},
{
desc: "invalid start utf8 character in literal string",
data: "a = '\x80'",
},
{
desc: "utf8 character with not enough bytes before end in literal string",
data: "a = '\xEF'",
},
{
desc: "basic string with newline after the first escape code",
data: "a = \"\\t\n\"",
},
{
desc: "basic string with unfinished escape sequence after the first escape code",
data: "a = \"\\t\\",
},
{
desc: "basic string with unfinished after the first escape code",
data: "a = \"\\t",
},
{
desc: "multiline basic string with unfinished escape sequence after the first escape code",
data: "a = \"\"\"\\t\\",
},
} }
for _, e := range examples { for _, e := range examples {
+203
View File
@@ -0,0 +1,203 @@
package toml
import (
"unicode/utf8"
)
type utf8Err struct {
Index int
Size int
}
func (u utf8Err) Zero() bool {
return u.Size == 0
}
// Verified that a given string is only made of valid UTF-8 characters allowed
// by the TOML spec:
//
// Any Unicode character may be used except those that must be escaped:
// quotation mark, backslash, and the control characters other than tab (U+0000
// to U+0008, U+000A to U+001F, U+007F).
//
// It is a copy of the Go 1.17 utf8.Valid implementation, tweaked to exit early
// when a character is not allowed.
//
// The returned utf8Err is Zero() if the string is valid, or contains the byte
// index and size of the invalid character.
//
// quotation mark => already checked
// backslash => already checked
// 0-0x8 => invalid
// 0x9 => tab, ok
// 0xA - 0x1F => invalid
// 0x7F => invalid
func utf8TomlValidAlreadyEscaped(p []byte) (err utf8Err) {
// Fast path. Check for and skip 8 bytes of ASCII characters per iteration.
offset := 0
for len(p) >= 8 {
// Combining two 32 bit loads allows the same code to be used
// for 32 and 64 bit platforms.
// The compiler can generate a 32bit load for first32 and second32
// on many platforms. See test/codegen/memcombine.go.
first32 := uint32(p[0]) | uint32(p[1])<<8 | uint32(p[2])<<16 | uint32(p[3])<<24
second32 := uint32(p[4]) | uint32(p[5])<<8 | uint32(p[6])<<16 | uint32(p[7])<<24
if (first32|second32)&0x80808080 != 0 {
// Found a non ASCII byte (>= RuneSelf).
break
}
for i, b := range p[:8] {
if invalidAscii(b) {
err.Index = offset + i
err.Size = 1
return
}
}
p = p[8:]
offset += 8
}
n := len(p)
for i := 0; i < n; {
pi := p[i]
if pi < utf8.RuneSelf {
if invalidAscii(pi) {
err.Index = offset + i
err.Size = 1
return
}
i++
continue
}
x := first[pi]
if x == xx {
// Illegal starter byte.
err.Index = offset + i
err.Size = 1
return
}
size := int(x & 7)
if i+size > n {
// Short or invalid.
err.Index = offset + i
err.Size = n - i
return
}
accept := acceptRanges[x>>4]
if c := p[i+1]; c < accept.lo || accept.hi < c {
err.Index = offset + i
err.Size = 2
return
} else if size == 2 {
} else if c := p[i+2]; c < locb || hicb < c {
err.Index = offset + i
err.Size = 3
return
} else if size == 3 {
} else if c := p[i+3]; c < locb || hicb < c {
err.Index = offset + i
err.Size = 4
return
}
i += size
}
return
}
// Return the size of the next rune if valid, 0 otherwise.
func utf8ValidNext(p []byte) int {
c := p[0]
if c < utf8.RuneSelf {
if invalidAscii(c) {
return 0
}
return 1
}
x := first[c]
if x == xx {
// Illegal starter byte.
return 0
}
size := int(x & 7)
if size > len(p) {
// Short or invalid.
return 0
}
accept := acceptRanges[x>>4]
if c := p[1]; c < accept.lo || accept.hi < c {
return 0
} else if size == 2 {
} else if c := p[2]; c < locb || hicb < c {
return 0
} else if size == 3 {
} else if c := p[3]; c < locb || hicb < c {
return 0
}
return size
}
func invalidAscii(b byte) bool {
return b <= 0x08 || (b > 0x0A && b < 0x0D) || (b > 0x0D && b <= 0x1F) || b == 0x7F
}
// acceptRange gives the range of valid values for the second byte in a UTF-8
// sequence.
type acceptRange struct {
lo uint8 // lowest value for second byte.
hi uint8 // highest value for second byte.
}
// acceptRanges has size 16 to avoid bounds checks in the code that uses it.
var acceptRanges = [16]acceptRange{
0: {locb, hicb},
1: {0xA0, hicb},
2: {locb, 0x9F},
3: {0x90, hicb},
4: {locb, 0x8F},
}
// first is information about the first byte in a UTF-8 sequence.
var first = [256]uint8{
// 1 2 3 4 5 6 7 8 9 A B C D E F
as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x00-0x0F
as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x10-0x1F
as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x20-0x2F
as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x30-0x3F
as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x40-0x4F
as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x50-0x5F
as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x60-0x6F
as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x70-0x7F
// 1 2 3 4 5 6 7 8 9 A B C D E F
xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0x80-0x8F
xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0x90-0x9F
xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0xA0-0xAF
xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0xB0-0xBF
xx, xx, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, // 0xC0-0xCF
s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, // 0xD0-0xDF
s2, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s4, s3, s3, // 0xE0-0xEF
s5, s6, s6, s6, s7, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0xF0-0xFF
}
const (
// The default lowest and highest continuation byte.
locb = 0b10000000
hicb = 0b10111111
// These names of these constants are chosen to give nice alignment in the
// table below. The first nibble is an index into acceptRanges or F for
// special one-byte cases. The second nibble is the Rune length or the
// Status for the special one-byte case.
xx = 0xF1 // invalid: size 1
as = 0xF0 // ASCII: size 1
s1 = 0x02 // accept 0, size 2
s2 = 0x13 // accept 1, size 3
s3 = 0x03 // accept 0, size 3
s4 = 0x23 // accept 2, size 3
s5 = 0x34 // accept 3, size 4
s6 = 0x04 // accept 0, size 4
s7 = 0x44 // accept 4, size 4
)