Validate UTF-8 (#629)
This commit is contained in:
@@ -0,0 +1,71 @@
|
||||
package toml
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"testing"
|
||||
)
|
||||
|
||||
var valid10Ascii = []byte("1234567890")
|
||||
var valid10Utf8 = []byte("日本語a")
|
||||
var valid1kUtf8 = bytes.Repeat([]byte("0123456789日本語日本語日本語日abcdefghijklmnopqrstuvwx"), 16)
|
||||
var valid1MUtf8 = bytes.Repeat(valid1kUtf8, 1024)
|
||||
var valid1kAscii = bytes.Repeat([]byte("012345678998jhjklasDJKLAAdjdfjsdklfjdslkabcdefghijklmnopqrstuvwx"), 16)
|
||||
var valid1MAscii = bytes.Repeat(valid1kAscii, 1024)
|
||||
|
||||
func BenchmarkScanComments(b *testing.B) {
|
||||
wrap := func(x []byte) []byte {
|
||||
return []byte("# " + string(x) + "\n")
|
||||
}
|
||||
|
||||
inputs := map[string][]byte{
|
||||
"10Valid": wrap(valid10Ascii),
|
||||
"1kValid": wrap(valid1kAscii),
|
||||
"1MValid": wrap(valid1MAscii),
|
||||
"10ValidUtf8": wrap(valid10Utf8),
|
||||
"1kValidUtf8": wrap(valid1kUtf8),
|
||||
"1MValidUtf8": wrap(valid1MUtf8),
|
||||
}
|
||||
|
||||
for name, input := range inputs {
|
||||
b.Run(name, func(b *testing.B) {
|
||||
b.SetBytes(int64(len(input)))
|
||||
b.ReportAllocs()
|
||||
b.ResetTimer()
|
||||
|
||||
for i := 0; i < b.N; i++ {
|
||||
scanComment(input)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func BenchmarkParseLiteralStringValid(b *testing.B) {
|
||||
wrap := func(x []byte) []byte {
|
||||
return []byte("'" + string(x) + "'")
|
||||
}
|
||||
|
||||
inputs := map[string][]byte{
|
||||
"10Valid": wrap(valid10Ascii),
|
||||
"1kValid": wrap(valid1kAscii),
|
||||
"1MValid": wrap(valid1MAscii),
|
||||
"10ValidUtf8": wrap(valid10Utf8),
|
||||
"1kValidUtf8": wrap(valid1kUtf8),
|
||||
"1MValidUtf8": wrap(valid1MUtf8),
|
||||
}
|
||||
|
||||
for name, input := range inputs {
|
||||
b.Run(name, func(b *testing.B) {
|
||||
p := parser{}
|
||||
b.SetBytes(int64(len(input)))
|
||||
b.ReportAllocs()
|
||||
b.ResetTimer()
|
||||
|
||||
for i := 0; i < b.N; i++ {
|
||||
_, _, _, err := p.parseLiteralString(input)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
@@ -2,6 +2,7 @@ package toml
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"unicode"
|
||||
|
||||
"github.com/pelletier/go-toml/v2/internal/ast"
|
||||
"github.com/pelletier/go-toml/v2/internal/danger"
|
||||
@@ -106,9 +107,8 @@ func (p *parser) parseExpression(b []byte) (ast.Reference, []byte, error) {
|
||||
}
|
||||
|
||||
if b[0] == '#' {
|
||||
_, rest := scanComment(b)
|
||||
|
||||
return ref, rest, nil
|
||||
_, rest, err := scanComment(b)
|
||||
return ref, rest, err
|
||||
}
|
||||
|
||||
if b[0] == '\n' || b[0] == '\r' {
|
||||
@@ -129,9 +129,8 @@ func (p *parser) parseExpression(b []byte) (ast.Reference, []byte, error) {
|
||||
b = p.parseWhitespace(b)
|
||||
|
||||
if len(b) > 0 && b[0] == '#' {
|
||||
_, rest := scanComment(b)
|
||||
|
||||
return ref, rest, nil
|
||||
_, rest, err := scanComment(b)
|
||||
return ref, rest, err
|
||||
}
|
||||
|
||||
return ref, b, nil
|
||||
@@ -479,7 +478,10 @@ func (p *parser) parseOptionalWhitespaceCommentNewline(b []byte) ([]byte, error)
|
||||
b = p.parseWhitespace(b)
|
||||
|
||||
if len(b) > 0 && b[0] == '#' {
|
||||
_, b = scanComment(b)
|
||||
_, b, err = scanComment(b)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
|
||||
if len(b) == 0 {
|
||||
@@ -529,7 +531,7 @@ func (p *parser) parseMultilineBasicString(b []byte) ([]byte, []byte, []byte, er
|
||||
// mlb-quotes = 1*2quotation-mark
|
||||
// mlb-unescaped = wschar / %x21 / %x23-5B / %x5D-7E / non-ascii
|
||||
// mlb-escaped-nl = escape ws newline *( wschar / newline )
|
||||
token, rest, err := scanMultilineBasicString(b)
|
||||
token, escaped, rest, err := scanMultilineBasicString(b)
|
||||
if err != nil {
|
||||
return nil, nil, nil, err
|
||||
}
|
||||
@@ -546,16 +548,20 @@ func (p *parser) parseMultilineBasicString(b []byte) ([]byte, []byte, []byte, er
|
||||
// fast path
|
||||
startIdx := i
|
||||
endIdx := len(token) - len(`"""`)
|
||||
for ; i < endIdx; i++ {
|
||||
if token[i] == '\\' {
|
||||
break
|
||||
|
||||
if escaped < 0 {
|
||||
str := token[startIdx:endIdx]
|
||||
verr := utf8TomlValidAlreadyEscaped(str)
|
||||
if verr.Zero() {
|
||||
return token, str, rest, nil
|
||||
}
|
||||
}
|
||||
if i == endIdx {
|
||||
return token, token[startIdx:endIdx], rest, nil
|
||||
return nil, nil, nil, newDecodeError(str[verr.Index:verr.Index+verr.Size], "invalid UTF-8")
|
||||
}
|
||||
|
||||
i = escaped
|
||||
|
||||
var builder bytes.Buffer
|
||||
// grow?
|
||||
builder.Write(token[startIdx:i])
|
||||
|
||||
// The scanner ensures that the token starts and ends with quotes and that
|
||||
@@ -705,25 +711,30 @@ func (p *parser) parseBasicString(b []byte) ([]byte, []byte, []byte, error) {
|
||||
// escape-seq-char =/ %x74 ; t tab U+0009
|
||||
// escape-seq-char =/ %x75 4HEXDIG ; uXXXX U+XXXX
|
||||
// escape-seq-char =/ %x55 8HEXDIG ; UXXXXXXXX U+XXXXXXXX
|
||||
token, rest, err := scanBasicString(b)
|
||||
token, escaped, rest, err := scanBasicString(b)
|
||||
if err != nil {
|
||||
return nil, nil, nil, err
|
||||
}
|
||||
|
||||
// fast path
|
||||
i := len(`"`)
|
||||
startIdx := i
|
||||
startIdx := len(`"`)
|
||||
endIdx := len(token) - len(`"`)
|
||||
for ; i < endIdx; i++ {
|
||||
if token[i] == '\\' {
|
||||
break
|
||||
|
||||
// Fast path. If there is no escape sequence, the string should just be
|
||||
// an UTF-8 encoded string, which is the same as Go. In that case,
|
||||
// validate the string and return a direct reference to the buffer.
|
||||
if escaped < 0 {
|
||||
str := token[startIdx:endIdx]
|
||||
verr := utf8TomlValidAlreadyEscaped(str)
|
||||
if verr.Zero() {
|
||||
return token, str, rest, nil
|
||||
}
|
||||
}
|
||||
if i == endIdx {
|
||||
return token, token[startIdx:endIdx], rest, nil
|
||||
return nil, nil, nil, newDecodeError(str[verr.Index:verr.Index+verr.Size], "invalid UTF-8")
|
||||
}
|
||||
|
||||
i := escaped
|
||||
|
||||
var builder bytes.Buffer
|
||||
// grow?
|
||||
builder.Write(token[startIdx:i])
|
||||
|
||||
// The scanner ensures that the token starts and ends with quotes and that
|
||||
@@ -780,22 +791,27 @@ func hexToRune(b []byte, length int) (rune, error) {
|
||||
}
|
||||
b = b[:length]
|
||||
|
||||
var r rune
|
||||
var r uint32
|
||||
for i, c := range b {
|
||||
d := uint32(0)
|
||||
switch {
|
||||
case '0' <= c && c <= '9':
|
||||
c = c - '0'
|
||||
d = uint32(c - '0')
|
||||
case 'a' <= c && c <= 'f':
|
||||
c = c - 'a' + 10
|
||||
d = uint32(c - 'a' + 10)
|
||||
case 'A' <= c && c <= 'F':
|
||||
c = c - 'A' + 10
|
||||
d = uint32(c - 'A' + 10)
|
||||
default:
|
||||
return -1, newDecodeError(b[i:i+1], "non-hex character")
|
||||
}
|
||||
r = r*16 + rune(c)
|
||||
r = r*16 + d
|
||||
}
|
||||
|
||||
return r, nil
|
||||
if r > unicode.MaxRune || 0xD800 <= r && r < 0xE000 {
|
||||
return -1, newDecodeError(b, "escape sequence is invalid Unicode code point")
|
||||
}
|
||||
|
||||
return rune(r), nil
|
||||
}
|
||||
|
||||
func (p *parser) parseWhitespace(b []byte) []byte {
|
||||
|
||||
+75
-17
@@ -49,13 +49,18 @@ func scanLiteralString(b []byte) ([]byte, []byte, error) {
|
||||
// literal-string = apostrophe *literal-char apostrophe
|
||||
// apostrophe = %x27 ; ' apostrophe
|
||||
// literal-char = %x09 / %x20-26 / %x28-7E / non-ascii
|
||||
for i := 1; i < len(b); i++ {
|
||||
for i := 1; i < len(b); {
|
||||
switch b[i] {
|
||||
case '\'':
|
||||
return b[:i+1], b[i+1:], nil
|
||||
case '\n':
|
||||
return nil, nil, newDecodeError(b[i:i+1], "literal strings cannot have new lines")
|
||||
}
|
||||
size := utf8ValidNext(b[i:])
|
||||
if size == 0 {
|
||||
return nil, nil, newDecodeError(b[i:i+1], "invalid character")
|
||||
}
|
||||
i += size
|
||||
}
|
||||
|
||||
return nil, nil, newDecodeError(b[len(b):], "unterminated literal string")
|
||||
@@ -70,10 +75,15 @@ func scanMultilineLiteralString(b []byte) ([]byte, []byte, error) {
|
||||
// mll-content = mll-char / newline
|
||||
// mll-char = %x09 / %x20-26 / %x28-7E / non-ascii
|
||||
// mll-quotes = 1*2apostrophe
|
||||
for i := 3; i < len(b); i++ {
|
||||
for i := 3; i < len(b); {
|
||||
if b[i] == '\'' && scanFollowsMultilineLiteralStringDelimiter(b[i:]) {
|
||||
return b[:i+3], b[i+3:], nil
|
||||
}
|
||||
size := utf8ValidNext(b[i:])
|
||||
if size == 0 {
|
||||
return nil, nil, newDecodeError(b[i:i+1], "invalid character")
|
||||
}
|
||||
i += size
|
||||
}
|
||||
|
||||
return nil, nil, newDecodeError(b[len(b):], `multiline literal string not terminated by '''`)
|
||||
@@ -106,45 +116,72 @@ func scanWhitespace(b []byte) ([]byte, []byte) {
|
||||
}
|
||||
|
||||
//nolint:unparam
|
||||
func scanComment(b []byte) ([]byte, []byte) {
|
||||
func scanComment(b []byte) ([]byte, []byte, error) {
|
||||
// comment-start-symbol = %x23 ; #
|
||||
// non-ascii = %x80-D7FF / %xE000-10FFFF
|
||||
// non-eol = %x09 / %x20-7F / non-ascii
|
||||
//
|
||||
// comment = comment-start-symbol *non-eol
|
||||
for i := 1; i < len(b); i++ {
|
||||
|
||||
for i := 1; i < len(b); {
|
||||
if b[i] == '\n' {
|
||||
return b[:i], b[i:]
|
||||
return b[:i], b[i:], nil
|
||||
}
|
||||
size := utf8ValidNext(b[i:])
|
||||
if size == 0 {
|
||||
return nil, nil, newDecodeError(b[i:i+1], "invalid character in comment")
|
||||
}
|
||||
|
||||
i += size
|
||||
}
|
||||
|
||||
return b, b[len(b):]
|
||||
return b, b[len(b):], nil
|
||||
}
|
||||
|
||||
func scanBasicString(b []byte) ([]byte, []byte, error) {
|
||||
func scanBasicString(b []byte) ([]byte, int, []byte, error) {
|
||||
// basic-string = quotation-mark *basic-char quotation-mark
|
||||
// quotation-mark = %x22 ; "
|
||||
// basic-char = basic-unescaped / escaped
|
||||
// basic-unescaped = wschar / %x21 / %x23-5B / %x5D-7E / non-ascii
|
||||
// escaped = escape escape-seq-char
|
||||
for i := 1; i < len(b); i++ {
|
||||
escaped := -1 // index of the first \. -1 means no escape character in there.
|
||||
i := 1
|
||||
|
||||
loop:
|
||||
for ; i < len(b); i++ {
|
||||
switch b[i] {
|
||||
case '"':
|
||||
return b[:i+1], b[i+1:], nil
|
||||
return b[:i+1], escaped, b[i+1:], nil
|
||||
case '\n':
|
||||
return nil, nil, newDecodeError(b[i:i+1], "basic strings cannot have new lines")
|
||||
return nil, escaped, nil, newDecodeError(b[i:i+1], "basic strings cannot have new lines")
|
||||
case '\\':
|
||||
if len(b) < i+2 {
|
||||
return nil, nil, newDecodeError(b[i:i+1], "need a character after \\")
|
||||
return nil, escaped, nil, newDecodeError(b[i:i+1], "need a character after \\")
|
||||
}
|
||||
escaped = i
|
||||
i += 2 // skip the next character
|
||||
break loop
|
||||
}
|
||||
}
|
||||
|
||||
for ; i < len(b); i++ {
|
||||
switch b[i] {
|
||||
case '"':
|
||||
return b[:i+1], escaped, b[i+1:], nil
|
||||
case '\n':
|
||||
return nil, escaped, nil, newDecodeError(b[i:i+1], "basic strings cannot have new lines")
|
||||
case '\\':
|
||||
if len(b) < i+2 {
|
||||
return nil, escaped, nil, newDecodeError(b[i:i+1], "need a character after \\")
|
||||
}
|
||||
i++ // skip the next character
|
||||
}
|
||||
}
|
||||
|
||||
return nil, nil, newDecodeError(b[len(b):], `basic string not terminated by "`)
|
||||
return nil, escaped, nil, newDecodeError(b[len(b):], `basic string not terminated by "`)
|
||||
}
|
||||
|
||||
func scanMultilineBasicString(b []byte) ([]byte, []byte, error) {
|
||||
func scanMultilineBasicString(b []byte) ([]byte, int, []byte, error) {
|
||||
// ml-basic-string = ml-basic-string-delim [ newline ] ml-basic-body
|
||||
// ml-basic-string-delim
|
||||
// ml-basic-string-delim = 3quotation-mark
|
||||
@@ -155,19 +192,40 @@ func scanMultilineBasicString(b []byte) ([]byte, []byte, error) {
|
||||
// mlb-quotes = 1*2quotation-mark
|
||||
// mlb-unescaped = wschar / %x21 / %x23-5B / %x5D-7E / non-ascii
|
||||
// mlb-escaped-nl = escape ws newline *( wschar / newline )
|
||||
for i := 3; i < len(b); i++ {
|
||||
|
||||
escaped := -1
|
||||
i := 3
|
||||
|
||||
loop:
|
||||
for ; i < len(b); i++ {
|
||||
switch b[i] {
|
||||
case '"':
|
||||
if scanFollowsMultilineBasicStringDelimiter(b[i:]) {
|
||||
return b[:i+3], b[i+3:], nil
|
||||
return b[:i+3], escaped, b[i+3:], nil
|
||||
}
|
||||
case '\\':
|
||||
if len(b) < i+2 {
|
||||
return nil, nil, newDecodeError(b[len(b):], "need a character after \\")
|
||||
return nil, escaped, nil, newDecodeError(b[len(b):], "need a character after \\")
|
||||
}
|
||||
escaped = i
|
||||
i += 2 // skip the next character
|
||||
break loop
|
||||
}
|
||||
}
|
||||
|
||||
for ; i < len(b); i++ {
|
||||
switch b[i] {
|
||||
case '"':
|
||||
if scanFollowsMultilineBasicStringDelimiter(b[i:]) {
|
||||
return b[:i+3], escaped, b[i+3:], nil
|
||||
}
|
||||
case '\\':
|
||||
if len(b) < i+2 {
|
||||
return nil, escaped, nil, newDecodeError(b[len(b):], "need a character after \\")
|
||||
}
|
||||
i++ // skip the next character
|
||||
}
|
||||
}
|
||||
|
||||
return nil, nil, newDecodeError(b[len(b):], `multiline basic string not terminated by """`)
|
||||
return nil, escaped, nil, newDecodeError(b[len(b):], `multiline basic string not terminated by """`)
|
||||
}
|
||||
|
||||
@@ -71,139 +71,106 @@ func TestTOMLTest_Invalid_Bool_WrongCaseTrue(t *testing.T) {
|
||||
}
|
||||
|
||||
func TestTOMLTest_Invalid_Control_CommentDel(t *testing.T) {
|
||||
t.Skip("FIXME")
|
||||
input := "comment-del = \"0x7f\" # \u007f\n"
|
||||
testgenInvalid(t, input)
|
||||
}
|
||||
|
||||
func TestTOMLTest_Invalid_Control_CommentLf(t *testing.T) {
|
||||
t.Skip("FIXME")
|
||||
input := "comment-lf = \"ctrl-P\" # \x10\n"
|
||||
testgenInvalid(t, input)
|
||||
}
|
||||
|
||||
func TestTOMLTest_Invalid_Control_CommentNull(t *testing.T) {
|
||||
t.Skip("FIXME")
|
||||
input := "comment-null = \"null\" # \x00\n"
|
||||
testgenInvalid(t, input)
|
||||
}
|
||||
|
||||
func TestTOMLTest_Invalid_Control_CommentUs(t *testing.T) {
|
||||
t.Skip("FIXME")
|
||||
input := "comment-us = \"ctrl-_\" # \x1f\n"
|
||||
testgenInvalid(t, input)
|
||||
}
|
||||
|
||||
func TestTOMLTest_Invalid_Control_MultiDel(t *testing.T) {
|
||||
t.Skip("FIXME")
|
||||
|
||||
input := "multi-del = \"\"\"null\u007f\"\"\"\n"
|
||||
testgenInvalid(t, input)
|
||||
}
|
||||
|
||||
func TestTOMLTest_Invalid_Control_MultiLf(t *testing.T) {
|
||||
t.Skip("FIXME")
|
||||
|
||||
input := "multi-lf = \"\"\"null\x10\"\"\"\n"
|
||||
testgenInvalid(t, input)
|
||||
}
|
||||
|
||||
func TestTOMLTest_Invalid_Control_MultiNull(t *testing.T) {
|
||||
t.Skip("FIXME")
|
||||
|
||||
input := "multi-null = \"\"\"null\x00\"\"\"\n"
|
||||
testgenInvalid(t, input)
|
||||
}
|
||||
|
||||
func TestTOMLTest_Invalid_Control_MultiUs(t *testing.T) {
|
||||
t.Skip("FIXME")
|
||||
|
||||
input := "multi-us = \"\"\"null\x1f\"\"\"\n"
|
||||
testgenInvalid(t, input)
|
||||
}
|
||||
|
||||
func TestTOMLTest_Invalid_Control_RawmultiDel(t *testing.T) {
|
||||
t.Skip("FIXME")
|
||||
|
||||
input := "rawmulti-del = '''null\u007f'''\n"
|
||||
testgenInvalid(t, input)
|
||||
}
|
||||
|
||||
func TestTOMLTest_Invalid_Control_RawmultiLf(t *testing.T) {
|
||||
t.Skip("FIXME")
|
||||
|
||||
input := "rawmulti-lf = '''null\x10'''\n"
|
||||
testgenInvalid(t, input)
|
||||
}
|
||||
|
||||
func TestTOMLTest_Invalid_Control_RawmultiNull(t *testing.T) {
|
||||
t.Skip("FIXME")
|
||||
|
||||
input := "rawmulti-null = '''null\x00'''\n"
|
||||
testgenInvalid(t, input)
|
||||
}
|
||||
|
||||
func TestTOMLTest_Invalid_Control_RawmultiUs(t *testing.T) {
|
||||
t.Skip("FIXME")
|
||||
|
||||
input := "rawmulti-us = '''null\x1f'''\n"
|
||||
testgenInvalid(t, input)
|
||||
}
|
||||
|
||||
func TestTOMLTest_Invalid_Control_RawstringDel(t *testing.T) {
|
||||
t.Skip("FIXME")
|
||||
|
||||
input := "rawstring-del = 'null\u007f'\n"
|
||||
testgenInvalid(t, input)
|
||||
}
|
||||
|
||||
func TestTOMLTest_Invalid_Control_RawstringLf(t *testing.T) {
|
||||
t.Skip("FIXME")
|
||||
|
||||
input := "rawstring-lf = 'null\x10'\n"
|
||||
testgenInvalid(t, input)
|
||||
}
|
||||
|
||||
func TestTOMLTest_Invalid_Control_RawstringNull(t *testing.T) {
|
||||
t.Skip("FIXME")
|
||||
|
||||
input := "rawstring-null = 'null\x00'\n"
|
||||
testgenInvalid(t, input)
|
||||
}
|
||||
|
||||
func TestTOMLTest_Invalid_Control_RawstringUs(t *testing.T) {
|
||||
t.Skip("FIXME")
|
||||
|
||||
input := "rawstring-us = 'null\x1f'\n"
|
||||
testgenInvalid(t, input)
|
||||
}
|
||||
|
||||
func TestTOMLTest_Invalid_Control_StringBs(t *testing.T) {
|
||||
t.Skip("FIXME")
|
||||
input := "string-bs = \"backspace\b\"\n"
|
||||
testgenInvalid(t, input)
|
||||
}
|
||||
|
||||
func TestTOMLTest_Invalid_Control_StringDel(t *testing.T) {
|
||||
t.Skip("FIXME")
|
||||
input := "string-del = \"null\u007f\"\n"
|
||||
testgenInvalid(t, input)
|
||||
}
|
||||
|
||||
func TestTOMLTest_Invalid_Control_StringLf(t *testing.T) {
|
||||
t.Skip("FIXME")
|
||||
input := "string-lf = \"null\x10\"\n"
|
||||
testgenInvalid(t, input)
|
||||
}
|
||||
|
||||
func TestTOMLTest_Invalid_Control_StringNull(t *testing.T) {
|
||||
t.Skip("FIXME")
|
||||
input := "string-null = \"null\x00\"\n"
|
||||
testgenInvalid(t, input)
|
||||
}
|
||||
|
||||
func TestTOMLTest_Invalid_Control_StringUs(t *testing.T) {
|
||||
t.Skip("FIXME")
|
||||
input := "string-us = \"null\x1f\"\n"
|
||||
testgenInvalid(t, input)
|
||||
}
|
||||
@@ -757,7 +724,6 @@ func TestTOMLTest_Invalid_String_BadByteEscape(t *testing.T) {
|
||||
}
|
||||
|
||||
func TestTOMLTest_Invalid_String_BadCodepoint(t *testing.T) {
|
||||
t.Skip("FIXME")
|
||||
input := "invalid-codepoint = \"This string contains a non scalar unicode codepoint \\uD801\"\n"
|
||||
testgenInvalid(t, input)
|
||||
}
|
||||
@@ -793,13 +759,11 @@ func TestTOMLTest_Invalid_String_BasicByteEscapes(t *testing.T) {
|
||||
}
|
||||
|
||||
func TestTOMLTest_Invalid_String_BasicMultilineOutOfRangeUnicodeEscape1(t *testing.T) {
|
||||
t.Skip("FIXME")
|
||||
input := "a = \"\"\"\\UFFFFFFFF\"\"\"\n"
|
||||
testgenInvalid(t, input)
|
||||
}
|
||||
|
||||
func TestTOMLTest_Invalid_String_BasicMultilineOutOfRangeUnicodeEscape2(t *testing.T) {
|
||||
t.Skip("FIXME")
|
||||
input := "a = \"\"\"\\U00D80000\"\"\"\n"
|
||||
testgenInvalid(t, input)
|
||||
}
|
||||
@@ -815,13 +779,11 @@ func TestTOMLTest_Invalid_String_BasicMultilineUnknownEscape(t *testing.T) {
|
||||
}
|
||||
|
||||
func TestTOMLTest_Invalid_String_BasicOutOfRangeUnicodeEscape1(t *testing.T) {
|
||||
t.Skip("FIXME")
|
||||
input := "a = \"\\UFFFFFFFF\"\n"
|
||||
testgenInvalid(t, input)
|
||||
}
|
||||
|
||||
func TestTOMLTest_Invalid_String_BasicOutOfRangeUnicodeEscape2(t *testing.T) {
|
||||
t.Skip("FIXME")
|
||||
input := "a = \"\\U00D80000\"\n"
|
||||
testgenInvalid(t, input)
|
||||
}
|
||||
|
||||
+2
-4
@@ -899,18 +899,16 @@ func (d *decoder) unmarshalInteger(value *ast.Node, v reflect.Value) error {
|
||||
}
|
||||
|
||||
func (d *decoder) unmarshalString(value *ast.Node, v reflect.Value) error {
|
||||
var err error
|
||||
|
||||
switch v.Kind() {
|
||||
case reflect.String:
|
||||
v.SetString(string(value.Data))
|
||||
case reflect.Interface:
|
||||
v.Set(reflect.ValueOf(string(value.Data)))
|
||||
default:
|
||||
err = newDecodeError(d.p.Raw(value.Raw), "cannot store TOML string into a Go %s", v.Kind())
|
||||
return newDecodeError(d.p.Raw(value.Raw), "cannot store TOML string into a Go %s", v.Kind())
|
||||
}
|
||||
|
||||
return err
|
||||
return nil
|
||||
}
|
||||
|
||||
func (d *decoder) handleKeyValue(expr *ast.Node, v reflect.Value) (reflect.Value, error) {
|
||||
|
||||
@@ -246,6 +246,20 @@ func TestUnmarshal(t *testing.T) {
|
||||
}
|
||||
},
|
||||
},
|
||||
{
|
||||
desc: "kv literal string",
|
||||
input: `A = 'foo 🙂 '`,
|
||||
gen: func() test {
|
||||
type doc struct {
|
||||
A string
|
||||
}
|
||||
|
||||
return test{
|
||||
target: &doc{},
|
||||
expected: &doc{A: "foo 🙂 "},
|
||||
}
|
||||
},
|
||||
},
|
||||
{
|
||||
desc: "time.time with negative zone",
|
||||
input: `a = 1979-05-27T00:32:00-07:00 `, // space intentional
|
||||
@@ -2009,6 +2023,74 @@ world'`,
|
||||
desc: `invalid nan`,
|
||||
data: `A = non`,
|
||||
},
|
||||
{
|
||||
desc: `invalid character in comment in array`,
|
||||
data: "A = [#\x00\n]",
|
||||
},
|
||||
{
|
||||
desc: "invalid utf8 character in long string with no escape sequence",
|
||||
data: "a = \"aaaa\x80aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\"",
|
||||
},
|
||||
{
|
||||
desc: "invalid ascii character in long string with no escape sequence",
|
||||
data: "a = \"aaaa\x00aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\"",
|
||||
},
|
||||
{
|
||||
desc: "unfinished 2-byte utf8 character in string with no escape sequence",
|
||||
data: "a = \"aaaa\xC2\"",
|
||||
},
|
||||
{
|
||||
desc: "unfinished 3-byte utf8 character in string with no escape sequence",
|
||||
data: "a = \"aaaa\xE2\x00\x00\"",
|
||||
},
|
||||
{
|
||||
desc: "invalid 3rd byte of 3-byte utf8 character in string with no escape sequence",
|
||||
data: "a = \"aaaa\xE2\x80\x00\"",
|
||||
},
|
||||
{
|
||||
desc: "invalid 4rd byte of 4-byte utf8 character in string with no escape sequence",
|
||||
data: "a = \"aaaa\xF2\x81\x81\x00\"",
|
||||
},
|
||||
{
|
||||
desc: "unfinished 2-byte utf8 character in literal string",
|
||||
data: "a = 'aaa\xC2'",
|
||||
},
|
||||
{
|
||||
desc: "unfinished 3-byte utf8 character in literal string",
|
||||
data: "a = 'aaaa\xE2\x00\x00'",
|
||||
},
|
||||
{
|
||||
desc: "invalid 3rd byte of 3-byte utf8 character in literal string",
|
||||
data: "a = 'aaaa\xE2\x80\x00'",
|
||||
},
|
||||
{
|
||||
desc: "invalid 4rd byte of 4-byte utf8 character in literal string",
|
||||
data: "a = 'aaaa\xF2\x81\x81\x00'",
|
||||
},
|
||||
{
|
||||
desc: "invalid start utf8 character in literal string",
|
||||
data: "a = '\x80'",
|
||||
},
|
||||
{
|
||||
desc: "utf8 character with not enough bytes before end in literal string",
|
||||
data: "a = '\xEF'",
|
||||
},
|
||||
{
|
||||
desc: "basic string with newline after the first escape code",
|
||||
data: "a = \"\\t\n\"",
|
||||
},
|
||||
{
|
||||
desc: "basic string with unfinished escape sequence after the first escape code",
|
||||
data: "a = \"\\t\\",
|
||||
},
|
||||
{
|
||||
desc: "basic string with unfinished after the first escape code",
|
||||
data: "a = \"\\t",
|
||||
},
|
||||
{
|
||||
desc: "multiline basic string with unfinished escape sequence after the first escape code",
|
||||
data: "a = \"\"\"\\t\\",
|
||||
},
|
||||
}
|
||||
|
||||
for _, e := range examples {
|
||||
|
||||
@@ -0,0 +1,203 @@
|
||||
package toml
|
||||
|
||||
import (
|
||||
"unicode/utf8"
|
||||
)
|
||||
|
||||
type utf8Err struct {
|
||||
Index int
|
||||
Size int
|
||||
}
|
||||
|
||||
func (u utf8Err) Zero() bool {
|
||||
return u.Size == 0
|
||||
}
|
||||
|
||||
// Verified that a given string is only made of valid UTF-8 characters allowed
|
||||
// by the TOML spec:
|
||||
//
|
||||
// Any Unicode character may be used except those that must be escaped:
|
||||
// quotation mark, backslash, and the control characters other than tab (U+0000
|
||||
// to U+0008, U+000A to U+001F, U+007F).
|
||||
//
|
||||
// It is a copy of the Go 1.17 utf8.Valid implementation, tweaked to exit early
|
||||
// when a character is not allowed.
|
||||
//
|
||||
// The returned utf8Err is Zero() if the string is valid, or contains the byte
|
||||
// index and size of the invalid character.
|
||||
//
|
||||
// quotation mark => already checked
|
||||
// backslash => already checked
|
||||
// 0-0x8 => invalid
|
||||
// 0x9 => tab, ok
|
||||
// 0xA - 0x1F => invalid
|
||||
// 0x7F => invalid
|
||||
func utf8TomlValidAlreadyEscaped(p []byte) (err utf8Err) {
|
||||
// Fast path. Check for and skip 8 bytes of ASCII characters per iteration.
|
||||
offset := 0
|
||||
for len(p) >= 8 {
|
||||
// Combining two 32 bit loads allows the same code to be used
|
||||
// for 32 and 64 bit platforms.
|
||||
// The compiler can generate a 32bit load for first32 and second32
|
||||
// on many platforms. See test/codegen/memcombine.go.
|
||||
first32 := uint32(p[0]) | uint32(p[1])<<8 | uint32(p[2])<<16 | uint32(p[3])<<24
|
||||
second32 := uint32(p[4]) | uint32(p[5])<<8 | uint32(p[6])<<16 | uint32(p[7])<<24
|
||||
if (first32|second32)&0x80808080 != 0 {
|
||||
// Found a non ASCII byte (>= RuneSelf).
|
||||
break
|
||||
}
|
||||
|
||||
for i, b := range p[:8] {
|
||||
if invalidAscii(b) {
|
||||
err.Index = offset + i
|
||||
err.Size = 1
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
p = p[8:]
|
||||
offset += 8
|
||||
}
|
||||
n := len(p)
|
||||
for i := 0; i < n; {
|
||||
pi := p[i]
|
||||
if pi < utf8.RuneSelf {
|
||||
if invalidAscii(pi) {
|
||||
err.Index = offset + i
|
||||
err.Size = 1
|
||||
return
|
||||
}
|
||||
i++
|
||||
continue
|
||||
}
|
||||
x := first[pi]
|
||||
if x == xx {
|
||||
// Illegal starter byte.
|
||||
err.Index = offset + i
|
||||
err.Size = 1
|
||||
return
|
||||
}
|
||||
size := int(x & 7)
|
||||
if i+size > n {
|
||||
// Short or invalid.
|
||||
err.Index = offset + i
|
||||
err.Size = n - i
|
||||
return
|
||||
}
|
||||
accept := acceptRanges[x>>4]
|
||||
if c := p[i+1]; c < accept.lo || accept.hi < c {
|
||||
err.Index = offset + i
|
||||
err.Size = 2
|
||||
return
|
||||
} else if size == 2 {
|
||||
} else if c := p[i+2]; c < locb || hicb < c {
|
||||
err.Index = offset + i
|
||||
err.Size = 3
|
||||
return
|
||||
} else if size == 3 {
|
||||
} else if c := p[i+3]; c < locb || hicb < c {
|
||||
err.Index = offset + i
|
||||
err.Size = 4
|
||||
return
|
||||
}
|
||||
i += size
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
// Return the size of the next rune if valid, 0 otherwise.
|
||||
func utf8ValidNext(p []byte) int {
|
||||
c := p[0]
|
||||
|
||||
if c < utf8.RuneSelf {
|
||||
if invalidAscii(c) {
|
||||
return 0
|
||||
}
|
||||
return 1
|
||||
}
|
||||
|
||||
x := first[c]
|
||||
if x == xx {
|
||||
// Illegal starter byte.
|
||||
return 0
|
||||
}
|
||||
size := int(x & 7)
|
||||
if size > len(p) {
|
||||
// Short or invalid.
|
||||
return 0
|
||||
}
|
||||
accept := acceptRanges[x>>4]
|
||||
if c := p[1]; c < accept.lo || accept.hi < c {
|
||||
return 0
|
||||
} else if size == 2 {
|
||||
} else if c := p[2]; c < locb || hicb < c {
|
||||
return 0
|
||||
} else if size == 3 {
|
||||
} else if c := p[3]; c < locb || hicb < c {
|
||||
return 0
|
||||
}
|
||||
|
||||
return size
|
||||
}
|
||||
|
||||
func invalidAscii(b byte) bool {
|
||||
return b <= 0x08 || (b > 0x0A && b < 0x0D) || (b > 0x0D && b <= 0x1F) || b == 0x7F
|
||||
}
|
||||
|
||||
// acceptRange gives the range of valid values for the second byte in a UTF-8
|
||||
// sequence.
|
||||
type acceptRange struct {
|
||||
lo uint8 // lowest value for second byte.
|
||||
hi uint8 // highest value for second byte.
|
||||
}
|
||||
|
||||
// acceptRanges has size 16 to avoid bounds checks in the code that uses it.
|
||||
var acceptRanges = [16]acceptRange{
|
||||
0: {locb, hicb},
|
||||
1: {0xA0, hicb},
|
||||
2: {locb, 0x9F},
|
||||
3: {0x90, hicb},
|
||||
4: {locb, 0x8F},
|
||||
}
|
||||
|
||||
// first is information about the first byte in a UTF-8 sequence.
|
||||
var first = [256]uint8{
|
||||
// 1 2 3 4 5 6 7 8 9 A B C D E F
|
||||
as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x00-0x0F
|
||||
as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x10-0x1F
|
||||
as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x20-0x2F
|
||||
as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x30-0x3F
|
||||
as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x40-0x4F
|
||||
as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x50-0x5F
|
||||
as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x60-0x6F
|
||||
as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x70-0x7F
|
||||
// 1 2 3 4 5 6 7 8 9 A B C D E F
|
||||
xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0x80-0x8F
|
||||
xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0x90-0x9F
|
||||
xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0xA0-0xAF
|
||||
xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0xB0-0xBF
|
||||
xx, xx, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, // 0xC0-0xCF
|
||||
s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, // 0xD0-0xDF
|
||||
s2, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s4, s3, s3, // 0xE0-0xEF
|
||||
s5, s6, s6, s6, s7, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0xF0-0xFF
|
||||
}
|
||||
|
||||
const (
|
||||
// The default lowest and highest continuation byte.
|
||||
locb = 0b10000000
|
||||
hicb = 0b10111111
|
||||
|
||||
// These names of these constants are chosen to give nice alignment in the
|
||||
// table below. The first nibble is an index into acceptRanges or F for
|
||||
// special one-byte cases. The second nibble is the Rune length or the
|
||||
// Status for the special one-byte case.
|
||||
xx = 0xF1 // invalid: size 1
|
||||
as = 0xF0 // ASCII: size 1
|
||||
s1 = 0x02 // accept 0, size 2
|
||||
s2 = 0x13 // accept 1, size 3
|
||||
s3 = 0x03 // accept 0, size 3
|
||||
s4 = 0x23 // accept 2, size 3
|
||||
s5 = 0x34 // accept 3, size 4
|
||||
s6 = 0x04 // accept 0, size 4
|
||||
s7 = 0x44 // accept 4, size 4
|
||||
)
|
||||
Reference in New Issue
Block a user