Unicode parsing optimization (#568)
Inline call to hexToRune and uses specialized parsing, as found in encoding/json. Co-authored-by: Thomas Pelletier <thomas@pelletier.codes>
This commit is contained in:
@@ -2,7 +2,6 @@ package toml
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"strconv"
|
||||
|
||||
"github.com/pelletier/go-toml/v2/internal/ast"
|
||||
"github.com/pelletier/go-toml/v2/internal/danger"
|
||||
@@ -593,20 +592,19 @@ func (p *parser) parseMultilineBasicString(b []byte) ([]byte, []byte, []byte, er
|
||||
case 't':
|
||||
builder.WriteByte('\t')
|
||||
case 'u':
|
||||
x, err := hexToString(atmost(token[i+1:], 4), 4)
|
||||
x, err := hexToRune(atmost(token[i+1:], 4), 4)
|
||||
if err != nil {
|
||||
return nil, nil, nil, err
|
||||
}
|
||||
|
||||
builder.WriteString(x)
|
||||
builder.WriteRune(x)
|
||||
i += 4
|
||||
case 'U':
|
||||
x, err := hexToString(atmost(token[i+1:], 8), 8)
|
||||
x, err := hexToRune(atmost(token[i+1:], 8), 8)
|
||||
if err != nil {
|
||||
return nil, nil, nil, err
|
||||
}
|
||||
|
||||
builder.WriteString(x)
|
||||
builder.WriteRune(x)
|
||||
i += 8
|
||||
default:
|
||||
return nil, nil, nil, newDecodeError(token[i:i+1], "invalid escaped character %#U", c)
|
||||
@@ -742,20 +740,20 @@ func (p *parser) parseBasicString(b []byte) ([]byte, []byte, []byte, error) {
|
||||
case 't':
|
||||
builder.WriteByte('\t')
|
||||
case 'u':
|
||||
x, err := hexToString(token[i+1:len(token)-1], 4)
|
||||
x, err := hexToRune(token[i+1:len(token)-1], 4)
|
||||
if err != nil {
|
||||
return nil, nil, nil, err
|
||||
}
|
||||
|
||||
builder.WriteString(x)
|
||||
builder.WriteRune(x)
|
||||
i += 4
|
||||
case 'U':
|
||||
x, err := hexToString(token[i+1:len(token)-1], 8)
|
||||
x, err := hexToRune(token[i+1:len(token)-1], 8)
|
||||
if err != nil {
|
||||
return nil, nil, nil, err
|
||||
}
|
||||
|
||||
builder.WriteString(x)
|
||||
builder.WriteRune(x)
|
||||
i += 8
|
||||
default:
|
||||
return nil, nil, nil, newDecodeError(token[i:i+1], "invalid escaped character %#U", c)
|
||||
@@ -768,20 +766,28 @@ func (p *parser) parseBasicString(b []byte) ([]byte, []byte, []byte, error) {
|
||||
return token, builder.Bytes(), rest, nil
|
||||
}
|
||||
|
||||
func hexToString(b []byte, length int) (string, error) {
|
||||
func hexToRune(b []byte, length int) (rune, error) {
|
||||
if len(b) < length {
|
||||
return "", newDecodeError(b, "unicode point needs %d character, not %d", length, len(b))
|
||||
return -1, newDecodeError(b, "unicode point needs %d character, not %d", length, len(b))
|
||||
}
|
||||
b = b[:length]
|
||||
|
||||
//nolint:godox
|
||||
// TODO: slow
|
||||
intcode, err := strconv.ParseInt(string(b), 16, 32)
|
||||
if err != nil {
|
||||
return "", newDecodeError(b, "couldn't parse hexadecimal number: %w", err)
|
||||
var r rune
|
||||
for i, c := range b {
|
||||
switch {
|
||||
case '0' <= c && c <= '9':
|
||||
c = c - '0'
|
||||
case 'a' <= c && c <= 'f':
|
||||
c = c - 'a' + 10
|
||||
case 'A' <= c && c <= 'F':
|
||||
c = c - 'A' + 10
|
||||
default:
|
||||
return -1, newDecodeError(b[i:i+1], "non-hex character")
|
||||
}
|
||||
r = r*16 + rune(c)
|
||||
}
|
||||
|
||||
return string(rune(intcode)), nil
|
||||
return r, nil
|
||||
}
|
||||
|
||||
func (p *parser) parseWhitespace(b []byte) []byte {
|
||||
|
||||
@@ -348,3 +348,25 @@ func TestParser_AST(t *testing.T) {
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func BenchmarkParseBasicStringWithUnicode(b *testing.B) {
|
||||
p := &parser{}
|
||||
b.Run("4", func(b *testing.B) {
|
||||
input := []byte(`"\u1234\u5678\u9ABC\u1234\u5678\u9ABC"`)
|
||||
b.ReportAllocs()
|
||||
b.SetBytes(int64(len(input)))
|
||||
|
||||
for i := 0; i < b.N; i++ {
|
||||
p.parseBasicString(input)
|
||||
}
|
||||
})
|
||||
b.Run("8", func(b *testing.B) {
|
||||
input := []byte(`"\u12345678\u9ABCDEF0\u12345678\u9ABCDEF0"`)
|
||||
b.ReportAllocs()
|
||||
b.SetBytes(int64(len(input)))
|
||||
|
||||
for i := 0; i < b.N; i++ {
|
||||
p.parseBasicString(input)
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user