Unicode parsing optimization (#568)

Inline call to hexToRune and uses specialized parsing, as found in encoding/json.

Co-authored-by: Thomas Pelletier <thomas@pelletier.codes>
This commit is contained in:
kkHAIKE
2021-07-21 16:50:03 +08:00
committed by GitHub
parent 9c24fbeaad
commit a93b34d984
2 changed files with 46 additions and 18 deletions
+24 -18
View File
@@ -2,7 +2,6 @@ package toml
import (
"bytes"
"strconv"
"github.com/pelletier/go-toml/v2/internal/ast"
"github.com/pelletier/go-toml/v2/internal/danger"
@@ -593,20 +592,19 @@ func (p *parser) parseMultilineBasicString(b []byte) ([]byte, []byte, []byte, er
case 't':
builder.WriteByte('\t')
case 'u':
x, err := hexToString(atmost(token[i+1:], 4), 4)
x, err := hexToRune(atmost(token[i+1:], 4), 4)
if err != nil {
return nil, nil, nil, err
}
builder.WriteString(x)
builder.WriteRune(x)
i += 4
case 'U':
x, err := hexToString(atmost(token[i+1:], 8), 8)
x, err := hexToRune(atmost(token[i+1:], 8), 8)
if err != nil {
return nil, nil, nil, err
}
builder.WriteString(x)
builder.WriteRune(x)
i += 8
default:
return nil, nil, nil, newDecodeError(token[i:i+1], "invalid escaped character %#U", c)
@@ -742,20 +740,20 @@ func (p *parser) parseBasicString(b []byte) ([]byte, []byte, []byte, error) {
case 't':
builder.WriteByte('\t')
case 'u':
x, err := hexToString(token[i+1:len(token)-1], 4)
x, err := hexToRune(token[i+1:len(token)-1], 4)
if err != nil {
return nil, nil, nil, err
}
builder.WriteString(x)
builder.WriteRune(x)
i += 4
case 'U':
x, err := hexToString(token[i+1:len(token)-1], 8)
x, err := hexToRune(token[i+1:len(token)-1], 8)
if err != nil {
return nil, nil, nil, err
}
builder.WriteString(x)
builder.WriteRune(x)
i += 8
default:
return nil, nil, nil, newDecodeError(token[i:i+1], "invalid escaped character %#U", c)
@@ -768,20 +766,28 @@ func (p *parser) parseBasicString(b []byte) ([]byte, []byte, []byte, error) {
return token, builder.Bytes(), rest, nil
}
func hexToString(b []byte, length int) (string, error) {
func hexToRune(b []byte, length int) (rune, error) {
if len(b) < length {
return "", newDecodeError(b, "unicode point needs %d character, not %d", length, len(b))
return -1, newDecodeError(b, "unicode point needs %d character, not %d", length, len(b))
}
b = b[:length]
//nolint:godox
// TODO: slow
intcode, err := strconv.ParseInt(string(b), 16, 32)
if err != nil {
return "", newDecodeError(b, "couldn't parse hexadecimal number: %w", err)
var r rune
for i, c := range b {
switch {
case '0' <= c && c <= '9':
c = c - '0'
case 'a' <= c && c <= 'f':
c = c - 'a' + 10
case 'A' <= c && c <= 'F':
c = c - 'A' + 10
default:
return -1, newDecodeError(b[i:i+1], "non-hex character")
}
r = r*16 + rune(c)
}
return string(rune(intcode)), nil
return r, nil
}
func (p *parser) parseWhitespace(b []byte) []byte {
+22
View File
@@ -348,3 +348,25 @@ func TestParser_AST(t *testing.T) {
})
}
}
func BenchmarkParseBasicStringWithUnicode(b *testing.B) {
p := &parser{}
b.Run("4", func(b *testing.B) {
input := []byte(`"\u1234\u5678\u9ABC\u1234\u5678\u9ABC"`)
b.ReportAllocs()
b.SetBytes(int64(len(input)))
for i := 0; i < b.N; i++ {
p.parseBasicString(input)
}
})
b.Run("8", func(b *testing.B) {
input := []byte(`"\u12345678\u9ABCDEF0\u12345678\u9ABCDEF0"`)
b.ReportAllocs()
b.SetBytes(int64(len(input)))
for i := 0; i < b.N; i++ {
p.parseBasicString(input)
}
})
}