Unicode parsing optimization (#568)
Inline call to hexToRune and uses specialized parsing, as found in encoding/json. Co-authored-by: Thomas Pelletier <thomas@pelletier.codes>
This commit is contained in:
@@ -2,7 +2,6 @@ package toml
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"bytes"
|
"bytes"
|
||||||
"strconv"
|
|
||||||
|
|
||||||
"github.com/pelletier/go-toml/v2/internal/ast"
|
"github.com/pelletier/go-toml/v2/internal/ast"
|
||||||
"github.com/pelletier/go-toml/v2/internal/danger"
|
"github.com/pelletier/go-toml/v2/internal/danger"
|
||||||
@@ -593,20 +592,19 @@ func (p *parser) parseMultilineBasicString(b []byte) ([]byte, []byte, []byte, er
|
|||||||
case 't':
|
case 't':
|
||||||
builder.WriteByte('\t')
|
builder.WriteByte('\t')
|
||||||
case 'u':
|
case 'u':
|
||||||
x, err := hexToString(atmost(token[i+1:], 4), 4)
|
x, err := hexToRune(atmost(token[i+1:], 4), 4)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, nil, nil, err
|
return nil, nil, nil, err
|
||||||
}
|
}
|
||||||
|
builder.WriteRune(x)
|
||||||
builder.WriteString(x)
|
|
||||||
i += 4
|
i += 4
|
||||||
case 'U':
|
case 'U':
|
||||||
x, err := hexToString(atmost(token[i+1:], 8), 8)
|
x, err := hexToRune(atmost(token[i+1:], 8), 8)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, nil, nil, err
|
return nil, nil, nil, err
|
||||||
}
|
}
|
||||||
|
|
||||||
builder.WriteString(x)
|
builder.WriteRune(x)
|
||||||
i += 8
|
i += 8
|
||||||
default:
|
default:
|
||||||
return nil, nil, nil, newDecodeError(token[i:i+1], "invalid escaped character %#U", c)
|
return nil, nil, nil, newDecodeError(token[i:i+1], "invalid escaped character %#U", c)
|
||||||
@@ -742,20 +740,20 @@ func (p *parser) parseBasicString(b []byte) ([]byte, []byte, []byte, error) {
|
|||||||
case 't':
|
case 't':
|
||||||
builder.WriteByte('\t')
|
builder.WriteByte('\t')
|
||||||
case 'u':
|
case 'u':
|
||||||
x, err := hexToString(token[i+1:len(token)-1], 4)
|
x, err := hexToRune(token[i+1:len(token)-1], 4)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, nil, nil, err
|
return nil, nil, nil, err
|
||||||
}
|
}
|
||||||
|
|
||||||
builder.WriteString(x)
|
builder.WriteRune(x)
|
||||||
i += 4
|
i += 4
|
||||||
case 'U':
|
case 'U':
|
||||||
x, err := hexToString(token[i+1:len(token)-1], 8)
|
x, err := hexToRune(token[i+1:len(token)-1], 8)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, nil, nil, err
|
return nil, nil, nil, err
|
||||||
}
|
}
|
||||||
|
|
||||||
builder.WriteString(x)
|
builder.WriteRune(x)
|
||||||
i += 8
|
i += 8
|
||||||
default:
|
default:
|
||||||
return nil, nil, nil, newDecodeError(token[i:i+1], "invalid escaped character %#U", c)
|
return nil, nil, nil, newDecodeError(token[i:i+1], "invalid escaped character %#U", c)
|
||||||
@@ -768,20 +766,28 @@ func (p *parser) parseBasicString(b []byte) ([]byte, []byte, []byte, error) {
|
|||||||
return token, builder.Bytes(), rest, nil
|
return token, builder.Bytes(), rest, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func hexToString(b []byte, length int) (string, error) {
|
func hexToRune(b []byte, length int) (rune, error) {
|
||||||
if len(b) < length {
|
if len(b) < length {
|
||||||
return "", newDecodeError(b, "unicode point needs %d character, not %d", length, len(b))
|
return -1, newDecodeError(b, "unicode point needs %d character, not %d", length, len(b))
|
||||||
}
|
}
|
||||||
b = b[:length]
|
b = b[:length]
|
||||||
|
|
||||||
//nolint:godox
|
var r rune
|
||||||
// TODO: slow
|
for i, c := range b {
|
||||||
intcode, err := strconv.ParseInt(string(b), 16, 32)
|
switch {
|
||||||
if err != nil {
|
case '0' <= c && c <= '9':
|
||||||
return "", newDecodeError(b, "couldn't parse hexadecimal number: %w", err)
|
c = c - '0'
|
||||||
|
case 'a' <= c && c <= 'f':
|
||||||
|
c = c - 'a' + 10
|
||||||
|
case 'A' <= c && c <= 'F':
|
||||||
|
c = c - 'A' + 10
|
||||||
|
default:
|
||||||
|
return -1, newDecodeError(b[i:i+1], "non-hex character")
|
||||||
|
}
|
||||||
|
r = r*16 + rune(c)
|
||||||
}
|
}
|
||||||
|
|
||||||
return string(rune(intcode)), nil
|
return r, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (p *parser) parseWhitespace(b []byte) []byte {
|
func (p *parser) parseWhitespace(b []byte) []byte {
|
||||||
|
|||||||
@@ -348,3 +348,25 @@ func TestParser_AST(t *testing.T) {
|
|||||||
})
|
})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func BenchmarkParseBasicStringWithUnicode(b *testing.B) {
|
||||||
|
p := &parser{}
|
||||||
|
b.Run("4", func(b *testing.B) {
|
||||||
|
input := []byte(`"\u1234\u5678\u9ABC\u1234\u5678\u9ABC"`)
|
||||||
|
b.ReportAllocs()
|
||||||
|
b.SetBytes(int64(len(input)))
|
||||||
|
|
||||||
|
for i := 0; i < b.N; i++ {
|
||||||
|
p.parseBasicString(input)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
b.Run("8", func(b *testing.B) {
|
||||||
|
input := []byte(`"\u12345678\u9ABCDEF0\u12345678\u9ABCDEF0"`)
|
||||||
|
b.ReportAllocs()
|
||||||
|
b.SetBytes(int64(len(input)))
|
||||||
|
|
||||||
|
for i := 0; i < b.N; i++ {
|
||||||
|
p.parseBasicString(input)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user