seen tracker: use array for storage (#545)

name                              old time/op    new time/op    delta
UnmarshalDataset/config-32          81.2ms ± 3%    77.8ms ± 3%   -4.25%  (p=0.000 n=20+19)
UnmarshalDataset/canada-32           104ms ± 5%     105ms ± 4%     ~     (p=0.102 n=20+20)
UnmarshalDataset/citm_catalog-32    57.5ms ± 5%    59.0ms ± 5%   +2.54%  (p=0.033 n=20+20)
UnmarshalDataset/twitter-32         25.7ms ± 7%    28.1ms ± 5%   +9.33%  (p=0.000 n=20+20)
UnmarshalDataset/code-32             305ms ± 6%     292ms ± 5%   -4.29%  (p=0.000 n=20+19)
UnmarshalDataset/example-32          519µs ± 6%     522µs ± 5%     ~     (p=0.659 n=20+20)
UnmarshalSimple/struct-32           1.44µs ± 1%    1.17µs ± 6%  -18.78%  (p=0.000 n=14+20)
UnmarshalSimple/map-32              2.30µs ± 4%    1.99µs ± 4%  -13.65%  (p=0.000 n=20+19)
ReferenceFile/struct-32             44.1µs ± 4%    38.1µs ± 5%  -13.61%  (p=0.000 n=18+20)
ReferenceFile/map-32                 197µs ± 7%     189µs ± 5%   -3.91%  (p=0.000 n=20+20)
HugoFrontMatter-32                  45.9µs ± 6%    39.3µs ± 6%  -14.46%  (p=0.000 n=19+20)

name                              old speed      new speed      delta
UnmarshalDataset/config-32        12.9MB/s ± 3%  13.5MB/s ± 3%   +4.42%  (p=0.000 n=20+19)
UnmarshalDataset/canada-32        21.1MB/s ± 5%  20.9MB/s ± 4%     ~     (p=0.101 n=20+20)
UnmarshalDataset/citm_catalog-32  9.72MB/s ± 6%  9.47MB/s ± 5%   -2.53%  (p=0.031 n=20+20)
UnmarshalDataset/twitter-32       17.2MB/s ± 7%  15.8MB/s ± 5%   -8.57%  (p=0.000 n=20+20)
UnmarshalDataset/code-32          8.81MB/s ± 7%  9.20MB/s ± 5%   +4.47%  (p=0.000 n=20+19)
UnmarshalDataset/example-32       15.6MB/s ± 6%  15.5MB/s ± 5%     ~     (p=0.644 n=20+20)
UnmarshalSimple/struct-32         7.61MB/s ± 1%  9.39MB/s ± 7%  +23.33%  (p=0.000 n=15+20)
UnmarshalSimple/map-32            4.78MB/s ± 4%  5.54MB/s ± 5%  +15.85%  (p=0.000 n=20+19)
ReferenceFile/struct-32            119MB/s ± 4%   138MB/s ± 5%  +15.79%  (p=0.000 n=18+20)
ReferenceFile/map-32              26.6MB/s ± 7%  27.7MB/s ± 5%   +4.06%  (p=0.000 n=20+20)
HugoFrontMatter-32                11.9MB/s ± 6%  13.9MB/s ± 6%  +16.91%  (p=0.000 n=19+20)

name                              old alloc/op   new alloc/op   delta
UnmarshalDataset/config-32          16.9MB ± 0%    13.9MB ± 0%  -17.48%  (p=0.000 n=18+18)
UnmarshalDataset/canada-32          74.3MB ± 0%    74.3MB ± 0%   -0.00%  (p=0.000 n=20+20)
UnmarshalDataset/citm_catalog-32    37.3MB ± 0%    37.3MB ± 0%   +0.11%  (p=0.000 n=20+20)
UnmarshalDataset/twitter-32         15.6MB ± 0%    15.6MB ± 0%     ~     (p=0.211 n=19+20)
UnmarshalDataset/code-32            59.5MB ± 0%    52.4MB ± 0%  -11.96%  (p=0.000 n=19+20)
UnmarshalDataset/example-32          238kB ± 0%     239kB ± 0%   +0.02%  (p=0.000 n=18+20)
UnmarshalSimple/struct-32             981B ± 0%      709B ± 0%  -27.73%  (p=0.000 n=20+20)
UnmarshalSimple/map-32              1.45kB ± 0%    1.17kB ± 0%  -18.82%  (p=0.000 n=20+20)
ReferenceFile/struct-32             11.8kB ± 0%     9.7kB ± 0%  -17.64%  (p=0.000 n=20+20)
ReferenceFile/map-32                51.5kB ± 0%    52.2kB ± 0%   +1.30%  (p=0.000 n=20+17)
HugoFrontMatter-32                  12.1kB ± 0%    11.1kB ± 0%   -7.97%  (p=0.000 n=20+19)

name                              old allocs/op  new allocs/op  delta
UnmarshalDataset/config-32            645k ± 0%      557k ± 0%  -13.76%  (p=0.000 n=20+16)
UnmarshalDataset/canada-32            896k ± 0%      896k ± 0%   -0.00%  (p=0.000 n=20+20)
UnmarshalDataset/citm_catalog-32      380k ± 0%      377k ± 0%   -0.75%  (p=0.000 n=19+20)
UnmarshalDataset/twitter-32           158k ± 0%      158k ± 0%   -0.01%  (p=0.000 n=18+18)
UnmarshalDataset/code-32             2.92M ± 0%     2.57M ± 0%  -11.87%  (p=0.000 n=20+20)
UnmarshalDataset/example-32          3.66k ± 0%     3.64k ± 0%   -0.63%  (p=0.000 n=20+20)
UnmarshalSimple/struct-32             13.0 ± 0%      10.0 ± 0%  -23.08%  (p=0.000 n=20+20)
UnmarshalSimple/map-32                22.0 ± 0%      19.0 ± 0%  -13.64%  (p=0.000 n=20+20)
ReferenceFile/struct-32                253 ± 0%       155 ± 0%  -38.74%  (p=0.000 n=20+20)
ReferenceFile/map-32                 1.67k ± 0%     1.44k ± 0%  -14.23%  (p=0.000 n=20+20)
HugoFrontMatter-32                     357 ± 0%       313 ± 0%  -12.32%  (p=0.000 n=20+20)
This commit is contained in:
Thomas Pelletier
2021-05-26 18:47:00 -04:00
committed by GitHub
parent c2d1fd86e5
commit 840df4a229
2 changed files with 235 additions and 122 deletions
+146 -89
View File
@@ -1,6 +1,7 @@
package tracker
import (
"bytes"
"fmt"
"github.com/pelletier/go-toml/v2/internal/ast"
@@ -29,67 +30,92 @@ func (k keyKind) String() string {
panic("missing keyKind string mapping")
}
// SeenTracker tracks which keys have been seen with which TOML type to flag duplicates
// and mismatches according to the spec.
// SeenTracker tracks which keys have been seen with which TOML type to flag
// duplicates and mismatches according to the spec.
//
// Each node in the visited tree is represented by an entry. Each entry has an
// identifier, which is provided by a counter. Entries are stored in the array
// entries. As new nodes are discovered (referenced for the first time in the
// TOML document), entries are created and appended to the array. An entry
// points to its parent using its id.
//
// To find whether a given key (sequence of []byte) has already been visited,
// the entries are linearly searched, looking for one with the right name and
// parent id.
//
// Given that all keys appear in the document after their parent, it is
// guaranteed that all descendants of a node are stored after the node, this
// speeds up the search process.
//
// When encountering [[array tables]], the descendants of that node are removed
// to allow that branch of the tree to be "rediscovered". To maintain the
// invariant above, the deletion process needs to keep the order of entries.
// This results in more copies in that case.
type SeenTracker struct {
root *info
current *info
entries []entry
currentIdx int
nextID int
}
type info struct {
parent *info
type entry struct {
id int
parent int
name []byte
kind keyKind
children map[string]*info
explicit bool
}
func (i *info) Clear() {
i.children = nil
// Remove all descendent of node at position idx.
func (s *SeenTracker) clear(idx int) {
p := s.entries[idx].id
rest := clear(p, s.entries[idx+1:])
s.entries = s.entries[:idx+1+len(rest)]
}
func (i *info) Has(k string) (*info, bool) {
c, ok := i.children[k]
return c, ok
}
func (i *info) SetKind(kind keyKind) {
i.kind = kind
}
func (i *info) CreateTable(k string, explicit bool) *info {
return i.createChild(k, tableKind, explicit)
}
func (i *info) CreateArrayTable(k string, explicit bool) *info {
return i.createChild(k, arrayTableKind, explicit)
}
func (i *info) createChild(k string, kind keyKind, explicit bool) *info {
if i.children == nil {
i.children = make(map[string]*info, 1)
func clear(parentID int, entries []entry) []entry {
for i := 0; i < len(entries); {
if entries[i].parent == parentID {
id := entries[i].id
copy(entries[i:], entries[i+1:])
entries = entries[:len(entries)-1]
rest := clear(id, entries[i:])
entries = entries[:i+len(rest)]
} else {
i++
}
}
return entries
}
x := &info{
parent: i,
func (s *SeenTracker) create(parentIdx int, name []byte, kind keyKind, explicit bool) int {
parentID := s.id(parentIdx)
idx := len(s.entries)
s.entries = append(s.entries, entry{
id: s.nextID,
parent: parentID,
name: name,
kind: kind,
explicit: explicit,
}
i.children[k] = x
return x
})
s.nextID++
return idx
}
// CheckExpression takes a top-level node and checks that it does not contain keys
// that have been seen in previous calls, and validates that types are consistent.
func (s *SeenTracker) CheckExpression(node ast.Node) error {
if s.root == nil {
s.root = &info{
kind: tableKind,
}
s.current = s.root
if s.entries == nil {
//s.entries = make([]entry, 0, 8)
// Skip ID = 0 to remove the confusion between nodes whose parent has
// id 0 and root nodes (parent id is 0 because it's the zero value).
s.nextID = 1
// Start unscoped, so idx is negative.
s.currentIdx = -1
}
switch node.Kind {
case ast.KeyValue:
return s.checkKeyValue(s.current, node)
return s.checkKeyValue(node)
case ast.Table:
return s.checkTable(node)
case ast.ArrayTable:
@@ -97,104 +123,135 @@ func (s *SeenTracker) CheckExpression(node ast.Node) error {
default:
panic(fmt.Errorf("this should not be a top level node type: %s", node.Kind))
}
}
func (s *SeenTracker) checkTable(node ast.Node) error {
s.current = s.root
func (s *SeenTracker) checkTable(node ast.Node) error {
it := node.Key()
// handle the first parts of the key, excluding the last one
parentIdx := -1
// This code is duplicated in checkArrayTable. This is because factoring
// it in a function requires to copy the iterator, or allocate it to the
// heap, which is not cheap.
for it.Next() {
if !it.Node().Next().Valid() {
break
}
k := string(it.Node().Data)
child, found := s.current.Has(k)
if !found {
child = s.current.CreateTable(k, false)
k := it.Node().Data
idx := s.find(parentIdx, k)
if idx < 0 {
idx = s.create(parentIdx, k, tableKind, false)
}
s.current = child
parentIdx = idx
}
// handle the last part of the key
k := string(it.Node().Data)
k := it.Node().Data
idx := s.find(parentIdx, k)
i, found := s.current.Has(k)
if found {
if i.kind != tableKind {
return fmt.Errorf("toml: key %s should be a table, not a %s", k, i.kind)
if idx >= 0 {
kind := s.entries[idx].kind
if kind != tableKind {
return fmt.Errorf("toml: key %s should be a table, not a %s", string(k), kind)
}
if i.explicit {
return fmt.Errorf("toml: table %s already exists", k)
if s.entries[idx].explicit {
return fmt.Errorf("toml: table %s already exists", string(k))
}
i.explicit = true
s.current = i
s.entries[idx].explicit = true
} else {
s.current = s.current.CreateTable(k, true)
idx = s.create(parentIdx, k, tableKind, true)
}
s.currentIdx = idx
return nil
}
func (s *SeenTracker) checkArrayTable(node ast.Node) error {
s.current = s.root
it := node.Key()
// handle the first parts of the key, excluding the last one
parentIdx := -1
for it.Next() {
if !it.Node().Next().Valid() {
break
}
k := string(it.Node().Data)
child, found := s.current.Has(k)
if !found {
child = s.current.CreateTable(k, false)
k := it.Node().Data
idx := s.find(parentIdx, k)
if idx < 0 {
idx = s.create(parentIdx, k, tableKind, false)
}
s.current = child
parentIdx = idx
}
// handle the last part of the key
k := string(it.Node().Data)
k := it.Node().Data
idx := s.find(parentIdx, k)
info, found := s.current.Has(k)
if found {
if info.kind != arrayTableKind {
return fmt.Errorf("toml: key %s already exists as a %s, but should be an array table", info.kind, k)
if idx >= 0 {
kind := s.entries[idx].kind
if kind != arrayTableKind {
return fmt.Errorf("toml: key %s already exists as a %s, but should be an array table", kind, string(k))
}
info.Clear()
s.clear(idx)
} else {
info = s.current.CreateArrayTable(k, true)
idx = s.create(parentIdx, k, arrayTableKind, true)
}
s.current = info
s.currentIdx = idx
return nil
}
func (s *SeenTracker) checkKeyValue(context *info, node ast.Node) error {
func (s *SeenTracker) checkKeyValue(node ast.Node) error {
it := node.Key()
// handle the first parts of the key, excluding the last one
parentIdx := s.currentIdx
for it.Next() {
k := string(it.Node().Data)
child, found := context.Has(k)
if found {
if child.kind != tableKind {
return fmt.Errorf("toml: expected %s to be a table, not a %s", k, child.kind)
k := it.Node().Data
idx := s.find(parentIdx, k)
if idx >= 0 {
if s.entries[idx].kind != tableKind {
return fmt.Errorf("toml: expected %s to be a table, not a %s", string(k), s.entries[idx].kind)
}
} else {
child = context.CreateTable(k, false)
idx = s.create(parentIdx, k, tableKind, false)
}
context = child
parentIdx = idx
}
kind := valueKind
if node.Value().Kind == ast.InlineTable {
context.SetKind(tableKind)
} else {
context.SetKind(valueKind)
kind = tableKind
}
s.entries[parentIdx].kind = kind
return nil
}
func (s *SeenTracker) id(idx int) int {
if idx >= 0 {
return s.entries[idx].id
}
return 0
}
func (s *SeenTracker) find(parentIdx int, k []byte) int {
parentID := s.id(parentIdx)
for i := parentIdx + 1; i < len(s.entries); i++ {
if s.entries[i].parent == parentID && bytes.Equal(s.entries[i].name, k) {
return i
}
}
return -1
}