Files
go-toml/unstable/parser.go
T
Cursor Agent a646ffd9fa Make error position tracking explicit with Offset field on ParserError
Thread byte offset information through all error creation sites,
eliminating the need for SubsliceOffset to recover position from
pointer comparison.

Changes:
- Add Offset field to ParserError struct
- Add offset parameter to NewParserError
- Add Parser.offsetOf helper for suffix-length arithmetic
- Thread base offset through scanner functions (scanComment,
  scanBasicString, scanMultilineBasicString, scanLiteralString,
  scanMultilineLiteralString, scanWindowsNewline)
- Thread base offset through standalone functions (expect, hexToRune)
- Thread base offset through all decode functions (parseInteger,
  parseFloat, parseLocalDate, parseLocalTime, parseLocalDateTime,
  parseDateTime, checkAndRemoveUnderscores*)
- Update all unmarshaler call sites to pass value.Raw.Offset
- Update localtime.go UnmarshalText methods with base=0
- Update strict.go to populate Offset from key ranges
- Change wrapDecodeError to read de.Offset directly
- Change Utf8TomlValidAlreadyEscaped to return int index (-1 if valid)
  instead of a byte subslice
- Unexport SubsliceOffset (now only used internally by Range())

This makes error positions self-describing: each ParserError carries its
own byte offset, so callers no longer need the original document slice
and address arithmetic to determine where an error occurred.

Co-authored-by: Thomas Pelletier <thomas@pelletier.dev>
2026-04-12 19:08:55 +00:00

1268 lines
28 KiB
Go

package unstable
import (
"bytes"
"fmt"
"unicode"
"github.com/pelletier/go-toml/v2/internal/characters"
)
// ParserError describes an error relative to the content of the document.
//
// It cannot outlive the instance of Parser it refers to, and may cause panics
// if the parser is reset.
type ParserError struct {
Highlight []byte
Message string
Key []string // optional
Offset int
}
// Error is the implementation of the error interface.
func (e *ParserError) Error() string {
return e.Message
}
// NewParserError is a convenience function to create a ParserError
//
// Warning: Highlight needs to be a subslice of Parser.data, so only slices
// returned by Parser.Raw are valid candidates.
func NewParserError(highlight []byte, offset int, format string, args ...interface{}) error {
return &ParserError{
Highlight: highlight,
Offset: offset,
Message: fmt.Errorf(format, args...).Error(),
}
}
// Parser scans over a TOML-encoded document and generates an iterative AST.
//
// To prime the Parser, first reset it with the contents of a TOML document.
// Then, process all top-level expressions sequentially. See Example.
//
// Don't forget to check Error() after you're done parsing.
//
// Each top-level expression needs to be fully processed before calling
// NextExpression() again. Otherwise, calls to various Node methods may panic if
// the parser has moved on the next expression.
//
// For performance reasons, go-toml doesn't make a copy of the input bytes to
// the parser. Make sure to copy all the bytes you need to outlive the slice
// given to the parser.
type Parser struct {
data []byte
builder builder
ref reference
left []byte
err error
first bool
KeepComments bool
}
// Data returns the slice provided to the last call to Reset.
func (p *Parser) Data() []byte {
return p.data
}
func (p *Parser) offsetOf(b []byte) int {
return len(p.data) - len(b)
}
// Range returns a range description that corresponds to a given slice of the
// input. If the argument is not a subslice of the parser input, this function
// panics.
//
// Prefer using ParserError.Offset directly for error position information.
func (p *Parser) Range(b []byte) Range {
return Range{
Offset: uint32(subsliceOffset(p.data, b)), //nolint:gosec // TOML documents are small
Length: uint32(len(b)), //nolint:gosec // TOML documents are small
}
}
// rangeOfToken computes the Range of a token given the remaining bytes after the token.
// This is used when the token was extracted from the beginning of some position,
// and 'rest' is what remains after the token.
func (p *Parser) rangeOfToken(token, rest []byte) Range {
offset := len(p.data) - len(token) - len(rest)
return Range{Offset: uint32(offset), Length: uint32(len(token))} //nolint:gosec // TOML documents are small
}
// Raw returns the slice corresponding to the bytes in the given range.
func (p *Parser) Raw(raw Range) []byte {
return p.data[raw.Offset : raw.Offset+raw.Length]
}
// Reset brings the parser to its initial state for a given input. It wipes an
// reuses internal storage to reduce allocation.
func (p *Parser) Reset(b []byte) {
p.builder.Reset()
p.ref = invalidReference
p.data = b
p.left = b
p.err = nil
p.first = true
}
// NextExpression parses the next top-level expression. If an expression was
// successfully parsed, it returns true. If the parser is at the end of the
// document or an error occurred, it returns false.
//
// Retrieve the parsed expression with Expression().
func (p *Parser) NextExpression() bool {
if len(p.left) == 0 || p.err != nil {
return false
}
p.builder.Reset()
p.ref = invalidReference
for {
if len(p.left) == 0 || p.err != nil {
return false
}
if !p.first {
p.left, p.err = p.parseNewline(p.left)
}
if len(p.left) == 0 || p.err != nil {
return false
}
p.ref, p.left, p.err = p.parseExpression(p.left)
if p.err != nil {
return false
}
p.first = false
if p.ref.Valid() {
return true
}
}
}
// Expression returns a pointer to the node representing the last successfully
// parsed expression.
func (p *Parser) Expression() *Node {
return p.builder.NodeAt(p.ref)
}
// Error returns any error that has occurred during parsing.
func (p *Parser) Error() error {
return p.err
}
// Position describes a position in the input.
type Position struct {
// Number of bytes from the beginning of the input.
Offset int
// Line number, starting at 1.
Line int
// Column number, starting at 1.
Column int
}
// Shape describes the position of a range in the input.
type Shape struct {
Start Position
End Position
}
// Shape returns the shape of the given range in the input. Will
// panic if the range is not a subslice of the input.
func (p *Parser) Shape(r Range) Shape {
return Shape{
Start: p.positionAt(int(r.Offset)),
End: p.positionAt(int(r.Offset + r.Length)),
}
}
// positionAt returns the position at the given byte offset in the document.
func (p *Parser) positionAt(offset int) Position {
lead := p.data[:offset]
return Position{
Offset: offset,
Line: bytes.Count(lead, []byte{'\n'}) + 1,
Column: len(lead) - bytes.LastIndex(lead, []byte{'\n'}),
}
}
func (p *Parser) parseNewline(b []byte) ([]byte, error) {
if b[0] == '\n' {
return b[1:], nil
}
if b[0] == '\r' {
_, rest, err := scanWindowsNewline(b, p.offsetOf(b))
return rest, err
}
return nil, NewParserError(b[0:1], p.offsetOf(b), "expected newline but got %#U", b[0])
}
func (p *Parser) parseComment(b []byte) (reference, []byte, error) {
ref := invalidReference
data, rest, err := scanComment(b, p.offsetOf(b))
if p.KeepComments && err == nil {
ref = p.builder.Push(Node{
Kind: Comment,
Raw: p.rangeOfToken(data, rest),
Data: data,
})
}
return ref, rest, err
}
func (p *Parser) parseExpression(b []byte) (reference, []byte, error) {
// expression = ws [ comment ]
// expression =/ ws keyval ws [ comment ]
// expression =/ ws table ws [ comment ]
ref := invalidReference
b = p.parseWhitespace(b)
if len(b) == 0 {
return ref, b, nil
}
if b[0] == '#' {
ref, rest, err := p.parseComment(b)
return ref, rest, err
}
if b[0] == '\n' || b[0] == '\r' {
return ref, b, nil
}
var err error
if b[0] == '[' {
ref, b, err = p.parseTable(b)
} else {
ref, b, err = p.parseKeyval(b)
}
if err != nil {
return ref, nil, err
}
b = p.parseWhitespace(b)
if len(b) > 0 && b[0] == '#' {
cref, rest, err := p.parseComment(b)
if cref != invalidReference {
p.builder.Chain(ref, cref)
}
return ref, rest, err
}
return ref, b, nil
}
func (p *Parser) parseTable(b []byte) (reference, []byte, error) {
// table = std-table / array-table
if len(b) > 1 && b[1] == '[' {
return p.parseArrayTable(b)
}
return p.parseStdTable(b)
}
func (p *Parser) parseArrayTable(b []byte) (reference, []byte, error) {
// array-table = array-table-open key array-table-close
// array-table-open = %x5B.5B ws ; [[ Double left square bracket
// array-table-close = ws %x5D.5D ; ]] Double right square bracket
ref := p.builder.Push(Node{
Kind: ArrayTable,
})
b = b[2:]
b = p.parseWhitespace(b)
k, b, err := p.parseKey(b)
if err != nil {
return ref, nil, err
}
p.builder.AttachChild(ref, k)
b = p.parseWhitespace(b)
b, err = expect(']', b, p.offsetOf(b))
if err != nil {
return ref, nil, err
}
b, err = expect(']', b, p.offsetOf(b))
return ref, b, err
}
func (p *Parser) parseStdTable(b []byte) (reference, []byte, error) {
// std-table = std-table-open key std-table-close
// std-table-open = %x5B ws ; [ Left square bracket
// std-table-close = ws %x5D ; ] Right square bracket
ref := p.builder.Push(Node{
Kind: Table,
})
b = b[1:]
b = p.parseWhitespace(b)
key, b, err := p.parseKey(b)
if err != nil {
return ref, nil, err
}
p.builder.AttachChild(ref, key)
b = p.parseWhitespace(b)
b, err = expect(']', b, p.offsetOf(b))
return ref, b, err
}
func (p *Parser) parseKeyval(b []byte) (reference, []byte, error) {
// keyval = key keyval-sep val
// Track the start position for Raw range
startB := b
ref := p.builder.Push(Node{
Kind: KeyValue,
})
key, b, err := p.parseKey(b)
if err != nil {
return invalidReference, nil, err
}
// keyval-sep = ws %x3D ws ; =
b = p.parseWhitespace(b)
if len(b) == 0 {
return invalidReference, nil, NewParserError(startB[:len(startB)-len(b)], p.offsetOf(startB), "expected = after a key, but the document ends there")
}
b, err = expect('=', b, p.offsetOf(b))
if err != nil {
return invalidReference, nil, err
}
b = p.parseWhitespace(b)
valRef, b, err := p.parseVal(b)
if err != nil {
return ref, b, err
}
p.builder.Chain(valRef, key)
p.builder.AttachChild(ref, valRef)
// Set Raw to span the entire key-value expression.
// Access the node directly in the slice to avoid the write barrier
// that NodeAt's nodes-pointer setup would trigger.
p.builder.tree.nodes[ref].Raw = p.rangeOfToken(startB[:len(startB)-len(b)], b)
return ref, b, err
}
//nolint:cyclop,funlen
func (p *Parser) parseVal(b []byte) (reference, []byte, error) {
// val = string / boolean / array / inline-table / date-time / float / integer
ref := invalidReference
if len(b) == 0 {
return ref, nil, NewParserError(b, p.offsetOf(b), "expected value, not eof")
}
var err error
c := b[0]
switch c {
case '"':
var raw []byte
var v []byte
if scanFollowsMultilineBasicStringDelimiter(b) {
raw, v, b, err = p.parseMultilineBasicString(b)
} else {
raw, v, b, err = p.parseBasicString(b)
}
if err == nil {
ref = p.builder.Push(Node{
Kind: String,
Raw: p.rangeOfToken(raw, b),
Data: v,
})
}
return ref, b, err
case '\'':
var raw []byte
var v []byte
if scanFollowsMultilineLiteralStringDelimiter(b) {
raw, v, b, err = p.parseMultilineLiteralString(b)
} else {
raw, v, b, err = p.parseLiteralString(b)
}
if err == nil {
ref = p.builder.Push(Node{
Kind: String,
Raw: p.rangeOfToken(raw, b),
Data: v,
})
}
return ref, b, err
case 't':
if !scanFollowsTrue(b) {
return ref, nil, NewParserError(atmost(b, 4), p.offsetOf(b), "expected 'true'")
}
ref = p.builder.Push(Node{
Kind: Bool,
Data: b[:4],
})
return ref, b[4:], nil
case 'f':
if !scanFollowsFalse(b) {
return ref, nil, NewParserError(atmost(b, 5), p.offsetOf(b), "expected 'false'")
}
ref = p.builder.Push(Node{
Kind: Bool,
Data: b[:5],
})
return ref, b[5:], nil
case '[':
return p.parseValArray(b)
case '{':
return p.parseInlineTable(b)
default:
return p.parseIntOrFloatOrDateTime(b)
}
}
func atmost(b []byte, n int) []byte {
if n >= len(b) {
return b
}
return b[:n]
}
func (p *Parser) parseLiteralString(b []byte) ([]byte, []byte, []byte, error) {
v, rest, err := scanLiteralString(b, p.offsetOf(b))
if err != nil {
return nil, nil, nil, err
}
return v, v[1 : len(v)-1], rest, nil
}
func (p *Parser) parseInlineTable(b []byte) (reference, []byte, error) {
// inline-table = inline-table-open [ inline-table-keyvals ] inline-table-close
// inline-table-open = %x7B ws ; {
// inline-table-close = ws %x7D ; }
// inline-table-sep = ws %x2C ws ; , Comma
// inline-table-keyvals = keyval [ inline-table-sep inline-table-keyvals ]
parent := p.builder.Push(Node{
Kind: InlineTable,
Raw: p.rangeOfToken(b[:1], b[1:]),
})
first := true
var child reference
b = b[1:]
var err error
for len(b) > 0 {
previousB := b
b = p.parseWhitespace(b)
if len(b) == 0 {
return parent, nil, NewParserError(previousB[:1], p.offsetOf(previousB), "inline table is incomplete")
}
if b[0] == '}' {
break
}
if !first {
b, err = expect(',', b, p.offsetOf(b))
if err != nil {
return parent, nil, err
}
b = p.parseWhitespace(b)
}
var kv reference
kv, b, err = p.parseKeyval(b)
if err != nil {
return parent, nil, err
}
if first {
p.builder.AttachChild(parent, kv)
} else {
p.builder.Chain(child, kv)
}
child = kv
first = false
}
rest, err := expect('}', b, p.offsetOf(b))
return parent, rest, err
}
//nolint:funlen,cyclop
func (p *Parser) parseValArray(b []byte) (reference, []byte, error) {
// array = array-open [ array-values ] ws-comment-newline array-close
// array-open = %x5B ; [
// array-close = %x5D ; ]
// array-values = ws-comment-newline val ws-comment-newline array-sep array-values
// array-values =/ ws-comment-newline val ws-comment-newline [ array-sep ]
// array-sep = %x2C ; , Comma
// ws-comment-newline = *( wschar / [ comment ] newline )
arrayStart := b
b = b[1:]
parent := p.builder.Push(Node{
Kind: Array,
})
// First indicates whether the parser is looking for the first element
// (non-comment) of the array.
first := true
lastChild := invalidReference
addChild := func(valueRef reference) {
if lastChild == invalidReference {
p.builder.AttachChild(parent, valueRef)
} else {
p.builder.Chain(lastChild, valueRef)
}
lastChild = valueRef
}
var err error
for len(b) > 0 {
var cref reference
cref, b, err = p.parseOptionalWhitespaceCommentNewline(b)
if err != nil {
return parent, nil, err
}
if cref != invalidReference {
addChild(cref)
}
if len(b) == 0 {
return parent, nil, NewParserError(arrayStart[:1], p.offsetOf(arrayStart), "array is incomplete")
}
if b[0] == ']' {
break
}
if b[0] == ',' {
if first {
return parent, nil, NewParserError(b[0:1], p.offsetOf(b), "array cannot start with comma")
}
b = b[1:]
cref, b, err = p.parseOptionalWhitespaceCommentNewline(b)
if err != nil {
return parent, nil, err
}
if cref != invalidReference {
addChild(cref)
}
} else if !first {
return parent, nil, NewParserError(b[0:1], p.offsetOf(b), "array elements must be separated by commas")
}
// TOML allows trailing commas in arrays.
if len(b) > 0 && b[0] == ']' {
break
}
var valueRef reference
valueRef, b, err = p.parseVal(b)
if err != nil {
return parent, nil, err
}
addChild(valueRef)
cref, b, err = p.parseOptionalWhitespaceCommentNewline(b)
if err != nil {
return parent, nil, err
}
if cref != invalidReference {
addChild(cref)
}
first = false
}
rest, err := expect(']', b, p.offsetOf(b))
return parent, rest, err
}
func (p *Parser) parseOptionalWhitespaceCommentNewline(b []byte) (reference, []byte, error) {
rootCommentRef := invalidReference
latestCommentRef := invalidReference
addComment := func(ref reference) {
switch {
case rootCommentRef == invalidReference:
rootCommentRef = ref
case latestCommentRef == invalidReference:
p.builder.AttachChild(rootCommentRef, ref)
latestCommentRef = ref
default:
p.builder.Chain(latestCommentRef, ref)
latestCommentRef = ref
}
}
for len(b) > 0 {
var err error
b = p.parseWhitespace(b)
if len(b) > 0 && b[0] == '#' {
var ref reference
ref, b, err = p.parseComment(b)
if err != nil {
return invalidReference, nil, err
}
if ref != invalidReference {
addComment(ref)
}
}
if len(b) == 0 {
break
}
if b[0] == '\n' || b[0] == '\r' {
b, err = p.parseNewline(b)
if err != nil {
return invalidReference, nil, err
}
} else {
break
}
}
return rootCommentRef, b, nil
}
func (p *Parser) parseMultilineLiteralString(b []byte) ([]byte, []byte, []byte, error) {
token, rest, err := scanMultilineLiteralString(b, p.offsetOf(b))
if err != nil {
return nil, nil, nil, err
}
i := 3
// skip the immediate new line
if token[i] == '\n' {
i++
} else if token[i] == '\r' && token[i+1] == '\n' {
i += 2
}
return token, token[i : len(token)-3], rest, err
}
//nolint:funlen,gocognit,cyclop
func (p *Parser) parseMultilineBasicString(b []byte) ([]byte, []byte, []byte, error) {
// ml-basic-string = ml-basic-string-delim [ newline ] ml-basic-body
// ml-basic-string-delim
// ml-basic-string-delim = 3quotation-mark
// ml-basic-body = *mlb-content *( mlb-quotes 1*mlb-content ) [ mlb-quotes ]
//
// mlb-content = mlb-char / newline / mlb-escaped-nl
// mlb-char = mlb-unescaped / escaped
// mlb-quotes = 1*2quotation-mark
// mlb-unescaped = wschar / %x21 / %x23-5B / %x5D-7E / non-ascii
// mlb-escaped-nl = escape ws newline *( wschar / newline )
token, escaped, rest, err := scanMultilineBasicString(b, p.offsetOf(b))
if err != nil {
return nil, nil, nil, err
}
i := 3
// skip the immediate new line
if token[i] == '\n' {
i++
} else if token[i] == '\r' && token[i+1] == '\n' {
i += 2
}
// fast path
startIdx := i
endIdx := len(token) - len(`"""`)
tokenBase := p.offsetOf(token)
if !escaped {
str := token[startIdx:endIdx]
invalidIdx := characters.Utf8TomlValidAlreadyEscaped(str)
if invalidIdx < 0 {
return token, str, rest, nil
}
return nil, nil, nil, NewParserError(str[invalidIdx:invalidIdx+1], tokenBase+startIdx+invalidIdx, "invalid UTF-8")
}
var builder bytes.Buffer
// The scanner ensures that the token starts and ends with quotes and that
// escapes are balanced.
for i < len(token)-3 {
c := token[i]
//nolint:nestif
if c == '\\' {
// When the last non-whitespace character on a line is an unescaped \,
// it will be trimmed along with all whitespace (including newlines) up
// to the next non-whitespace character or closing delimiter.
isLastNonWhitespaceOnLine := false
j := 1
findEOLLoop:
for ; j < len(token)-3-i; j++ {
switch token[i+j] {
case ' ', '\t':
continue
case '\r':
if token[i+j+1] == '\n' {
continue
}
case '\n':
isLastNonWhitespaceOnLine = true
}
break findEOLLoop
}
if isLastNonWhitespaceOnLine {
i += j
for ; i < len(token)-3; i++ {
c := token[i]
if c != '\n' && c != '\r' && c != ' ' && c != '\t' {
i--
break
}
}
i++
continue
}
// handle escaping
i++
c = token[i]
switch c {
case '"', '\\':
builder.WriteByte(c)
case 'b':
builder.WriteByte('\b')
case 'f':
builder.WriteByte('\f')
case 'n':
builder.WriteByte('\n')
case 'r':
builder.WriteByte('\r')
case 't':
builder.WriteByte('\t')
case 'e':
builder.WriteByte(0x1B)
case 'u':
x, err := hexToRune(atmost(token[i+1:], 4), tokenBase+i+1, 4)
if err != nil {
return nil, nil, nil, err
}
builder.WriteRune(x)
i += 4
case 'U':
x, err := hexToRune(atmost(token[i+1:], 8), tokenBase+i+1, 8)
if err != nil {
return nil, nil, nil, err
}
builder.WriteRune(x)
i += 8
default:
return nil, nil, nil, NewParserError(token[i:i+1], tokenBase+i, "invalid escaped character %#U", c)
}
i++
} else {
size := characters.Utf8ValidNext(token[i:])
if size == 0 {
return nil, nil, nil, NewParserError(token[i:i+1], tokenBase+i, "invalid character %#U", c)
}
builder.Write(token[i : i+size])
i += size
}
}
return token, builder.Bytes(), rest, nil
}
func (p *Parser) parseKey(b []byte) (reference, []byte, error) {
// key = simple-key / dotted-key
// simple-key = quoted-key / unquoted-key
//
// unquoted-key = 1*( ALPHA / DIGIT / %x2D / %x5F ) ; A-Z / a-z / 0-9 / - / _
// quoted-key = basic-string / literal-string
// dotted-key = simple-key 1*( dot-sep simple-key )
//
// dot-sep = ws %x2E ws ; . Period
raw, key, b, err := p.parseSimpleKey(b)
if err != nil {
return invalidReference, nil, err
}
ref := p.builder.Push(Node{
Kind: Key,
Raw: p.rangeOfToken(raw, b),
Data: key,
})
for {
b = p.parseWhitespace(b)
if len(b) > 0 && b[0] == '.' {
b = p.parseWhitespace(b[1:])
raw, key, b, err = p.parseSimpleKey(b)
if err != nil {
return ref, nil, err
}
p.builder.PushAndChain(Node{
Kind: Key,
Raw: p.rangeOfToken(raw, b),
Data: key,
})
} else {
break
}
}
return ref, b, nil
}
func (p *Parser) parseSimpleKey(b []byte) (raw, key, rest []byte, err error) {
if len(b) == 0 {
return nil, nil, nil, NewParserError(b, p.offsetOf(b), "expected key but found none")
}
switch {
case b[0] == '\'':
return p.parseLiteralString(b)
case b[0] == '"':
return p.parseBasicString(b)
case isUnquotedKeyChar(b[0]):
key, rest = scanUnquotedKey(b)
return key, key, rest, nil
default:
return nil, nil, nil, NewParserError(b[0:1], p.offsetOf(b), "invalid character at start of key: %c", b[0])
}
}
//nolint:funlen,cyclop
func (p *Parser) parseBasicString(b []byte) ([]byte, []byte, []byte, error) {
// basic-string = quotation-mark *basic-char quotation-mark
// quotation-mark = %x22 ; "
// basic-char = basic-unescaped / escaped
// basic-unescaped = wschar / %x21 / %x23-5B / %x5D-7E / non-ascii
// escaped = escape escape-seq-char
// escape-seq-char = %x22 ; " quotation mark U+0022
// escape-seq-char =/ %x5C ; \ reverse solidus U+005C
// escape-seq-char =/ %x62 ; b backspace U+0008
// escape-seq-char =/ %x66 ; f form feed U+000C
// escape-seq-char =/ %x6E ; n line feed U+000A
// escape-seq-char =/ %x72 ; r carriage return U+000D
// escape-seq-char =/ %x74 ; t tab U+0009
// escape-seq-char =/ %x75 4HEXDIG ; uXXXX U+XXXX
// escape-seq-char =/ %x55 8HEXDIG ; UXXXXXXXX U+XXXXXXXX
token, escaped, rest, err := scanBasicString(b, p.offsetOf(b))
if err != nil {
return nil, nil, nil, err
}
startIdx := len(`"`)
endIdx := len(token) - len(`"`)
// Fast path. If there is no escape sequence, the string should just be
// an UTF-8 encoded string, which is the same as Go. In that case,
// validate the string and return a direct reference to the buffer.
tokenBase := p.offsetOf(token)
if !escaped {
str := token[startIdx:endIdx]
invalidIdx := characters.Utf8TomlValidAlreadyEscaped(str)
if invalidIdx < 0 {
return token, str, rest, nil
}
return nil, nil, nil, NewParserError(str[invalidIdx:invalidIdx+1], tokenBase+startIdx+invalidIdx, "invalid UTF-8")
}
i := startIdx
var builder bytes.Buffer
// The scanner ensures that the token starts and ends with quotes and that
// escapes are balanced.
for i < len(token)-1 {
c := token[i]
if c == '\\' {
i++
c = token[i]
switch c {
case '"', '\\':
builder.WriteByte(c)
case 'b':
builder.WriteByte('\b')
case 'f':
builder.WriteByte('\f')
case 'n':
builder.WriteByte('\n')
case 'r':
builder.WriteByte('\r')
case 't':
builder.WriteByte('\t')
case 'e':
builder.WriteByte(0x1B)
case 'u':
x, err := hexToRune(token[i+1:len(token)-1], tokenBase+i+1, 4)
if err != nil {
return nil, nil, nil, err
}
builder.WriteRune(x)
i += 4
case 'U':
x, err := hexToRune(token[i+1:len(token)-1], tokenBase+i+1, 8)
if err != nil {
return nil, nil, nil, err
}
builder.WriteRune(x)
i += 8
default:
return nil, nil, nil, NewParserError(token[i:i+1], tokenBase+i, "invalid escaped character %#U", c)
}
i++
} else {
size := characters.Utf8ValidNext(token[i:])
if size == 0 {
return nil, nil, nil, NewParserError(token[i:i+1], tokenBase+i, "invalid character %#U", c)
}
builder.Write(token[i : i+size])
i += size
}
}
return token, builder.Bytes(), rest, nil
}
func hexToRune(b []byte, base int, length int) (rune, error) {
if len(b) < length {
return -1, NewParserError(b, base, "unicode point needs %d character, not %d", length, len(b))
}
b = b[:length]
var r uint32
for i, c := range b {
var d uint32
switch {
case '0' <= c && c <= '9':
d = uint32(c - '0')
case 'a' <= c && c <= 'f':
d = uint32(c - 'a' + 10)
case 'A' <= c && c <= 'F':
d = uint32(c - 'A' + 10)
default:
return -1, NewParserError(b[i:i+1], base+i, "non-hex character")
}
r = r*16 + d
}
if r > unicode.MaxRune || 0xD800 <= r && r < 0xE000 {
return -1, NewParserError(b, base, "escape sequence is invalid Unicode code point")
}
return rune(r), nil
}
func (p *Parser) parseWhitespace(b []byte) []byte {
// ws = *wschar
// wschar = %x20 ; Space
// wschar =/ %x09 ; Horizontal tab
_, rest := scanWhitespace(b)
return rest
}
//nolint:cyclop
func (p *Parser) parseIntOrFloatOrDateTime(b []byte) (reference, []byte, error) {
switch b[0] {
case 'i':
if !scanFollowsInf(b) {
return invalidReference, nil, NewParserError(atmost(b, 3), p.offsetOf(b), "expected 'inf'")
}
return p.builder.Push(Node{
Kind: Float,
Data: b[:3],
Raw: p.rangeOfToken(b[:3], b[3:]),
}), b[3:], nil
case 'n':
if !scanFollowsNan(b) {
return invalidReference, nil, NewParserError(atmost(b, 3), p.offsetOf(b), "expected 'nan'")
}
return p.builder.Push(Node{
Kind: Float,
Data: b[:3],
Raw: p.rangeOfToken(b[:3], b[3:]),
}), b[3:], nil
case '+', '-':
return p.scanIntOrFloat(b)
}
if len(b) < 3 {
return p.scanIntOrFloat(b)
}
s := 5
if len(b) < s {
s = len(b)
}
for idx, c := range b[:s] {
if isDigit(c) {
continue
}
if idx == 2 && c == ':' || (idx == 4 && c == '-') {
return p.scanDateTime(b)
}
break
}
return p.scanIntOrFloat(b)
}
func (p *Parser) scanDateTime(b []byte) (reference, []byte, error) {
// scans for contiguous characters in [0-9T:Z.+-], and up to one space if
// followed by a digit.
hasDate := false
hasTime := false
hasTz := false
seenSpace := false
i := 0
byteLoop:
for ; i < len(b); i++ {
c := b[i]
switch {
case isDigit(c):
case c == '-':
hasDate = true
const minOffsetOfTz = 8
if i >= minOffsetOfTz {
hasTz = true
}
case c == 'T' || c == 't' || c == ':' || c == '.':
hasTime = true
case c == '+' || c == 'Z' || c == 'z':
hasTz = true
case c == ' ':
if !seenSpace && i+1 < len(b) && isDigit(b[i+1]) {
i += 2
// Avoid reaching past the end of the document in case the time
// is malformed. See TestIssue585.
if i >= len(b) {
i--
}
seenSpace = true
hasTime = true
} else {
break byteLoop
}
default:
break byteLoop
}
}
var kind Kind
if hasTime {
if hasDate {
if hasTz {
kind = DateTime
} else {
kind = LocalDateTime
}
} else {
kind = LocalTime
}
} else {
kind = LocalDate
}
return p.builder.Push(Node{
Kind: kind,
Data: b[:i],
}), b[i:], nil
}
//nolint:funlen,gocognit,cyclop
func (p *Parser) scanIntOrFloat(b []byte) (reference, []byte, error) {
i := 0
if len(b) > 2 && b[0] == '0' && b[1] != '.' && b[1] != 'e' && b[1] != 'E' {
var isValidRune validRuneFn
switch b[1] {
case 'x':
isValidRune = isValidHexRune
case 'o':
isValidRune = isValidOctalRune
case 'b':
isValidRune = isValidBinaryRune
default:
i++
}
if isValidRune != nil {
i += 2
for ; i < len(b); i++ {
if !isValidRune(b[i]) {
break
}
}
}
return p.builder.Push(Node{
Kind: Integer,
Data: b[:i],
Raw: p.rangeOfToken(b[:i], b[i:]),
}), b[i:], nil
}
isFloat := false
for ; i < len(b); i++ {
c := b[i]
if c >= '0' && c <= '9' || c == '+' || c == '-' || c == '_' {
continue
}
if c == '.' || c == 'e' || c == 'E' {
isFloat = true
continue
}
if c == 'i' {
if scanFollowsInf(b[i:]) {
return p.builder.Push(Node{
Kind: Float,
Data: b[:i+3],
Raw: p.rangeOfToken(b[:i+3], b[i+3:]),
}), b[i+3:], nil
}
return invalidReference, nil, NewParserError(b[i:i+1], p.offsetOf(b)+i, "unexpected character 'i' while scanning for a number")
}
if c == 'n' {
if scanFollowsNan(b[i:]) {
return p.builder.Push(Node{
Kind: Float,
Data: b[:i+3],
Raw: p.rangeOfToken(b[:i+3], b[i+3:]),
}), b[i+3:], nil
}
return invalidReference, nil, NewParserError(b[i:i+1], p.offsetOf(b)+i, "unexpected character 'n' while scanning for a number")
}
break
}
if i == 0 {
return invalidReference, b, NewParserError(b, p.offsetOf(b), "incomplete number")
}
kind := Integer
if isFloat {
kind = Float
}
return p.builder.Push(Node{
Kind: kind,
Data: b[:i],
Raw: p.rangeOfToken(b[:i], b[i:]),
}), b[i:], nil
}
func isDigit(r byte) bool {
return r >= '0' && r <= '9'
}
type validRuneFn func(r byte) bool
func isValidHexRune(r byte) bool {
return r >= 'a' && r <= 'f' ||
r >= 'A' && r <= 'F' ||
r >= '0' && r <= '9' ||
r == '_'
}
func isValidOctalRune(r byte) bool {
return r >= '0' && r <= '7' || r == '_'
}
func isValidBinaryRune(r byte) bool {
return r == '0' || r == '1' || r == '_'
}
func expect(x byte, b []byte, base int) ([]byte, error) {
if len(b) == 0 {
return nil, NewParserError(b, base, "expected character %c but the document ended here", x)
}
if b[0] != x {
return nil, NewParserError(b[0:1], base, "expected character %c", x)
}
return b[1:], nil
}