Multiline basic string parsing

This commit is contained in:
Thomas Pelletier
2021-02-05 17:46:40 -05:00
parent ca12c0670d
commit 736a75748b
2 changed files with 122 additions and 1 deletions
+2 -1
View File
@@ -58,7 +58,8 @@ func scan(b []byte) ([]byte, []byte, error) {
case comment:
return scanComment(b)
case ' ', '\t':
return scanWhitespace(b)
data, rest := scanWhitespace(b)
return data, rest, nil
case '\r':
return scanWindowsNewline(b)
case '\n':
+120
View File
@@ -92,6 +92,126 @@ func parseKeyval(b []byte) ([]byte, error) {
func parseVal(b []byte) ([]byte, error) {
// val = string / boolean / array / inline-table / date-time / float / integer
c := b[0]
switch c {
// strings
case '"':
var rest []byte
var err error
if scanFollowsMultilineBasicStringDelimiter(b) {
_, rest, err = parseMultilineBasicString(b)
} else {
_, rest, err = parseBasicString(b)
}
return rest, err
case '\'':
if scanFollowsMultilineLiteralStringDelimiter(b) {
return parseMultilineLiteralString(b)
}
_, rest, err := scanLiteralString(b)
return rest, err
// TODO boolean
// TODO array
// TODO inline-table
// TODO date-time
// TODO float
// TODO integer
default:
return nil, fmt.Errorf("unexpected char")
}
}
func parseMultilineBasicString(b []byte) (string, []byte, error) {
//ml-basic-string = ml-basic-string-delim [ newline ] ml-basic-body
//ml-basic-string-delim
//ml-basic-string-delim = 3quotation-mark
//ml-basic-body = *mlb-content *( mlb-quotes 1*mlb-content ) [ mlb-quotes ]
//
//mlb-content = mlb-char / newline / mlb-escaped-nl
//mlb-char = mlb-unescaped / escaped
//mlb-quotes = 1*2quotation-mark
//mlb-unescaped = wschar / %x21 / %x23-5B / %x5D-7E / non-ascii
//mlb-escaped-nl = escape ws newline *( wschar / newline )
token, rest, err := scanMultilineBasicString(b)
if err != nil {
return "", nil, err
}
var builder strings.Builder
i := 3
// skip the immediate new line
if token[i] == '\n' {
i++
} else if token[i] == '\r' && token[i+1] == '\n' {
i += 2
}
// The scanner ensures that the token starts and ends with quotes and that
// escapes are balanced.
for ; i < len(token)-3; i++ {
c := token[i]
if c == '\\' {
// When the last non-whitespace character on a line is an unescaped \,
// it will be trimmed along with all whitespace (including newlines) up
// to the next non-whitespace character or closing delimiter.
if token[i+1] == '\n' || (token[i+1] == '\r' && token[i+2] == '\n') {
i++ // skip the \
for ; i < len(token)-3; i++ {
c := token[i]
if !(c == '\n' || c == '\r' || c == ' ' || c == '\t') {
break
}
}
continue
}
// handle escaping
i++
c = token[i]
switch c {
case '"', '\\':
builder.WriteByte(c)
case 'b':
builder.WriteByte('\b')
case 'f':
builder.WriteByte('\f')
case 'n':
builder.WriteByte('\n')
case 'r':
builder.WriteByte('\r')
case 't':
builder.WriteByte('\t')
case 'u':
x, err := hexToString(token[i+3:len(token)-3], 4)
if err != nil {
return "", nil, err
}
builder.WriteString(x)
i += 4
case 'U':
x, err := hexToString(token[i+3:len(token)-3], 8)
if err != nil {
return "", nil, err
}
builder.WriteString(x)
i += 8
default:
return "", nil, fmt.Errorf("invalid escaped character: %#U", c)
}
} else {
builder.WriteByte(c)
}
}
return builder.String(), rest, nil
}
func parseKey(b []byte) ([]byte, error) {