Trying the scanner approach

2021-02-04 10:14:11 -05:00
parent b123c357c5
commit 0ee0fe7f7c
9 changed files with 261 additions and 1126 deletions
@@ -0,0 +1,236 @@
+package toml
+
+import "fmt"
+
+func scanFollows(pattern []byte) func(b []byte) bool {
+	return func(b []byte) bool {
+		if len(b) < len(pattern) {
+			return false
+		}
+		for i, c := range pattern {
+			if b[i] != c {
+				return false
+			}
+		}
+		return true
+	}
+}
+
+var scanFollowsMultilineBasicStringDelimiter = scanFollows([]byte{'"', '"', '"'})
+var scanFollowsMultilineLiteralStringDelimiter = scanFollows([]byte{'\'', '\'', '\''})
+var scanFollowsTrue = scanFollows([]byte{'t', 'r', 'u', 'e'})
+var scanFollowsFalse = scanFollows([]byte{'f', 'a', 'l', 's', 'e'})
+var scanFollowsArrayTableBegin = scanFollows([]byte{arrayOrTableBegin, arrayOrTableBegin})
+var scanFollowsArrayTableEnd = scanFollows([]byte{arrayOrTableEnd, arrayOrTableEnd})
+
+const (
+	dot               = '.'
+	equal             = '='
+	comma             = ','
+	inlineTableBegin  = '{'
+	inlineTableEnd    = '}'
+	comment           = '#'
+	arrayOrTableBegin = '['
+	arrayOrTableEnd   = ']'
+)
+
+// scan returns a []byte containing the next lexical token, bytes left, and an error.
+//
+// eof is signaled by an empty token and nil error.
+func scan(b []byte) ([]byte, []byte, error) {
+	if len(b) == 0 {
+		return b, b, nil
+	}
+
+	switch b[0] {
+	case dot, equal, inlineTableBegin, inlineTableEnd, comma:
+		return b[:1], b[1:], nil
+	case '"':
+		if scanFollowsMultilineBasicStringDelimiter(b) {
+			return scanMultilineBasicString(b)
+		}
+		return scanBasicString(b)
+	case '\'':
+		if scanFollowsMultilineLiteralStringDelimiter(b) {
+			return scanMultilineLiteralString(b)
+		}
+		return scanLiteralString(b)
+	case comment:
+		return scanComment(b)
+	case ' ', '\t':
+		return scanWhitespace(b)
+	case '\r':
+		return scanWindowsNewline(b)
+	case '\n':
+		return b[:1], b[1:], nil
+	case 't':
+		if scanFollowsTrue(b) {
+			return b[:4], b[4:], nil
+		}
+	case 'f':
+		if scanFollowsFalse(b) {
+			return b[:5], b[5:], nil
+		}
+	case arrayOrTableBegin:
+		if scanFollowsArrayTableBegin(b) {
+			return b[:2], b[2:], nil
+		}
+		return b[:1], b[1:], nil
+	case arrayOrTableEnd:
+		if scanFollowsArrayTableEnd(b) {
+			return b[:2], b[2:], nil
+		}
+		return b[:1], b[1:], nil
+	}
+
+	if isUnquotedKeyChar(b[0]) {
+		return scanUnquotedKey(b)
+	}
+
+	// TODO: numbers, date-time
+	panic("unhandled scan")
+}
+
+func scanUnquotedKey(b []byte) ([]byte, []byte, error) {
+	//unquoted-key = 1*( ALPHA / DIGIT / %x2D / %x5F ) ; A-Z / a-z / 0-9 / - / _
+	for i := 1; i < len(b); i++ {
+		if !isUnquotedKeyChar(b[i]) {
+			return b[:i], b[i:], nil
+		}
+	}
+	return b, nil, nil
+}
+
+func isUnquotedKeyChar(r byte) bool {
+	return (r >= 'A' && r <= 'Z') || (r >= 'a' && r <= 'z') || (r >= '0' && r <= '9') || r == '-' || r == '_'
+}
+
+func scanLiteralString(b []byte) ([]byte, []byte, error) {
+	//literal-string = apostrophe *literal-char apostrophe
+	//apostrophe = %x27 ; ' apostrophe
+	//literal-char = %x09 / %x20-26 / %x28-7E / non-ascii
+	for i := 1; i < len(b); i++ {
+		switch b[i] {
+		case '\'':
+			return b[:i+1], b[i+1:], nil
+		case '\n':
+			return nil, nil, fmt.Errorf("literal strings cannot have new lines")
+		}
+	}
+	return nil, nil, fmt.Errorf("unterminated literal string")
+}
+
+func scanMultilineLiteralString(b []byte) ([]byte, []byte, error) {
+	//ml-literal-string = ml-literal-string-delim [ newline ] ml-literal-body
+	//ml-literal-string-delim
+	//ml-literal-string-delim = 3apostrophe
+	//ml-literal-body = *mll-content *( mll-quotes 1*mll-content ) [ mll-quotes ]
+	//
+	//mll-content = mll-char / newline
+	//mll-char = %x09 / %x20-26 / %x28-7E / non-ascii
+	//mll-quotes = 1*2apostrophe
+	for i := 3; i < len(b); i++ {
+		switch b[i] {
+		case '\'':
+			if scanFollowsMultilineLiteralStringDelimiter(b[i:]) {
+				return b[:i+3], b[:i+3], nil
+			}
+		}
+	}
+
+	return nil, nil, fmt.Errorf(`multiline literal string not terminated by '''`)
+}
+
+func scanWindowsNewline(b []byte) ([]byte, []byte, error) {
+	if len(b) < 2 {
+		return nil, nil, fmt.Errorf(`windows new line missing \n`)
+	}
+	if b[1] != '\n' {
+		return nil, nil, fmt.Errorf(`windows new line should be \r\n`)
+	}
+	return b[:2], b[2:], nil
+}
+
+func scanWhitespace(b []byte) ([]byte, []byte, error) {
+	for i := 1; i < len(b); i++ {
+		switch b[i] {
+		case ' ', '\t':
+			continue
+		default:
+			return b[:i], b[i:], nil
+		}
+	}
+	return b, nil, nil
+}
+
+func scanComment(b []byte) ([]byte, []byte, error) {
+	//;; Comment
+	//
+	//comment-start-symbol = %x23 ; #
+	//non-ascii = %x80-D7FF / %xE000-10FFFF
+	//non-eol = %x09 / %x20-7F / non-ascii
+	//
+	//comment = comment-start-symbol *non-eol
+
+	for i := 1; i < len(b); i++ {
+		switch b[i] {
+		case '\n':
+			return b[:i+1], b[i+1:], nil
+		}
+	}
+	return b, nil, nil
+}
+
+// TODO perform validation on the string?
+func scanBasicString(b []byte) ([]byte, []byte, error) {
+	//basic-string = quotation-mark *basic-char quotation-mark
+	//quotation-mark = %x22            ; "
+	//basic-char = basic-unescaped / escaped
+	//basic-unescaped = wschar / %x21 / %x23-5B / %x5D-7E / non-ascii
+	//escaped = escape escape-seq-char
+	for i := 1; i < len(b); i++ {
+		switch b[i] {
+		case '"':
+			return b[:i+1], b[i+1:], nil
+		case '\n':
+			return nil, nil, fmt.Errorf("basic strings cannot have new lines")
+		case '\\':
+			if len(b) < i+2 {
+				return nil, nil, fmt.Errorf("need a character after \\")
+			}
+			i++ // skip the next character
+		}
+	}
+
+	return nil, nil, fmt.Errorf(`basic string not terminated by "`)
+}
+
+// TODO perform validation on the string?
+func scanMultilineBasicString(b []byte) ([]byte, []byte, error) {
+	//ml-basic-string = ml-basic-string-delim [ newline ] ml-basic-body
+	//ml-basic-string-delim
+	//ml-basic-string-delim = 3quotation-mark
+	//ml-basic-body = *mlb-content *( mlb-quotes 1*mlb-content ) [ mlb-quotes ]
+	//
+	//mlb-content = mlb-char / newline / mlb-escaped-nl
+	//mlb-char = mlb-unescaped / escaped
+	//mlb-quotes = 1*2quotation-mark
+	//mlb-unescaped = wschar / %x21 / %x23-5B / %x5D-7E / non-ascii
+	//mlb-escaped-nl = escape ws newline *( wschar / newline )
+
+	for i := 3; i < len(b); i++ {
+		switch b[i] {
+		case '"':
+			if scanFollowsMultilineBasicStringDelimiter(b[i:]) {
+				return b[:i+3], b[:i+3], nil
+			}
+		case '\\':
+			if len(b) < i+2 {
+				return nil, nil, fmt.Errorf("need a character after \\")
+			}
+			i++ // skip the next character
+		}
+	}
+
+	return nil, nil, fmt.Errorf(`multiline basic string not terminated by """`)
+}