semi-functional prototype

2014-09-03 21:33:25 -04:00
parent fb5423fba2
commit 9942463786
6 changed files with 1334 additions and 0 deletions
@@ -0,0 +1,449 @@
+// TOML JSONPath lexer.
+//
+// Written using the principles developped by Rob Pike in
+// http://www.youtube.com/watch?v=HxaD_trXwRE
+
+package jpath
+
+import (
+  . "github.com/pelletier/go-toml"
+	"fmt"
+	"regexp"
+	"strconv"
+	"strings"
+	"unicode"
+	"unicode/utf8"
+)
+
+var dateRegexp *regexp.Regexp
+
+// Define tokens
+type tokenType int
+
+const (
+	eof = -(iota + 1)
+)
+
+const (
+	tokenError tokenType = iota
+	tokenEOF
+  tokenKey
+  tokenString
+  tokenFloat
+  tokenInteger
+  tokenAtCost
+  tokenDollar
+  tokenLBracket
+  tokenRBracket
+  tokenDot
+  tokenDotDot
+  tokenStar
+  tokenComma
+  tokenColon
+  tokenQuestion
+  tokenLParen
+  tokenRParen
+)
+
+var tokenTypeNames = []string{
+	"EOF",
+  "Key",
+  "String",
+  "Float",
+  "Integer",
+  "@",
+  "$",
+  "[",
+  "]",
+  ".",
+  "..",
+  "*",
+  ",",
+  ":",
+  "?",
+  "(",
+  ")",
+}
+
+type token struct {
+	Position
+	typ tokenType
+	val string
+}
+
+func (tt tokenType) String() string {
+	idx := int(tt)
+	if idx < len(tokenTypeNames) {
+		return tokenTypeNames[idx]
+	}
+	return "Unknown"
+}
+
+func (i token) String() string {
+	switch i.typ {
+	case tokenEOF:
+		return "EOF"
+	case tokenError:
+		return i.val
+	}
+
+	if len(i.val) > 10 {
+		return fmt.Sprintf("%.10q...", i.val)
+	}
+	return fmt.Sprintf("%q", i.val)
+}
+
+func isSpace(r rune) bool {
+	return r == ' ' || r == '\t'
+}
+
+func isAlphanumeric(r rune) bool {
+	return unicode.IsLetter(r) || r == '_'
+}
+
+func isKeyChar(r rune) bool {
+	// "Keys start with the first non-whitespace character and end with the last
+	// non-whitespace character before the equals sign."
+	return !(isSpace(r) || r == '\r' || r == '\n' || r == eof || r == '=')
+}
+
+func isDigit(r rune) bool {
+	return unicode.IsNumber(r)
+}
+
+func isHexDigit(r rune) bool {
+	return isDigit(r) ||
+		r == 'A' || r == 'B' || r == 'C' || r == 'D' || r == 'E' || r == 'F'
+}
+
+// Define lexer
+type lexer struct {
+	input      string
+	start      int
+	pos        int
+	width      int
+	tokens     chan token
+	depth      int
+	line       int
+	col        int
+  stringTerm string
+}
+
+func (l *lexer) run() {
+	for state := lexVoid; state != nil; {
+		state = state(l)
+	}
+	close(l.tokens)
+}
+
+func (l *lexer) nextStart() {
+	// iterate by runes (utf8 characters)
+	// search for newlines and advance line/col counts
+	for i := l.start; i < l.pos; {
+		r, width := utf8.DecodeRuneInString(l.input[i:])
+		if r == '\n' {
+			l.line++
+			l.col = 1
+		} else {
+			l.col++
+		}
+		i += width
+	}
+	// advance start position to next token
+	l.start = l.pos
+}
+
+func (l *lexer) emit(t tokenType) {
+	l.tokens <- token{
+		Position: Position{l.line, l.col},
+		typ:      t,
+		val:      l.input[l.start:l.pos],
+	}
+	l.nextStart()
+}
+
+func (l *lexer) emitWithValue(t tokenType, value string) {
+	l.tokens <- token{
+		Position: Position{l.line, l.col},
+		typ:      t,
+		val:      value,
+	}
+	l.nextStart()
+}
+
+func (l *lexer) next() rune {
+	if l.pos >= len(l.input) {
+		l.width = 0
+		return eof
+	}
+	var r rune
+	r, l.width = utf8.DecodeRuneInString(l.input[l.pos:])
+	l.pos += l.width
+	return r
+}
+
+func (l *lexer) ignore() {
+	l.nextStart()
+}
+
+func (l *lexer) backup() {
+	l.pos -= l.width
+}
+
+func (l *lexer) errorf(format string, args ...interface{}) stateFn {
+	l.tokens <- token{
+		Position: Position{l.line, l.col},
+		typ:      tokenError,
+		val:      fmt.Sprintf(format, args...),
+	}
+	return nil
+}
+
+func (l *lexer) peek() rune {
+	r := l.next()
+	l.backup()
+	return r
+}
+
+func (l *lexer) accept(valid string) bool {
+	if strings.IndexRune(valid, l.next()) >= 0 {
+		return true
+	}
+	l.backup()
+	return false
+}
+
+func (l *lexer) follow(next string) bool {
+	return strings.HasPrefix(l.input[l.pos:], next)
+}
+
+// Define state functions
+type stateFn func(*lexer) stateFn
+
+func lexVoid(l *lexer) stateFn {
+  for {
+    next := l.peek()
+    switch next {
+    case '$':
+      l.pos++
+      l.emit(tokenDollar)
+      continue
+    case '.':
+      if l.follow("..") {
+        l.pos += 2
+        l.emit(tokenDotDot)
+      } else {
+        l.pos++
+        l.emit(tokenDot)
+      }
+      continue
+    case '@':
+      l.pos++
+      l.emit(tokenAtCost)
+      continue
+    case '[':
+      l.pos++
+      l.emit(tokenLBracket)
+      continue
+    case ']':
+      l.pos++
+      l.emit(tokenRBracket)
+      continue
+    case ',':
+      l.pos++
+      l.emit(tokenComma)
+      continue
+    case '*':
+      l.pos++
+      l.emit(tokenStar)
+      continue
+    case '(':
+      l.pos++
+      l.emit(tokenLParen)
+      continue
+    case ')':
+      l.pos++
+      l.emit(tokenRParen)
+      continue
+    case '?':
+      l.pos++
+      l.emit(tokenQuestion)
+      continue
+    case ':':
+      l.pos++
+      l.emit(tokenColon)
+      continue
+    case '\'':
+      l.ignore()
+      l.stringTerm = string(next)
+      return lexString
+    case '"':
+      l.ignore()
+      l.stringTerm = string(next)
+      return lexString
+    }
+
+		if isAlphanumeric(next) {
+			return lexKey
+		}
+
+		if next == '+' || next == '-' || isDigit(next) {
+			return lexNumber
+		}
+
+    if isAlphanumeric(next) {
+			return lexKey
+		}
+
+		if isSpace(next) {
+			l.ignore()
+		}
+
+    if l.next() == eof {
+			break
+    }
+
+    return l.errorf("unexpected char: '%v'", next)
+  }
+	l.emit(tokenEOF)
+	return nil
+}
+
+func lexKey(l *lexer) stateFn {
+  for {
+    next := l.peek()
+    if !isAlphanumeric(next) {
+      l.emit(tokenKey)
+      return lexVoid
+    }
+
+    if l.next() == eof {
+			break
+    }
+  }
+	l.emit(tokenEOF)
+	return nil
+}
+
+func lexString(l *lexer) stateFn {
+	l.pos++
+	l.ignore()
+	growingString := ""
+
+	for {
+		if l.follow(l.stringTerm) {
+			l.emitWithValue(tokenString, growingString)
+			l.pos++
+			l.ignore()
+			return lexVoid
+		}
+
+		if l.follow("\\\"") {
+			l.pos++
+			growingString += "\""
+		} else if l.follow("\\'") {
+			l.pos++
+			growingString += "'"
+		} else if l.follow("\\n") {
+			l.pos++
+			growingString += "\n"
+		} else if l.follow("\\b") {
+			l.pos++
+			growingString += "\b"
+		} else if l.follow("\\f") {
+			l.pos++
+			growingString += "\f"
+		} else if l.follow("\\/") {
+			l.pos++
+			growingString += "/"
+		} else if l.follow("\\t") {
+			l.pos++
+			growingString += "\t"
+		} else if l.follow("\\r") {
+			l.pos++
+			growingString += "\r"
+		} else if l.follow("\\\\") {
+			l.pos++
+			growingString += "\\"
+		} else if l.follow("\\u") {
+			l.pos += 2
+			code := ""
+			for i := 0; i < 4; i++ {
+				c := l.peek()
+				l.pos++
+				if !isHexDigit(c) {
+					return l.errorf("unfinished unicode escape")
+				}
+				code = code + string(c)
+			}
+			l.pos--
+			intcode, err := strconv.ParseInt(code, 16, 32)
+			if err != nil {
+				return l.errorf("invalid unicode escape: \\u" + code)
+			}
+			growingString += string(rune(intcode))
+		} else if l.follow("\\") {
+			l.pos++
+			return l.errorf("invalid escape sequence: \\" + string(l.peek()))
+		} else {
+			growingString += string(l.peek())
+		}
+
+		if l.next() == eof {
+			break
+		}
+	}
+
+	return l.errorf("unclosed string")
+}
+
+func lexNumber(l *lexer) stateFn {
+	l.ignore()
+	if !l.accept("+") {
+		l.accept("-")
+	}
+	pointSeen := false
+	digitSeen := false
+	for {
+		next := l.next()
+		if next == '.' {
+			if pointSeen {
+				return l.errorf("cannot have two dots in one float")
+			}
+			if !isDigit(l.peek()) {
+				return l.errorf("float cannot end with a dot")
+			}
+			pointSeen = true
+		} else if isDigit(next) {
+			digitSeen = true
+		} else {
+			l.backup()
+			break
+		}
+		if pointSeen && !digitSeen {
+			return l.errorf("cannot start float with a dot")
+		}
+	}
+
+	if !digitSeen {
+		return l.errorf("no digit in that number")
+	}
+	if pointSeen {
+		l.emit(tokenFloat)
+	} else {
+		l.emit(tokenInteger)
+	}
+	return lexVoid
+}
+
+// Entry point
+func lex(input string) (*lexer, chan token) {
+	l := &lexer{
+		input:  input,
+		tokens: make(chan token),
+		line:   1,
+		col:    1,
+	}
+	go l.run()
+	return l, l.tokens
+}