go-toml/query/lexer.go

// TOML JSONPath lexer.
//
// Written using the principles developed by Rob Pike in
// http://www.youtube.com/watch?v=HxaD_trXwRE

package query

import (
	"fmt"
	"strconv"
	"strings"
	"unicode/utf8"
	"github.com/pelletier/go-toml"
)

// Lexer state function
type queryLexStateFn func() queryLexStateFn

// Lexer definition
type queryLexer struct {
	input      string
	start      int
	pos        int
	width      int
	tokens     chan token
	depth      int
	line       int
	col        int
	stringTerm string
}

func (l *queryLexer) run() {
	for state := l.lexVoid; state != nil; {
		state = state()
	}
	close(l.tokens)
}

func (l *queryLexer) nextStart() {
	// iterate by runes (utf8 characters)
	// search for newlines and advance line/col counts
	for i := l.start; i < l.pos; {
		r, width := utf8.DecodeRuneInString(l.input[i:])
		if r == '\n' {
			l.line++
			l.col = 1
		} else {
			l.col++
		}
		i += width
	}
	// advance start position to next token
	l.start = l.pos
}

func (l *queryLexer) emit(t tokenType) {
	l.tokens <- token{
		Position: toml.Position{Line:l.line, Col:l.col},
		typ:      t,
		val:      l.input[l.start:l.pos],
	}
	l.nextStart()
}

func (l *queryLexer) emitWithValue(t tokenType, value string) {
	l.tokens <- token{
		Position: toml.Position{Line:l.line, Col:l.col},
		typ:      t,
		val:      value,
	}
	l.nextStart()
}

func (l *queryLexer) next() rune {
	if l.pos >= len(l.input) {
		l.width = 0
		return eof
	}
	var r rune
	r, l.width = utf8.DecodeRuneInString(l.input[l.pos:])
	l.pos += l.width
	return r
}

func (l *queryLexer) ignore() {
	l.nextStart()
}

func (l *queryLexer) backup() {
	l.pos -= l.width
}

func (l *queryLexer) errorf(format string, args ...interface{}) queryLexStateFn {
	l.tokens <- token{
		Position: toml.Position{Line:l.line, Col:l.col},
		typ:      tokenError,
		val:      fmt.Sprintf(format, args...),
	}
	return nil
}

func (l *queryLexer) peek() rune {
	r := l.next()
	l.backup()
	return r
}

func (l *queryLexer) accept(valid string) bool {
	if strings.ContainsRune(valid, l.next()) {
		return true
	}
	l.backup()
	return false
}

func (l *queryLexer) follow(next string) bool {
	return strings.HasPrefix(l.input[l.pos:], next)
}

func (l *queryLexer) lexVoid() queryLexStateFn {
	for {
		next := l.peek()
		switch next {
		case '$':
			l.pos++
			l.emit(tokenDollar)
			continue
		case '.':
			if l.follow("..") {
				l.pos += 2
				l.emit(tokenDotDot)
			} else {
				l.pos++
				l.emit(tokenDot)
			}
			continue
		case '[':
			l.pos++
			l.emit(tokenLeftBracket)
			continue
		case ']':
			l.pos++
			l.emit(tokenRightBracket)
			continue
		case ',':
			l.pos++
			l.emit(tokenComma)
			continue
		case '*':
			l.pos++
			l.emit(tokenStar)
			continue
		case '(':
			l.pos++
			l.emit(tokenLeftParen)
			continue
		case ')':
			l.pos++
			l.emit(tokenRightParen)
			continue
		case '?':
			l.pos++
			l.emit(tokenQuestion)
			continue
		case ':':
			l.pos++
			l.emit(tokenColon)
			continue
		case '\'':
			l.ignore()
			l.stringTerm = string(next)
			return l.lexString
		case '"':
			l.ignore()
			l.stringTerm = string(next)
			return l.lexString
		}

		if isSpace(next) {
			l.next()
			l.ignore()
			continue
		}

		if isAlphanumeric(next) {
			return l.lexKey
		}

		if next == '+' || next == '-' || isDigit(next) {
			return l.lexNumber
		}

		if l.next() == eof {
			break
		}

		return l.errorf("unexpected char: '%v'", next)
	}
	l.emit(tokenEOF)
	return nil
}

func (l *queryLexer) lexKey() queryLexStateFn {
	for {
		next := l.peek()
		if !isAlphanumeric(next) {
			l.emit(tokenKey)
			return l.lexVoid
		}

		if l.next() == eof {
			break
		}
	}
	l.emit(tokenEOF)
	return nil
}

func (l *queryLexer) lexString() queryLexStateFn {
	l.pos++
	l.ignore()
	growingString := ""

	for {
		if l.follow(l.stringTerm) {
			l.emitWithValue(tokenString, growingString)
			l.pos++
			l.ignore()
			return l.lexVoid
		}

		if l.follow("\\\"") {
			l.pos++
			growingString += "\""
		} else if l.follow("\\'") {
			l.pos++
			growingString += "'"
		} else if l.follow("\\n") {
			l.pos++
			growingString += "\n"
		} else if l.follow("\\b") {
			l.pos++
			growingString += "\b"
		} else if l.follow("\\f") {
			l.pos++
			growingString += "\f"
		} else if l.follow("\\/") {
			l.pos++
			growingString += "/"
		} else if l.follow("\\t") {
			l.pos++
			growingString += "\t"
		} else if l.follow("\\r") {
			l.pos++
			growingString += "\r"
		} else if l.follow("\\\\") {
			l.pos++
			growingString += "\\"
		} else if l.follow("\\u") {
			l.pos += 2
			code := ""
			for i := 0; i < 4; i++ {
				c := l.peek()
				l.pos++
				if !isHexDigit(c) {
					return l.errorf("unfinished unicode escape")
				}
				code = code + string(c)
			}
			l.pos--
			intcode, err := strconv.ParseInt(code, 16, 32)
			if err != nil {
				return l.errorf("invalid unicode escape: \\u" + code)
			}
			growingString += string(rune(intcode))
		} else if l.follow("\\U") {
			l.pos += 2
			code := ""
			for i := 0; i < 8; i++ {
				c := l.peek()
				l.pos++
				if !isHexDigit(c) {
					return l.errorf("unfinished unicode escape")
				}
				code = code + string(c)
			}
			l.pos--
			intcode, err := strconv.ParseInt(code, 16, 32)
			if err != nil {
				return l.errorf("invalid unicode escape: \\u" + code)
			}
			growingString += string(rune(intcode))
		} else if l.follow("\\") {
			l.pos++
			return l.errorf("invalid escape sequence: \\" + string(l.peek()))
		} else {
			growingString += string(l.peek())
		}

		if l.next() == eof {
			break
		}
	}

	return l.errorf("unclosed string")
}

func (l *queryLexer) lexNumber() queryLexStateFn {
	l.ignore()
	if !l.accept("+") {
		l.accept("-")
	}
	pointSeen := false
	digitSeen := false
	for {
		next := l.next()
		if next == '.' {
			if pointSeen {
				return l.errorf("cannot have two dots in one float")
			}
			if !isDigit(l.peek()) {
				return l.errorf("float cannot end with a dot")
			}
			pointSeen = true
		} else if isDigit(next) {
			digitSeen = true
		} else {
			l.backup()
			break
		}
		if pointSeen && !digitSeen {
			return l.errorf("cannot start float with a dot")
		}
	}

	if !digitSeen {
		return l.errorf("no digit in that number")
	}
	if pointSeen {
		l.emit(tokenFloat)
	} else {
		l.emit(tokenInteger)
	}
	return l.lexVoid
}

// Entry point
func lexQuery(input string) chan token {
	l := &queryLexer{
		input:  input,
		tokens: make(chan token),
		line:   1,
		col:    1,
	}
	go l.run()
	return l.tokens
}