Add lexer

2013-02-24 18:45:05 +01:00
commit a4e5fe8d12
7 changed files with 653 additions and 0 deletions
@@ -0,0 +1,3 @@
+# go-toml
+
+Go library for the [TOML](https://github.com/mojombo/toml) format.
@@ -0,0 +1,29 @@
+# This is a TOML document. Boom.
+
+title = "TOML Example"
+
+[owner]
+name = "Tom Preston-Werner"
+organization = "GitHub"
+bio = "GitHub Cofounder & CEO\nLikes tater tots and beer."
+dob = 1979-05-27T07:32:00Z # First class dates? Why not?
+
+[database]
+server = "192.168.1.1"
+ports = [ 8001, 8001, 8002 ]
+connection_max = 5000
+enabled = true
+
+[servers]
+
+  # You can indent as you please. Tabs or spaces. TOML don't care.
+  [servers.alpha]
+  ip = "10.0.0.1"
+  dc = "eqdc10"
+
+  [servers.beta]
+  ip = "10.0.0.2"
+  dc = "eqdc10"
+
+[clients]
+data = [ ["gamma", "delta"], [1, 2] ] # just an update to make sure parsers support it
@@ -0,0 +1,389 @@
+// TOML lexer.// Written using the principles developped by Rob Pike in
+// http://www.youtube.com/watch?v=HxaD_trXwRE
+
+package toml
+
+import (
+	"fmt"
+	"reflect"
+	"regexp"
+	"runtime"
+	"strconv"
+	"strings"
+	"unicode/utf8"
+)
+
+
+var dateRegexp *regexp.Regexp
+
+// Define tokens
+type tokenType int
+
+const (
+	EOF = - (iota + 1)
+)
+
+const (
+	tokenError tokenType = iota
+	tokenEOF
+	tokenComment
+	tokenKey
+	tokenEqual
+	tokenString
+	tokenInteger
+	tokenTrue
+	tokenFalse
+	tokenFloat
+	tokenLeftBracket
+	tokenRightBracket
+	tokenDate
+	tokenKeyGroup
+	tokenComma
+)
+
+type token struct {
+	typ tokenType
+	val string
+}
+
+
+func (i token) String() string {
+	switch i.typ {
+	case tokenEOF:
+		return "EOF"
+	case tokenError:
+		return i.val
+	}
+
+	if len(i.val) > 10 {
+		return fmt.Sprintf("%.10q...", i.val);
+	}
+	return fmt.Sprintf("%q", i.val)
+}
+
+
+func isSpace(r rune) bool {
+	return r == ' ' || r == '\t'
+}
+
+func isAlpha(r rune) bool {
+	return r >= 'a' && r <= 'z'
+}
+
+func isDigit(r rune) bool {
+	return r >= '0' && r <= '9'
+}
+
+
+// Define lexer
+type lexer struct {
+	input string
+	start int
+	pos int
+	width int
+	tokens chan token
+}
+
+
+func (l *lexer) run() {
+	for state := lexVoid; state != nil; {
+		fmt.Println("going in state", runtime.FuncForPC(reflect.ValueOf(state).Pointer()).Name())
+		state = state(l)
+	}
+	fmt.Println("closing...")
+	close (l.tokens)
+}
+
+func (l *lexer) emit(t tokenType) {
+	l.tokens <- token{t, l.input[l.start:l.pos]}
+	l.start = l.pos
+}
+
+func (l *lexer) emitWithValue(t tokenType, value string) {
+	l.tokens <- token{t, value}
+	l.start = l.pos
+}
+
+
+func (l *lexer) next() (rune) {
+	if l.pos >= len(l.input) {
+		l.width = 0
+		return EOF
+	}
+	var r rune
+	r, l.width = utf8.DecodeRuneInString(l.input[l.pos:])
+	l.pos += l.width
+	return r
+}
+
+func (l *lexer) ignore() {
+	l.start = l.pos
+}
+
+func (l *lexer) backup() {
+	l.pos -= l.width
+}
+
+func (l *lexer) errorf(format string, args ...interface{}) stateFn {
+	l.tokens <- token{
+		tokenError,
+		fmt.Sprintf(format, args...),
+	}
+	return nil
+}
+
+func (l *lexer) peek() rune {
+	r := l.next()
+	l.backup()
+	return r
+}
+
+func (l *lexer) accept(valid string) bool {
+	if strings.IndexRune(valid, l.next()) >= 0 {
+		return true
+	}
+	l.backup()
+	return false
+}
+
+func (l *lexer) follow(next string) bool {
+	return strings.HasPrefix(l.input[l.pos:], next)
+}
+
+
+// Define state functions
+type stateFn func(*lexer) stateFn
+
+func lexVoid(l *lexer) stateFn {
+	for {
+		next := l.peek()
+		switch next {
+		case '[':
+			return lexKeyGroup
+		case '#':
+			return lexComment
+		case '=':
+			return lexEqual
+		}
+
+		if isAlpha(next) {
+			return lexKey
+		}
+
+		if isSpace(next) {
+			l.ignore()
+		}
+
+		if l.next() == EOF { break }
+	}
+
+	l.emit(tokenEOF)
+	return nil
+}
+
+func lexRvalue(l *lexer) stateFn {
+	for {
+		next := l.peek()
+		switch next {
+		case '[':
+			return lexLeftBracket
+		case ']':
+			return lexRightBracket
+		case '#':
+			return lexComment
+		case '"':
+			return lexString
+		case ',':
+			return lexComma
+		case '\n':
+			return lexVoid
+		}
+
+		if l.follow("true") {
+			return lexTrue
+		}
+
+		if l.follow("false") {
+			return lexFalse
+		}
+
+		if isAlpha(next) {
+			return lexKey
+		}
+
+		if dateRegexp.FindString(l.input[l.pos:]) != "" {
+			return lexDate
+		}
+
+		if next == '+' || next == '-' || isDigit(next) {
+			return lexNumber
+		}
+
+		if isSpace(next) {
+			l.ignore()
+		}
+
+		if l.next() == EOF { break }
+	}
+
+	l.emit(tokenEOF)
+	return nil
+}
+
+func lexDate(l *lexer) stateFn {
+	l.ignore()
+	l.pos += 20 // Fixed size of a date in TOML
+	l.emit(tokenDate)
+	return lexRvalue
+}
+
+func lexTrue(l *lexer) stateFn {
+	l.ignore()
+	l.pos += 4
+	l.emit(tokenTrue)
+	return lexRvalue
+}
+
+func lexFalse(l *lexer) stateFn {
+	l.ignore()
+	l.pos += 5
+	l.emit(tokenFalse)
+	return lexRvalue
+}
+
+func lexEqual(l *lexer) stateFn {
+	l.ignore()
+	l.accept("=")
+	l.emit(tokenEqual)
+	return lexRvalue
+}
+
+func lexComma(l *lexer) stateFn {
+	l.ignore()
+	l.accept(",")
+	l.emit(tokenComma)
+	return lexRvalue
+}
+
+func lexKey(l *lexer) stateFn {
+	for isAlpha(l.next()) {
+	}
+	l.backup()
+	l.emit(tokenKey)
+	return lexVoid
+}
+
+func lexComment(l *lexer) stateFn {
+	for {
+		next := l.next()
+		if next == '\n' || next == EOF {
+			break
+		}
+	}
+	l.ignore()
+	return lexVoid
+}
+
+func lexLeftBracket(l *lexer) stateFn {
+	l.ignore()
+	l.pos += 1
+	l.emit(tokenLeftBracket)
+	return lexRvalue
+}
+
+func lexString(l *lexer) stateFn {
+	l.pos += 1
+	l.ignore()
+	growing_string := ""
+
+	for {
+		fmt.Println("peek:", strconv.QuoteRune(l.peek()))
+		if l.peek() == '"' {
+			l.emitWithValue(tokenString, growing_string)
+			l.pos += 1
+			l.ignore()
+			return lexVoid
+		}
+
+		if l.follow("\\\"") {
+			fmt.Println("follow")
+			l.pos += 1
+			growing_string += "\""
+		} else {
+			growing_string += string(l.peek())
+		}
+
+		if l.next() == EOF { break }
+	}
+
+	return l.errorf("unclosed string")
+}
+
+func lexKeyGroup(l *lexer) stateFn {
+	l.ignore()
+	l.pos += 1
+	l.emit(tokenLeftBracket)
+	return lexInsideKeyGroup
+}
+
+func lexInsideKeyGroup(l *lexer) stateFn {
+	for {
+		if l.peek() == ']' {
+			if l.pos > l.start {
+				l.emit(tokenKeyGroup)
+			}
+			l.ignore()
+			l.pos += 1
+			l.emit(tokenRightBracket)
+			return lexVoid
+		}
+
+		if l.next() == EOF { break }
+	}
+	return l.errorf("unclosed key group")
+}
+
+func lexRightBracket(l *lexer) stateFn {
+	l.ignore()
+	l.pos += 1
+	l.emit(tokenRightBracket)
+	return lexRvalue
+}
+
+func lexNumber(l *lexer) stateFn {
+	l.ignore()
+	if !l.accept("+") { l.accept("-") }
+	point_seen := false
+	digit_seen := false
+	for {
+		next := l.next()
+		if next == '.' { point_seen = true
+		} else if isDigit(next) { digit_seen = true
+		} else { break }
+	}
+
+	if !digit_seen {
+		return l.errorf("no digit in that number")
+	}
+	if point_seen {
+		l.emit(tokenFloat)
+	} else {
+		l.emit(tokenInteger)
+	}
+	return lexRvalue
+}
+
+func init() {
+	dateRegexp = regexp.MustCompile("^\\d{1,4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}Z")
+}
+
+
+// Entry point
+func lex(input string) (*lexer, chan token) {
+	l := &lexer {
+		input: input,
+		tokens: make(chan token),
+	}
+	go l.run()
+	return l, l.tokens
+}
@@ -0,0 +1,220 @@
+package toml
+
+import "testing"
+
+func testFlow(t *testing.T, input string, expectedFlow []token) {
+	_, ch := lex(input)
+	for _, expected := range expectedFlow {
+		token := <- ch
+		if token != expected {
+			t.Log("compared", token, "to", expected)
+			t.Log(token.val, "<->", expected.val)
+			t.Log(token.typ, "<->", expected.typ)
+			t.FailNow()
+		}
+	}
+
+	tok, ok := <- ch
+	if ok {
+		t.Log("channel is not closed!")
+		t.Log(len(ch) + 1, "tokens remaining:")
+
+		t.Log("token ->", tok)
+		for token := range ch {
+			t.Log("token ->", token)
+		}
+		t.FailNow()
+	}
+}
+
+func TestValidKeyGroup(t *testing.T) {
+	testFlow(t, "[hello world]", []token{
+		token{tokenLeftBracket, "["},
+		token{tokenKeyGroup, "hello world"},
+		token{tokenRightBracket, "]"},
+		token{tokenEOF, ""},
+	})
+}
+
+func TestUnclosedKeyGroup(t *testing.T) {
+	testFlow(t, "[hello world", []token{
+		token{tokenLeftBracket, "["},
+		token{tokenError, "unclosed key group"},
+	})
+}
+
+
+func TestComment(t *testing.T) {
+	testFlow(t, "# blahblah", []token{
+		token{tokenEOF, ""},
+	})
+}
+
+func TestKeyGroupComment(t *testing.T) {
+	testFlow(t, "[hello world] # blahblah", []token{
+		token{tokenLeftBracket, "["},
+		token{tokenKeyGroup, "hello world"},
+		token{tokenRightBracket, "]"},
+		token{tokenEOF, ""},
+	})
+}
+
+func TestMultipleKeyGroupsComment(t *testing.T) {
+	testFlow(t, "[hello world] # blahblah\n[test]", []token{
+		token{tokenLeftBracket, "["},
+		token{tokenKeyGroup, "hello world"},
+		token{tokenRightBracket, "]"},
+		token{tokenLeftBracket, "["},
+		token{tokenKeyGroup, "test"},
+		token{tokenRightBracket, "]"},
+		token{tokenEOF, ""},
+	})
+}
+
+func TestBasicKey(t *testing.T) {
+	testFlow(t, "hello", []token{
+		token{tokenKey, "hello"},
+		token{tokenEOF, ""},
+	})
+}
+
+func TestBasicKeyAndEqual(t *testing.T) {
+	testFlow(t, "hello =", []token{
+		token{tokenKey, "hello"},
+		token{tokenEqual, "="},
+		token{tokenEOF, ""},
+	})
+}
+
+func TestKeyEqualStringEscape(t *testing.T) {
+	testFlow(t, "foo = \"hello\\\"\"", []token{
+		token{tokenKey, "foo"},
+		token{tokenEqual, "="},
+		token{tokenString, "hello\""},
+		token{tokenEOF, ""},
+	})
+}
+
+func TestKeyEqualStringUnfinished(t *testing.T) {
+	testFlow(t, "foo = \"bar", []token{
+		token{tokenKey, "foo"},
+		token{tokenEqual, "="},
+		token{tokenError, "unclosed string"},
+	})
+}
+
+func TestKeyEqualString(t *testing.T) {
+	testFlow(t, "foo = \"bar\"", []token{
+		token{tokenKey, "foo"},
+		token{tokenEqual, "="},
+		token{tokenString, "bar"},
+		token{tokenEOF, ""},
+	})
+}
+
+func TestKeyEqualTrue(t *testing.T) {
+	testFlow(t, "foo = true", []token{
+		token{tokenKey, "foo"},
+		token{tokenEqual, "="},
+		token{tokenTrue, "true"},
+		token{tokenEOF, ""},
+	})
+}
+
+func TestKeyEqualFalse(t *testing.T) {
+	testFlow(t, "foo = false", []token{
+		token{tokenKey, "foo"},
+		token{tokenEqual, "="},
+		token{tokenFalse, "false"},
+		token{tokenEOF, ""},
+	})
+}
+
+func TestKeyEqualArrayBools(t *testing.T) {
+	testFlow(t, "foo = [true, false, true]", []token{
+		token{tokenKey, "foo"},
+		token{tokenEqual, "="},
+		token{tokenLeftBracket, "["},
+		token{tokenTrue, "true"},
+		token{tokenComma, ","},
+		token{tokenFalse, "false"},
+		token{tokenComma, ","},
+		token{tokenTrue, "true"},
+		token{tokenRightBracket, "]"},
+		token{tokenEOF, ""},
+	})
+}
+
+func TestKeyEqualArrayBoolsWithComments(t *testing.T) {
+	testFlow(t, "foo = [true, false, true] # YEAH", []token{
+		token{tokenKey, "foo"},
+		token{tokenEqual, "="},
+		token{tokenLeftBracket, "["},
+		token{tokenTrue, "true"},
+		token{tokenComma, ","},
+		token{tokenFalse, "false"},
+		token{tokenComma, ","},
+		token{tokenTrue, "true"},
+		token{tokenRightBracket, "]"},
+		token{tokenEOF, ""},
+	})
+}
+
+func TestDateRegexp(t *testing.T) {
+	if dateRegexp.FindString("1979-05-27T07:32:00Z") == "" {
+		t.Fail()
+	}
+}
+
+func TestKeyEqualDate(t *testing.T) {
+	testFlow(t, "foo = 1979-05-27T07:32:00Z", []token{
+		token{tokenKey, "foo"},
+		token{tokenEqual, "="},
+		token{tokenDate, "1979-05-27T07:32:00Z"},
+		token{tokenEOF, ""},
+	})
+}
+
+func TestKeyEqualNumber(t *testing.T) {
+	testFlow(t, "foo = 42", []token{
+		token{tokenKey, "foo"},
+		token{tokenEqual, "="},
+		token{tokenInteger, "42"},
+		token{tokenEOF, ""},
+	})
+
+	testFlow(t, "foo = +42", []token{
+		token{tokenKey, "foo"},
+		token{tokenEqual, "="},
+		token{tokenInteger, "+42"},
+		token{tokenEOF, ""},
+	})
+
+	testFlow(t, "foo = -42", []token{
+		token{tokenKey, "foo"},
+		token{tokenEqual, "="},
+		token{tokenInteger, "-42"},
+		token{tokenEOF, ""},
+	})
+
+	testFlow(t, "foo = 4.2", []token{
+		token{tokenKey, "foo"},
+		token{tokenEqual, "="},
+		token{tokenFloat, "4.2"},
+		token{tokenEOF, ""},
+	})
+
+	testFlow(t, "foo = +4.2", []token{
+		token{tokenKey, "foo"},
+		token{tokenEqual, "="},
+		token{tokenFloat, "+4.2"},
+		token{tokenEOF, ""},
+	})
+
+	testFlow(t, "foo = -4.2", []token{
+		token{tokenKey, "foo"},
+		token{tokenEqual, "="},
+		token{tokenFloat, "-4.2"},
+		token{tokenEOF, ""},
+	})
+}
@@ -0,0 +1,3 @@
+// TOML Parser.
+
+package toml
@@ -0,0 +1,8 @@
+// TOML interface.
+
+package toml
+
+func Load() map[string]interface{} {
+	result := make(map[string]interface{})
+	return result
+}
@@ -0,0 +1 @@
+package toml