Add lexer

This commit is contained in:
Thomas Pelletier
2013-02-24 18:45:05 +01:00
commit a4e5fe8d12
7 changed files with 653 additions and 0 deletions
+3
View File
@@ -0,0 +1,3 @@
# go-toml
Go library for the [TOML](https://github.com/mojombo/toml) format.
+29
View File
@@ -0,0 +1,29 @@
# This is a TOML document. Boom.
title = "TOML Example"
[owner]
name = "Tom Preston-Werner"
organization = "GitHub"
bio = "GitHub Cofounder & CEO\nLikes tater tots and beer."
dob = 1979-05-27T07:32:00Z # First class dates? Why not?
[database]
server = "192.168.1.1"
ports = [ 8001, 8001, 8002 ]
connection_max = 5000
enabled = true
[servers]
# You can indent as you please. Tabs or spaces. TOML don't care.
[servers.alpha]
ip = "10.0.0.1"
dc = "eqdc10"
[servers.beta]
ip = "10.0.0.2"
dc = "eqdc10"
[clients]
data = [ ["gamma", "delta"], [1, 2] ] # just an update to make sure parsers support it
+389
View File
@@ -0,0 +1,389 @@
// TOML lexer.// Written using the principles developped by Rob Pike in
// http://www.youtube.com/watch?v=HxaD_trXwRE
package toml
import (
"fmt"
"reflect"
"regexp"
"runtime"
"strconv"
"strings"
"unicode/utf8"
)
var dateRegexp *regexp.Regexp
// Define tokens
type tokenType int
const (
EOF = - (iota + 1)
)
const (
tokenError tokenType = iota
tokenEOF
tokenComment
tokenKey
tokenEqual
tokenString
tokenInteger
tokenTrue
tokenFalse
tokenFloat
tokenLeftBracket
tokenRightBracket
tokenDate
tokenKeyGroup
tokenComma
)
type token struct {
typ tokenType
val string
}
func (i token) String() string {
switch i.typ {
case tokenEOF:
return "EOF"
case tokenError:
return i.val
}
if len(i.val) > 10 {
return fmt.Sprintf("%.10q...", i.val);
}
return fmt.Sprintf("%q", i.val)
}
func isSpace(r rune) bool {
return r == ' ' || r == '\t'
}
func isAlpha(r rune) bool {
return r >= 'a' && r <= 'z'
}
func isDigit(r rune) bool {
return r >= '0' && r <= '9'
}
// Define lexer
type lexer struct {
input string
start int
pos int
width int
tokens chan token
}
func (l *lexer) run() {
for state := lexVoid; state != nil; {
fmt.Println("going in state", runtime.FuncForPC(reflect.ValueOf(state).Pointer()).Name())
state = state(l)
}
fmt.Println("closing...")
close (l.tokens)
}
func (l *lexer) emit(t tokenType) {
l.tokens <- token{t, l.input[l.start:l.pos]}
l.start = l.pos
}
func (l *lexer) emitWithValue(t tokenType, value string) {
l.tokens <- token{t, value}
l.start = l.pos
}
func (l *lexer) next() (rune) {
if l.pos >= len(l.input) {
l.width = 0
return EOF
}
var r rune
r, l.width = utf8.DecodeRuneInString(l.input[l.pos:])
l.pos += l.width
return r
}
func (l *lexer) ignore() {
l.start = l.pos
}
func (l *lexer) backup() {
l.pos -= l.width
}
func (l *lexer) errorf(format string, args ...interface{}) stateFn {
l.tokens <- token{
tokenError,
fmt.Sprintf(format, args...),
}
return nil
}
func (l *lexer) peek() rune {
r := l.next()
l.backup()
return r
}
func (l *lexer) accept(valid string) bool {
if strings.IndexRune(valid, l.next()) >= 0 {
return true
}
l.backup()
return false
}
func (l *lexer) follow(next string) bool {
return strings.HasPrefix(l.input[l.pos:], next)
}
// Define state functions
type stateFn func(*lexer) stateFn
func lexVoid(l *lexer) stateFn {
for {
next := l.peek()
switch next {
case '[':
return lexKeyGroup
case '#':
return lexComment
case '=':
return lexEqual
}
if isAlpha(next) {
return lexKey
}
if isSpace(next) {
l.ignore()
}
if l.next() == EOF { break }
}
l.emit(tokenEOF)
return nil
}
func lexRvalue(l *lexer) stateFn {
for {
next := l.peek()
switch next {
case '[':
return lexLeftBracket
case ']':
return lexRightBracket
case '#':
return lexComment
case '"':
return lexString
case ',':
return lexComma
case '\n':
return lexVoid
}
if l.follow("true") {
return lexTrue
}
if l.follow("false") {
return lexFalse
}
if isAlpha(next) {
return lexKey
}
if dateRegexp.FindString(l.input[l.pos:]) != "" {
return lexDate
}
if next == '+' || next == '-' || isDigit(next) {
return lexNumber
}
if isSpace(next) {
l.ignore()
}
if l.next() == EOF { break }
}
l.emit(tokenEOF)
return nil
}
func lexDate(l *lexer) stateFn {
l.ignore()
l.pos += 20 // Fixed size of a date in TOML
l.emit(tokenDate)
return lexRvalue
}
func lexTrue(l *lexer) stateFn {
l.ignore()
l.pos += 4
l.emit(tokenTrue)
return lexRvalue
}
func lexFalse(l *lexer) stateFn {
l.ignore()
l.pos += 5
l.emit(tokenFalse)
return lexRvalue
}
func lexEqual(l *lexer) stateFn {
l.ignore()
l.accept("=")
l.emit(tokenEqual)
return lexRvalue
}
func lexComma(l *lexer) stateFn {
l.ignore()
l.accept(",")
l.emit(tokenComma)
return lexRvalue
}
func lexKey(l *lexer) stateFn {
for isAlpha(l.next()) {
}
l.backup()
l.emit(tokenKey)
return lexVoid
}
func lexComment(l *lexer) stateFn {
for {
next := l.next()
if next == '\n' || next == EOF {
break
}
}
l.ignore()
return lexVoid
}
func lexLeftBracket(l *lexer) stateFn {
l.ignore()
l.pos += 1
l.emit(tokenLeftBracket)
return lexRvalue
}
func lexString(l *lexer) stateFn {
l.pos += 1
l.ignore()
growing_string := ""
for {
fmt.Println("peek:", strconv.QuoteRune(l.peek()))
if l.peek() == '"' {
l.emitWithValue(tokenString, growing_string)
l.pos += 1
l.ignore()
return lexVoid
}
if l.follow("\\\"") {
fmt.Println("follow")
l.pos += 1
growing_string += "\""
} else {
growing_string += string(l.peek())
}
if l.next() == EOF { break }
}
return l.errorf("unclosed string")
}
func lexKeyGroup(l *lexer) stateFn {
l.ignore()
l.pos += 1
l.emit(tokenLeftBracket)
return lexInsideKeyGroup
}
func lexInsideKeyGroup(l *lexer) stateFn {
for {
if l.peek() == ']' {
if l.pos > l.start {
l.emit(tokenKeyGroup)
}
l.ignore()
l.pos += 1
l.emit(tokenRightBracket)
return lexVoid
}
if l.next() == EOF { break }
}
return l.errorf("unclosed key group")
}
func lexRightBracket(l *lexer) stateFn {
l.ignore()
l.pos += 1
l.emit(tokenRightBracket)
return lexRvalue
}
func lexNumber(l *lexer) stateFn {
l.ignore()
if !l.accept("+") { l.accept("-") }
point_seen := false
digit_seen := false
for {
next := l.next()
if next == '.' { point_seen = true
} else if isDigit(next) { digit_seen = true
} else { break }
}
if !digit_seen {
return l.errorf("no digit in that number")
}
if point_seen {
l.emit(tokenFloat)
} else {
l.emit(tokenInteger)
}
return lexRvalue
}
func init() {
dateRegexp = regexp.MustCompile("^\\d{1,4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}Z")
}
// Entry point
func lex(input string) (*lexer, chan token) {
l := &lexer {
input: input,
tokens: make(chan token),
}
go l.run()
return l, l.tokens
}
+220
View File
@@ -0,0 +1,220 @@
package toml
import "testing"
func testFlow(t *testing.T, input string, expectedFlow []token) {
_, ch := lex(input)
for _, expected := range expectedFlow {
token := <- ch
if token != expected {
t.Log("compared", token, "to", expected)
t.Log(token.val, "<->", expected.val)
t.Log(token.typ, "<->", expected.typ)
t.FailNow()
}
}
tok, ok := <- ch
if ok {
t.Log("channel is not closed!")
t.Log(len(ch) + 1, "tokens remaining:")
t.Log("token ->", tok)
for token := range ch {
t.Log("token ->", token)
}
t.FailNow()
}
}
func TestValidKeyGroup(t *testing.T) {
testFlow(t, "[hello world]", []token{
token{tokenLeftBracket, "["},
token{tokenKeyGroup, "hello world"},
token{tokenRightBracket, "]"},
token{tokenEOF, ""},
})
}
func TestUnclosedKeyGroup(t *testing.T) {
testFlow(t, "[hello world", []token{
token{tokenLeftBracket, "["},
token{tokenError, "unclosed key group"},
})
}
func TestComment(t *testing.T) {
testFlow(t, "# blahblah", []token{
token{tokenEOF, ""},
})
}
func TestKeyGroupComment(t *testing.T) {
testFlow(t, "[hello world] # blahblah", []token{
token{tokenLeftBracket, "["},
token{tokenKeyGroup, "hello world"},
token{tokenRightBracket, "]"},
token{tokenEOF, ""},
})
}
func TestMultipleKeyGroupsComment(t *testing.T) {
testFlow(t, "[hello world] # blahblah\n[test]", []token{
token{tokenLeftBracket, "["},
token{tokenKeyGroup, "hello world"},
token{tokenRightBracket, "]"},
token{tokenLeftBracket, "["},
token{tokenKeyGroup, "test"},
token{tokenRightBracket, "]"},
token{tokenEOF, ""},
})
}
func TestBasicKey(t *testing.T) {
testFlow(t, "hello", []token{
token{tokenKey, "hello"},
token{tokenEOF, ""},
})
}
func TestBasicKeyAndEqual(t *testing.T) {
testFlow(t, "hello =", []token{
token{tokenKey, "hello"},
token{tokenEqual, "="},
token{tokenEOF, ""},
})
}
func TestKeyEqualStringEscape(t *testing.T) {
testFlow(t, "foo = \"hello\\\"\"", []token{
token{tokenKey, "foo"},
token{tokenEqual, "="},
token{tokenString, "hello\""},
token{tokenEOF, ""},
})
}
func TestKeyEqualStringUnfinished(t *testing.T) {
testFlow(t, "foo = \"bar", []token{
token{tokenKey, "foo"},
token{tokenEqual, "="},
token{tokenError, "unclosed string"},
})
}
func TestKeyEqualString(t *testing.T) {
testFlow(t, "foo = \"bar\"", []token{
token{tokenKey, "foo"},
token{tokenEqual, "="},
token{tokenString, "bar"},
token{tokenEOF, ""},
})
}
func TestKeyEqualTrue(t *testing.T) {
testFlow(t, "foo = true", []token{
token{tokenKey, "foo"},
token{tokenEqual, "="},
token{tokenTrue, "true"},
token{tokenEOF, ""},
})
}
func TestKeyEqualFalse(t *testing.T) {
testFlow(t, "foo = false", []token{
token{tokenKey, "foo"},
token{tokenEqual, "="},
token{tokenFalse, "false"},
token{tokenEOF, ""},
})
}
func TestKeyEqualArrayBools(t *testing.T) {
testFlow(t, "foo = [true, false, true]", []token{
token{tokenKey, "foo"},
token{tokenEqual, "="},
token{tokenLeftBracket, "["},
token{tokenTrue, "true"},
token{tokenComma, ","},
token{tokenFalse, "false"},
token{tokenComma, ","},
token{tokenTrue, "true"},
token{tokenRightBracket, "]"},
token{tokenEOF, ""},
})
}
func TestKeyEqualArrayBoolsWithComments(t *testing.T) {
testFlow(t, "foo = [true, false, true] # YEAH", []token{
token{tokenKey, "foo"},
token{tokenEqual, "="},
token{tokenLeftBracket, "["},
token{tokenTrue, "true"},
token{tokenComma, ","},
token{tokenFalse, "false"},
token{tokenComma, ","},
token{tokenTrue, "true"},
token{tokenRightBracket, "]"},
token{tokenEOF, ""},
})
}
func TestDateRegexp(t *testing.T) {
if dateRegexp.FindString("1979-05-27T07:32:00Z") == "" {
t.Fail()
}
}
func TestKeyEqualDate(t *testing.T) {
testFlow(t, "foo = 1979-05-27T07:32:00Z", []token{
token{tokenKey, "foo"},
token{tokenEqual, "="},
token{tokenDate, "1979-05-27T07:32:00Z"},
token{tokenEOF, ""},
})
}
func TestKeyEqualNumber(t *testing.T) {
testFlow(t, "foo = 42", []token{
token{tokenKey, "foo"},
token{tokenEqual, "="},
token{tokenInteger, "42"},
token{tokenEOF, ""},
})
testFlow(t, "foo = +42", []token{
token{tokenKey, "foo"},
token{tokenEqual, "="},
token{tokenInteger, "+42"},
token{tokenEOF, ""},
})
testFlow(t, "foo = -42", []token{
token{tokenKey, "foo"},
token{tokenEqual, "="},
token{tokenInteger, "-42"},
token{tokenEOF, ""},
})
testFlow(t, "foo = 4.2", []token{
token{tokenKey, "foo"},
token{tokenEqual, "="},
token{tokenFloat, "4.2"},
token{tokenEOF, ""},
})
testFlow(t, "foo = +4.2", []token{
token{tokenKey, "foo"},
token{tokenEqual, "="},
token{tokenFloat, "+4.2"},
token{tokenEOF, ""},
})
testFlow(t, "foo = -4.2", []token{
token{tokenKey, "foo"},
token{tokenEqual, "="},
token{tokenFloat, "-4.2"},
token{tokenEOF, ""},
})
}
+3
View File
@@ -0,0 +1,3 @@
// TOML Parser.
package toml
+8
View File
@@ -0,0 +1,8 @@
// TOML interface.
package toml
func Load() map[string]interface{} {
result := make(map[string]interface{})
return result
}
+1
View File
@@ -0,0 +1 @@
package toml