diff --git a/toml.go b/toml.go new file mode 100644 index 0000000..115d073 --- /dev/null +++ b/toml.go @@ -0,0 +1,467 @@ +package toml + +import ( + "fmt" + "unicode/utf8" +) + +func Unmarshal(data []byte, v interface{}) error { + // TODO + return nil +} + +func Marshal(v interface{}) ([]byte, error) { + // TODO + return nil, nil +} + +type Document struct { +} + +type builder interface { + Whitespace(b []byte) + Comment(b []byte) + UnquotedKey(b []byte) + LiteralString(b []byte) + BasicString(b []byte) +} + +type position struct { + line int + column int +} + +type documentBuilder struct { + document Document +} + +func (d *documentBuilder) BasicString(b []byte) { + s := string(b) + fmt.Printf("BasicString: '%s'\n", s) +} + +func (d *documentBuilder) LiteralString(b []byte) { + s := string(b) + fmt.Printf("LiteralString: '%s'\n", s) +} + +func (d *documentBuilder) UnquotedKey(b []byte) { + s := string(b) + fmt.Printf("UnquotedKey: '%s'\n", s) +} + +func (d *documentBuilder) Comment(b []byte) { + s := string(b) + fmt.Printf("Comment: '%s'\n", s) +} + +func (d *documentBuilder) Whitespace(b []byte) { + s := string(b) + fmt.Printf("Whitespace: '%s'\n", s) +} + +func Parse(b []byte) (Document, error) { + builder := documentBuilder{} + p := parser{builder: &builder} + err := p.parse(b) + if err != nil { + return Document{}, err + } + return builder.document, nil +} + +// eof is a rune value indicating end-of-file. +const eof = -1 + +type parser struct { + builder builder +} + +type InvalidCharacter struct { + r rune +} + +func (e *InvalidCharacter) Error() string { + return fmt.Sprintf("unexpected character '%#U'", e.r) +} + +type UnexpectedCharacter struct { + r rune + expected rune +} + +func (e *UnexpectedCharacter) Error() string { + return fmt.Sprintf("expected character '%#U' but got '%#U'", e.expected, e.r) +} + +func (p *parser) parse(b []byte) error { + next := b + var err error + for { + next, err = p.parseExpression(next) + if err != nil { + return err + } + if len(next) == 0 { + return nil + } + + // new lines between expressions + r, size, err := readRune(next) + if err != nil { + return err + } + if r == '\n' { + next = next[size:] + continue + } + if r == '\r' { + r, size2, err := readRune(next) + if err != nil { + return err + } + if r == '\n' { + next = next[size+size2:] + continue + } + } + return &InvalidCharacter{r: r} + } +} + +func (p *parser) parseExpression(b []byte) ([]byte, error) { + next, err := p.parseWhitespace(b) + if err != nil { + return nil, err + } + r, _, err := readRune(next) + if err != nil { + return nil, err + } + // Line with just whitespace and a comment. We can exit early. + if r == '#' { + return p.parseComment(next) + } + + // or line with something? + + if r == '[' { + // parse table. could be either a standard table or an array table + } + + // it has to be a keyval + + if isUnquotedKeyRune(r) || r == '\'' || r == '"' { + next, err = p.parseKeyval(next) + if err != nil { + return nil, err + } + } + + // parse trailing whitespace and comment + + next, err = p.parseWhitespace(next) + if err != nil { + return nil, err + } + + r, _, err = readRune(next) + if err != nil { + return nil, err + } + + if r == '#' { + return p.parseComment(next) + } + + return next, nil +} + +func (p *parser) parseKeyval(b []byte) ([]byte, error) { + // key keyval-sep val + next, err := p.parseKey(b) + if err != nil { + return nil, err + } + return next, nil +} + +func (p *parser) parseKey(b []byte) ([]byte, error) { + // simple-key / dotted-key + // dotted-key = simple-key 1*( dot-sep simple-key ) + + return p.parseSimpleKey(b) + // TODO: dotted key +} + +func isUnquotedKeyRune(r rune) bool { + return (r >= 'A' && r <= 'Z') || (r >= 'a' && r <= 'z') || (r >= '0' && r <= '9') || r == '-' || r == '_' +} + +func (p *parser) parseSimpleKey(b []byte) ([]byte, error) { + // simple-key = quoted-key / unquoted-key + // quoted-key = basic-string / literal-string + // unquoted-key = 1*( ALPHA / DIGIT / %x2D / %x5F ) ; A-Z / a-z / 0-9 / - / _ + // basic-string = quotation-mark *basic-char quotation-mark + // literal-string = apostrophe *literal-char apostrophe + + r, _, err := readRune(b) + if err != nil { + return nil, err + } + + if r == '\'' { + return p.parseLiteralString(b) + } + if r == '"' { + return p.parseBasicString(b) + } + + return p.parseUnquotedKey(b) +} + +func (p *parser) parseUnquotedKey(b []byte) ([]byte, error) { + length := 0 + r, size, err := readRune(b) + if err != nil { + return nil, err + } + + if !isUnquotedKeyRune(r) { + return nil, &InvalidCharacter{r: r} + } + length += size + + for { + r, size, err := readRune(b[length:]) + if err != nil { + return nil, err + } + if !isUnquotedKeyRune(r) { + break + } + length += size + } + p.builder.UnquotedKey(b[:length]) + return b[length:], nil +} + +func expectRune(b []byte, expected rune) (int, error) { + r, size, err := readRune(b) + if err != nil { + return 0, err + } + if r != expected { + return 0, &UnexpectedCharacter{ + r: r, + expected: expected, + } + } + return size, nil +} + +func (p *parser) parseComment(b []byte) ([]byte, error) { + length := 0 + + size, err := expectRune(b, '#') + if err != nil { + return b, err + } + length += size + + for { + r, size, err := readRune(b[length:]) + if err != nil { + return nil, err + } + if r == eof || r == '\n' { + if length > 0 { + p.builder.Comment(b[:length]) + } + return b[length:], nil + } + length += size + } +} + +func isWhitespace(r rune) bool { + switch r { + case 0x20, 0x09: + return true + default: + return false + } +} + +type InvalidUnicodeError struct { + r rune +} + +func (e *InvalidUnicodeError) Error() string { + return fmt.Sprintf("invalid unicode: %#U", e.r) +} + +func readRune(b []byte) (rune, int, error) { + r, size := utf8.DecodeRune(b) + if r == utf8.RuneError { + if size == 0 { // eof + return eof, 0, nil + } + if size == 1 { // invalid rune + return utf8.RuneError, 1, &InvalidUnicodeError{r: r} + } + } + return r, size, nil +} + +func (p *parser) parseWhitespace(b []byte) ([]byte, error) { + length := 0 + for { + r, size, err := readRune(b[length:]) + if err != nil { + return nil, err + } + if isWhitespace(r) { + length += size + } else { + if length > 0 { + p.builder.Whitespace(b[:length]) + } + return b[length:], nil + } + } +} + +func isNonAsciiChar(r rune) bool { + return (r >= 0x80 && r <= 0xD7FF) || (r >= 0xE000 && r <= 0x10FFFF) +} + +func isLiteralChar(r rune) bool { + return r == 0x09 || (r >= 0x20 && r <= 0x26) || (r >= 0x28 && r <= 0x7E) || isNonAsciiChar(r) +} + +func (p *parser) parseLiteralString(b []byte) ([]byte, error) { + // literal-string = apostrophe *literal-char apostrophe + // literal-char = %x09 / %x20-26 / %x28-7E / non-ascii + // non-ascii = %x80-D7FF / %xE000-10FFFF + + length := 0 + + start, err := expectRune(b, '\'') + if err != nil { + return nil, err + } + + for { + r, size, err := readRune(b[start+length:]) + if err != nil { + return nil, err + } + if r == '\'' { + p.builder.LiteralString(b[start : start+length]) + return b[start+length+size:], nil + } + if !isLiteralChar(r) { + return nil, &InvalidCharacter{r: r} + } + length += size + } +} + +func isBasicStringChar(r rune) bool { + return r == ' ' || r == 0x21 || r >= 0x23 && r <= 0x5B || r >= 0x5D && r <= 0x7E || isNonAsciiChar(r) +} + +func isEscapeChar(r rune) bool { + return r == '"' || r == '\\' || r == 'b' || r == 'f' || r == 'n' || r == 'r' || r == 't' +} + +func isHex(r rune) bool { + return (r >= '0' && r <= '9') || (r >= 'A' && r <= 'F') +} + +func (p *parser) parseBasicString(b []byte) ([]byte, error) { + // basic-string = quotation-mark *basic-char quotation-mark + // basic-char = basic-unescaped / escaped + // basic-unescaped = wschar / %x21 / %x23-5B / %x5D-7E / non-ascii + // escaped = escape escape-seq-char + //escape = %x5C ; \ + //escape-seq-char = %x22 ; " quotation mark U+0022 + //escape-seq-char =/ %x5C ; \ reverse solidus U+005C + //escape-seq-char =/ %x62 ; b backspace U+0008 + //escape-seq-char =/ %x66 ; f form feed U+000C + //escape-seq-char =/ %x6E ; n line feed U+000A + //escape-seq-char =/ %x72 ; r carriage return U+000D + //escape-seq-char =/ %x74 ; t tab U+0009 + //escape-seq-char =/ %x75 4HEXDIG ; uXXXX U+XXXX + //escape-seq-char =/ %x55 8HEXDIG ; UXXXXXXXX U+XXXXXXXX + // HEXDIG = DIGIT / "A" / "B" / "C" / "D" / "E" / "F" + length := 0 + + start, err := expectRune(b, '"') + if err != nil { + return nil, err + } + + for { + r, size, err := readRune(b[start+length:]) + if err != nil { + return nil, err + } + if r == '"' { + p.builder.BasicString(b[start : start+length]) + return b[start+length+size:], nil + } + + if r == '\\' { + length += size + r, size, err := readRune(b[start+length:]) + if err != nil { + return nil, err + } + + if isEscapeChar(r) { + length += size + continue + } + + if r == 'u' { + length += size + for i := 0; i < 4; i++ { + r, size, err := readRune(b[start+length:]) + if err != nil { + return nil, err + } + if !isHex(r) { + return nil, &InvalidCharacter{r: r} + } + length += size + } + continue + } + + if r == 'U' { + length += size + for i := 0; i < 8; i++ { + r, size, err := readRune(b[start+length:]) + if err != nil { + return nil, err + } + if !isHex(r) { + return nil, &InvalidCharacter{r: r} + } + length += size + } + continue + } + + return nil, &InvalidCharacter{r: r} + + } + + if isBasicStringChar(r) { + length += size + continue + } + } +} diff --git a/toml_test.go b/toml_test.go new file mode 100644 index 0000000..1aae1eb --- /dev/null +++ b/toml_test.go @@ -0,0 +1,34 @@ +package toml + +import ( + "fmt" + "testing" + + "github.com/stretchr/testify/assert" +) + +func TestParse(t *testing.T) { + + inputs := []string{ + ` #foo`, + `#foo`, + `#`, + "\n\n\n", + "#one\n # two \n", + `a`, + `abc`, + ` abc # foo`, + `'abc'`, + `"foo bar"`, + `"hello\tworld"`, + `"hello \u1234 foo"`, + } + + for i, data := range inputs { + t.Run(fmt.Sprintf("example %d", i), func(t *testing.T) { + doc, err := Parse([]byte(data)) + assert.NoError(t, err) + fmt.Println(doc) + }) + } +}