From d7b15313dc554871cc0973924323f7407722121a Mon Sep 17 00:00:00 2001 From: franck cuny Date: Sat, 11 Jan 2020 13:34:54 +0100 Subject: lexer: initial lexer The initial lexer for the monkey language. We only support a small subset at this stage. We have some simple tests to ensure that we can parse some small snippet, and that the minimum number of tokens we need are also all supported correctly. --- users/fcuny/exp/monkey/pkg/lexer/lexer.go | 114 +++++++++++++++++++++++++ users/fcuny/exp/monkey/pkg/lexer/lexer_test.go | 104 ++++++++++++++++++++++ 2 files changed, 218 insertions(+) create mode 100644 users/fcuny/exp/monkey/pkg/lexer/lexer.go create mode 100644 users/fcuny/exp/monkey/pkg/lexer/lexer_test.go (limited to 'users/fcuny/exp/monkey/pkg/lexer') diff --git a/users/fcuny/exp/monkey/pkg/lexer/lexer.go b/users/fcuny/exp/monkey/pkg/lexer/lexer.go new file mode 100644 index 0000000..fc29371 --- /dev/null +++ b/users/fcuny/exp/monkey/pkg/lexer/lexer.go @@ -0,0 +1,114 @@ +package lexer + +import "monkey/pkg/token" + +// Lexer represents the lexer +type Lexer struct { + input string + // current position in input + position int + // current reading position in input (after a char) + readPosition int + // current character under examination + ch byte +} + +// New returns a new lexer +func New(input string) *Lexer { + l := &Lexer{input: input} + l.readChar() + return l +} + +// Read the current character and advances our position in the input string. +func (l *Lexer) readChar() { + // if we've reached the end of the input, we set the current character to 0, + // which is the ASCII code for NUL. + if l.readPosition >= len(l.input) { + l.ch = 0 + } else { + l.ch = l.input[l.readPosition] + } + l.position = l.readPosition + l.readPosition++ +} + +func (l *Lexer) readIdentifier() string { + position := l.position + for isLetter(l.ch) { + l.readChar() + } + return l.input[position:l.position] +} + +func (l *Lexer) readNumber() string { + position := l.position + for isDigit(l.ch) { + l.readChar() + } + return l.input[position:l.position] +} + +// we don't care about white space characters, we skip them when we find them. +func (l *Lexer) skipWhitespace() { + for l.ch == ' ' || l.ch == '\t' || l.ch == '\n' || l.ch == '\r' { + l.readChar() + } +} + +// NextToken reads the next token from the lexer and returns the current token. +func (l *Lexer) NextToken() token.Token { + var tok token.Token + + l.skipWhitespace() + + switch l.ch { + case '=': + tok = newToken(token.ASSIGN, l.ch) + case '+': + tok = newToken(token.PLUS, l.ch) + case ';': + tok = newToken(token.SEMICOLON, l.ch) + case ',': + tok = newToken(token.COMMA, l.ch) + case '(': + tok = newToken(token.LPAREN, l.ch) + case ')': + tok = newToken(token.RPAREN, l.ch) + case '{': + tok = newToken(token.LBRACE, l.ch) + case '}': + tok = newToken(token.RBRACE, l.ch) + case 0: + tok.Literal = "" + tok.Type = token.EOF + default: + if isLetter(l.ch) { + tok.Literal = l.readIdentifier() + tok.Type = token.LookupIdent(tok.Literal) + return tok + } else if isDigit(l.ch) { + tok.Type = token.INT + tok.Literal = l.readNumber() + return tok + } else { + tok = newToken(token.ILLEGAL, l.ch) + } + + } + + l.readChar() + return tok +} + +func newToken(tokenType token.TokenType, ch byte) token.Token { + return token.Token{Type: tokenType, Literal: string(ch)} +} + +func isLetter(ch byte) bool { + return 'a' <= ch && ch <= 'z' || 'A' <= ch && ch <= 'Z' || ch == '_' +} + +func isDigit(ch byte) bool { + return '0' <= ch && ch <= '9' +} diff --git a/users/fcuny/exp/monkey/pkg/lexer/lexer_test.go b/users/fcuny/exp/monkey/pkg/lexer/lexer_test.go new file mode 100644 index 0000000..73b27fb --- /dev/null +++ b/users/fcuny/exp/monkey/pkg/lexer/lexer_test.go @@ -0,0 +1,104 @@ +package lexer + +import ( + "monkey/pkg/token" + "testing" +) + +func TestNextTokenBasic(t *testing.T) { + input := `=+(){},;` + + tests := []struct { + expectedType token.TokenType + expectedLiteral string + }{ + {token.ASSIGN, "="}, + {token.PLUS, "+"}, + {token.LPAREN, "("}, + {token.RPAREN, ")"}, + {token.LBRACE, "{"}, + {token.RBRACE, "}"}, + {token.COMMA, ","}, + {token.SEMICOLON, ";"}, + } + + l := New(input) + + for i, tt := range tests { + tok := l.NextToken() + if tok.Type != tt.expectedType { + t.Fatalf("tests[%d] - tokentype wrong. expected=%q, got=%q", i, tt.expectedType, tok.Type) + } + + if tok.Literal != tt.expectedLiteral { + t.Fatalf("tests[%d] - tokenliteral wrong. expected=%q, got=%q", i, tt.expectedLiteral, tok.Literal) + } + } +} + +func TestNextTokenMonkey(t *testing.T) { + input := `let five = 5; +let ten = 10; + +let add = fn(x, y) { + x + y +}; + +let result = add(five, ten);` + + tests := []struct { + expectedType token.TokenType + expectedLiteral string + }{ + {token.LET, "let"}, + {token.IDENT, "five"}, + {token.ASSIGN, "="}, + {token.INT, "5"}, + {token.SEMICOLON, ";"}, + + {token.LET, "let"}, + {token.IDENT, "ten"}, + {token.ASSIGN, "="}, + {token.INT, "10"}, + {token.SEMICOLON, ";"}, + + {token.LET, "let"}, + {token.IDENT, "add"}, + {token.ASSIGN, "="}, + {token.FUNCTION, "fn"}, + {token.LPAREN, "("}, + {token.IDENT, "x"}, + {token.COMMA, ","}, + {token.IDENT, "y"}, + {token.RPAREN, ")"}, + {token.LBRACE, "{"}, + {token.IDENT, "x"}, + {token.PLUS, "+"}, + {token.IDENT, "y"}, + {token.RBRACE, "}"}, + {token.SEMICOLON, ";"}, + + {token.LET, "let"}, + {token.IDENT, "result"}, + {token.ASSIGN, "="}, + {token.IDENT, "add"}, + {token.LPAREN, "("}, + {token.IDENT, "five"}, + {token.COMMA, ","}, + {token.IDENT, "ten"}, + {token.RPAREN, ")"}, + {token.SEMICOLON, ";"}, + } + + l := New(input) + for i, tt := range tests { + tok := l.NextToken() + if tok.Type != tt.expectedType { + t.Fatalf("tests[%d] - tokentype wrong. expected=%q, got=%q", i, tt.expectedType, tok.Type) + } + + if tok.Literal != tt.expectedLiteral { + t.Fatalf("tests[%d] - tokenliteral wrong. expected=%q, got=%q", i, tt.expectedLiteral, tok.Literal) + } + } +} -- cgit v1.2.3 From bcd7ed3ec9f59f350a29eb95ffbac71345d93e6d Mon Sep 17 00:00:00 2001 From: franck cuny Date: Sat, 11 Jan 2020 13:53:44 +0100 Subject: lexer: support more operator tokens. Support the operator tokens that were added to our tokenizer. This also add a few more tests to ensure we handle them correctly. --- users/fcuny/exp/monkey/pkg/lexer/lexer.go | 13 +++++++++++++ users/fcuny/exp/monkey/pkg/lexer/lexer_test.go | 19 ++++++++++++++++++- 2 files changed, 31 insertions(+), 1 deletion(-) (limited to 'users/fcuny/exp/monkey/pkg/lexer') diff --git a/users/fcuny/exp/monkey/pkg/lexer/lexer.go b/users/fcuny/exp/monkey/pkg/lexer/lexer.go index fc29371..d538cf5 100644 --- a/users/fcuny/exp/monkey/pkg/lexer/lexer.go +++ b/users/fcuny/exp/monkey/pkg/lexer/lexer.go @@ -67,6 +67,19 @@ func (l *Lexer) NextToken() token.Token { tok = newToken(token.ASSIGN, l.ch) case '+': tok = newToken(token.PLUS, l.ch) + case '-': + tok = newToken(token.MINUS, l.ch) + case '!': + tok = newToken(token.BANG, l.ch) + case '*': + tok = newToken(token.ASTERISK, l.ch) + case '/': + tok = newToken(token.SLASH, l.ch) + case '<': + tok = newToken(token.LT, l.ch) + case '>': + tok = newToken(token.GT, l.ch) + case ';': tok = newToken(token.SEMICOLON, l.ch) case ',': diff --git a/users/fcuny/exp/monkey/pkg/lexer/lexer_test.go b/users/fcuny/exp/monkey/pkg/lexer/lexer_test.go index 73b27fb..ba7fa07 100644 --- a/users/fcuny/exp/monkey/pkg/lexer/lexer_test.go +++ b/users/fcuny/exp/monkey/pkg/lexer/lexer_test.go @@ -44,7 +44,10 @@ let add = fn(x, y) { x + y }; -let result = add(five, ten);` +let result = add(five, ten); +!-/*5; +5 < 10 > 5; +` tests := []struct { expectedType token.TokenType @@ -88,6 +91,20 @@ let result = add(five, ten);` {token.IDENT, "ten"}, {token.RPAREN, ")"}, {token.SEMICOLON, ";"}, + + {token.BANG, "!"}, + {token.MINUS, "-"}, + {token.SLASH, "/"}, + {token.ASTERISK, "*"}, + {token.INT, "5"}, + {token.SEMICOLON, ";"}, + + {token.INT, "5"}, + {token.LT, "<"}, + {token.INT, "10"}, + {token.GT, ">"}, + {token.INT, "5"}, + {token.SEMICOLON, ";"}, } l := New(input) -- cgit v1.2.3 From be6ab89f58b0572d0999701d4f1b454e98dec581 Mon Sep 17 00:00:00 2001 From: franck cuny Date: Sat, 11 Jan 2020 14:01:49 +0100 Subject: lexer: delete redundant test. The test `TestNextTokenBasic` was not testing anything that `TestNextTokenMonkey` was not already testing. Rename `TestNextTokenMonkey` to `TestNextToken` for clarity. --- users/fcuny/exp/monkey/pkg/lexer/lexer_test.go | 33 +------------------------- 1 file changed, 1 insertion(+), 32 deletions(-) (limited to 'users/fcuny/exp/monkey/pkg/lexer') diff --git a/users/fcuny/exp/monkey/pkg/lexer/lexer_test.go b/users/fcuny/exp/monkey/pkg/lexer/lexer_test.go index ba7fa07..22dbfcb 100644 --- a/users/fcuny/exp/monkey/pkg/lexer/lexer_test.go +++ b/users/fcuny/exp/monkey/pkg/lexer/lexer_test.go @@ -5,38 +5,7 @@ import ( "testing" ) -func TestNextTokenBasic(t *testing.T) { - input := `=+(){},;` - - tests := []struct { - expectedType token.TokenType - expectedLiteral string - }{ - {token.ASSIGN, "="}, - {token.PLUS, "+"}, - {token.LPAREN, "("}, - {token.RPAREN, ")"}, - {token.LBRACE, "{"}, - {token.RBRACE, "}"}, - {token.COMMA, ","}, - {token.SEMICOLON, ";"}, - } - - l := New(input) - - for i, tt := range tests { - tok := l.NextToken() - if tok.Type != tt.expectedType { - t.Fatalf("tests[%d] - tokentype wrong. expected=%q, got=%q", i, tt.expectedType, tok.Type) - } - - if tok.Literal != tt.expectedLiteral { - t.Fatalf("tests[%d] - tokenliteral wrong. expected=%q, got=%q", i, tt.expectedLiteral, tok.Literal) - } - } -} - -func TestNextTokenMonkey(t *testing.T) { +func TestNextToken(t *testing.T) { input := `let five = 5; let ten = 10; -- cgit v1.2.3 From a2991b978a309d4c3a9c480aad4e4e657ae82597 Mon Sep 17 00:00:00 2001 From: franck cuny Date: Sat, 11 Jan 2020 14:27:18 +0100 Subject: lexer: test the new keywords are parsed correctly. Ensure that the new keywords added (`if`, `else`, `true`, `false`, `return`) are parsed correctly. --- users/fcuny/exp/monkey/pkg/lexer/lexer_test.go | 28 +++++++++++++++++++++++--- 1 file changed, 25 insertions(+), 3 deletions(-) (limited to 'users/fcuny/exp/monkey/pkg/lexer') diff --git a/users/fcuny/exp/monkey/pkg/lexer/lexer_test.go b/users/fcuny/exp/monkey/pkg/lexer/lexer_test.go index 22dbfcb..df1b392 100644 --- a/users/fcuny/exp/monkey/pkg/lexer/lexer_test.go +++ b/users/fcuny/exp/monkey/pkg/lexer/lexer_test.go @@ -15,7 +15,13 @@ let add = fn(x, y) { let result = add(five, ten); !-/*5; -5 < 10 > 5; +10 > 5; + +if (5 < 10) { + return true; +} else { + return false; +} ` tests := []struct { @@ -68,12 +74,28 @@ let result = add(five, ten); {token.INT, "5"}, {token.SEMICOLON, ";"}, - {token.INT, "5"}, - {token.LT, "<"}, {token.INT, "10"}, {token.GT, ">"}, {token.INT, "5"}, {token.SEMICOLON, ";"}, + + {token.IF, "if"}, + {token.LPAREN, "("}, + {token.INT, "5"}, + {token.LT, "<"}, + {token.INT, "10"}, + {token.RPAREN, ")"}, + {token.LBRACE, "{"}, + {token.RETURN, "return"}, + {token.TRUE, "true"}, + {token.SEMICOLON, ";"}, + {token.RBRACE, "}"}, + {token.ELSE, "else"}, + {token.LBRACE, "{"}, + {token.RETURN, "return"}, + {token.FALSE, "false"}, + {token.SEMICOLON, ";"}, + {token.RBRACE, "}"}, } l := New(input) -- cgit v1.2.3 From 4fb91ad4622e099f798e01873bac914b64ed48f4 Mon Sep 17 00:00:00 2001 From: franck cuny Date: Sat, 11 Jan 2020 14:40:32 +0100 Subject: lexer: support tokens for equal and not equal. The tokens for equal (`==`) and not equal (`!=`) are composed of two characters. We introduce a new helper (`peekChar`) that we use when we encounter the token `=` or `!` to see if this is a token composed of two characters. Add some tests to ensure they are parsed correctly. --- users/fcuny/exp/monkey/pkg/lexer/lexer.go | 28 ++++++++++++++++++++++++-- users/fcuny/exp/monkey/pkg/lexer/lexer_test.go | 13 ++++++++++++ 2 files changed, 39 insertions(+), 2 deletions(-) (limited to 'users/fcuny/exp/monkey/pkg/lexer') diff --git a/users/fcuny/exp/monkey/pkg/lexer/lexer.go b/users/fcuny/exp/monkey/pkg/lexer/lexer.go index d538cf5..06d526e 100644 --- a/users/fcuny/exp/monkey/pkg/lexer/lexer.go +++ b/users/fcuny/exp/monkey/pkg/lexer/lexer.go @@ -56,6 +56,16 @@ func (l *Lexer) skipWhitespace() { } } +// peekChar returns the character at position (which is the next charatecter), +// but does not increment `readPosition` and `position`. +// This is needed to read tokens that are composed of two characters (e.g. `==`). +func (l *Lexer) peekChar() byte { + if l.readPosition >= len(l.input) { + return 0 + } + return l.input[l.readPosition] +} + // NextToken reads the next token from the lexer and returns the current token. func (l *Lexer) NextToken() token.Token { var tok token.Token @@ -64,13 +74,27 @@ func (l *Lexer) NextToken() token.Token { switch l.ch { case '=': - tok = newToken(token.ASSIGN, l.ch) + if l.peekChar() == '=' { + ch := l.ch + l.readChar() + literal := string(ch) + string(l.ch) + tok = token.Token{Type: token.EQ, Literal: literal} + } else { + tok = newToken(token.ASSIGN, l.ch) + } case '+': tok = newToken(token.PLUS, l.ch) case '-': tok = newToken(token.MINUS, l.ch) case '!': - tok = newToken(token.BANG, l.ch) + if l.peekChar() == '=' { + ch := l.ch + l.readChar() + literal := string(ch) + string(l.ch) + tok = token.Token{Type: token.NOT_EQ, Literal: literal} + } else { + tok = newToken(token.BANG, l.ch) + } case '*': tok = newToken(token.ASTERISK, l.ch) case '/': diff --git a/users/fcuny/exp/monkey/pkg/lexer/lexer_test.go b/users/fcuny/exp/monkey/pkg/lexer/lexer_test.go index df1b392..fdea1d3 100644 --- a/users/fcuny/exp/monkey/pkg/lexer/lexer_test.go +++ b/users/fcuny/exp/monkey/pkg/lexer/lexer_test.go @@ -22,6 +22,9 @@ if (5 < 10) { } else { return false; } + +10 == 10; +10 != 9; ` tests := []struct { @@ -96,6 +99,16 @@ if (5 < 10) { {token.FALSE, "false"}, {token.SEMICOLON, ";"}, {token.RBRACE, "}"}, + + {token.INT, "10"}, + {token.EQ, "=="}, + {token.INT, "10"}, + {token.SEMICOLON, ";"}, + + {token.INT, "10"}, + {token.NOT_EQ, "!="}, + {token.INT, "9"}, + {token.SEMICOLON, ";"}, } l := New(input) -- cgit v1.2.3 From 8437218bdaed90cab7374a752f8aaf128225aa1a Mon Sep 17 00:00:00 2001 From: Franck Cuny Date: Mon, 10 May 2021 19:21:39 -0700 Subject: lint: fix a few issues --- users/fcuny/exp/monkey/pkg/lexer/lexer.go | 1 + 1 file changed, 1 insertion(+) (limited to 'users/fcuny/exp/monkey/pkg/lexer') diff --git a/users/fcuny/exp/monkey/pkg/lexer/lexer.go b/users/fcuny/exp/monkey/pkg/lexer/lexer.go index 06d526e..3e98cf0 100644 --- a/users/fcuny/exp/monkey/pkg/lexer/lexer.go +++ b/users/fcuny/exp/monkey/pkg/lexer/lexer.go @@ -1,3 +1,4 @@ +// Package lexer provides a lexer to the monkey language. package lexer import "monkey/pkg/token" -- cgit v1.2.3