A Lexer is a program that reads the code and converts it into tokens. It is the first part of an interpreter.
What the usage looks like
// for an input `let five = 5;`
input := `let five = 5;`
l := lexer.New(input)
// result is a list of tokens
result := []tokens.Token{
{Type: tokens.LET, Literal: "let"},
{Type: tokens.IDENT, Literal: "five"},
{Type: tokens.ASSIGN, Literal: "="},
{Type: tokens.INT, Literal: "5"},
{Type: tokens.SEMICOLON, Literal: ";"},
{Type: tokens.EOF, Literal: ""},
}
input2 := "1 + 2 + 3;"
l2 := lexer.New(input2)
result2 := []tokens.Token{
{Type: tokens.INT, Literal: "1"},
{Type: tokens.PLUS, Literal: "+"},
{Type: tokens.INT, Literal: "2"},
{Type: tokens.PLUS, Literal: "+"},
{Type: tokens.INT, Literal: "3"},
{Type: tokens.SEMICOLON, Literal: ";"},
{Type: tokens.EOF, Literal: ""},
}
Let’s see how to implement a sample lexer in Go.
- Define the tokens
package tokens
type TokenType string
const (
ILLEGAL = "ILLEGAL"
EOF = "EOF"
// Identifiers + literals
IDENT = "IDENT" // add, foobar, x, y, ...
INT = "INT"
// Operators
ASSIGN = "="
PLUS = "+"
MINUS = "-"
BANG = "!"
ASTERISK = "*"
SLASH = "/"
LT = "<"
GT = ">"
// Keywords
LET = "LET"
)
// Token represents a token
type Token struct {
Type TokenType
Literal string
}
// Keywords map
var keywords = map[string]TokenType{
"let": LET,
}
// LookupIdent looks up the identifier and returns the token type
func LookupIdent(ident string) TokenType {
if tok, ok := keywords[ident]; ok {
return tok
}
return IDENT
}
- Define the lexer
package lexer
import "github.com/adharshmk96/interpreter-in-go/tokens"
type Lexer struct {
input string
position int // current position in input (points to current char)
readPosition int // current reading position in input (after current char)
ch byte // current char under examination
}
// New creates a new lexer
func New(input string) *Lexer {
l := &Lexer{input: input}
// Initialize the lexer
l.readChar()
return l
}
// readChar reads the next character and advances the position in the input string
func (l *Lexer) readChar() {
if l.readPosition >= len(l.input) {
// The end of the input
l.ch = 0
} else {
// Read the next character
l.ch = l.input[l.readPosition]
}
// Move the position to the next character
l.position = l.readPosition
// Move the read position to the next character
l.readPosition += 1
}
// NextToken returns the next token
func (l *Lexer) NextToken() tokens.Token {
var tok tokens.Token
// Skip the whitespaces
l.skipWhitespace()
switch l.ch {
case '=':
tok = newToken(tokens.ASSIGN, l.ch)
case '+':
tok = newToken(tokens.PLUS, l.ch)
case '-':
tok = newToken(tokens.MINUS, l.ch)
case '!':
tok = newToken(tokens.BANG, l.ch)
case '*':
tok = newToken(tokens.ASTERISK, l.ch)
case '/':
tok = newToken(tokens.SLASH, l.ch)
case '<':
tok = newToken(tokens.LT, l.ch)
case '>':
tok = newToken(tokens.GT, l.ch)
case ';':
tok = newToken(tokens.SEMICOLON, l.ch)
case '(':
tok = newToken(tokens.LPAREN, l.ch)
case ')':
tok = newToken(tokens.RPAREN, l.ch)
case ',':
tok = newToken(tokens.COMMA, l.ch)
case '{':
tok = newToken(tokens.LBRACE, l.ch)
case '}':
tok = newToken(tokens.RBRACE, l.ch)
case 0:
tok.Literal = ""
tok.Type = tokens.EOF
default:
if isLetter(l.ch) {
tok.Literal = l.readIdentifier()
tok.Type = tokens.LookupIdent(tok.Literal)
return tok
} else if isDigit(l.ch) {
tok.Type = tokens.INT
tok.Literal = l.readNumber()
return tok
} else {
tok = newToken(tokens.ILLEGAL, l.ch)
}
}
l.readChar()
return tok
}
// new token creates a new token
func newToken(tokenType tokens.TokenType, ch byte) tokens.Token {
return tokens.Token{Type: tokenType, Literal: string(ch)}
}
// readIdentifier reads the identifier
func (l *Lexer) readIdentifier() string {
position := l.position
for isLetter(l.ch) {
l.readChar()
}
return l.input[position:l.position]
}
// readNumber reads the number
func (l *Lexer) readNumber() string {
position := l.position
for isDigit(l.ch) {
l.readChar()
}
return l.input[position:l.position]
}
// skipWhitespace skips the whitespaces
func (l *Lexer) skipWhitespace() {
for l.ch == ' ' || l.ch == '\t' || l.ch == '\n' || l.ch == '\r' {
l.readChar()
}
}
// isLetter checks if the character is a letter
func isLetter(ch byte) bool {
return 'a' <= ch && ch <= 'z' || 'A' <= ch && ch <= 'Z' || ch == '_'
}
// isDigit checks if the character is a digit
func isDigit(ch byte) bool {
return '0' <= ch && ch <= '9'
}
- Use the lexer
So the lexer is going to read the input and convert it into tokens. Let’s take a simple input and see how the lexer works.
import (
"fmt"
"github.com/adharshmk96/interpreter-in-go/lexer"
"github.com/adharshmk96/interpreter-in-go/tokens"
)
func TestLexer() {
input := `let five = 5;`
l := lexer.New(input)
result := []tokens.Token{
{Type: tokens.LET, Literal: "let"},
{Type: tokens.IDENT, Literal: "five"},
{Type: tokens.ASSIGN, Literal: "="},
{Type: tokens.INT, Literal: "5"},
{Type: tokens.SEMICOLON, Literal: ";"},
{Type: tokens.EOF, Literal: ""},
}
for _, expected := range result {
tok := l.NextToken()
if tok.Type != expected.Type {
fmt.Printf("Expected type: %s, got: %s\n", expected.Type, tok.Type)
}
if tok.Literal != expected.Literal {
fmt.Printf("Expected literal: %s, got: %s\n", expected.Literal, tok.Literal)
}
}
}
we are testing the lexer with the input let five = 5;
. The expected output is a list of tokens. We are comparing the expected tokens with the tokens generated by the lexer.