A Lexer is a program that reads the code and converts it into tokens. It is the first part of an interpreter.

What the usage looks like

// for an input `let five = 5;`
input := `let five = 5;`
l := lexer.New(input)

// result is a list of tokens
result := []tokens.Token{
    {Type: tokens.LET, Literal: "let"},
    {Type: tokens.IDENT, Literal: "five"},
    {Type: tokens.ASSIGN, Literal: "="},
    {Type: tokens.INT, Literal: "5"},
    {Type: tokens.SEMICOLON, Literal: ";"},
    {Type: tokens.EOF, Literal: ""},
}

input2 := "1 + 2 + 3;"
l2 := lexer.New(input2)

result2 := []tokens.Token{
    {Type: tokens.INT, Literal: "1"},
    {Type: tokens.PLUS, Literal: "+"},
    {Type: tokens.INT, Literal: "2"},
    {Type: tokens.PLUS, Literal: "+"},
    {Type: tokens.INT, Literal: "3"},
    {Type: tokens.SEMICOLON, Literal: ";"},
    {Type: tokens.EOF, Literal: ""},
}

Let’s see how to implement a sample lexer in Go.

  1. Define the tokens
package tokens

type TokenType string

const (
    ILLEGAL = "ILLEGAL"
	EOF     = "EOF"

	// Identifiers + literals
	IDENT = "IDENT" // add, foobar, x, y, ...
	INT   = "INT"

	// Operators
	ASSIGN   = "="
	PLUS     = "+"
	MINUS    = "-"
	BANG     = "!"
	ASTERISK = "*"
	SLASH    = "/"
	LT       = "<"
	GT       = ">"

    // Keywords
    LET      = "LET"
)

// Token represents a token
type Token struct {
    Type    TokenType
    Literal string
}

// Keywords map
var keywords = map[string]TokenType{
    "let": LET,
}

// LookupIdent looks up the identifier and returns the token type
func LookupIdent(ident string) TokenType {
    if tok, ok := keywords[ident]; ok {
        return tok
    }
    return IDENT
}
  1. Define the lexer
package lexer

import "github.com/adharshmk96/interpreter-in-go/tokens"

type Lexer struct {
    input        string
    position     int  // current position in input (points to current char)
    readPosition int  // current reading position in input (after current char)
    ch           byte // current char under examination
}

// New creates a new lexer
func New(input string) *Lexer {
    l := &Lexer{input: input}
    // Initialize the lexer
    l.readChar()
    return l
}

// readChar reads the next character and advances the position in the input string
func (l *Lexer) readChar() {
    if l.readPosition >= len(l.input) {
        // The end of the input
        l.ch = 0
    } else {
        // Read the next character
        l.ch = l.input[l.readPosition]
    }
    // Move the position to the next character
    l.position = l.readPosition
    // Move the read position to the next character
    l.readPosition += 1
}

// NextToken returns the next token
func (l *Lexer) NextToken() tokens.Token {
    var tok tokens.Token

    // Skip the whitespaces
    l.skipWhitespace()

    switch l.ch {
    case '=':
        tok = newToken(tokens.ASSIGN, l.ch)
    case '+':
        tok = newToken(tokens.PLUS, l.ch)
    case '-':
        tok = newToken(tokens.MINUS, l.ch)
    case '!':
        tok = newToken(tokens.BANG, l.ch)
    case '*':
        tok = newToken(tokens.ASTERISK, l.ch)
    case '/':
        tok = newToken(tokens.SLASH, l.ch)
    case '<':
        tok = newToken(tokens.LT, l.ch)
    case '>':
        tok = newToken(tokens.GT, l.ch)
    case ';':
        tok = newToken(tokens.SEMICOLON, l.ch)
    case '(':
        tok = newToken(tokens.LPAREN, l.ch)
    case ')':
        tok = newToken(tokens.RPAREN, l.ch)
    case ',':
        tok = newToken(tokens.COMMA, l.ch)
    case '{':
        tok = newToken(tokens.LBRACE, l.ch)
    case '}':
        tok = newToken(tokens.RBRACE, l.ch)
    case 0:
        tok.Literal = ""
        tok.Type = tokens.EOF
    default:
        if isLetter(l.ch) {
            tok.Literal = l.readIdentifier()
            tok.Type = tokens.LookupIdent(tok.Literal)
            return tok
        } else if isDigit(l.ch) {
            tok.Type = tokens.INT
            tok.Literal = l.readNumber()
            return tok
        } else {
            tok = newToken(tokens.ILLEGAL, l.ch)
        }
    }

    l.readChar()
    return tok
}

// new token creates a new token
func newToken(tokenType tokens.TokenType, ch byte) tokens.Token {
    return tokens.Token{Type: tokenType, Literal: string(ch)}
}

// readIdentifier reads the identifier
func (l *Lexer) readIdentifier() string {
    position := l.position
    for isLetter(l.ch) {
        l.readChar()
    }
    return l.input[position:l.position]
}

// readNumber reads the number
func (l *Lexer) readNumber() string {
    position := l.position
    for isDigit(l.ch) {
        l.readChar()
    }
    return l.input[position:l.position]
}

// skipWhitespace skips the whitespaces
func (l *Lexer) skipWhitespace() {
    for l.ch == ' ' || l.ch == '\t' || l.ch == '\n' || l.ch == '\r' {
        l.readChar()
    }
}

// isLetter checks if the character is a letter
func isLetter(ch byte) bool {
    return 'a' <= ch && ch <= 'z' || 'A' <= ch && ch <= 'Z' || ch == '_'
}

// isDigit checks if the character is a digit
func isDigit(ch byte) bool {
    return '0' <= ch && ch <= '9'
}
  1. Use the lexer

So the lexer is going to read the input and convert it into tokens. Let’s take a simple input and see how the lexer works.

import (
    "fmt"
    "github.com/adharshmk96/interpreter-in-go/lexer"
    "github.com/adharshmk96/interpreter-in-go/tokens"
)

func TestLexer() {
    input := `let five = 5;`
    l := lexer.New(input)

    result := []tokens.Token{
        {Type: tokens.LET, Literal: "let"},
        {Type: tokens.IDENT, Literal: "five"},
        {Type: tokens.ASSIGN, Literal: "="},
        {Type: tokens.INT, Literal: "5"},
        {Type: tokens.SEMICOLON, Literal: ";"},
        {Type: tokens.EOF, Literal: ""},
    }

    for _, expected := range result {
        tok := l.NextToken()
        if tok.Type != expected.Type {
            fmt.Printf("Expected type: %s, got: %s\n", expected.Type, tok.Type)
        }
        if tok.Literal != expected.Literal {
            fmt.Printf("Expected literal: %s, got: %s\n", expected.Literal, tok.Literal)
        }
    }
}

we are testing the lexer with the input let five = 5;. The expected output is a list of tokens. We are comparing the expected tokens with the tokens generated by the lexer.