A Parser is a program that reads the tokens and converts it into an abstract syntax tree (AST). It is the second part of an interpreter.

What the usage looks like

// for an input `let five = 5;`
input := `let five = 5;`
l := lexer.New(input)
p := parser.New(l)

// result is a list of statements
result := []ast.Statement{
    &ast.LetStatement{
        Token: token.Token{Type: token.LET, Literal: "let"},
        Name: &ast.Identifier{
            Token: token.Token{Type: token.IDENT, Literal: "five"},
            Value: "five",
        },
        Value: &ast.IntegerLiteral{
            Token: token.Token{Type: token.INT, Literal: "5"},
            Value: 5,
        },
    },
}

input2 := "1 + 2 + 3;"
l2 := lexer.New(input2)
p2 := parser.New(l2)

result2 := []ast.Statement{
    &ast.ExpressionStatement{
        Token: token.Token{Type: token.INT, Literal: "1"},
        Expression: &ast.InfixExpression{
            Token: token.Token{Type: token.PLUS, Literal: "+"},
            Left: &ast.IntegerLiteral{
                Token: token.Token{Type: token.INT, Literal: "1"},
                Value: 1,
            },
            Operator: "+",
            Right: &ast.InfixExpression{
                Token: token.Token{Type: token.PLUS, Literal: "+"},
                Left: &ast.IntegerLiteral{
                    Token: token.Token{Type: token.INT, Literal: "2"},
                    Value: 2,
                },
                Operator: "+",
                Right: &ast.IntegerLiteral{
                    Token: token.Token{Type: token.INT, Literal: "3"},
                    Value: 3,
                },
            },
        },
    },
}

The parser also needs to handle operator precedence and associativity. It is a complex part of an interpreter.

Let’s try a top-down parser for simple statements. The grammar for the parser is:

program = { statement } ;
statement = letStatement | returnStatement ;
letStatement = "let" , identifier , "=" , expression , ";" ;
returnStatement = "return" , expression , ";" ;
expression = identifier | integerLiteral ;
identifier = letter , { letter | digit } ;
integerLiteral = digit , { digit } ;
letter = "a" ... "z" | "A" ... "Z" ;
digit = "0" ... "9" ;

AST Definition

AST for the grammar is:

// Program is the root node of the AST
type Program struct {
    Statements []Statement
}

// Statement is a node in the AST
type Statement interface {
    statementNode()
}

// Expression is a node in the AST
type Expression interface {
    expressionNode()
}

// Identifier is for expressions like `five` in `let five = 5;`
type Identifier struct {
    Token token.Token
    Value string
}

// IntegerLiteral is for expressions like `5`
type IntegerLiteral struct {
    Token token.Token
    Value int64
}

// LetStatement is for statments like `let five = 5;`
type LetStatement struct {
    Token token.Token
    Name  *Identifier
    Value Expression
}

// ReturnStatement is for statements like `return 5;`
type ReturnStatement struct {
    Token       token.Token
    ReturnValue Expression
}

// ExpressionStatement is for statements like `5;`
type ExpressionStatement struct {
    Token      token.Token
    Expression Expression
}

Parser For simple statements

Let’s try a top-down parser for the grammar. Which builds the AST from the tokens. The parser will parse simple expressions like.

let five = 5;
return 5;

Here is an example of a parser in Go.

type Parser struct {
    l *lexer.Lexer

    curToken  token.Token
    peekToken token.Token
}

func New(l *lexer.Lexer) *Parser {
    p := &Parser{l: l}
    p.nextToken()
    p.nextToken()
    return p
}

// nextToken moves the curToken and peekToken
// It is used to go through tokens
func (p *Parser) nextToken() {
    p.curToken = p.peekToken
    p.peekToken = p.l.NextToken()
}

// ParseProgram parses the program
func (p *Parser) ParseProgram() *ast.Program {
    program := &ast.Program{}
    program.Statements = []ast.Statement{}

    for !p.curTokenIs(token.EOF) {
        stmt := p.parseStatement()
        if stmt != nil {
            program.Statements = append(program.Statements, stmt)
        }
        p.nextToken()
    }

    return program
}

// Parse different types of statements
func (p *Parser) parseStatement() ast.Statement {
    switch p.curToken.Type {
    case token.LET:
        return p.parseLetStatement()
    case token.RETURN:
        return p.parseReturnStatement()
    default:
        return nil
    }
}

// Parse let statement
func (p *Parser) parseLetStatement() *ast.LetStatement {
    stmt := &ast.LetStatement{Token: p.curToken}

    if !p.expectPeek(token.IDENT) {
        return nil
    }

    stmt.Name = &ast.Identifier{Token: p.curToken, Value: p.curToken.Literal}

    if !p.expectPeek(token.ASSIGN) {
        return nil
    }

    p.nextToken()

    stmt.Value = p.parseExpression(LOWEST)

    if p.peekTokenIs(token.SEMICOLON) {
        p.nextToken()
    }

    return stmt
}

// Parse return statement
func (p *Parser) parseReturnStatement() *ast.ReturnStatement {
    stmt := &ast.ReturnStatement{Token: p.curToken}

    p.nextToken()

    stmt.ReturnValue = p.parseExpression(LOWEST)

    if p.peekTokenIs(token.SEMICOLON) {
        p.nextToken()
    }

    return stmt
}

// prase expression statement
func (p *Parser) parseExpressionStatement() *ast.ExpressionStatement {
    stmt := &ast.ExpressionStatement{Token: p.curToken}
    stmt.Expression = p.parseExpression(LOWEST)

    if p.peekTokenIs(token.SEMICOLON) {
        p.nextToken()
    }

    return stmt
}

// Parse expression
func (p *Parser) parseExpression(precedence int) *ast.Expression {
    // ...
}

The result of the parser will be a Program which contains a list of statements.

Although this is a simple example, When parsing expressions we need to handle operator precedence and associativity. This part is what makes a parser complex.