A simple tiny expression parser

I mentioned in my previous post that I parse expressions in my mini language using a strange shunting yard algorithm.  It mixes explicit stack management and recursive decent.  Its operation makes intuitive sense to me which is why I use it, and the code to implement it happens to also be very short.

An expression parser is one of the hardest parts of a recursive descent parser to write, but with a little practice it doesn’t have to be hard at all.  I’m sure there are much better ways out there to implement an expression parser so I would be interested to see them if anyone has any good links.  Are they smaller then the ~100 lines (including comments) this one takes up?

The code presented below is almost identical in operation to the one from my mini parser.  I added a tiny Lexer and REPL loop to it so it can be used as a very simple infix to postfix expression evaluator.  I also used the C libraries setjmp() function to add very simple error handling.

It supports normal operators [+ – * /], parenthesis, the unary not operator [ ! ], and function calls. Only single character variables and literals are supported, so that the focus remains on the parser rather then the Lexer.  Spaces are not supported either.

Try a simple expression such as:
> A*B-C(1+3)
> 1*(3+4)
> 2*!A

In the event that wordpress feels the need to mangle my source code here is an alternative link which will maintain the formatting: http://pastebin.com/TMkT17C6

#include <assert.h>
#include <stdint.h>
#include <vector>
#include <setjmp.h>
#include <stdio.h>

typedef char token_e;

namespace {
// return the precedence of an operator
int op_prec(token_e o) {
	switch (o) {
	case ('&'): return  1;
	case ('+'): 
	case ('-'): return  2;
	case ('*'): 
	case ('/'): return  3;
	default:    return  0;
	}
}

// return true if token is an operator
bool is_op(token_e o) {
	return op_prec(o) != 0;
}

// return true if token is a variable identifier
bool is_var(token_e o) {
	return (o>='A' && o<='Z') || (o>='a' && o<='z');
} 

// return true if token is a numeric literal 
bool is_num(token_e o) { 
	return (o>='0' && o<='9');
}
} // namespace {}

struct parser_t {

	std::vector<token_e> op_;
	const char * s_;
	jmp_buf error_buf;

	void error() {
		longjmp (error_buf, 1);
	}

	parser_t(const char * stream)
		: op_()
		, s_(stream)
	{}

	char found_op () { return (is_op  (*s_)) ? *(s_++) : 0; }
	char found_var() { return (is_var (*s_)) ? *(s_++) : 0; }
	char found_num() { return (is_num (*s_)) ? *(s_++) : 0; }

	// check for specific token in the token stream
	bool found(char ch) {
		if (*s_ == ch) {
			s_++;
			return true;
		}
		return false;
	}

	// unconditionally expect a token in the token stream
	void expect(char ch) {
		if (*(s_++) != ch) {
			printf ("\nexpected '%c'\n", ch);
			error ();
		}
	}

	// ---- ---- ---- ---- ---- ---- ---- ---- <expression parser>

	void pop_op() {
		assert(op_.size() > 0);
		token_e t = op_.back();
		putchar (t);
		op_.pop_back();
	}

	void push_op(uint32_t tide, token_e tok) {
		while (op_.size() > tide) {
			if (op_prec (op_.back ()) > op_prec (tok))
				pop_op ();
			else
				break;
		}
		op_.push_back(tok);
	}

	void pop_all_op(uint32_t tide) {
		while (op_.size() > tide)
			pop_op();
	}

	void parse_lhs(uint32_t tide) {
		// parenthesis
		if (found('(')) {
			parse_expr ();
			expect (')');
		}
		// identifier
		else if (char var = found_var()) {
			// identifier name
			putchar (var);
			// index an object
			while (found('.')) {
				if (char ix = found_var()) {
					putchar (ix);
					putchar ('@');
				}
				else {
					printf ("\nexpecting identifier, got '%c'\n", *s_);
					error ();
				}
			}
			// function call
			if (found('(')) {
				//
				uint32_t num_args = 0;
				// argument list
				if (!found(')')) {
					do {
						++num_args;
						parse_expr();
					} while (found(','));
					expect (')');
				}
				// call instruction
				printf("<%d>",-num_args);
			}
		}
		// numeric literal
		else if (char num = found_num()) {
			putchar (num);
		} 
		// unexpected token
		else {
			printf ("\nunexpected token '%c'\n", *s_);
			error ();
		}
	}

	// parse an expression with extended control
	void parse_expr_ex(uint32_t tide) {
		bool unary_not = found ('!');
		parse_lhs(tide);
		if (unary_not) {
			// logical not instruction
			putchar('!');
		}
		if (char op = found_op()) {
			push_op (tide, op);
			parse_expr_ex(tide);
		}
	}

	// parse a full expression
	void parse_expr() {
		uint32_t tide = op_.size();
		parse_expr_ex(tide);
		pop_all_op(tide);
	}

	// ---- ---- ---- ---- ---- ---- ---- ---- </expression parser>

	// parse root
	void parse() {
		if (!setjmp(error_buf)) {
			parse_expr ();
		} else {
			printf ("unable to parse\n");
		}
	}
};

// supported operators: 
//      &       and
//      + -     add sub
//      * /     mul div
//      !       unary not
// example input:
//      A+B*C*(0-3)
//      A&!B
//      2+A(B,C+1)
int main() {
	char buffer[1024];
	// little REPL loop
	for (;;) {
		printf ("expr: ");
		fgets(buffer, sizeof(buffer), stdin);
		if (buffer[0] == '\0')
			break;
		parser_t parser(buffer);
		printf ("post: ");
		parser.parse();
		printf ("\n\n");
	}
	return 0;
}

One thought on “A simple tiny expression parser

  1. Pingback: A simpler tinier expression parser | The 8Bit Pimps Pixel Mine

Leave a Reply

Fill in your details below or click an icon to log in:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out /  Change )

Facebook photo

You are commenting using your Facebook account. Log Out /  Change )

Connecting to %s