利用FLEX & BISON 快速实现简单的C 语言编译器前端
Flex
Flex是一个生成词法分析器的工具,它可以利用正则表达式来生成匹配相应字符串的C语言代码,其语法格式基本同Lex相同。
格式
LEX的源文件由三个部份组成,每个部分之间用顶行的 `%%' 分割,其格式如下:
定义部份
%%
规则部份
%%
用户附加C语言部份
手册->Flex, version 2.5
怎么用就不赘述了....
主要就是用这个词法生成器,给输入的源文件中的每一个关键字打上tag,以便后面的语法分析器能够识别.
GNU Bison
GNU bison 是属于 GNU 项目的一个语法分析器生成器。Bison 把一个关于“向前查看 从左到右 最右 \`LALR' 上下文无关文法的描述转化成可以分析该文法的 C 或 C++ 程序。它也可以为二义文法生成 “通用的 从左到右 最右 \`(GLR)' 语法分析器。
这里主要使用的就是普通的LALR...
手册->Bison 3.0.5
格式
与flex的格式大致相同...
声明部分
%%
语法规则
%%
C语言附加部分
源码
flex 部分
/* filename -> scanner.l */
%option noyywrap yylineno
%{
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "parser.tab.h"
int old_status;
void yyerror(char *s, ...);
%}
%x COMMENT
%%
/* 下面都是正则表达式 */
int { return INT; }
float { return FLOAT; }
double { return DOUBLE; }
auto { return AUTO; }
break { return BREAK; }
case { return CASE; }
const { return CONST; }
else { return ELSE; }
for { return FOR; }
if { return IF; }
long { return LONG; }
return { return RETURN; }
short { return SHORT; }
signed { return SIGNED; }
char { return CHAR; }
unsigned { return UNSIGNED; }
/* user variable */
[a-zA-Z][a-zA-Z0-9_]* { yylval.strval = strdup(yytext); return IDENTITY; }
/* integer numbers */
-?[0-9]+ { yylval.intval = atoi(yytext); return INT_NUMBER; }
-?[0-9]+\.[0-9]+ { yylval.floatval = atof(yytext); return FLOAT_NUMBER; }
/* real numbers */
\"(\\.|\"\"|[^"\n"])*\" { yylval.strval = strdup(yytext); return STRING; }
/* C-type strings */
\"(\\.|[^"\n])*$ { yyerror("Unterminated string %s", yytext); }
/* operators */
[-+&~|^/%*(),.;!] { return yytext[0]; }
"&&" { return AND; }
"||" { return OR; }
"=" { return ASSIGN; }
/* comments */
"//".*;
"/*" { old_status = YY_START; BEGIN COMMENT; }
<COMMENT>"*/" { BEGIN old_status; }
/* space & tab */
[ \t\n]
/* prevent flex jam */
. { yyerror("something goes wrong...\n"); }
%%
Bison 部分
/* filename -> parser.y */
%require "3.0.4"
%{
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdarg.h>
void yyerror(char *s, ...);
int yylex();
%}
%debug
%locations
%union {
int intval;
double floatval;
char *strval;
int subtok;
}
/* token 关键字 */
%token INT
%token FLOAT
%token DOUBLE
%token AUTO
%token BREAK
%token CASE
%token CONST
%token ELSE
%token FOR
%token IF
%token LONG
%token RETURN
%token SHORT
%token SIGNED
%token CHAR
%token UNSIGNED
%token <strval> IDENTITY
%token <intval> INT_NUMBER
%token <floatval> FLOAT_NUMBER
%token <strval> STRING
%left OR
%left AND
%left ASSIGN
%left '!'
%left '+' '-'
%left '*' '/' '%'
%type <intval> decl_stmt bin_expr factor expr sentence id_list assign_stmt
%start statement
%%
/* 语法规则 */
statement: sentence ';' { printf("STMT\n"); }
| statement sentence ';'
;
sentence: decl_stmt { $$ = $1; }
| assign_stmt { $$ = $1; }
| /* empty rule */ { $$ = 0; }
;
decl_stmt: type_dec id_list { printf("stmt_decl\n"); }
| type_dec assign_stmt { printf("stmt_decl & assignment\n"); }
;
id_list: IDENTITY { printf("id: %s\n", $1); $$ = $1; }
| id_list ',' IDENTITY { printf("id: %s\n", $3); $$ = $3; }
;
assign_stmt: IDENTITY ASSIGN expr { printf("id: %s\nASSIGNMENT\n", $1); }
;
expr: factor { $$ = $1; }
| bin_expr { $$ = $1; }
;
factor: INT_NUMBER { printf("VALUE: %d\n", $1); $$ = $1; }
| FLOAT_NUMBER { printf("VALUE: %d\n", $1); $$ = $1; }
| IDENTITY { printf("VALUE: %s\n", $1); $$ = $1; }
;
bin_expr: expr '+' expr { printf("PLUS.\n"); }
| expr '-' expr { printf("SUB.\n"); }
| expr '*' expr { printf("MUL.\n"); }
| expr '/' expr { printf("DIV.\n"); }
;
type_dec: INT { printf("TYPE:INT\n"); }
| FLOAT { printf("TYPE:FLOAT\n"); }
| SHORT { printf("TYPE:SHORT\n"); }
| LONG { printf("TYPE:LONG\n"); }
| UNSIGNED LONG { printf("TYPE:UNSIGNED\n"); }
;
%%
int main(int argc, const char *args[])
{
/* 将注释去掉就能看到stack具体是怎么工作的.. */
/* yydebug = 1; */
extern FILE *yyin;
if(argc > 1 && (yyin = fopen(args[1], "r")) == NULL) {
fprintf(stderr, "can not open %s\n", args[1]);
exit(1);
}
if(yyparse()) {
exit(-1);
}
return 0;
}
void yyerror(char *s, ...)
{
extern int yylineno;
va_list ap;
va_start(ap, s);
fprintf(stderr, "%d: error: ", yylineno);
vfprintf(stderr, s, ap);
fprintf(stderr, "\n");
}
当然还有Makefile
LEX = flex
YYAC = bison
CC = gcc
all: test
test: parser.tab.o scanner.o
$(CC) -o $@ parser.tab.o scanner.o
parser.tab.c parser.tab.h: parser.y
$(YYAC) -vd parser.y
scanner.c: scanner.l
$(LEX) -o $@ $<
scanner.o: scanner.c parser.tab.h
.PYHONY: clean
clean:
-@ rm parser.tab.c parser.tab.h scanner.c parser.output *.o
上面的bison代码中语法部分仅仅只写了赋值语句和声明语句的语法解析规则.连错误处理都没有,23333
也没有生成中间树..但这不重要..
打印的输出是逆波兰式`RPN'..
不打开debug模式
图先用老的吧,2333
int a;
TYPE:INT
id: a
stmt_decl
STMT
int a, b, c;
TYPE:INT
id: a
id: b
id: c
stmt_decl
int a = b + 1;
TYPE:INT
VALUE: b
VALUE: 1
PLUS.
id: a
ASSIGNMENT
stmt_decl & assignment
float a, b, c;
TYPE:FLOAT
id: a
id: b
id: c
stmt_decl
unsigned long a, b, c;
TYPE:UNSIGNED
id: a
id: b
id: c
stmt_decl
unsigned long a = b + c;
TYPE:UNSIGNED
VALUE: b
VALUE: c
PLUS.
id: a
ASSIGNMENT
stmt_decl & assignment
int a, b, c + 1;
TYPE:INT
id: a
id: b
id: c
stmt_decl
7: error: syntax error
打开debug模式
Starting parse
Entering state 0
Reading a token: int a, b, c;
Next token is token INT (1.1: )
Shifting token INT (1.1: )
Entering state 1
Reducing stack by rule 20 (line 95):
$1 = token INT (1.1: )
TYPE:INT
-> $$ = nterm type_dec (1.1: )
Stack now 0
Entering state 11
Reading a token: Next token is token IDENTITY (1.1: )
Shifting token IDENTITY (1.1: )
Entering state 17
Reading a token: Next token is token ',' (1.1: )
Reducing stack by rule 8 (line 73):
$1 = token IDENTITY (1.1: )
id: a
-> $$ = nterm id_list (1.1: )
Stack now 0 11
Entering state 18
Next token is token ',' (1.1: )
Shifting token ',' (1.1: )
Entering state 27
Reading a token: Next token is token IDENTITY (1.1: )
Shifting token IDENTITY (1.1: )
Entering state 32
Reducing stack by rule 9 (line 74):
$1 = nterm id_list (1.1: )
$2 = token ',' (1.1: )
$3 = token IDENTITY (1.1: )
id: b
-> $$ = nterm id_list (1.1: )
Stack now 0 11
Entering state 18
Reading a token: Next token is token ',' (1.1: )
Shifting token ',' (1.1: )
Entering state 27
Reading a token: Next token is token IDENTITY (1.1: )
Shifting token IDENTITY (1.1: )
Entering state 32
Reducing stack by rule 9 (line 74):
$1 = nterm id_list (1.1: )
$2 = token ',' (1.1: )
$3 = token IDENTITY (1.1: )
id: c
-> $$ = nterm id_list (1.1: )
Stack now 0 11
Entering state 18
Reading a token: Next token is token ';' (1.1: )
Reducing stack by rule 6 (line 69):
$1 = nterm type_dec (1.1: )
$2 = nterm id_list (1.1: )
stmt_decl
-> $$ = nterm decl_stmt (1.1: )
Stack now 0
Entering state 9
Reducing stack by rule 3 (line 63):
$1 = nterm decl_stmt (1.1: )
-> $$ = nterm sentence (1.1: )
Stack now 0
Entering state 8
Next token is token ';' (1.1: )
Shifting token ';' (1.1: )
Entering state 16
Reducing stack by rule 1 (line 59):
$1 = nterm sentence (1.1: )
$2 = token ';' (1.1: )
STMT
-> $$ = nterm statement (1.1: )
Stack now 0
Entering state 7
Reading a token: Now at end of input.
Shifting token $end (1.1: )
Entering state 14
Stack now 0 7 14
Cleanup: popping token $end (1.1: )
Cleanup: popping nterm statement (1.1: )
能够清楚的看到栈的状态....
CmakeList.txt
cmake在较新的版本已经可以直接使用了.之前的版本是不支持找flex & bison的,只能通过`add_custom_command' 来使用flex & bison...这非常的麻烦
通过看CMake的手册才知道了可以这样写....
cmake_minimum_required(VERSION 3.4.3)
project(tincompiler)
find_package(BISON)
find_package(FLEX)
INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})
INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
FIND_LIBRARY(LEX_LIB l)
FLEX_TARGET(scanner scanner.l ${CMAKE_CURRENT_BINARY_DIR}/scanner.c)
BISON_TARGET(parser parser.y ${CMAKE_CURRENT_BINARY_DIR}/parser.c)
ADD_FLEX_BISON_DEPENDENCY(scanner parser)
ADD_EXECUTABLE(tincompiler
${BISON_parser_OUTPUTS}
${FLEX_scanner_OUTPUTS})
如果使用CMake作为构建工具需要将 \`scanner.l' 里面的 \` #include "parser.tab.h" ' 改成 \` #include "parser.h" '
参考资料:
《flex & bison》(John Levine), O'Reilly Media
CMake Manual, FindFLEX
CMake Manual, FindBISON
《Compilers Principles, Techniques, & Tools》(龙书), (Alfred V. Aho / Monica S.Lam / Ravi Sethi / Jeffrey D. Ullman)