2018年7月

Flex

Flex是一个生成词法分析器的工具,它可以利用正则表达式来生成匹配相应字符串的C语言代码,其语法格式基本同Lex相同。

格式

LEX的源文件由三个部份组成,每个部分之间用顶行的 `%%' 分割,其格式如下:

定义部份

%%

规则部份

%%

用户附加C语言部份
手册->Flex, version 2.5
怎么用就不赘述了....

主要就是用这个词法生成器,给输入的源文件中的每一个关键字打上tag,以便后面的语法分析器能够识别.

GNU Bison

GNU bison 是属于 GNU 项目的一个语法分析器生成器。Bison 把一个关于“向前查看 从左到右 最右 \`LALR' 上下文无关文法的描述转化成可以分析该文法的 C 或 C++ 程序。它也可以为二义文法生成 “通用的 从左到右 最右 \`(GLR)' 语法分析器。
这里主要使用的就是普通的LALR...

手册->Bison 3.0.5

格式

与flex的格式大致相同...

声明部分
%%
语法规则
%%
C语言附加部分

源码

flex 部分

/* filename -> scanner.l */
%option noyywrap yylineno

%{
#include <stdio.h>   
#include <stdlib.h>
#include <string.h>
#include "parser.tab.h"
int old_status;
void yyerror(char *s, ...);

%}

%x COMMENT

%%
    /* 下面都是正则表达式 */
int         { return INT; }
float       { return FLOAT; }
double      { return DOUBLE; }
auto        { return AUTO; }
break       { return BREAK; }
case        { return CASE; }
const       { return CONST; }
else        { return ELSE; }
for         { return FOR; }
if          { return IF; }
long        { return LONG; }
return      { return RETURN; }
short       { return SHORT; }
signed      { return SIGNED; }
char        { return CHAR; }
unsigned    { return UNSIGNED; }
    
    /* user variable */
[a-zA-Z][a-zA-Z0-9_]*       { yylval.strval = strdup(yytext); return IDENTITY; }
    /* integer numbers */
-?[0-9]+                    { yylval.intval = atoi(yytext); return INT_NUMBER; }
-?[0-9]+\.[0-9]+            { yylval.floatval = atof(yytext); return FLOAT_NUMBER; }
    /* real numbers */
\"(\\.|\"\"|[^"\n"])*\"     { yylval.strval = strdup(yytext); return STRING; }
    /* C-type strings */
\"(\\.|[^"\n])*$            { yyerror("Unterminated string %s", yytext); }

    /* operators */
[-+&~|^/%*(),.;!]       { return yytext[0]; }
"&&"                    { return AND; }
"||"                    { return OR; }
"="                     { return ASSIGN; }

    /* comments */
"//".*;
"/*"                    { old_status = YY_START; BEGIN COMMENT; }
<COMMENT>"*/"           { BEGIN old_status; }
    /* space & tab */
[ \t\n]
    /* prevent flex jam */
.           { yyerror("something goes wrong...\n"); }

%%

Bison 部分

 /* filename -> parser.y */
%require "3.0.4"

%{
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdarg.h>

void yyerror(char *s, ...);
int yylex();
%}

%debug

%locations

%union {
    int intval;
    double floatval;
    char *strval;
    int subtok;
}

    /* token 关键字 */
%token INT
%token FLOAT
%token DOUBLE
%token AUTO
%token BREAK
%token CASE
%token CONST
%token ELSE
%token FOR
%token IF
%token LONG
%token RETURN
%token SHORT
%token SIGNED
%token CHAR
%token UNSIGNED
%token <strval> IDENTITY
%token <intval> INT_NUMBER
%token <floatval> FLOAT_NUMBER
%token <strval> STRING

%left OR
%left AND
%left ASSIGN
%left '!'
%left '+' '-'
%left '*' '/' '%'

%type <intval> decl_stmt bin_expr factor expr sentence id_list assign_stmt

%start statement

%%

/* 语法规则 */
statement: sentence ';'         { printf("STMT\n"); }
    | statement sentence ';'
    ;

sentence: decl_stmt     { $$ = $1; }
    | assign_stmt       { $$ = $1; }
    | /* empty rule */  { $$ = 0; }
    ;


decl_stmt: type_dec id_list { printf("stmt_decl\n"); }
    | type_dec assign_stmt  { printf("stmt_decl & assignment\n"); }
    ;

id_list: IDENTITY           { printf("id: %s\n", $1); $$ = $1; }
    | id_list ',' IDENTITY  { printf("id: %s\n", $3); $$ = $3; }
    ;

assign_stmt: IDENTITY ASSIGN expr    { printf("id: %s\nASSIGNMENT\n", $1); }
    ;

expr: factor    { $$ = $1; }
    | bin_expr  { $$ = $1; }
    ;

factor: INT_NUMBER      { printf("VALUE: %d\n", $1); $$ = $1; }
    | FLOAT_NUMBER      { printf("VALUE: %d\n", $1); $$ = $1; }
    | IDENTITY          { printf("VALUE: %s\n", $1); $$ = $1; }
    ;

bin_expr: expr '+' expr     { printf("PLUS.\n"); }
    | expr '-' expr         { printf("SUB.\n"); }
    | expr '*' expr         { printf("MUL.\n"); }
    | expr '/' expr         { printf("DIV.\n"); }
    ;

type_dec: INT       { printf("TYPE:INT\n"); }
    | FLOAT         { printf("TYPE:FLOAT\n"); }
    | SHORT         { printf("TYPE:SHORT\n"); }
    | LONG          { printf("TYPE:LONG\n"); }
    | UNSIGNED LONG { printf("TYPE:UNSIGNED\n"); }
    ;


%%

int main(int argc, const char *args[])
{
    /* 将注释去掉就能看到stack具体是怎么工作的.. */
    /* yydebug = 1; */

    extern FILE *yyin;
    if(argc > 1 && (yyin = fopen(args[1], "r")) == NULL) {
        fprintf(stderr, "can not open %s\n", args[1]);
        exit(1);
    }
    if(yyparse()) {
        exit(-1);
    }
    
    return 0;
}

void yyerror(char *s, ...)
{
    extern int yylineno;

    va_list ap;
    va_start(ap, s);

    fprintf(stderr, "%d: error: ", yylineno);
    vfprintf(stderr, s, ap);
    fprintf(stderr, "\n");
}

当然还有Makefile

LEX = flex
YYAC = bison
CC = gcc

all: test

test: parser.tab.o scanner.o
        $(CC) -o $@ parser.tab.o scanner.o

parser.tab.c parser.tab.h: parser.y
        $(YYAC) -vd parser.y

scanner.c: scanner.l
        $(LEX) -o $@ $<

scanner.o: scanner.c parser.tab.h

.PYHONY: clean

clean:
        -@ rm parser.tab.c parser.tab.h scanner.c parser.output *.o

上面的bison代码中语法部分仅仅只写了赋值语句和声明语句的语法解析规则.连错误处理都没有,23333
也没有生成中间树..但这不重要..
打印的输出是逆波兰式`RPN'..

不打开debug模式

图先用老的吧,2333

3808135993.png

int a;
TYPE:INT
id: a
stmt_decl
STMT
int a, b, c;
TYPE:INT
id: a
id: b
id: c
stmt_decl
int a = b + 1;
TYPE:INT
VALUE: b
VALUE: 1
PLUS.
id: a
ASSIGNMENT
stmt_decl & assignment
float a, b, c;
TYPE:FLOAT
id: a
id: b
id: c
stmt_decl
unsigned long a, b, c;
TYPE:UNSIGNED
id: a
id: b
id: c
stmt_decl
unsigned long a = b + c;
TYPE:UNSIGNED
VALUE: b
VALUE: c
PLUS.
id: a
ASSIGNMENT
stmt_decl & assignment
int a, b, c + 1;
TYPE:INT
id: a
id: b
id: c
stmt_decl
7: error: syntax error

打开debug模式

Starting parse
Entering state 0
Reading a token: int a, b, c; 
Next token is token INT (1.1: )
Shifting token INT (1.1: )
Entering state 1
Reducing stack by rule 20 (line 95):
   $1 = token INT (1.1: )
TYPE:INT
-> $$ = nterm type_dec (1.1: )
Stack now 0
Entering state 11
Reading a token: Next token is token IDENTITY (1.1: )
Shifting token IDENTITY (1.1: )
Entering state 17
Reading a token: Next token is token ',' (1.1: )
Reducing stack by rule 8 (line 73):
   $1 = token IDENTITY (1.1: )
id: a
-> $$ = nterm id_list (1.1: )
Stack now 0 11
Entering state 18
Next token is token ',' (1.1: )
Shifting token ',' (1.1: )
Entering state 27
Reading a token: Next token is token IDENTITY (1.1: )
Shifting token IDENTITY (1.1: )
Entering state 32
Reducing stack by rule 9 (line 74):
   $1 = nterm id_list (1.1: )
   $2 = token ',' (1.1: )
   $3 = token IDENTITY (1.1: )
id: b
-> $$ = nterm id_list (1.1: )
Stack now 0 11
Entering state 18
Reading a token: Next token is token ',' (1.1: )
Shifting token ',' (1.1: )
Entering state 27
Reading a token: Next token is token IDENTITY (1.1: )
Shifting token IDENTITY (1.1: )
Entering state 32
Reducing stack by rule 9 (line 74):
   $1 = nterm id_list (1.1: )
   $2 = token ',' (1.1: )
   $3 = token IDENTITY (1.1: )
id: c
-> $$ = nterm id_list (1.1: )
Stack now 0 11
Entering state 18
Reading a token: Next token is token ';' (1.1: )
Reducing stack by rule 6 (line 69):
   $1 = nterm type_dec (1.1: )
   $2 = nterm id_list (1.1: )
stmt_decl
-> $$ = nterm decl_stmt (1.1: )
Stack now 0
Entering state 9
Reducing stack by rule 3 (line 63):
   $1 = nterm decl_stmt (1.1: )
-> $$ = nterm sentence (1.1: )
Stack now 0
Entering state 8
Next token is token ';' (1.1: )
Shifting token ';' (1.1: )
Entering state 16
Reducing stack by rule 1 (line 59):
   $1 = nterm sentence (1.1: )
   $2 = token ';' (1.1: )
STMT
-> $$ = nterm statement (1.1: )
Stack now 0
Entering state 7
Reading a token: Now at end of input.
Shifting token $end (1.1: )
Entering state 14
Stack now 0 7 14
Cleanup: popping token $end (1.1: )
Cleanup: popping nterm statement (1.1: )

能够清楚的看到栈的状态....

CmakeList.txt

cmake在较新的版本已经可以直接使用了.之前的版本是不支持找flex & bison的,只能通过`add_custom_command' 来使用flex & bison...这非常的麻烦

通过看CMake的手册才知道了可以这样写....

cmake_minimum_required(VERSION 3.4.3)

project(tincompiler)

find_package(BISON)
find_package(FLEX)

INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})
INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})

FIND_LIBRARY(LEX_LIB l)

FLEX_TARGET(scanner scanner.l   ${CMAKE_CURRENT_BINARY_DIR}/scanner.c)
BISON_TARGET(parser parser.y    ${CMAKE_CURRENT_BINARY_DIR}/parser.c)

ADD_FLEX_BISON_DEPENDENCY(scanner parser)

ADD_EXECUTABLE(tincompiler
        ${BISON_parser_OUTPUTS}
        ${FLEX_scanner_OUTPUTS})
如果使用CMake作为构建工具需要将 \`scanner.l' 里面的 \` #include "parser.tab.h" ' 改成 \` #include "parser.h" '

参考资料:

《flex & bison》(John Levine), O'Reilly Media
CMake Manual, FindFLEX
CMake Manual, FindBISON
《Compilers Principles, Techniques, & Tools》(龙书), (Alfred V. Aho / Monica S.Lam / Ravi Sethi / Jeffrey D. Ullman)