編譯器構造/案例研究 1B
這是一個正在進行中的案例研究,現在已連結到主頁面(完成後)。User:matt73 正在研究這個。 |
本案例研究的目的是提供一個使用 Lex 和 Yacc 在 C 中編寫的編譯器/直譯器前端的示例。使用直譯器是因為它允許在最少的額外工作(在前端構建之後)的情況下建立工作程式。透過用編譯器後端替換最後一階段,可以將此程式碼開發成編譯器。
程式碼按突出顯示建立編譯器過程的順序顯示,因此同一個檔案將在開發過程中多次顯示。
本案例研究開發了一個針對 Very Tiny Basic 的直譯器,該直譯器在本書的案例研究 1部分中進行了說明。
以下步驟將用於完成直譯器
- 詞法分析
- 語法分析
- 詞法分析與語義
- 抽象語法樹
- 生成抽象語法樹
- 解釋
為了簡潔和簡單起見,許多有用的編譯器/直譯器的重要功能已被省略,包括
- 處理錯誤
- 最佳化
此外,某些過程不適用於 Very Tiny Basic 這樣簡單的語言,例如名稱表不適用,因為只使用單個字元來命名變數。
要求
lex 檔案的第一稿透過在關聯的 C 操作中返回特定值來識別不同的標記。對於關鍵字和運算子來說這很簡單,但是識別符號、值和註釋則更復雜。
VTB.l - 版本 1
%{
%}
DIGIT [0-9]
LETTER [A-Z]
%%
"LET" {return LET;}
"IF" {return IF;}
"THEN" {return THEN;}
"PRINT" {return PRINT;}
"REM"[^\n]* {return REM;}
"GOTO" {return GOTO;}
"GOSUB" {return GOSUB;}
"RETURN" {return RETURN;}
"STOP" {return STOP;}
"FOR" {return FOR;}
"TO" {return TO;}
"STEP" {return STEP;}
"NEXT" {return NEXT;}
\"[^\"]\" {return STRING;}
"=" {return EQUAL;}
"<>" {return NEQUAL;}
%%
可以使用以下方法之一處理此檔案
lex VTB.l flex VTB.l
您可能想知道所有這些返回的值從何而來。它們將在 Yacc 語法檔案處理時建立。
與案例研究 1中的Very Tiny Basic - 規範相比,存在一些差異,例如 MULT 和 DIV 代替 MULDIV。這是因為我們需要區分這兩個值。LineNumber 和 WholeNumber 在詞法上是相同的,因此目前無法分離。定義標記的使用和類別留到下一階段。
VTB.y - 版本 1
%{
int yylex(void);
void yyerror(char* str)
{
/*This should handle errors!*/
}
%}
%union {
int number;
char *string;
char var;
}
%token <number> NUMBER
%token <string> STRING
%token <var> VAR_NUM
%token <var> VAR_STR
%token LET IF THEN PRINT REM GOTO GOSUB RETURN STOP
FOR EQUAL TO STEP NEXT SEPARATE
NEQUAL LT GT LTEQ GTEQ
ADD SUB MUL DIV
LPARAN RPARAN
%start lines
%%
lines: line lines
| /*Nothing*/
;
line: NUMBER statement
;
statement: LET variable EQUAL exp
| IF test THEN NUMBER
| PRINT printItem printList
| REM
| GOTO NUMBER
| GOSUB NUMBER
| RETURN
| STOP
| FOR variable EQUAL exp TO exp STEP exp
| FOR variable EQUAL exp TO exp
| NEXT variable
;
variable: VAR_NUM
| VAR_STR
;
printList: SEPARATE printItem
| /* Nothing */
;
printItem: exp
/* | STRING */
;
test: exp comparison exp
;
comparison: EQUAL
| NEQUAL
| LT
| GT
| LTEQ
| GTEQ
;
exp: addsub secondary secList
| secondary secList
;
secList: addsub secondary
| /* Nothing */
;
addsub : ADD
| SUB
;
secondary : primary priList
;
priList : muldiv primary
| /* Nothing */
;
muldiv : MUL
| DIV
;
primary : LPARAN exp RPARAN
| variable
| NUMBER
| STRING
;
在此版本的詞法分析器中,包含了由 yacc/bison 生成的標頭檔案。標頭檔案定義了返回值和用於儲存標記語義值的聯合。這是根據語法檔案的%token 宣告和%union 部分建立的。詞法分析器現在從某些型別的標記中提取值,並將這些值儲存在yylval 聯合中。
VTB.l - 版本 2
%{
#include "y.tab.h"
#include <string.h>
#include <malloc.h>
%}
DIGIT [0-9]
LETTER [A-Z]
%%
"LET" {return LET;}
"IF" {return IF;}
"THEN" {return THEN;}
"PRINT" {return PRINT;}
"REM"[^\n]* {return REM;}
"GOTO" {return GOTO;}
"GOSUB" {return GOSUB;}
"RETURN" {return RETURN;}
"STOP" {return STOP;}
"FOR" {return FOR;}
"TO" {return TO;}
"STEP" {return STEP;}
"NEXT" {return NEXT;}
\"[^\"]\" {yylval.string=malloc(strlen(yytext)+1);
strcpy(yylval.string,yytext); return STRING;}
"=" {return EQUAL;}
"<>" {return NEQUAL;}
"<" {return LT;}
">" {return GT;}
"<=" {return LTEQ;}
">=" {return GTEQ;}
{LETTER} {yylval.var=yytext[0]; return VAR_NUM;}
{LETTER}$ {yylval.var=yytext[0]; return VAR_STR;}
{DIGIT}+ {yylval.number=atoi(yytext); return NUMBER;}
"," |
";" {return SEPARATE;}
"+" {return ADD;}
"-" {return SUB;}
"*" {return MUL;}
"/" {return DIV;}
"(" {return LPARAN;}
")" {return RPARAN;}
. {/*This is an error!*/}
%%
抽象語法樹 是使用資料結構在記憶體中建立的程式碼的中間表示。語言的語法結構(已定義並已作為 YACC 語法檔案寫下)被轉換為樹結構。使用 YACC & C,這意味著大多數語法規則和標記都變成了節點。註釋是一個沒有放入樹中的示例。
在結構中,運算元的組合是清晰的,因此不需要在樹中存在括號之類的標記。對於結束程式碼塊的標記也是如此。這是可能的,因為語法檔案中的規則可以使用這些標記以正確的形狀建立樹。
抽象語法樹中組合的說明
(1+3)*4 1+3*4 * + / \ / \ 4 + 1 * / \ / \ 1 3 3 4
在此直譯器中,可以透過將主/次表示式結構摺疊來丟棄它們。這會增加程式碼的複雜性,因此目前沒有實現。Rem 語句也被保留(沒有註釋文字),因為 VTB 的定義意味著它們是 Goto 的有效目標。實際上,跳轉到不存在的行是未定義的,因此此直譯器將發出錯誤。
對於樹結構,使用指標和動態分配是必須的,否則它將是無限大的,因此會使用太多資源(仔細想想)。為了保持程式碼整潔,
/*
* Abstract Syntax Trees for Very Tiny Basic
*/
typedef struct statement_ *statement;
typedef struct exp_ *exp;
typedef struct variable_ *variable;
typedef exp printItem; /*Not required since it is always an expression.*/
typedef struct printList_ *printList;
typedef struct test_ *test;
typedef enum comparison_ comparison;
typedef struct secondary_ *secondary;
typedef struct secList_ *secList;
typedef struct primary_ *primary;
typedef struct priList_ *priList;
typedef enum muldiv_ muldiv;
typedef enum addsub_ addsub;
enum comparison_ {equal, nequal, lt, gt, lteq, gteq};
enum muldiv_ {mul,divd};
enum addsub_ {add, sub};
struct statement_ {
enum{ let, ifThen, print, rem, goto_, gosub, return_, stop, for_step, for_, next} type;
union {
struct {variable var; exp e;} let_val;
struct {test t; int target;} ifThen_val;
struct {printItem item; printList lst;} print_val;
/* REM has no value */
int goto_val;
int gosub_val;
/* RETURN has no value */
/* STOP has no value */
struct {variable var; exp assign; exp to; exp step;} for_val; /* Use for both.*/
variable next_val;
} u;
};
struct exp_ {
int fHasAddSub; /* Flag */
addsub as;
secondary sec;
secList list;
};
struct variable_ {
enum { stringV, integerV} type;
char v;
};
struct printList_ {
printItem item;
printList next;
};
struct test_ {
exp a, b;
comparison c;
};
struct secondary_ {
primary p;
priList list;
};
struct secList_ {
addsub as;
secondary s;
secList next;
};
struct primary_ {
enum {bracket, var, integerP, stringP} type;
union {
exp bracket_val;
variable var_val;
int integer_val;
char *string_val;
} u;
};
struct priList_ {
muldiv md;
primary p;
priList next;
};
statement LetStatement(variable var_, exp e_);
statement IfThenStatement(test t_, int target_);
statement PrintStatement(printItem item_, printList lst_);
statement RemStatement();
statement GotoStatement(int line);
statement GosubStatement(int line);
statement ReturnStatement();
statement StopStatement();
statement ForStatement(variable var_, exp assign_, exp to_, exp step_); /* Step can be NULL */
statement NextStatement(variable var);
exp AddSubExp(addsub as_, secondary sec_, secList list_);
exp PlainExp(secondary sec_, secList list_);
variable StringVariable(char v_);
variable IntegerVariable(char v_);
printItem NewPrintItem(exp e_);
printList NewPrintList(printItem item_, printList next_);
test NewTest(exp a_, exp b_, comparison c_);
secondary NewSecondary(primary p_, priList list_);
secList NewSecList(addsub as_, secondary s_, secList next_);
primary BracketPrimary(exp e_);
primary VarPrimary (variable v_);
primary IntegerPrimary(int n_);
primary StringPrimary(char *str_);
priList NewPriList(muldiv md_, primary p_, priList next_);
absyn.c 的目的是為抽象語法樹中使用到的所有結構提供初始化例程。此程式碼不太有趣,並且非常重複。
/*
* Abstract Syntax Trees for Very Tiny Basic
*/
#include <malloc.h>
#include "absyn.h"
#define safe_malloc malloc
statement LetStatement(variable var_, exp e_) {
statement ret = safe_malloc(sizeof(*ret));
ret->type = let;
ret->u.let_val.var = var_;
ret->u.let_val.e = e_;
return ret;
}
statement IfThenStatement(test t_, int target_) {
statement ret = safe_malloc(sizeof(*ret));
ret->type = ifThen;
ret->u.ifThen_val.t = t_;
ret->u.ifThen_val.target = target_;
return ret;
}
statement PrintStatement(printItem item_, printList lst_) {
statement ret = safe_malloc(sizeof(*ret));
ret->type = print;
ret->u.print_val.item = item_;
ret->u.print_val.lst = lst_;
return ret;
}
statement RemStatement() {
statement ret = safe_malloc(sizeof(*ret));
ret->type = rem;
return ret;
}
statement GotoStatement(int line) {
statement ret = safe_malloc(sizeof(*ret));
ret->type = goto_;
ret->u.goto_val = line;
return ret;
}
statement GosubStatement(int line) {
statement ret = safe_malloc(sizeof(*ret));
ret->type = gosub;
ret->u.gosub_val = line;
return ret;
}
statement ReturnStatement() {
statement ret = safe_malloc(sizeof(*ret));
ret->type = return_;
return ret;
}
statement StopStatement(){
statement ret = safe_malloc(sizeof(*ret));
ret->type = stop;
return ret;
}
/* Step can be NULL */
statement ForStatement(variable var_, exp assign_, exp to_, exp step_) {
statement ret = safe_malloc(sizeof(*ret));
if(step_ == NULL) {
ret->type = for_;
} else {
ret->type = for_step;
}
ret->u.for_val.var = var_;
ret->u.for_val.assign = assign_;
ret->u.for_val.to = to_;
ret->u.for_val.step = step_;
return ret;
}
statement NextStatement(variable var) {
statement ret = safe_malloc(sizeof(*ret));
ret->type = next;
ret->u.next_val = var;
return ret;
}
exp AddSubExp(addsub as_, secondary sec_, secList list_) {
exp ret = safe_malloc(sizeof(*ret));
ret->fHasAddSub = 1;
ret->as = as_;
ret->sec = sec_;
ret->list = list_;
return ret;
}
exp PlainExp(secondary sec_, secList list_) {
exp ret = safe_malloc(sizeof(*ret));
ret->fHasAddSub = 0;
ret->sec = sec_;
ret->list = list_;
return ret;
}
variable StringVariable(char v_) {
variable ret = safe_malloc(sizeof(*ret));
ret->type = stringV;
ret->v = v_;
return ret;
}
variable IntegerVariable(char v_) {
variable ret = safe_malloc(sizeof(*ret));
ret->type = integerV;
ret->v = v_;
return ret;
}
printItem NewPrintItem(exp e_) { return e_;}
printList NewPrintList(printItem item_, printList next_) {
printList ret = safe_malloc(sizeof(*ret));
ret->item = item_;
ret->next = next_;
return ret;
}
test NewTest(exp a_, exp b_, comparison c_) {
test ret = safe_malloc(sizeof(*ret));
ret->a = a_;
ret->b = b_;
ret->c = c_;
return ret;
}
secondary NewSecondary(primary p_, priList list_) {
secondary ret = safe_malloc(sizeof(*ret));
ret->p = p_;
ret->list = list_;
return ret;
}
secList NewSecList(addsub as_, secondary s_, secList next_) {
secList ret = safe_malloc(sizeof(*ret));
ret->as = as_;
ret->s = s_;
ret->next = next_;
return ret;
}
primary BracketPrimary(exp e_) {
primary ret = safe_malloc(sizeof(*ret));
ret->type = bracket;
ret->u.bracket_val = e_;
return ret;
}
primary VarPrimary (variable v_) {
primary ret = safe_malloc(sizeof(*ret));
ret->type = var;
ret->u.var_val = v_;
return ret;
}
primary IntegerPrimary(int n_) {
primary ret = safe_malloc(sizeof(*ret));
ret->type = integerP;
ret->u.integer_val = n_;
return ret;
}
primary StringPrimary(char *str_) {
primary ret = safe_malloc(sizeof(*ret));
ret->type = stringP;
ret->u.string_val = str_;
return ret;
}
priList NewPriList(muldiv md_, primary p_, priList next_) {
priList ret = safe_malloc(sizeof(*ret));
return ret;
}
由於我們使用的是標準 *nix 工具,而 *nix 上常用的構建系統是 make,因此為直譯器編寫 Makefile 非常有用。請記住,大多數編譯器/直譯器非常大,需要比本示例更高階的構建系統。它們可能需要 CVS、autoconf 和分佈在不同目錄中的許多 makefile。由於這個只有一個使用五個檔案,所以它非常簡單。
DEBUG_CFLAGS := -Wall -Wno-unknown-pragmas -Wno-format -g -DDEBUG -O0
DEBUG_LDFLAGS := -g
RELEASE_CFLAGS := -O3
RELEASE_LDFLAGS :=
#Comment out following line if you need optimization ;-)
DEBUG := YES
#Comment out following line if you're not using Windows/Dos
POSTFIX := .exe
ifeq (YES, ${DEBUG})
CFLAGS := ${DEBUG_CFLAGS}
LDFLAGS := ${DEBUG_LDFLAGS}
else
CFLAGS := ${RELEASE_CFLAGS}
LDFLAGS := ${RELEASE_LDFLAGS}
endif
.PHONY : all clean
all : vtbi${POSTFIX}
vtbi${POSTFIX} : vtbi.o y.tab.o lex.yy.o absyn.o
gcc ${LDFLAGS} ${CFLAGS} -o vtbi vtbi.o y.tab.o lex.yy.o absyn.o
clean :
-rm *.o y.tab.h y.tab.c lex.yy.c *.exe
y.tab.c : VTB.y
bison -yd VTB.y -r state
y.tab.h : y.tab.c
echo "y.tab.h created by Bison as well as y.tab.c"
lex.yy.c : VTB.l
flex VTB.l
%.o : %.c
gcc -c ${CFLAGS} $< -o $@
VTB.y - 版本 2 詞法分析器現在為標記提供值,並且抽象語法樹結構已編寫完成。接下來,更新語法檔案以從規則和語義值中構建樹。所有樹節點型別都被新增到聯合宣告中。必須為規則指定型別並返回正確的型別。
%{
#include <stdio.h>
#include "absyn.h"
int yylex(void);
void yyerror(char* str)
{
/*This should handle errors!*/
}
void addLine(int line, statement stm);
%}
%union {
/* Token Types */
int number;
char *string;
char var;
/* Abstract Syntax Tree Types */
statement stm;
exp exp;
variable variable;
printList printList;
test test;
comparison comp;
secondary sec;
secList secList;
primary pri;
priList priList;
muldiv muldiv;
addsub addsub;
}
%token <number> NUMBER
%token <string> STRING
%token <var> VAR_NUM
%token <var> VAR_STR
%token LET IF THEN PRINT REM GOTO GOSUB RETURN STOP
FOR EQUAL TO STEP NEXT SEPARATE
NEQUAL LT GT LTEQ GTEQ
ADD SUB MUL DIV
LPARAN RPARAN
%start lines
%type <stm> statement
%type <exp> exp
%type <exp> printItem
%type <variable> variable
%type <printList> printList
%type <test> test
%type <comp> comparison
%type <sec> secondary
%type <secList> secList
%type <pri> primary
%type <priList> priList
%type <muldiv> muldiv
%type <addsub> addsub
%%
lines: line lines
| /*Nothing*/
;
line: NUMBER statement {addLine($1, $2);}
;
statement: LET variable EQUAL exp {$$=LetStatement($2,$4);}
| IF test THEN NUMBER {$$=IfThenStatement($2, $4);}
| PRINT printItem printList {$$=PrintStatement($2, $3);}
| REM {$$=RemStatement();}
| GOTO NUMBER {$$=GotoStatement($2);}
| GOSUB NUMBER {$$=GosubStatement($2);}
| RETURN {$$=ReturnStatement();}
| STOP {$$=StopStatement();}
| FOR variable EQUAL exp TO exp STEP exp
{$$=ForStatement($2, $4, $6, $8);}
| FOR variable EQUAL exp TO exp
{$$=ForStatement($2, $4, $6, NULL);}
| NEXT variable {$$=NextStatement($2);}
;
variable: VAR_NUM {$$=IntegerVariable($1);}
| VAR_STR {$$=StringVariable($1);}
;
printList: SEPARATE printItem printList {$$=NewPrintList($2,$3);}
| /* Nothing */ {$$=NULL;}
;
printItem: exp {$$=NewPrintItem($1);}
/* | STRING */
;
test: exp comparison exp {$$=NewTest($1, $3, $2);}
;
comparison: EQUAL {$$=equal;}
| NEQUAL {$$=nequal;}
| LT {$$=lt;}
| GT {$$=gt;}
| LTEQ {$$=lteq;}
| GTEQ {$$=gteq;}
;
exp: addsub secondary secList {$$=AddSubExp($1, $2, $3);}
| secondary secList {$$=PlainExp($1, $2);}
;
secList: addsub secondary secList {$$=NewSecList($1, $2, $3);}
| /* Nothing */ {$$=NULL;}
;
addsub : ADD {$$=add;}
| SUB {$$=sub;}
;
secondary : primary priList {$$=NewSecondary($1, $2);}
;
priList : muldiv primary priList {$$=NewPriList($1, $2, $3);}
| /* Nothing */ {$$=NULL;}
;
muldiv : MUL {$$=mul;}
| DIV {$$=divd;}
;
primary : LPARAN exp RPARAN {$$=BracketPrimary($2);}
| variable {$$=VarPrimary($1);}
| NUMBER {$$=IntegerPrimary($1);}
| STRING {$$=StringPrimary($1);}
;
