词法分析器(Lexical Analyzer)【C实现】

#include <stdlib.h>
#include <stdio.h>
#include <stdarg.h>
#include <ctype.h>
#include <string.h>
#include <errno.h>
#include <stdbool.h>
#include <limits.h>#define NELEMS(arr) (sizeof(arr) / sizeof(arr[0]))#define da_dim(name, type) type *name = NULL; \int _qy_ ## name ## _p = 0; \int _qy_ ## name ## _max = 0
#define da_rewind(name) _qy_ ## name ## _p = 0
#define da_redim(name) do {if (_qy_ ## name ## _p >= _qy_ ## name ## _max) \name = realloc(name, (_qy_ ## name ## _max += 32) * sizeof(name[0]));} while (0)
#define da_append(name, x) do {da_redim(name); name[_qy_ ## name ## _p++] = x;} while (0)
#define da_len(name) _qy_ ## name ## _ptypedef enum {
    tk_EOI, tk_Mul, tk_Div, tk_Mod, tk_Add, tk_Sub, tk_Negate, tk_Not, tk_Lss, tk_Leq,tk_Gtr, tk_Geq, tk_Eq, tk_Neq, tk_Assign, tk_And, tk_Or, tk_If, tk_Else, tk_While,tk_Print, tk_Putc, tk_Lparen, tk_Rparen, tk_Lbrace, tk_Rbrace, tk_Semi, tk_Comma,tk_Ident, tk_Integer, tk_String
} TokenType;typedef struct {
    TokenType tok;int err_ln, err_col;union {
    int n;                  /* value for constants */char *text;             /* text for idents */};
} tok_s;static FILE *source_fp, *dest_fp;
static int line = 1, col = 0, the_ch = ' ';
da_dim(text, char);tok_s gettok();static void error(int err_line, int err_col, const char *fmt, ... ) {
    char buf[1000];va_list ap;va_start(ap, fmt);vsprintf(buf, fmt, ap);va_end(ap);printf("(%d,%d) error: %s\n", err_line, err_col, buf);exit(1);
}static int next_ch() {
         /* get next char from input */the_ch = getc(source_fp);++col;if (the_ch == '\n') {
    ++line;col = 0;}return the_ch;
}static tok_s char_lit(int n, int err_line, int err_col) {
       /* 'x' */if (the_ch == '\'')error(err_line, err_col, "gettok: empty character constant");if (the_ch == '\\') {
    next_ch();if (the_ch == 'n')n = 10;else if (the_ch == '\\')n = '\\';else error(err_line, err_col, "gettok: unknown escape sequence \\%c", the_ch);}if (next_ch() != '\'')error(err_line, err_col, "multi-character constant");next_ch();return (tok_s){
    tk_Integer, err_line, err_col, {
}static tok_s div_or_cmt(int err_line, int err_col) {
     /* process divide or comments */if (the_ch != '*')return (tok_s){
    tk_Div, err_line, err_col, {
    0}};/* comment found */next_ch();for (;;) {
    if (the_ch == '*') {
    if (next_ch() == '/') {
    next_ch();return gettok();}} else if (the_ch == EOF)error(err_line, err_col, "EOF in comment");elsenext_ch();}
}static tok_s string_lit(int start, int err_line, int err_col) {
     /* "st" */da_rewind(text);while (next_ch() != start) {
    if (the_ch == '\n') error(err_line, err_col, "EOL in string");if (the_ch == EOF)  error(err_line, err_col, "EOF in string");da_append(text, (char)the_ch);}da_append(text, '\0');next_ch();return (tok_s){
    tk_String, err_line, err_col, {
}static int kwd_cmp(const void *p1, const void *p2) {
    return strcmp(*(char **)p1, *(char **)p2);
}static TokenType get_ident_type(const char *ident) {
    static struct {
    char *s;TokenType sym;} kwds[] = {
    "else",  tk_Else},{
    "if",    tk_If},{
    "print", tk_Print},{
    "putc",  tk_Putc},{
    "while", tk_While},}, *kwp;return (kwp = bsearch(&ident, kwds, NELEMS(kwds), sizeof(kwds[0]), kwd_cmp)) == NULL ? tk_Ident : kwp->sym;
}static tok_s ident_or_int(int err_line, int err_col) {
    int n, is_number = true;da_rewind(text);while (isalnum(the_ch) || the_ch == '_') {
    da_append(text, (char)the_ch);if (!isdigit(the_ch))is_number = false;next_ch();}if (da_len(text) == 0)error(err_line, err_col, "gettok: unrecognized character (%d) '%c'\n", the_ch, the_ch);da_append(text, '\0');if (isdigit(text[0])) {
    if (!is_number)error(err_line, err_col, "invalid number: %s\n", text);n = strtol(text, NULL, 0);if (n == LONG_MAX && errno == ERANGE)error(err_line, err_col, "Number exceeds maximum value");return (tok_s){
    tk_Integer, err_line, err_col, {
    n}};}return (tok_s){
    get_ident_type(text), err_line, err_col, {
}static tok_s follow(int expect, TokenType ifyes, TokenType ifno, int err_line, int err_col) {
       /* look ahead for '>=', etc. */if (the_ch == expect) {
    next_ch();return (tok_s){
    ifyes, err_line, err_col, {
    0}};}if (ifno == tk_EOI)error(err_line, err_col, "follow: unrecognized character '%c' (%d)\n", the_ch, the_ch);return (tok_s){
    ifno, err_line, err_col, {
}tok_s gettok() {
                /* return the token type *//* skip white space */while (isspace(the_ch))next_ch();int err_line = line;int err_col  = col;switch (the_ch) {
    case '{':  next_ch(); return (tok_s){
    tk_Lbrace, err_line, err_col, {
    0}};case '}':  next_ch(); return (tok_s){
    tk_Rbrace, err_line, err_col, {
    0}};case '(':  next_ch(); return (tok_s){
    tk_Lparen, err_line, err_col, {
    0}};case ')':  next_ch(); return (tok_s){
    tk_Rparen, err_line, err_col, {
    0}};case '+':  next_ch(); return (tok_s){
    tk_Add, err_line, err_col, {
    0}};case '-':  next_ch(); return (tok_s){
    tk_Sub, err_line, err_col, {
    0}};case '*':  next_ch(); return (tok_s){
    tk_Mul, err_line, err_col, {
    0}};case '%':  next_ch(); return (tok_s){
    tk_Mod, err_line, err_col, {
    0}};case ';':  next_ch(); return (tok_s){
    tk_Semi, err_line, err_col, {
    0}};case ',':  next_ch(); return (tok_s){
    tk_Comma,err_line, err_col, {
    0}};case '/':  next_ch(); return div_or_cmt(err_line, err_col);case '\'': next_ch(); return char_lit(the_ch, err_line, err_col);case '<':  next_ch(); return follow('=', tk_Leq, tk_Lss,    err_line, err_col);case '>':  next_ch(); return follow('=', tk_Geq, tk_Gtr,    err_line, err_col);case '=':  next_ch(); return follow('=', tk_Eq,  tk_Assign, err_line, err_col);case '!':  next_ch(); return follow('=', tk_Neq, tk_Not,    err_line, err_col);case '&':  next_ch(); return follow('&', tk_And, tk_EOI,    err_line, err_col);case '|':  next_ch(); return follow('|', tk_Or,  tk_EOI,    err_line, err_col);case '"' : return string_lit(the_ch, err_line, err_col);default:   return ident_or_int(err_line, err_col);case EOF:  return (tok_s){
    tk_EOI, err_line, err_col, {
}void run() {
        /* tokenize the given input */tok_s tok;do {
    tok = gettok();fprintf(dest_fp, "%5d %5d %.15s",tok.err_ln, tok.err_col,&"End_of_input Op_multiply Op_divide Op_mod Op_add ""Op_subtract Op_negate Op_not Op_less Op_lessequal ""Op_greater Op_greaterequal Op_equal Op_notequal Op_assign ""Op_and Op_or Keyword_if Keyword_else Keyword_while ""Keyword_print Keyword_putc LeftParen RightParen LeftBrace ""RightBrace Semicolon Comma Identifier Integer ""String "[tok.tok * 16]);if (tok.tok == tk_Integer)     fprintf(dest_fp, " %4d",   tok.n);else if (tok.tok == tk_Ident)  fprintf(dest_fp, " %s",     tok.text);else if (tok.tok == tk_String) fprintf(dest_fp, " \"%s\"", tok.text);fprintf(dest_fp, "\n");} while (tok.tok != tk_EOI);if (dest_fp != stdout)fclose(dest_fp);
}void init_io(FILE **fp, FILE *std, const char mode[], const char fn[]) {
    if (fn[0] == '\0')*fp = std;else if ((*fp = fopen(fn, mode)) == NULL)error(0, 0, "Can't open %s\n", fn);
}int main(int argc, char *argv[]) {
    init_io(&source_fp, stdin,  "r",  argc > 1 ? argv[1] : "");init_io(&dest_fp,   stdout, "wb", argc > 2 ? argv[2] : "");run();return 0;


    5     16 Keyword_print5     40 Op_subtract6     16 Keyword_putc6     40 Op_less7     16 Keyword_if7     40 Op_greater8     16 Keyword_else8     40 Op_lessequal9     16 Keyword_while9     40 Op_greaterequal10     16 LeftBrace10     40 Op_equal11     16 RightBrace11     40 Op_notequal12     16 LeftParen12     40 Op_and13     16 RightParen13     40 Op_or14     16 Op_subtract14     40 Semicolon15     16 Op_not15     40 Comma16     16 Op_multiply16     40 Op_assign17     16 Op_divide17     40 Integer            4218     16 Op_mod18     40 String          "String literal"19     16 Op_add19     40 Identifier      variable_name20     26 Integer            1021     26 Integer            9222     26 Integer            3223      1 End_of_input