Pergunta

I need to write a simply tokenizer for C language. It doesn't have to classify things, doesn't have to use any grammar. All it needs to do is to print separately the words, characters, parenthesis and other things. I'm using lex for it. But I need to read a C source file and then tokenize it. Below you can find my current code. I have three questions.

1) How can I correct the error message I get when I compile:

parser.l:47:1: warning: unknown conversion type character ‘=’ in format [-Wformat]

2) How can I make lexer to run on the source file that is passed as argument?

3) How can I make the tokenizer print the names of the variables and other unspecified things just as they are? Meaning if I have, int test, it will print int because it is specified in the lexer, and I also want to print test separately because it is not specified in lex.

Here is my code:

%{
#include <stdio.h>
#include <stdlib.h>
%}

%%

"auto"          { printf("auto\t"); }
"break"         { printf("break\t"); }
"case"          { printf("case\t"); }
"char"          { printf("char\t"); }
"const"         { printf("const\t"); }
"continue"      { printf("continue\t"); }
"default"       { printf("default\t"); }
"do"            { printf("do\t"); }
"double"        { printf("double\t"); }
"else"          { printf("else\t"); }
"enum"          { printf("enum\t"); }
"extern"        { printf("extern\t"); }
"float"         { printf("float\t"); }
"for"           { printf("for\t"); }
"goto"          { printf("goto\t"); }
"if"            { printf("if\t"); }
"inline"        { printf("inline\t"); }
"int"           { printf("int\t"); }
"long"          { printf("long\t"); }
"printf"        { printf("printf\t"); }
"register"      { printf("register\t"); }
"restrict"      { printf("restrict\t"); }
"return"        { printf("return\t"); }
"short"         { printf("short\t"); }
"signed"        { printf("signed\t"); }
"sizeof"        { printf("sizeof\t"); }
"static"        { printf("static\t"); }
"struct"        { printf("struct\t"); }
"switch"        { printf("switch\t"); }
"typedef"       { printf("typedef\t"); }
"union"         { printf("union\t"); }
"unsigned"      { printf("unsigned\t"); }
"void"          { printf("void\t"); }
"volatile"      { printf("volatile\t"); }
"while"         { printf("while\t"); }
"+="            { printf("+=\t"); }
"-="            { printf("-=\t"); }
"*="            { printf("*=\t"); }
"/="            { printf("/=\t"); }
"%="            { printf("%=\t"); }
"&="            { printf("&=\t"); }
"^="            { printf("^=\t"); }
"|="            { printf("|=\t"); }
"++"            { printf("++\t"); }
"--"            { printf("--\t"); }
"->"            { printf("->\t"); }
"&&"            { printf("&&\t"); }
"||"            { printf("||\t"); }
"<="            { printf("<=\t"); }
">="            { printf(">=\t"); }
"=="            { printf("==\t"); }
"!="            { printf("!=\t"); }
"{"         { printf("{\t"); }
"}"         { printf("}\t"); }
"="         { printf("=\t"); }
"("         { printf("(\t"); }
")"         { printf(")\t"); }
"["         { printf("[\t"); }
"]"         { printf("]\t"); }
"<"         { printf("<\t"); }
">"         { printf(">\t"); }

%%

void main(int argc, char** argv)
{
    if(argc != 2)
    {
        printf("Usage: %s filename\n", argv[0]);
        exit(1);
    }

    char *filename = argv[1];
    FILE *f = fopen(filename, "r");

    if(f == NULL)
    {
        fprintf(stderr, "Unable to open %s\n", filename);
    }
    else
    {
        yylex();
    }
}
Foi útil?

Solução

Q1. The warning you have is from this line:

"%="            { printf("%=\t"); }

You want:

"%="            { printf("%%=\t"); }

as you need to escape the '%'.

Q2. To get lex to read from a given file, you need to use yyin - see in lex how to make yyin point to a file with the main function in yacc? for more details.

Q3. You will need to use regexps to match them. See https://www.cs.princeton.edu/~appel/modern/c/software/flex/flex.html for example.

Outras dicas

void main(int argc, char** argv)
{
    if (argc > 1)
    {
        FILE *file;
        file = fopen(argv[1], "r");      
        if (!file)
        {
            fprintf(stderr, "Could not open %s\n", argv[1]);
            exit(1);
        }
        yyin = file;
    }
    yylex();
}

To print identifiers you can use the regular expression below

enter[a-zA-Z][_a-zA-Z0-9]*       {printf("identifier\t%s\t",yytext);}

Include this at the end of the rules so that it identifies all the other defined rules properly and not always print as identifier.

Licenciado em: CC-BY-SA com atribuição
Não afiliado a StackOverflow
scroll top