How can I add float parsing capability to a C lexical analyzer?

https://stackoverflow.com/questions/22623796

20-06-2023
|

Question

I am trying to add floating point functionality to this simple lexical analyzer that I've written in C, for C (among other things). I have some ideas on how to this, but they are all incomplete solutions, involving mainly adding an if statement into the Parse integer literals, but it still will stop and count the period as a period because of the while statement. I thought about adding an OR to that while statement, but I am not entirely sure how to specify that it be a period only. Here is the code:

    /* front.c */
#include <stdio.h>
#include <ctype.h>
#include <string.h>
#include <conio.h>
/*Global Declarations */
/*variables*/
int charClass;
char lexeme [100];
char nextChar;
int lexLen;
int token;
int nextToken;
FILE *in_fp, *fopen();

/*function declarations*/
void addChar();
void getChar();
void getNonBlank();
int lex();

/*Character classes */
#define LETTER 0
#define DIGIT 1
#define UNKNOWN 99

/*token codes*/
#define INT_LIT 10
#define FLOAT
#define IDENT 11
#define ASSIGN_OP 20
#define ADD_OP 21
#define SUB_OP 22
#define MULT_OP 23
#define DIV_OP 24
#define LEFT_PAREN 25
#define RIGHT_PAREN 26
#define MOD_OP 27
#define SEMICOL 28
#define COMMA 29
#define EXCLAMATION_MARK 30
#define AT_SIGN 31
#define POUND_SIGN 32
#define DOLLAR_SIGN 33
#define CARAT_SIGN 34
#define AMPERSAND 35
#define PERIOD_MARK 36
#define LESSTHAN_SIGN 37
#define GREATERTHAN_SIGN 38

#define QUESTION_MARK 39
#define LEFT_SQUAREBRACKET 40
#define RIGHT_SQUAREBRACKET 41
#define LEFT_CURLYBRACKET 42
#define RIGHT_CURLYBRACKET 43
#define BACKSLASH 44
#define VERTICALBAR 45

#define SINGLE_QUOTE 46

#define DOUBLE_QUOTE 47
#define COLON 48

#define UNDERSCORE 49
#define TILDE 50
#define GRAVE_ACCENT 51




/*********************/
/*main driver */
main()
{
/*Open the input data file and process its contents*/
    if ((in_fp = fopen("front.in", "r")) == NULL)
        printf("ERROR - cannot open front.in \n");
    else
    {
        getChar();
        do
        {
            lex();
        } while (nextToken != EOF);
    }
}

/***************************/
/*lookup - a function to lookup operators and parentheses
        and return the token */
int lookup(char ch)
{
    switch (ch)
    {
        case '=':
            addChar();
            nextToken = ASSIGN_OP
            break;

        case '(':
            addChar();
            nextToken = LEFT_PAREN;
            break;

        case ')':
            addChar();
            nextToken = RIGHT_PAREN;
            break;

        case '+':
            addChar();
            nextToken = ADD_OP;
            break;

        case '-':
            addChar();
            nextToken = SUB_OP;
            break;

        case '*':
            addChar();
            nextToken = MULT_OP;
            break;

        case '/':
            addChar();
            nextToken = DIV_OP;
            break;

        case '%':
            addChar();
            nextToken = MOD_OP;
            break;

        case ';':
            addChar();
            nextToken = SEMICOL;
            break;

        case ':':
            addChar();
            nextToken = COLON;
            break;

        case '"':
            addChar();
            nextToken = DOUBLE_QUOTE;
            break;

        case ',':
            addChar();
            nextToken = COMMA;
            break;

        case '.':
            addChar();
            nextToken = PERIOD_MARK;
            break;

        case '!':
            addChar();
            nextToken = EXCLAMATION_MARK;
            break;

        case '@':
            addChar();
            nextToken = AT_SIGN;
            break;

        case '#':
            addChar();
            nextToken = POUND_SIGN;
            break;

        case '$':
            addChar();
            nextToken = DOLLAR_SIGN;
            break;

        case '^':
            addChar();
            nextToken = CARAT_SIGN;
            break;

        case '&':
            addChar();
            nextToken = AMPERSAND;
            break;

        case '<':
            addChar();
            nextToken = LESSTHAN_SIGN;
            break;

        case '>':
            addChar();
            nextToken = GREATERTHAN_SIGN;
            break;

        case '?':
            addChar();
            nextToken = QUESTION_MARK;
            break;

        case '[':
            addChar();
            nextToken = LEFT_SQUAREBRACKET;
            break;

        case ']':
            addChar();
            nextToken = RIGHT_SQUAREBRACKET;
            break;

        case '{':
            addChar();
            nextToken = LEFT_CURLYBRACKET;
            break;

        case '}':
            addChar();
            nextToken = RIGHT_CURLYBRACKET;
            break;

        case '\'':
            addChar();
            nextToken = SINGLE_QUOTE;
            break;*

        case '|':
            addChar();
            nextToken = VERTICALBAR;
            break;

        case '_':
            addChar();
            nextToken = UNDERSCORE;
            break;

        case '~':
            addChar();
            nextToken = TILDE;
            break;

        case '`':
            addChar();
            nextToken = GRAVE_ACCENT;
            break;

        case '\\':
            addChar();
            nextToken = BACKSLASH;
            break;

        default:
            addChar();
            nextToken = EOF;
            break;
    }
    return nextToken;
}

/*****************************/
/* addChar = a function to add nextChar to lexeme */
void addChar()
{
    if (lexLen <= 98)
    {
        lexeme[lexLen++] = nextChar;
        lexeme[lexLen] = 0;
    }
    else
        printf("Error - lexeme is too long \n");
}

/**********************************/
/* getChar- a function to get the next character of
            input and determine its character class */
void getChar()
{
    if ((nextChar = getc(in_fp)) != EOF)
    {
        if (isalpha(nextChar))
            charClass = LETTER;
        else if (isdigit(nextChar))
                charClass = DIGIT;
            else charClass = UNKNOWN;
    }
    else
        charClass = EOF;
}

/********************************************/
/* getNonBlank - a function to call getChar until it
                    returns a non-whitespace character */
void getNonBlank()
{
    while (isspace(nextChar))
        getChar();
}

/*******************************/
/* lex - a simple lexical analyzer for arithmetic
        expressions */
int lex()
{
    lexLen = 0;
    getNonBlank();
    switch (charClass)
    {
    /*Parse identifiers */
        case LETTER:
            addChar();
            getChar();
            while (charClass == LETTER || charClass == DIGIT)
            {
                addChar();
                getChar();
            }
        nextToken = IDENT;
        break;


/*Parse integer literals and ?Floats?*/
        case DIGIT:
            addChar();
            getChar();
            while (charClass == DIGIT)
            {
                addChar();
                getChar();
            }
            nextToken = INT_LIT;
            break;

/*Parentheses and operators*/
        case UNKNOWN:
            lookup(nextChar);
            getChar();
            break;

/*EOF*/
        case EOF:
            nextToken = EOF;
            lexeme[0] = 'E';
            lexeme[1] = 'O';
            lexeme[2] = 'F';
            lexeme[3] = 0;
            break;
    }/*End of switch*/
    printf("Next token is:  %d, Next lexeme is %s\n",
        nextToken, lexeme);
    return nextToken;
} /*End of function lex*/

I was thinking that maybe if I used something like "charClass.ch == '.'" in the while statement as an extension of "charClass == DIGIT" via || (or), but I think I might be getting mixed up with another language or doing it wrong. I might not be, but its kind of hard to test this program properly at the moment.

Here is the specific part I think I need to change to get float:

    /*Parse integer literals and ?Floats?*/
    case DIGIT:
        addChar();
        getChar();
        while (charClass == DIGIT)
        {
            addChar();
            getChar();
        }
        nextToken = INT_LIT;
        break;

Solution

/*Parse integer literals and ?Floats?*/
    case DIGIT:
        addChar();
        getChar();
        while (charClass == DIGIT)
        {
            addChar();
            getChar();
        }

At this point you already know what nextChar is. If it's a dot, write some more code to consume it and all the following digits and set nextToken to FLOAT_LIT. Otherwise fall through to this:

        nextToken = INT_LIT;
        break;

Licensed under: CC-BY-SA with attribution

Not affiliated with StackOverflow