Lexical Analyzer — Yev Ignatov

          
          lex.c



#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>


// Constants
#define MAX_IDENT_LEN 11
#define MAX_NUM_LEN 5
#define MAX_TOKENS 500
#define num_reserved 15

// Token types
typedef enum {
skipsym = 1 ,            // Skip
identsym = 2,            // Identifier
numbersym = 3,           // Number
plussym = 4,             // +
minussym = 5,            // -
multsym = 6,             // *
slashsym = 7,            // /
eqsym = 8,               // =
neqsym = 9,              // <>
lessym = 10,             // <
leqsym = 11,             // <=
gtrsym = 12,             // >
geqsym = 13,             // >=
lparentsym = 14,         // (
rparentsym = 15,         // )
commasym = 16,           // ,
semicolonsym = 17,       // ;
periodsym = 18,          // .
becomessym = 19,         // :=
beginsym = 20,           // begin
endsym = 21,             // end
ifsym = 22,              // if
fisym = 23,              // fi
thensym = 24,            // then
whilesym = 25,           // while
dosym = 26,              // do
callsym = 27,            // call
constsym = 28,           // const
varsym = 29,             // var
procsym = 30,            // procedure        
writesym = 31,           // write
readsym = 32,            // read
elsesym = 33,            // else
evensym  = 34            // even
}TokenType;

// Struct to hold token and its lexeme
typedef struct {
    char lexeme[25];   // Lexeme string
    TokenType token;
    char error_msg[50];
}Token;

// Struct to hold list of tokens
typedef struct{
    Token tokens[MAX_TOKENS];
    int count;
}TokenList;

// Structure to hold reserved words and their tokens
typedef struct {
    char *word;
    TokenType token;
}ReservedWord;

// List of reserved words and their corresponding tokens
ReservedWord RESERVED[] = {             
    {"begin", beginsym},
    {"end", endsym},
    {"if", ifsym},
    {"fi", fisym},
    {"then", thensym},
    {"while", whilesym},
    {"do", dosym},
    {"call", callsym},
    {"const", constsym},
    {"var", varsym},
    {"procedure", procsym},
    {"write", writesym},
    {"read", readsym},
    {"else", elsesym},
    {"even", evensym}
};


// Function prototypes
int push_token(TokenType t);
int is_invisible(char c);
TokenType get_reserved_token(const char *word);
void init_token_list(TokenList *list);
void add_token(TokenList *list, const char *lexeme, TokenType token);
void add_error_token(TokenList *list, const char *lexeme, const char *error_msg);
void print_source_program(const char *filename);
void print_lexeme_table(TokenList *list);
void print_token_list(TokenList *list);
void scan_file(const char *filename, TokenList *list);


//----------------------------- Main ----------------------------
int main(int argc, char **argv){
    
    // Check for proper number of arguments
    if (argc != 2) {
        printf( "Usage: %s <source file>\n", argv[0] );
        return 1;
    }
    
    // Initialize token list
    TokenList tokens;
    init_token_list(&tokens);

    // print the source program
    print_source_program(argv[1]);

    // Scan the file and populate the token list
    scan_file(argv[1], &tokens);

    // Print the lexeme table
    print_lexeme_table(&tokens);

    // Print the token list
    print_token_list(&tokens);

    return 0;
}



//----------------------------- Functions ----------------------------

// Check if character is invisible (whitespace)
int is_invisible(char c){
    return c == ' ' || c == '\r' || c == '\t' || c == '\n';
}

// Initialize the token list by setting initial count to 0
void init_token_list(TokenList *list) {
    list->count = 0;
}

// Add a token to the token list
void add_token(TokenList *list, const char *lexeme, TokenType token){

    // Error check for token list overflow
    if (list->count >= MAX_TOKENS) {
        printf( "Token list limit exceeded\n" );
        return;
    }
    
    // Copy lexeme and token type into the list
    strcpy(list->tokens[list->count].lexeme, lexeme);
    list->tokens[list->count].token = token;
    
    // Set error message to empty
    list->tokens[list->count].error_msg[0] = '\0';
    
    // Increment token count
    list->count++;
}

// Add an error token to the token list
void add_error_token(TokenList *list, const char *lexeme, const char *error_msg) {
    if (list->count >= MAX_TOKENS) {
        printf("Token list limit exceeded\n");
        return;
    }
    // Copy lexeme and set token type to 'skipsym' for errors
    strcpy(list->tokens[list->count].lexeme, lexeme);
    list->tokens[list->count].token = skipsym;
    
    // Copy the error message
    strcpy(list->tokens[list->count].error_msg, error_msg);
    
    // Increment token count
    list->count++;
}

// Scan fileand populate the token list
void scan_file(const char *filename, TokenList *list){

    // Open the file
    FILE *file = fopen(filename, "r");
    // Error check for file opening
    if (!file) {
        printf( "Error opening file\n" );
        return;
    }
    
   
    char buffer[50];                    // Buffer to hold lexemes
    int pos = 0;                        // Current position in buffer 
    int c;                              // Current character 
    
    // Read characters until EOF
    while ((c = fgetc(file)) != EOF) {
        // Skip invisible characters
        if (is_invisible(c)) {
            continue;
        }

        // Comment handling
        if (c == '/') {
            // Check for /*
            int next_c = fgetc(file);
            if (next_c == '*') {
                int prev = 0;
                // Now inside a comment; skip until closing */ or EOF
                while ((c = fgetc(file)) != EOF){
                    if (prev == '*' && c == '/') {
                        break;
                    }
                    // Update previous character
                    prev = c;
                }
                continue;
            }
            // If no '*' found, then it's a division operator
            else {
                // Push the next character back to stream
                ungetc(next_c, file);
                add_token(list, "/", slashsym);                              // Add / token
                continue;
            }
        }
        // Identifier and reserved word handling
        else if (isalpha(c)) {
            
            buffer[0] = c;
            pos = 1;

            // Read the rest of the identifier
            while ((c = fgetc(file)) != EOF && (isalnum(c))) {
                    buffer[pos++] = c;
            }
            // Null-terminate the string
            buffer[pos] = '\0';
            
            if (c != EOF) {
                // Push back the last read character
                ungetc(c, file);
            }

            // Check identifier length and add token
            if (pos > MAX_IDENT_LEN) {
                add_error_token(list, buffer, "Identifier too long");       // Add error token
            }
            else {
                TokenType token = get_reserved_token(buffer);
                add_token(list, buffer, token);                             // Add identifier or reserved word token
            }
        }
        // Number handling
        else if (isdigit(c)) {
            buffer[0] = c;
            pos = 1;
            // Read the rest of number
            while ((c = fgetc(file)) != EOF && isdigit(c)) {
                buffer[pos++] = c;

            }
            // Null-terminate the string
            buffer[pos] = '\0';
            
            // Push the last read character back to stream
            ungetc(c, file);

            // Check for number length and add token
            if (pos > MAX_NUM_LEN) {
                add_error_token(list, buffer, "Number too long");   // Add error token
            }
            else {
                add_token(list, buffer, numbersym);                 // Add number token
            }
        }

        // Special symbols handling
        else {
            switch (c) {
                // Arithmetic operators
                case '+':
                    add_token(list, "+", plussym);                  // Add + token
                    break;
                case '-':
                    add_token(list, "-", minussym);                 // Add - token
                    break;
                case '*':
                    add_token(list, "*", multsym);                  // Add * token
                    break;
                case '=':
                    add_token(list, "=", eqsym);                    // Add = token
                    break;
                case '<': {
                    int next_c = fgetc(file);
                    // Check for <= or <>
                    if (next_c == '>') {   
                        add_token(list, "<>", neqsym);              // Add <> token  
                    }
                    else if (next_c == '=') {
                        add_token(list, "<=", leqsym);              // Add <= token
                    }

                    // Single <
                    else {
                        add_token(list, "<", lessym);               // Add < token
                        if (next_c != EOF) {
                            // Push the next character back if not EOF
                            ungetc(next_c, file);
                        }
                    }
                    break;
                }
                case '>': {
                    int next_c = fgetc(file);
                    //check for >=
                    if (next_c == '=') {
                        add_token(list, ">=", geqsym);              // Add >= token
                    }
                    // Single >
                    else {
                        add_token(list, ">", gtrsym);               // Add > token
                        if (next_c != EOF) {
                            // Push the next character back if not EOF
                            ungetc(next_c, file);
                        }
                    }
                    break;
                }
                // Parentheses, commas, semicolons, periods
                case '(':
                    add_token(list, "(", lparentsym);               // Add left parenthesis token
                    break;
                case ')':
                    add_token(list, ")", rparentsym);               // Add right parenthesis token
                    break;
                case ',':
                    add_token(list, ",", commasym);                 // Add comma token
                    break;
                case ';':
                    add_token(list, ";", semicolonsym);             // Add semicolon token
                    break;
                case '.':
                    add_token(list, ".", periodsym);                // Add period token 
                    break;
                case ':': {
                    // Read next character to check for :=  
                    int next_c = fgetc(file);               
                    if (next_c == '=') {
                        add_token(list, ":=", becomessym);          // Add := token
                    }
                    // Invalid character
                    else {
                        // Construct invalid character string
                        char invalid_char[2] = {c, '\0'};
                        
                        if (next_c != EOF) {
                            // Push the next character back if not EOF
                            ungetc(next_c, file);
                        }
                        
                        // Add error token
                        add_error_token(list, invalid_char, "Invalid character");  // Add error token
                    }
                    break;
                }
                // Default case for invalid characters
                default: {
                    // Construct invalid character string
                    char invalid_char[2] = {c, '\0'};
                    
                    // Add error token
                    add_error_token(list, invalid_char, "Invalid character");       // Add error token
                }
            }
    
        }
    }

    // Close the file
    fclose(file);
}

// Print the source program
void print_source_program(const char *filename){
    // Open the file
    FILE *file = fopen(filename, "r");

    // Error check
    if (!file) {
        printf( "Error opening file\n" );
        return;
    }

    printf("Source Program:\n\n");
    char ch;

    // Read and print each character until EOF
    while ((ch = fgetc(file)) != EOF) {
        putchar(ch);
    }
    
    // New lines for formatting
    printf("\n\n");
    
    // Close the file
    fclose(file);
}

// Print the lexeme table
void print_lexeme_table(TokenList *list){
    // Print table header
    printf("Lexeme Table:\n\n");
    printf("Lexeme\t\tToken Type\n");

    // Iterate through tokens and print them
    for (int i = 0; i < list->count; i++) {
        Token *t = &list->tokens[i];
        
        if (t->token == skipsym && t->error_msg[0] != '\0') {
            // Error token
            printf("%-10s\t%s\n", t->lexeme, t->error_msg);
        } 
        else {
            // Regular token
            printf("%-10s\t%d\n", t->lexeme, t->token);
        }
    }
    printf("\n");
}

// Print the token list
void print_token_list(TokenList *list){
    printf("Token List:\n\n");
    for (int i = 0; i < list->count; i++) {
        // Print token type
        printf("%d ", list->tokens[i].token);
        
        // Print lexeme for identifiers and numbers
        if (list->tokens[i].token == identsym || list->tokens[i].token == numbersym) {
            printf("%s ", list->tokens[i].lexeme);
        }
    }
    printf("\n");
}

// Get the token type for a reserved word or identifier
TokenType get_reserved_token(const char *word){
    for (int i = 0; i < num_reserved; i++) {
        // Iterate through reserved words and compare
        if (strcmp(word, RESERVED[i].word) == 0) {
            return RESERVED[i].token;
        }
    }
    // Not a reserved word, it's an identifier
    return identsym;
}