diff options
| author | Martial Simon <msimon_fr@hotmail.com> | 2025-09-15 01:07:58 +0200 |
|---|---|---|
| committer | Martial Simon <msimon_fr@hotmail.com> | 2025-09-15 01:07:58 +0200 |
| commit | 967be9e750221ab2ab783f95df79bb26d290a45e (patch) | |
| tree | 6802900a5e975f9f68b169f0f503f040056d6952 /21sh/ll-expr/src/lexer | |
Diffstat (limited to '21sh/ll-expr/src/lexer')
| -rw-r--r-- | 21sh/ll-expr/src/lexer/lexer.c | 104 | ||||
| -rw-r--r-- | 21sh/ll-expr/src/lexer/lexer.h | 61 | ||||
| -rw-r--r-- | 21sh/ll-expr/src/lexer/token.h | 25 |
3 files changed, 190 insertions, 0 deletions
diff --git a/21sh/ll-expr/src/lexer/lexer.c b/21sh/ll-expr/src/lexer/lexer.c new file mode 100644 index 0000000..3b3d29f --- /dev/null +++ b/21sh/ll-expr/src/lexer/lexer.c @@ -0,0 +1,104 @@ +#include "lexer.h" + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +struct lexer *lexer_new(const char *input) +{ + struct lexer *new = malloc(sizeof(struct lexer)); + new->input = input; + new->pos = 0; + return new; +} + +void lexer_free(struct lexer *lexer) +{ + free(lexer); +} + +struct template +{ + char value; + enum token_type type; +}; + +struct template templates[] = { + { '+', TOKEN_PLUS }, { '-', TOKEN_MINUS }, { '*', TOKEN_MUL }, + { '/', TOKEN_DIV }, { '(', TOKEN_LEFT_PAR }, { ')', TOKEN_RIGHT_PAR }, + { '\0', TOKEN_EOF }, { 0, TOKEN_ERROR } +}; + +static ssize_t parse_number(struct lexer *l) +{ + union + { + const char *cc; + char *c; + } cast; + cast.cc = l->input + l->pos; + char *in = cast.c; + size_t t = 0; + while (in[t] && in[t] >= '0' && in[t] <= '9') + t++; + char tmp = in[t]; + in[t] = '\0'; + ssize_t res = atoi(in); + in[t] = tmp; + return res; +} + +struct token lexer_next_token(struct lexer *lexer) +{ + for (; lexer->input[lexer->pos] && lexer->input[lexer->pos] == ' '; + lexer->pos++) + continue; + for (int i = 0; i < 7; i++) + { + if (lexer->input[lexer->pos] == templates[i].value) + { + lexer->current_tok.type = templates[i].type; + return lexer->current_tok; + } + } + if (lexer->input[lexer->pos] > '9' || lexer->input[lexer->pos] < '0') + { + lexer->current_tok.type = TOKEN_ERROR; + fprintf(stderr, "lexer: invalid token %c\n", lexer->input[lexer->pos]); + return lexer->current_tok; + } + else + { + lexer->current_tok.type = TOKEN_NUMBER; + lexer->current_tok.value = parse_number(lexer); + return lexer->current_tok; + } +} + +struct token lexer_peek(struct lexer *lexer) +{ + return lexer_next_token(lexer); +} + +static size_t count_numbers(ssize_t value) +{ + if (value == 0) + return 1; + size_t size = 0; + while (value) + { + size++; + value /= 10; + } + return size; +} + +struct token lexer_pop(struct lexer *lexer) +{ + struct token res = lexer_next_token(lexer); + if (res.type == TOKEN_NUMBER) + lexer->pos += count_numbers(res.value); + else + lexer->pos++; + return res; +} diff --git a/21sh/ll-expr/src/lexer/lexer.h b/21sh/ll-expr/src/lexer/lexer.h new file mode 100644 index 0000000..40a7cc9 --- /dev/null +++ b/21sh/ll-expr/src/lexer/lexer.h @@ -0,0 +1,61 @@ +#ifndef LEXER_H +#define LEXER_H + +#include "token.h" + +/** + * \page Lexer + * + * The lexer cuts some input text into blocks called tokens. + + * This process is done **on demand**: the lexer doesn't read the + * input more than it needs, only creates tokens when lexer_peek + * or lexer_pop is called, and no token is available. + * + * "2 + 3" will produce 3 tokens: + * - TOKEN_NUMBER { .value = 2 } + * - TOKEN_PLUS + * - TOKEN_NUMBER { .value = 3 } + */ + +struct lexer +{ + const char *input; // The input data + size_t pos; // The current offset inside the input data + struct token current_tok; // The next token, if processed +}; + +/** + * \brief Creates a new lexer given an input string. + */ +struct lexer *lexer_new(const char *input); + +/** + ** \brief Frees the given lexer, but not its input. + */ +void lexer_free(struct lexer *lexer); + +/** + * \brief Returns a token from the input string. + + * This function goes through the input string character by character and + * builds a token. lexer_peek and lexer_pop should call it. If the input is + * invalid, you must print something on stderr and return the appropriate token. + */ +struct token lexer_next_token(struct lexer *lexer); + +/** + * \brief Returns the next token, but doesn't move forward: calling lexer_peek + * multiple times in a row always returns the same result. + * This function is meant to help the parser check if the next token matches + * some rule. + */ +struct token lexer_peek(struct lexer *lexer); + +/** + * \brief Returns the next token, and removes it from the stream: + * calling lexer_pop in a loop will iterate over all tokens until EOF. + */ +struct token lexer_pop(struct lexer *lexer); + +#endif /* !LEXER_H */ diff --git a/21sh/ll-expr/src/lexer/token.h b/21sh/ll-expr/src/lexer/token.h new file mode 100644 index 0000000..b0866fc --- /dev/null +++ b/21sh/ll-expr/src/lexer/token.h @@ -0,0 +1,25 @@ +#ifndef TOKEN_H +#define TOKEN_H + +#include <unistd.h> + +enum token_type +{ + TOKEN_PLUS, // '+' + TOKEN_MINUS, // '-' + TOKEN_MUL, // '*' + TOKEN_DIV, // '/' + TOKEN_NUMBER, // "[0-9]+" + TOKEN_LEFT_PAR, // '(' + TOKEN_RIGHT_PAR, // ')' + TOKEN_EOF, // end of input marker + TOKEN_ERROR // it is not a real token, it is returned in case of invalid + // input +}; + +struct token +{ + enum token_type type; // The kind of token + ssize_t value; // If the token is a number, its value +}; +#endif /* !TOKEN_H */ |
