summaryrefslogtreecommitdiff
path: root/21sh/ll-expr/src/lexer
diff options
context:
space:
mode:
authorMartial Simon <msimon_fr@hotmail.com>2025-09-15 01:07:58 +0200
committerMartial Simon <msimon_fr@hotmail.com>2025-09-15 01:07:58 +0200
commit967be9e750221ab2ab783f95df79bb26d290a45e (patch)
tree6802900a5e975f9f68b169f0f503f040056d6952 /21sh/ll-expr/src/lexer
add: added projectsHEADmain
Diffstat (limited to '21sh/ll-expr/src/lexer')
-rw-r--r--21sh/ll-expr/src/lexer/lexer.c104
-rw-r--r--21sh/ll-expr/src/lexer/lexer.h61
-rw-r--r--21sh/ll-expr/src/lexer/token.h25
3 files changed, 190 insertions, 0 deletions
diff --git a/21sh/ll-expr/src/lexer/lexer.c b/21sh/ll-expr/src/lexer/lexer.c
new file mode 100644
index 0000000..3b3d29f
--- /dev/null
+++ b/21sh/ll-expr/src/lexer/lexer.c
@@ -0,0 +1,104 @@
+#include "lexer.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+struct lexer *lexer_new(const char *input)
+{
+ struct lexer *new = malloc(sizeof(struct lexer));
+ new->input = input;
+ new->pos = 0;
+ return new;
+}
+
+void lexer_free(struct lexer *lexer)
+{
+ free(lexer);
+}
+
+struct template
+{
+ char value;
+ enum token_type type;
+};
+
+struct template templates[] = {
+ { '+', TOKEN_PLUS }, { '-', TOKEN_MINUS }, { '*', TOKEN_MUL },
+ { '/', TOKEN_DIV }, { '(', TOKEN_LEFT_PAR }, { ')', TOKEN_RIGHT_PAR },
+ { '\0', TOKEN_EOF }, { 0, TOKEN_ERROR }
+};
+
+static ssize_t parse_number(struct lexer *l)
+{
+ union
+ {
+ const char *cc;
+ char *c;
+ } cast;
+ cast.cc = l->input + l->pos;
+ char *in = cast.c;
+ size_t t = 0;
+ while (in[t] && in[t] >= '0' && in[t] <= '9')
+ t++;
+ char tmp = in[t];
+ in[t] = '\0';
+ ssize_t res = atoi(in);
+ in[t] = tmp;
+ return res;
+}
+
+struct token lexer_next_token(struct lexer *lexer)
+{
+ for (; lexer->input[lexer->pos] && lexer->input[lexer->pos] == ' ';
+ lexer->pos++)
+ continue;
+ for (int i = 0; i < 7; i++)
+ {
+ if (lexer->input[lexer->pos] == templates[i].value)
+ {
+ lexer->current_tok.type = templates[i].type;
+ return lexer->current_tok;
+ }
+ }
+ if (lexer->input[lexer->pos] > '9' || lexer->input[lexer->pos] < '0')
+ {
+ lexer->current_tok.type = TOKEN_ERROR;
+ fprintf(stderr, "lexer: invalid token %c\n", lexer->input[lexer->pos]);
+ return lexer->current_tok;
+ }
+ else
+ {
+ lexer->current_tok.type = TOKEN_NUMBER;
+ lexer->current_tok.value = parse_number(lexer);
+ return lexer->current_tok;
+ }
+}
+
+struct token lexer_peek(struct lexer *lexer)
+{
+ return lexer_next_token(lexer);
+}
+
+static size_t count_numbers(ssize_t value)
+{
+ if (value == 0)
+ return 1;
+ size_t size = 0;
+ while (value)
+ {
+ size++;
+ value /= 10;
+ }
+ return size;
+}
+
+struct token lexer_pop(struct lexer *lexer)
+{
+ struct token res = lexer_next_token(lexer);
+ if (res.type == TOKEN_NUMBER)
+ lexer->pos += count_numbers(res.value);
+ else
+ lexer->pos++;
+ return res;
+}
diff --git a/21sh/ll-expr/src/lexer/lexer.h b/21sh/ll-expr/src/lexer/lexer.h
new file mode 100644
index 0000000..40a7cc9
--- /dev/null
+++ b/21sh/ll-expr/src/lexer/lexer.h
@@ -0,0 +1,61 @@
+#ifndef LEXER_H
+#define LEXER_H
+
+#include "token.h"
+
+/**
+ * \page Lexer
+ *
+ * The lexer cuts some input text into blocks called tokens.
+
+ * This process is done **on demand**: the lexer doesn't read the
+ * input more than it needs, only creates tokens when lexer_peek
+ * or lexer_pop is called, and no token is available.
+ *
+ * "2 + 3" will produce 3 tokens:
+ * - TOKEN_NUMBER { .value = 2 }
+ * - TOKEN_PLUS
+ * - TOKEN_NUMBER { .value = 3 }
+ */
+
+struct lexer
+{
+ const char *input; // The input data
+ size_t pos; // The current offset inside the input data
+ struct token current_tok; // The next token, if processed
+};
+
+/**
+ * \brief Creates a new lexer given an input string.
+ */
+struct lexer *lexer_new(const char *input);
+
+/**
+ ** \brief Frees the given lexer, but not its input.
+ */
+void lexer_free(struct lexer *lexer);
+
+/**
+ * \brief Returns a token from the input string.
+
+ * This function goes through the input string character by character and
+ * builds a token. lexer_peek and lexer_pop should call it. If the input is
+ * invalid, you must print something on stderr and return the appropriate token.
+ */
+struct token lexer_next_token(struct lexer *lexer);
+
+/**
+ * \brief Returns the next token, but doesn't move forward: calling lexer_peek
+ * multiple times in a row always returns the same result.
+ * This function is meant to help the parser check if the next token matches
+ * some rule.
+ */
+struct token lexer_peek(struct lexer *lexer);
+
+/**
+ * \brief Returns the next token, and removes it from the stream:
+ * calling lexer_pop in a loop will iterate over all tokens until EOF.
+ */
+struct token lexer_pop(struct lexer *lexer);
+
+#endif /* !LEXER_H */
diff --git a/21sh/ll-expr/src/lexer/token.h b/21sh/ll-expr/src/lexer/token.h
new file mode 100644
index 0000000..b0866fc
--- /dev/null
+++ b/21sh/ll-expr/src/lexer/token.h
@@ -0,0 +1,25 @@
+#ifndef TOKEN_H
+#define TOKEN_H
+
+#include <unistd.h>
+
+enum token_type
+{
+ TOKEN_PLUS, // '+'
+ TOKEN_MINUS, // '-'
+ TOKEN_MUL, // '*'
+ TOKEN_DIV, // '/'
+ TOKEN_NUMBER, // "[0-9]+"
+ TOKEN_LEFT_PAR, // '('
+ TOKEN_RIGHT_PAR, // ')'
+ TOKEN_EOF, // end of input marker
+ TOKEN_ERROR // it is not a real token, it is returned in case of invalid
+ // input
+};
+
+struct token
+{
+ enum token_type type; // The kind of token
+ ssize_t value; // If the token is a number, its value
+};
+#endif /* !TOKEN_H */