diff options
| author | Martial Simon <msimon_fr@hotmail.com> | 2025-09-15 01:07:58 +0200 |
|---|---|---|
| committer | Martial Simon <msimon_fr@hotmail.com> | 2025-09-15 01:07:58 +0200 |
| commit | 967be9e750221ab2ab783f95df79bb26d290a45e (patch) | |
| tree | 6802900a5e975f9f68b169f0f503f040056d6952 /42sh/src/lexer/lexer.c | |
Diffstat (limited to '42sh/src/lexer/lexer.c')
| -rw-r--r-- | 42sh/src/lexer/lexer.c | 359 |
1 files changed, 359 insertions, 0 deletions
diff --git a/42sh/src/lexer/lexer.c b/42sh/src/lexer/lexer.c new file mode 100644 index 0000000..eac77ab --- /dev/null +++ b/42sh/src/lexer/lexer.c @@ -0,0 +1,359 @@ +#include "lexer.h" + +#include <ctype.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +#include "expansion.h" +#include "utils.h" + +#define NEWLINEESCAPE (in[lexer->pos] == '\\' && in[lexer->pos + 1] == '\n') +#define SEPARATORS " \n;&|<>()" +#define ISSEPARATOR(I) strchr(SEPARATORS, (int)I) != NULL + +struct lexer *lexer_new(struct string *input) +{ + struct lexer *new = malloc(sizeof(struct lexer)); + new->input = input; + new->pos = 0; + new->processed = 0; + new->current_tok.value = NULL; + new->current_tok.type = TOKEN_EOF; + return new; +} + +void lexer_free(struct lexer *lexer) +{ + free(lexer); +} + +static void yeet_comment(struct lexer *l) +{ + while (l->input->data[l->pos] && l->input->data[l->pos] != '\n') + l->pos++; +} + +static int create_ionumber(struct lexer *l) +{ + struct string *str = string_create(NULL); + int i; + for (i = l->pos; isdigit(l->input->data[i]); i++) + string_pushc(str, l->input->data[i]); + if (l->input->data[i] == '<' || l->input->data[i] == '>') + { + l->current_tok.type = TOKEN_IONUMBER; + l->current_tok.value = str; + return 1; + } + else + { + string_free(str); + return 0; + } +} + +static int look_for_next(char *in, int i, char c) +{ + int escaped = 0; + while (in[i] && (in[i] != c || (escaped && in[i] == c))) + { + if (in[i] == '\\') + escaped ^= 1; + else + escaped = 0; + i++; + } + return i; +} + +static struct string *substitution(char *in, int i) +{ + char split = in[i]; + in[i] = '\0'; + struct string *new_input = string_create(in); + + if (split == '$') + { + i++; + } + + i++; + int error = 0; + struct string *substitute = + expand_substitution(in, &i, &error, (split == '`') ? '`' : ')'); + + if (error) + { + string_free(new_input); + return NULL; + } + + string_catenate(new_input, substitute); + + struct string *end_input = string_create(in + i + 1); + string_catenate(new_input, end_input); + + return new_input; +} + +static char extract_word(struct lexer *l, int *end) +{ + char *in = l->input->data; + int i = l->pos; + // True if we are currently inside quotes + int escaped = 0; + + while (in[i]) + { + if (!escaped && ((in[i] == '$' && in[i + 1] == '(') || in[i] == '`')) + { + struct string *new_input = substitution(in, i); + if (new_input == NULL) + { + l->current_tok.type = TOKEN_ERROR; + return '\0'; + } + + string_free(l->input); + l->input = new_input; + in = l->input->data; + } + // Checking that we are NOT in a quote and have a separator + if (!escaped && ISSEPARATOR(in[i])) + break; + if (!escaped && in[i] == '\'') + { + i++; + while (in[i] && in[i] != '\'') + i++; + if (!in[i]) + { + l->current_tok.type = TOKEN_ERROR; + return '\0'; + } + } + else if (!escaped && in[i] == '"') + { + i = look_for_next(in, i + 1, in[i]); + if (!in[i]) + { + l->current_tok.type = TOKEN_ERROR; + return '\0'; + } + } + else if (in[i] == '\\') + { + escaped ^= 1; + } + else + escaped = 0; + i++; + } + + char tmp = in[i]; + in[i] = '\0'; + *end = i; + return tmp; +} + +static enum token_type word_or_ass(struct token t) +{ + char *word = t.value->data; + if (isdigit(word[0])) + { + return TOKEN_WORD; + } + for (int i = 0; word[i]; i++) + { + if (word[i] == '=') + { + return TOKEN_ASS_WORD; + } + if (word[i] != '_' && !isalnum(word[i])) + { + return TOKEN_WORD; + } + } + return TOKEN_WORD; +} + +static void lex_word(struct lexer *l) +{ + int i = 0; + char tmp = extract_word(l, &i); + if (l->current_tok.type == TOKEN_ERROR) + return; + char *in = l->input->data; + + l->current_tok.type = TOKEN_ERROR; + // Identifies reserved words + for (int n = 0; reserved_words[n].word != NULL; n++) + { + if (STRINGS_ARE_EQUAL(in + l->pos, reserved_words[n].word)) + l->current_tok.type = reserved_words[n].type; + } + + // If we couldn't identify a reserved word + if (l->current_tok.type == TOKEN_ERROR) + { + struct string *pp = string_create(in + l->pos); + if (pp == NULL) + { + l->current_tok.type = TOKEN_ERROR; + in[i] = tmp; + return; + } + l->current_tok.value = pp; + // Set the token type + l->current_tok.type = word_or_ass(l->current_tok); + } + in[i] = tmp; +} + +static struct token set_ttype(struct lexer *lexer, enum token_type type) +{ + lexer->current_tok.type = type; + return lexer->current_tok; +} + +static struct token lex_and_or(struct lexer *l) +{ + if (l->input->data[l->pos] == '|' && l->input->data[l->pos + 1] == '|') + l->current_tok.type = TOKEN_OR; + else if (l->input->data[l->pos] == '|' && l->input->data[l->pos + 1] != '|') + l->current_tok.type = TOKEN_PIPE; + else if (l->input->data[l->pos] == '&' && l->input->data[l->pos + 1] == '&') + l->current_tok.type = TOKEN_AND; + else + l->current_tok.type = TOKEN_ERROR; + return l->current_tok; +} + +static struct token lex_redirect(struct lexer *l) +{ + l->current_tok.type = TOKEN_REDIR; + struct string *val = (l->current_tok.value = string_create(NULL)); + string_pushc(val, l->input->data[l->pos]); + if ((l->input->data[l->pos + 1] == '>' || l->input->data[l->pos + 1] == '&') + || (l->input->data[l->pos] == '>' && l->input->data[l->pos + 1] == '|')) + string_pushc(val, l->input->data[l->pos + 1]); + return l->current_tok; +} + +static void lex_special(struct lexer *l) +{ + // If the first char is a digit and we recognized a number followed by a + // redir + if (isdigit(l->input->data[l->pos]) && create_ionumber(l)) + // return immediately + return; + lex_word(l); +} + +struct token lexer_next_token(struct lexer *lexer) +{ + if (lexer->pos >= lexer->input->length) + { + lexer->current_tok.type = TOKEN_EOF; + lexer->current_tok.value = NULL; + return lexer->current_tok; + } + char *in = lexer->input->data; + + for (; in[lexer->pos] && (in[lexer->pos] == ' ' || NEWLINEESCAPE); + lexer->pos++) + { + if (in[lexer->pos] == '\\' && in[lexer->pos + 1] == '\n') + lexer->pos++; + continue; + } + + switch (in[lexer->pos]) + { + case ';': + return set_ttype(lexer, TOKEN_SEMICOLON); + case '\n': + return set_ttype(lexer, TOKEN_NEWLINE); + case '\0': + return set_ttype(lexer, TOKEN_EOF); + case '(': + return set_ttype(lexer, TOKEN_PAR_LEFT); + case ')': + return set_ttype(lexer, TOKEN_PAR_RIGHT); + case '{': + return set_ttype(lexer, TOKEN_CURLY_LEFT); + case '}': + return set_ttype(lexer, TOKEN_CURLY_RIGHT); + case '|': + /* FALLTHROUGH */ + case '&': + return lex_and_or(lexer); + case '<': + /* FALLTHROUGH */ + case '>': + return lex_redirect(lexer); + case '#': + yeet_comment(lexer); + return lexer_next_token(lexer); + default: + lex_special(lexer); + return lexer->current_tok; + } +} + +static void move_pos(struct lexer *lexer) +{ + enum token_type t = lexer->current_tok.type; + if (t == TOKEN_EOF) + { + return; + } + if (t == TOKEN_IF || t == TOKEN_FI || t == TOKEN_IN || t == TOKEN_DO + || t == TOKEN_OR || t == TOKEN_AND) + lexer->pos += 2; + else if (t == TOKEN_FOR) + lexer->pos += 3; + else if (t == TOKEN_ELSE || t == TOKEN_ELIF || t == TOKEN_THEN + || t == TOKEN_DONE) + lexer->pos += 4; + else if (t == TOKEN_WHILE || t == TOKEN_UNTIL) + lexer->pos += 5; + else if (t == TOKEN_WORD || t == TOKEN_IONUMBER || t == TOKEN_REDIR + || t == TOKEN_ASS_WORD) + lexer->pos += lexer->current_tok.value->length; + else + lexer->pos++; +} + +struct token lexer_peek(struct lexer *lexer) +{ + if (lexer->processed) + return lexer->current_tok; + lexer->processed = 1; + struct token res = lexer_next_token(lexer); + + move_pos(lexer); + + return res; +} + +struct token lexer_pop(struct lexer *lexer) +{ + struct token res; + + if (lexer->processed) + { + res = lexer->current_tok; + lexer->processed = 0; + return res; + } + + res = lexer_next_token(lexer); + + move_pos(lexer); + + lexer->processed = 0; + + return res; +} |
