1 files changed, 359 insertions, 0 deletions
diff --git a/42sh/src/lexer/lexer.c b/42sh/src/lexer/lexer.c
new file mode 100644
index 0000000..eac77ab
--- /dev/null
+++ b/42sh/src/lexer/lexer.c
@@ -0,0 +1,359 @@
+#include "lexer.h"
+
+#include <ctype.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "expansion.h"
+#include "utils.h"
+
+#define NEWLINEESCAPE (in[lexer->pos] == '\\' && in[lexer->pos + 1] == '\n')
+#define SEPARATORS " \n;&|<>()"
+#define ISSEPARATOR(I) strchr(SEPARATORS, (int)I) != NULL
+
+struct lexer *lexer_new(struct string *input)
+{
+    struct lexer *new = malloc(sizeof(struct lexer));
+    new->input = input;
+    new->pos = 0;
+    new->processed = 0;
+    new->current_tok.value = NULL;
+    new->current_tok.type = TOKEN_EOF;
+    return new;
+}
+
+void lexer_free(struct lexer *lexer)
+{
+    free(lexer);
+}
+
+static void yeet_comment(struct lexer *l)
+{
+    while (l->input->data[l->pos] && l->input->data[l->pos] != '\n')
+        l->pos++;
+}
+
+static int create_ionumber(struct lexer *l)
+{
+    struct string *str = string_create(NULL);
+    int i;
+    for (i = l->pos; isdigit(l->input->data[i]); i++)
+        string_pushc(str, l->input->data[i]);
+    if (l->input->data[i] == '<' || l->input->data[i] == '>')
+    {
+        l->current_tok.type = TOKEN_IONUMBER;
+        l->current_tok.value = str;
+        return 1;
+    }
+    else
+    {
+        string_free(str);
+        return 0;
+    }
+}
+
+static int look_for_next(char *in, int i, char c)
+{
+    int escaped = 0;
+    while (in[i] && (in[i] != c || (escaped && in[i] == c)))
+    {
+        if (in[i] == '\\')
+            escaped ^= 1;
+        else
+            escaped = 0;
+        i++;
+    }
+    return i;
+}
+
+static struct string *substitution(char *in, int i)
+{
+    char split = in[i];
+    in[i] = '\0';
+    struct string *new_input = string_create(in);
+
+    if (split == '$')
+    {
+        i++;
+    }
+
+    i++;
+    int error = 0;
+    struct string *substitute =
+        expand_substitution(in, &i, &error, (split == '`') ? '`' : ')');
+
+    if (error)
+    {
+        string_free(new_input);
+        return NULL;
+    }
+
+    string_catenate(new_input, substitute);
+
+    struct string *end_input = string_create(in + i + 1);
+    string_catenate(new_input, end_input);
+
+    return new_input;
+}
+
+static char extract_word(struct lexer *l, int *end)
+{
+    char *in = l->input->data;
+    int i = l->pos;
+    // True if we are currently inside quotes
+    int escaped = 0;
+
+    while (in[i])
+    {
+        if (!escaped && ((in[i] == '$' && in[i + 1] == '(') || in[i] == '`'))
+        {
+            struct string *new_input = substitution(in, i);
+            if (new_input == NULL)
+            {
+                l->current_tok.type = TOKEN_ERROR;
+                return '\0';
+            }
+
+            string_free(l->input);
+            l->input = new_input;
+            in = l->input->data;
+        }
+        // Checking that we are NOT in a quote and have a separator
+        if (!escaped && ISSEPARATOR(in[i]))
+            break;
+        if (!escaped && in[i] == '\'')
+        {
+            i++;
+            while (in[i] && in[i] != '\'')
+                i++;
+            if (!in[i])
+            {
+                l->current_tok.type = TOKEN_ERROR;
+                return '\0';
+            }
+        }
+        else if (!escaped && in[i] == '"')
+        {
+            i = look_for_next(in, i + 1, in[i]);
+            if (!in[i])
+            {
+                l->current_tok.type = TOKEN_ERROR;
+                return '\0';
+            }
+        }
+        else if (in[i] == '\\')
+        {
+            escaped ^= 1;
+        }
+        else
+            escaped = 0;
+        i++;
+    }
+
+    char tmp = in[i];
+    in[i] = '\0';
+    *end = i;
+    return tmp;
+}
+
+static enum token_type word_or_ass(struct token t)
+{
+    char *word = t.value->data;
+    if (isdigit(word[0]))
+    {
+        return TOKEN_WORD;
+    }
+    for (int i = 0; word[i]; i++)
+    {
+        if (word[i] == '=')
+        {
+            return TOKEN_ASS_WORD;
+        }
+        if (word[i] != '_' && !isalnum(word[i]))
+        {
+            return TOKEN_WORD;
+        }
+    }
+    return TOKEN_WORD;
+}
+
+static void lex_word(struct lexer *l)
+{
+    int i = 0;
+    char tmp = extract_word(l, &i);
+    if (l->current_tok.type == TOKEN_ERROR)
+        return;
+    char *in = l->input->data;
+
+    l->current_tok.type = TOKEN_ERROR;
+    // Identifies reserved words
+    for (int n = 0; reserved_words[n].word != NULL; n++)
+    {
+        if (STRINGS_ARE_EQUAL(in + l->pos, reserved_words[n].word))
+            l->current_tok.type = reserved_words[n].type;
+    }
+
+    // If we couldn't identify a reserved word
+    if (l->current_tok.type == TOKEN_ERROR)
+    {
+        struct string *pp = string_create(in + l->pos);
+        if (pp == NULL)
+        {
+            l->current_tok.type = TOKEN_ERROR;
+            in[i] = tmp;
+            return;
+        }
+        l->current_tok.value = pp;
+        // Set the token type
+        l->current_tok.type = word_or_ass(l->current_tok);
+    }
+    in[i] = tmp;
+}
+
+static struct token set_ttype(struct lexer *lexer, enum token_type type)
+{
+    lexer->current_tok.type = type;
+    return lexer->current_tok;
+}
+
+static struct token lex_and_or(struct lexer *l)
+{
+    if (l->input->data[l->pos] == '|' && l->input->data[l->pos + 1] == '|')
+        l->current_tok.type = TOKEN_OR;
+    else if (l->input->data[l->pos] == '|' && l->input->data[l->pos + 1] != '|')
+        l->current_tok.type = TOKEN_PIPE;
+    else if (l->input->data[l->pos] == '&' && l->input->data[l->pos + 1] == '&')
+        l->current_tok.type = TOKEN_AND;
+    else
+        l->current_tok.type = TOKEN_ERROR;
+    return l->current_tok;
+}
+
+static struct token lex_redirect(struct lexer *l)
+{
+    l->current_tok.type = TOKEN_REDIR;
+    struct string *val = (l->current_tok.value = string_create(NULL));
+    string_pushc(val, l->input->data[l->pos]);
+    if ((l->input->data[l->pos + 1] == '>' || l->input->data[l->pos + 1] == '&')
+        || (l->input->data[l->pos] == '>' && l->input->data[l->pos + 1] == '|'))
+        string_pushc(val, l->input->data[l->pos + 1]);
+    return l->current_tok;
+}
+
+static void lex_special(struct lexer *l)
+{
+    // If the first char is a digit and we recognized a number followed by a
+    // redir
+    if (isdigit(l->input->data[l->pos]) && create_ionumber(l))
+        // return immediately
+        return;
+    lex_word(l);
+}
+
+struct token lexer_next_token(struct lexer *lexer)
+{
+    if (lexer->pos >= lexer->input->length)
+    {
+        lexer->current_tok.type = TOKEN_EOF;
+        lexer->current_tok.value = NULL;
+        return lexer->current_tok;
+    }
+    char *in = lexer->input->data;
+
+    for (; in[lexer->pos] && (in[lexer->pos] == ' ' || NEWLINEESCAPE);
+         lexer->pos++)
+    {
+        if (in[lexer->pos] == '\\' && in[lexer->pos + 1] == '\n')
+            lexer->pos++;
+        continue;
+    }
+
+    switch (in[lexer->pos])
+    {
+    case ';':
+        return set_ttype(lexer, TOKEN_SEMICOLON);
+    case '\n':
+        return set_ttype(lexer, TOKEN_NEWLINE);
+    case '\0':
+        return set_ttype(lexer, TOKEN_EOF);
+    case '(':
+        return set_ttype(lexer, TOKEN_PAR_LEFT);
+    case ')':
+        return set_ttype(lexer, TOKEN_PAR_RIGHT);
+    case '{':
+        return set_ttype(lexer, TOKEN_CURLY_LEFT);
+    case '}':
+        return set_ttype(lexer, TOKEN_CURLY_RIGHT);
+    case '|':
+    /* FALLTHROUGH */
+    case '&':
+        return lex_and_or(lexer);
+    case '<':
+    /* FALLTHROUGH */
+    case '>':
+        return lex_redirect(lexer);
+    case '#':
+        yeet_comment(lexer);
+        return lexer_next_token(lexer);
+    default:
+        lex_special(lexer);
+        return lexer->current_tok;
+    }
+}
+
+static void move_pos(struct lexer *lexer)
+{
+    enum token_type t = lexer->current_tok.type;
+    if (t == TOKEN_EOF)
+    {
+        return;
+    }
+    if (t == TOKEN_IF || t == TOKEN_FI || t == TOKEN_IN || t == TOKEN_DO
+        || t == TOKEN_OR || t == TOKEN_AND)
+        lexer->pos += 2;
+    else if (t == TOKEN_FOR)
+        lexer->pos += 3;
+    else if (t == TOKEN_ELSE || t == TOKEN_ELIF || t == TOKEN_THEN
+             || t == TOKEN_DONE)
+        lexer->pos += 4;
+    else if (t == TOKEN_WHILE || t == TOKEN_UNTIL)
+        lexer->pos += 5;
+    else if (t == TOKEN_WORD || t == TOKEN_IONUMBER || t == TOKEN_REDIR
+             || t == TOKEN_ASS_WORD)
+        lexer->pos += lexer->current_tok.value->length;
+    else
+        lexer->pos++;
+}
+
+struct token lexer_peek(struct lexer *lexer)
+{
+    if (lexer->processed)
+        return lexer->current_tok;
+    lexer->processed = 1;
+    struct token res = lexer_next_token(lexer);
+
+    move_pos(lexer);
+
+    return res;
+}
+
+struct token lexer_pop(struct lexer *lexer)
+{
+    struct token res;
+
+    if (lexer->processed)
+    {
+        res = lexer->current_tok;
+        lexer->processed = 0;
+        return res;
+    }
+
+    res = lexer_next_token(lexer);
+
+    move_pos(lexer);
+
+    lexer->processed = 0;
+
+    return res;
+}