summaryrefslogtreecommitdiff
path: root/42sh/src/lexer/lexer.c
diff options
context:
space:
mode:
Diffstat (limited to '42sh/src/lexer/lexer.c')
-rw-r--r--42sh/src/lexer/lexer.c359
1 files changed, 359 insertions, 0 deletions
diff --git a/42sh/src/lexer/lexer.c b/42sh/src/lexer/lexer.c
new file mode 100644
index 0000000..eac77ab
--- /dev/null
+++ b/42sh/src/lexer/lexer.c
@@ -0,0 +1,359 @@
+#include "lexer.h"
+
+#include <ctype.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "expansion.h"
+#include "utils.h"
+
+#define NEWLINEESCAPE (in[lexer->pos] == '\\' && in[lexer->pos + 1] == '\n')
+#define SEPARATORS " \n;&|<>()"
+#define ISSEPARATOR(I) strchr(SEPARATORS, (int)I) != NULL
+
+struct lexer *lexer_new(struct string *input)
+{
+ struct lexer *new = malloc(sizeof(struct lexer));
+ new->input = input;
+ new->pos = 0;
+ new->processed = 0;
+ new->current_tok.value = NULL;
+ new->current_tok.type = TOKEN_EOF;
+ return new;
+}
+
+void lexer_free(struct lexer *lexer)
+{
+ free(lexer);
+}
+
+static void yeet_comment(struct lexer *l)
+{
+ while (l->input->data[l->pos] && l->input->data[l->pos] != '\n')
+ l->pos++;
+}
+
+static int create_ionumber(struct lexer *l)
+{
+ struct string *str = string_create(NULL);
+ int i;
+ for (i = l->pos; isdigit(l->input->data[i]); i++)
+ string_pushc(str, l->input->data[i]);
+ if (l->input->data[i] == '<' || l->input->data[i] == '>')
+ {
+ l->current_tok.type = TOKEN_IONUMBER;
+ l->current_tok.value = str;
+ return 1;
+ }
+ else
+ {
+ string_free(str);
+ return 0;
+ }
+}
+
+static int look_for_next(char *in, int i, char c)
+{
+ int escaped = 0;
+ while (in[i] && (in[i] != c || (escaped && in[i] == c)))
+ {
+ if (in[i] == '\\')
+ escaped ^= 1;
+ else
+ escaped = 0;
+ i++;
+ }
+ return i;
+}
+
+static struct string *substitution(char *in, int i)
+{
+ char split = in[i];
+ in[i] = '\0';
+ struct string *new_input = string_create(in);
+
+ if (split == '$')
+ {
+ i++;
+ }
+
+ i++;
+ int error = 0;
+ struct string *substitute =
+ expand_substitution(in, &i, &error, (split == '`') ? '`' : ')');
+
+ if (error)
+ {
+ string_free(new_input);
+ return NULL;
+ }
+
+ string_catenate(new_input, substitute);
+
+ struct string *end_input = string_create(in + i + 1);
+ string_catenate(new_input, end_input);
+
+ return new_input;
+}
+
+static char extract_word(struct lexer *l, int *end)
+{
+ char *in = l->input->data;
+ int i = l->pos;
+ // True if we are currently inside quotes
+ int escaped = 0;
+
+ while (in[i])
+ {
+ if (!escaped && ((in[i] == '$' && in[i + 1] == '(') || in[i] == '`'))
+ {
+ struct string *new_input = substitution(in, i);
+ if (new_input == NULL)
+ {
+ l->current_tok.type = TOKEN_ERROR;
+ return '\0';
+ }
+
+ string_free(l->input);
+ l->input = new_input;
+ in = l->input->data;
+ }
+ // Checking that we are NOT in a quote and have a separator
+ if (!escaped && ISSEPARATOR(in[i]))
+ break;
+ if (!escaped && in[i] == '\'')
+ {
+ i++;
+ while (in[i] && in[i] != '\'')
+ i++;
+ if (!in[i])
+ {
+ l->current_tok.type = TOKEN_ERROR;
+ return '\0';
+ }
+ }
+ else if (!escaped && in[i] == '"')
+ {
+ i = look_for_next(in, i + 1, in[i]);
+ if (!in[i])
+ {
+ l->current_tok.type = TOKEN_ERROR;
+ return '\0';
+ }
+ }
+ else if (in[i] == '\\')
+ {
+ escaped ^= 1;
+ }
+ else
+ escaped = 0;
+ i++;
+ }
+
+ char tmp = in[i];
+ in[i] = '\0';
+ *end = i;
+ return tmp;
+}
+
+static enum token_type word_or_ass(struct token t)
+{
+ char *word = t.value->data;
+ if (isdigit(word[0]))
+ {
+ return TOKEN_WORD;
+ }
+ for (int i = 0; word[i]; i++)
+ {
+ if (word[i] == '=')
+ {
+ return TOKEN_ASS_WORD;
+ }
+ if (word[i] != '_' && !isalnum(word[i]))
+ {
+ return TOKEN_WORD;
+ }
+ }
+ return TOKEN_WORD;
+}
+
+static void lex_word(struct lexer *l)
+{
+ int i = 0;
+ char tmp = extract_word(l, &i);
+ if (l->current_tok.type == TOKEN_ERROR)
+ return;
+ char *in = l->input->data;
+
+ l->current_tok.type = TOKEN_ERROR;
+ // Identifies reserved words
+ for (int n = 0; reserved_words[n].word != NULL; n++)
+ {
+ if (STRINGS_ARE_EQUAL(in + l->pos, reserved_words[n].word))
+ l->current_tok.type = reserved_words[n].type;
+ }
+
+ // If we couldn't identify a reserved word
+ if (l->current_tok.type == TOKEN_ERROR)
+ {
+ struct string *pp = string_create(in + l->pos);
+ if (pp == NULL)
+ {
+ l->current_tok.type = TOKEN_ERROR;
+ in[i] = tmp;
+ return;
+ }
+ l->current_tok.value = pp;
+ // Set the token type
+ l->current_tok.type = word_or_ass(l->current_tok);
+ }
+ in[i] = tmp;
+}
+
+static struct token set_ttype(struct lexer *lexer, enum token_type type)
+{
+ lexer->current_tok.type = type;
+ return lexer->current_tok;
+}
+
+static struct token lex_and_or(struct lexer *l)
+{
+ if (l->input->data[l->pos] == '|' && l->input->data[l->pos + 1] == '|')
+ l->current_tok.type = TOKEN_OR;
+ else if (l->input->data[l->pos] == '|' && l->input->data[l->pos + 1] != '|')
+ l->current_tok.type = TOKEN_PIPE;
+ else if (l->input->data[l->pos] == '&' && l->input->data[l->pos + 1] == '&')
+ l->current_tok.type = TOKEN_AND;
+ else
+ l->current_tok.type = TOKEN_ERROR;
+ return l->current_tok;
+}
+
+static struct token lex_redirect(struct lexer *l)
+{
+ l->current_tok.type = TOKEN_REDIR;
+ struct string *val = (l->current_tok.value = string_create(NULL));
+ string_pushc(val, l->input->data[l->pos]);
+ if ((l->input->data[l->pos + 1] == '>' || l->input->data[l->pos + 1] == '&')
+ || (l->input->data[l->pos] == '>' && l->input->data[l->pos + 1] == '|'))
+ string_pushc(val, l->input->data[l->pos + 1]);
+ return l->current_tok;
+}
+
+static void lex_special(struct lexer *l)
+{
+ // If the first char is a digit and we recognized a number followed by a
+ // redir
+ if (isdigit(l->input->data[l->pos]) && create_ionumber(l))
+ // return immediately
+ return;
+ lex_word(l);
+}
+
+struct token lexer_next_token(struct lexer *lexer)
+{
+ if (lexer->pos >= lexer->input->length)
+ {
+ lexer->current_tok.type = TOKEN_EOF;
+ lexer->current_tok.value = NULL;
+ return lexer->current_tok;
+ }
+ char *in = lexer->input->data;
+
+ for (; in[lexer->pos] && (in[lexer->pos] == ' ' || NEWLINEESCAPE);
+ lexer->pos++)
+ {
+ if (in[lexer->pos] == '\\' && in[lexer->pos + 1] == '\n')
+ lexer->pos++;
+ continue;
+ }
+
+ switch (in[lexer->pos])
+ {
+ case ';':
+ return set_ttype(lexer, TOKEN_SEMICOLON);
+ case '\n':
+ return set_ttype(lexer, TOKEN_NEWLINE);
+ case '\0':
+ return set_ttype(lexer, TOKEN_EOF);
+ case '(':
+ return set_ttype(lexer, TOKEN_PAR_LEFT);
+ case ')':
+ return set_ttype(lexer, TOKEN_PAR_RIGHT);
+ case '{':
+ return set_ttype(lexer, TOKEN_CURLY_LEFT);
+ case '}':
+ return set_ttype(lexer, TOKEN_CURLY_RIGHT);
+ case '|':
+ /* FALLTHROUGH */
+ case '&':
+ return lex_and_or(lexer);
+ case '<':
+ /* FALLTHROUGH */
+ case '>':
+ return lex_redirect(lexer);
+ case '#':
+ yeet_comment(lexer);
+ return lexer_next_token(lexer);
+ default:
+ lex_special(lexer);
+ return lexer->current_tok;
+ }
+}
+
+static void move_pos(struct lexer *lexer)
+{
+ enum token_type t = lexer->current_tok.type;
+ if (t == TOKEN_EOF)
+ {
+ return;
+ }
+ if (t == TOKEN_IF || t == TOKEN_FI || t == TOKEN_IN || t == TOKEN_DO
+ || t == TOKEN_OR || t == TOKEN_AND)
+ lexer->pos += 2;
+ else if (t == TOKEN_FOR)
+ lexer->pos += 3;
+ else if (t == TOKEN_ELSE || t == TOKEN_ELIF || t == TOKEN_THEN
+ || t == TOKEN_DONE)
+ lexer->pos += 4;
+ else if (t == TOKEN_WHILE || t == TOKEN_UNTIL)
+ lexer->pos += 5;
+ else if (t == TOKEN_WORD || t == TOKEN_IONUMBER || t == TOKEN_REDIR
+ || t == TOKEN_ASS_WORD)
+ lexer->pos += lexer->current_tok.value->length;
+ else
+ lexer->pos++;
+}
+
+struct token lexer_peek(struct lexer *lexer)
+{
+ if (lexer->processed)
+ return lexer->current_tok;
+ lexer->processed = 1;
+ struct token res = lexer_next_token(lexer);
+
+ move_pos(lexer);
+
+ return res;
+}
+
+struct token lexer_pop(struct lexer *lexer)
+{
+ struct token res;
+
+ if (lexer->processed)
+ {
+ res = lexer->current_tok;
+ lexer->processed = 0;
+ return res;
+ }
+
+ res = lexer_next_token(lexer);
+
+ move_pos(lexer);
+
+ lexer->processed = 0;
+
+ return res;
+}