#include "lexer.h" #include #include #include #include #include "expansion.h" #include "utils.h" #define NEWLINEESCAPE (in[lexer->pos] == '\\' && in[lexer->pos + 1] == '\n') #define SEPARATORS " \n;&|<>()" #define ISSEPARATOR(I) strchr(SEPARATORS, (int)I) != NULL struct lexer *lexer_new(struct string *input) { struct lexer *new = malloc(sizeof(struct lexer)); new->input = input; new->pos = 0; new->processed = 0; new->current_tok.value = NULL; new->current_tok.type = TOKEN_EOF; return new; } void lexer_free(struct lexer *lexer) { free(lexer); } static void yeet_comment(struct lexer *l) { while (l->input->data[l->pos] && l->input->data[l->pos] != '\n') l->pos++; } static int create_ionumber(struct lexer *l) { struct string *str = string_create(NULL); int i; for (i = l->pos; isdigit(l->input->data[i]); i++) string_pushc(str, l->input->data[i]); if (l->input->data[i] == '<' || l->input->data[i] == '>') { l->current_tok.type = TOKEN_IONUMBER; l->current_tok.value = str; return 1; } else { string_free(str); return 0; } } static int look_for_next(char *in, int i, char c) { int escaped = 0; while (in[i] && (in[i] != c || (escaped && in[i] == c))) { if (in[i] == '\\') escaped ^= 1; else escaped = 0; i++; } return i; } static struct string *substitution(char *in, int i) { char split = in[i]; in[i] = '\0'; struct string *new_input = string_create(in); if (split == '$') { i++; } i++; int error = 0; struct string *substitute = expand_substitution(in, &i, &error, (split == '`') ? '`' : ')'); if (error) { string_free(new_input); return NULL; } string_catenate(new_input, substitute); struct string *end_input = string_create(in + i + 1); string_catenate(new_input, end_input); return new_input; } static char extract_word(struct lexer *l, int *end) { char *in = l->input->data; int i = l->pos; // True if we are currently inside quotes int escaped = 0; while (in[i]) { if (!escaped && ((in[i] == '$' && in[i + 1] == '(') || in[i] == '`')) { struct string *new_input = substitution(in, i); if (new_input == NULL) { l->current_tok.type = TOKEN_ERROR; return '\0'; } string_free(l->input); l->input = new_input; in = l->input->data; } // Checking that we are NOT in a quote and have a separator if (!escaped && ISSEPARATOR(in[i])) break; if (!escaped && in[i] == '\'') { i++; while (in[i] && in[i] != '\'') i++; if (!in[i]) { l->current_tok.type = TOKEN_ERROR; return '\0'; } } else if (!escaped && in[i] == '"') { i = look_for_next(in, i + 1, in[i]); if (!in[i]) { l->current_tok.type = TOKEN_ERROR; return '\0'; } } else if (in[i] == '\\') { escaped ^= 1; } else escaped = 0; i++; } char tmp = in[i]; in[i] = '\0'; *end = i; return tmp; } static enum token_type word_or_ass(struct token t) { char *word = t.value->data; if (isdigit(word[0])) { return TOKEN_WORD; } for (int i = 0; word[i]; i++) { if (word[i] == '=') { return TOKEN_ASS_WORD; } if (word[i] != '_' && !isalnum(word[i])) { return TOKEN_WORD; } } return TOKEN_WORD; } static void lex_word(struct lexer *l) { int i = 0; char tmp = extract_word(l, &i); if (l->current_tok.type == TOKEN_ERROR) return; char *in = l->input->data; l->current_tok.type = TOKEN_ERROR; // Identifies reserved words for (int n = 0; reserved_words[n].word != NULL; n++) { if (STRINGS_ARE_EQUAL(in + l->pos, reserved_words[n].word)) l->current_tok.type = reserved_words[n].type; } // If we couldn't identify a reserved word if (l->current_tok.type == TOKEN_ERROR) { struct string *pp = string_create(in + l->pos); if (pp == NULL) { l->current_tok.type = TOKEN_ERROR; in[i] = tmp; return; } l->current_tok.value = pp; // Set the token type l->current_tok.type = word_or_ass(l->current_tok); } in[i] = tmp; } static struct token set_ttype(struct lexer *lexer, enum token_type type) { lexer->current_tok.type = type; return lexer->current_tok; } static struct token lex_and_or(struct lexer *l) { if (l->input->data[l->pos] == '|' && l->input->data[l->pos + 1] == '|') l->current_tok.type = TOKEN_OR; else if (l->input->data[l->pos] == '|' && l->input->data[l->pos + 1] != '|') l->current_tok.type = TOKEN_PIPE; else if (l->input->data[l->pos] == '&' && l->input->data[l->pos + 1] == '&') l->current_tok.type = TOKEN_AND; else l->current_tok.type = TOKEN_ERROR; return l->current_tok; } static struct token lex_redirect(struct lexer *l) { l->current_tok.type = TOKEN_REDIR; struct string *val = (l->current_tok.value = string_create(NULL)); string_pushc(val, l->input->data[l->pos]); if ((l->input->data[l->pos + 1] == '>' || l->input->data[l->pos + 1] == '&') || (l->input->data[l->pos] == '>' && l->input->data[l->pos + 1] == '|')) string_pushc(val, l->input->data[l->pos + 1]); return l->current_tok; } static void lex_special(struct lexer *l) { // If the first char is a digit and we recognized a number followed by a // redir if (isdigit(l->input->data[l->pos]) && create_ionumber(l)) // return immediately return; lex_word(l); } struct token lexer_next_token(struct lexer *lexer) { if (lexer->pos >= lexer->input->length) { lexer->current_tok.type = TOKEN_EOF; lexer->current_tok.value = NULL; return lexer->current_tok; } char *in = lexer->input->data; for (; in[lexer->pos] && (in[lexer->pos] == ' ' || NEWLINEESCAPE); lexer->pos++) { if (in[lexer->pos] == '\\' && in[lexer->pos + 1] == '\n') lexer->pos++; continue; } switch (in[lexer->pos]) { case ';': return set_ttype(lexer, TOKEN_SEMICOLON); case '\n': return set_ttype(lexer, TOKEN_NEWLINE); case '\0': return set_ttype(lexer, TOKEN_EOF); case '(': return set_ttype(lexer, TOKEN_PAR_LEFT); case ')': return set_ttype(lexer, TOKEN_PAR_RIGHT); case '{': return set_ttype(lexer, TOKEN_CURLY_LEFT); case '}': return set_ttype(lexer, TOKEN_CURLY_RIGHT); case '|': /* FALLTHROUGH */ case '&': return lex_and_or(lexer); case '<': /* FALLTHROUGH */ case '>': return lex_redirect(lexer); case '#': yeet_comment(lexer); return lexer_next_token(lexer); default: lex_special(lexer); return lexer->current_tok; } } static void move_pos(struct lexer *lexer) { enum token_type t = lexer->current_tok.type; if (t == TOKEN_EOF) { return; } if (t == TOKEN_IF || t == TOKEN_FI || t == TOKEN_IN || t == TOKEN_DO || t == TOKEN_OR || t == TOKEN_AND) lexer->pos += 2; else if (t == TOKEN_FOR) lexer->pos += 3; else if (t == TOKEN_ELSE || t == TOKEN_ELIF || t == TOKEN_THEN || t == TOKEN_DONE) lexer->pos += 4; else if (t == TOKEN_WHILE || t == TOKEN_UNTIL) lexer->pos += 5; else if (t == TOKEN_WORD || t == TOKEN_IONUMBER || t == TOKEN_REDIR || t == TOKEN_ASS_WORD) lexer->pos += lexer->current_tok.value->length; else lexer->pos++; } struct token lexer_peek(struct lexer *lexer) { if (lexer->processed) return lexer->current_tok; lexer->processed = 1; struct token res = lexer_next_token(lexer); move_pos(lexer); return res; } struct token lexer_pop(struct lexer *lexer) { struct token res; if (lexer->processed) { res = lexer->current_tok; lexer->processed = 0; return res; } res = lexer_next_token(lexer); move_pos(lexer); lexer->processed = 0; return res; }