From 967be9e750221ab2ab783f95df79bb26d290a45e Mon Sep 17 00:00:00 2001 From: Martial Simon Date: Mon, 15 Sep 2025 01:07:58 +0200 Subject: add: added projects --- 42sh/src/lexer/Makefile.am | 15 ++ 42sh/src/lexer/expansion.c | 386 +++++++++++++++++++++++++++++++++++++++++++++ 42sh/src/lexer/expansion.h | 13 ++ 42sh/src/lexer/lexer.c | 359 +++++++++++++++++++++++++++++++++++++++++ 42sh/src/lexer/lexer.h | 48 ++++++ 42sh/src/lexer/token.h | 54 +++++++ 42sh/src/lexer/utils.h | 23 +++ 7 files changed, 898 insertions(+) create mode 100644 42sh/src/lexer/Makefile.am create mode 100644 42sh/src/lexer/expansion.c create mode 100644 42sh/src/lexer/expansion.h create mode 100644 42sh/src/lexer/lexer.c create mode 100644 42sh/src/lexer/lexer.h create mode 100644 42sh/src/lexer/token.h create mode 100644 42sh/src/lexer/utils.h (limited to '42sh/src/lexer') diff --git a/42sh/src/lexer/Makefile.am b/42sh/src/lexer/Makefile.am new file mode 100644 index 0000000..4eda36d --- /dev/null +++ b/42sh/src/lexer/Makefile.am @@ -0,0 +1,15 @@ +lib_LIBRARIES = liblexer.a + +liblexer_a_SOURCES = \ + token.h \ + utils.h \ + lexer.h \ + lexer.c \ + expansion.h \ + expansion.c + +liblexer_a_CPPFLAGS = -I$(top_srcdir)/src + +liblexer_a_CFLAGS = -std=c99 -Werror -Wall -Wextra -Wvla -pedantic + +noinst_LIBRARIES = liblexer.a diff --git a/42sh/src/lexer/expansion.c b/42sh/src/lexer/expansion.c new file mode 100644 index 0000000..d648009 --- /dev/null +++ b/42sh/src/lexer/expansion.c @@ -0,0 +1,386 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +#include "utils/env.h" + +#define BUFFER_SIZE 51 + +#define ERROR_CHECK(MSG) \ + if (str[*i] == '\0') \ + { \ + string_free(input); \ + return clean_exit(MSG, error); \ + } + +#define DQUOTEESCAPED "$\\\n`\"" +// Identifies double-quote escaped characters +#define ISDQUOTEESCAPED(C) strchr(DQUOTEESCAPED, (int)C) + +#define SPECIAL "@*?$#" +// Identifies special variable names +#define ISSPECIAL(C) strchr(SPECIAL, (int)C) + +// error = 1 indicates a missing } +// error = 0 and NULL returned indicates an unrecognized var name +// error = 0 and anything else than NULL returned is the var name +static struct string *get_var_name(char *str, int *error) +{ + struct string *res = string_create(NULL); + int i = 0; + if (str[i] == '{') + { + while (str[i] && str[i] != '}') + { + if (str[i] == '\\' && str[i + 1] == '}') + i++; + string_pushc(res, str[i]); + i++; + } + if (!str[i]) + { + string_free(res); + *error = 1; + return NULL; + } + *error = 0; + return res; + } + else if (ISSPECIAL(str[i]) || isdigit(str[i])) + { + string_pushc(res, str[i]); + *error = 0; + return res; + } + else if (!isalpha(str[i])) + { + *error = 0; + string_free(res); + return NULL; + } + else + { + while (isalnum(str[i]) || str[i] == '_') + string_pushc(res, str[i++]); + return res; + } +} + +// Useful to automate the same exit process accross the few functions +// that often do this +static struct string *clean_exit(char *txt, int *error) +{ + fprintf(stderr, "%s", txt); + *error = 1; + return NULL; +} + +// Creates the fork() in order to make a subshell for the command expansion +// (cf section 2.6.3 of the SCL) +// Only called in expand_substitution() +static struct string *fork_subshell(struct string *input, int j, char *str, + int *error) +{ + int fds[2]; + if (pipe(fds) == -1) + { + return clean_exit("pipe() faild to create 2 fds\n", error); + } + + struct string *output; + pid_t child = fork(); + + if (child == -1) + { + // Fork not working + return clean_exit("fork() faild to produce a children\n", error); + } + else if (child) + { + // Parent process + close(fds[1]); + + output = string_create(NULL); + + char buff[BUFFER_SIZE]; + + buff[BUFFER_SIZE - 1] = 0; + + int r; + + int status; + waitpid(child, &status, 0); + + // Check if the child terminated normally + if (!WIFEXITED(status)) + { + close(fds[0]); + string_free(output); + return clean_exit("Child process failed miserably\n", error); + } + + while ((r = read(fds[0], buff, 50))) + { + buff[r] = 0; + if (!string_pushstr(output, buff)) + { + string_free(output); + return clean_exit("Failed to transfer from pipe\n", error); + } + } + + close(fds[0]); + return output; + } + else + { + // Child process + str += j; + close(fds[0]); + + if (dup2(fds[1], STDOUT_FILENO) == -1) + { + // We are forced to return NULL + // how are we going to know if something wnet wrong ? + exit(-1); + } + + _process_input(input); + + close(fds[1]); + exit(1); + } +} + +static int look_for_next(char *in, int i, char c) +{ + int escaped = 0; + while (in[i] && (in[i] != c || (escaped && in[i] == c))) + { + if (in[i] == '\\') + escaped ^= 1; + else + escaped = 0; + i++; + } + return i; +} + +// Removes all the characters at the end of the string obtained by +// the command substitution (Also section 2.6.3 of the SCL) +static void trimming_newline(struct string *txt) +{ + if (!txt->length) + { + return; + } + char *str = txt->data; + size_t len = txt->length; + + size_t i = len - 1; + while (str[i] == '\n') + { + str[i] = 0; + len--; + } + + // I am scared and I know this isn't useful but just in case + txt->data = str; + txt->length = len; +} + +// Performs the substitution (forks and get back the stdout) +struct string *expand_substitution(char *str, int *i, int *error, char delim) +{ + int j = *i; + struct string *input = string_create(NULL); + if (input == NULL) + { + return clean_exit("Could not create string for input\n", error); + } + + if (delim == '`') + { + *i = look_for_next(str, j, delim); + + ERROR_CHECK("Could not match `\n") + + str[*i] = '\0'; + } + // Sadly, there is no other way around this + else + { + int escaped = 0; + int par_count = 1; + + while (str[*i] != 0) + { + if (str[*i] == '\\') + { + escaped ^= 1; + } + else if (str[*i] == '\'') + { + (*i) += 1; + while (str[*i] != '\0' && str[*i] != '\'') + { + (*i) += 1; + } + ERROR_CHECK("Missing matching '\n") + } + else if ((str[*i] == '\"' || str[*i] == '`') && !escaped) + { + (*i) += 1; + *i = look_for_next(str, *i, str[(*i) - 1]); + + ERROR_CHECK("Missing matching `\n") + } + else if (str[*i] == '(' && !escaped) + { + par_count++; + } + else if (str[*i] == delim && !escaped) + { + par_count--; + if (!par_count) + { + str[*i] = 0; + break; + } + } + else + { + escaped = 0; + } + + (*i)++; + } + } + + string_pushstr(input, str + j); + struct string *output = fork_subshell(input, j, str, error); + string_free(input); + + trimming_newline(output); + str[*i] = delim; + return output; +} + +static int expand_var(struct string *res, char *input, int i) +{ + // Will only be called after a '$' was read + + int e = 0; + struct string *name = get_var_name(input + i + 1, &e); + + if (e) + { + string_free(name); + fprintf(stderr, "Missing } in variable expansion\n"); + return -1; + } + else if (name == NULL) + { + string_pushc(res, input[i]); + i++; + } + else + { + // Get the value associated to the name + char *value = env_get(name->data); + // Concatenate the strings if the variable has a value + if (value) + string_pushstr(res, value); + if (input[++i] == '{') + i += 2; + i += name->length; + string_free(name); + } + return i; +} + +static int expand_dquotes(char *input, int i, struct string *res) +{ + while (input[i] != '"') + { + if (input[i] == '$') + { + if ((i = i + expand_var(res, input, i)) == -1) + { + string_free(res); + return -1; + } + continue; + } + if ((input[i] == '`' || (input[i] == '$' && input[i + 1] == '('))) + { + int e = 0; + i += (input[i] == '$' ? 2 : 1); + struct string *output = + expand_substitution(input, &i, &e, input[i]); + if (!e) + { + string_free(res); + return -1; + } + + // +1 for the last parenthesis/backquote + i++; + string_catenate(res, output); + continue; + } + if (input[i] == '\\' && ISDQUOTEESCAPED(input[i + 1])) + i++; + string_pushc(res, input[i]); + i++; + } + return i; +} + +struct string *expand_word(struct string *word) +{ + char *input = word->data; + int escape = 0; + struct string *res = string_create(NULL); + for (int i = 0; input[i]; i++) + { + if (!escape && input[i] == '\'') + { + while (input[++i] != '\'') + string_pushc(res, input[i]); + } + else if (!escape && input[i] == '"') + { + i++; + + if ((i = expand_dquotes(input, i, res)) == -1) + return NULL; + } + else if (!escape && input[i] == '\\') + escape ^= 1; + else + { + // We don't care if we are after a backslash, we just include this + // char + if (input[i] == '$' && !escape) + { + if ((i = i + expand_var(res, input, i)) == -1) + { + string_free(res); + return NULL; + } + continue; + } + string_pushc(res, input[i]); + escape = 0; + } + } + + // string_free(word); + return res; +} diff --git a/42sh/src/lexer/expansion.h b/42sh/src/lexer/expansion.h new file mode 100644 index 0000000..4729cb8 --- /dev/null +++ b/42sh/src/lexer/expansion.h @@ -0,0 +1,13 @@ +#ifndef EXPANSION_H +#define EXPANSION_H + +#include +#include + +#include "helper.h" + +struct string *expand_substitution(char *str, int *i, int *error, char delim); + +struct string *expand_word(struct string *word); + +#endif /* ! EXPANSION_H */ diff --git a/42sh/src/lexer/lexer.c b/42sh/src/lexer/lexer.c new file mode 100644 index 0000000..eac77ab --- /dev/null +++ b/42sh/src/lexer/lexer.c @@ -0,0 +1,359 @@ +#include "lexer.h" + +#include +#include +#include +#include + +#include "expansion.h" +#include "utils.h" + +#define NEWLINEESCAPE (in[lexer->pos] == '\\' && in[lexer->pos + 1] == '\n') +#define SEPARATORS " \n;&|<>()" +#define ISSEPARATOR(I) strchr(SEPARATORS, (int)I) != NULL + +struct lexer *lexer_new(struct string *input) +{ + struct lexer *new = malloc(sizeof(struct lexer)); + new->input = input; + new->pos = 0; + new->processed = 0; + new->current_tok.value = NULL; + new->current_tok.type = TOKEN_EOF; + return new; +} + +void lexer_free(struct lexer *lexer) +{ + free(lexer); +} + +static void yeet_comment(struct lexer *l) +{ + while (l->input->data[l->pos] && l->input->data[l->pos] != '\n') + l->pos++; +} + +static int create_ionumber(struct lexer *l) +{ + struct string *str = string_create(NULL); + int i; + for (i = l->pos; isdigit(l->input->data[i]); i++) + string_pushc(str, l->input->data[i]); + if (l->input->data[i] == '<' || l->input->data[i] == '>') + { + l->current_tok.type = TOKEN_IONUMBER; + l->current_tok.value = str; + return 1; + } + else + { + string_free(str); + return 0; + } +} + +static int look_for_next(char *in, int i, char c) +{ + int escaped = 0; + while (in[i] && (in[i] != c || (escaped && in[i] == c))) + { + if (in[i] == '\\') + escaped ^= 1; + else + escaped = 0; + i++; + } + return i; +} + +static struct string *substitution(char *in, int i) +{ + char split = in[i]; + in[i] = '\0'; + struct string *new_input = string_create(in); + + if (split == '$') + { + i++; + } + + i++; + int error = 0; + struct string *substitute = + expand_substitution(in, &i, &error, (split == '`') ? '`' : ')'); + + if (error) + { + string_free(new_input); + return NULL; + } + + string_catenate(new_input, substitute); + + struct string *end_input = string_create(in + i + 1); + string_catenate(new_input, end_input); + + return new_input; +} + +static char extract_word(struct lexer *l, int *end) +{ + char *in = l->input->data; + int i = l->pos; + // True if we are currently inside quotes + int escaped = 0; + + while (in[i]) + { + if (!escaped && ((in[i] == '$' && in[i + 1] == '(') || in[i] == '`')) + { + struct string *new_input = substitution(in, i); + if (new_input == NULL) + { + l->current_tok.type = TOKEN_ERROR; + return '\0'; + } + + string_free(l->input); + l->input = new_input; + in = l->input->data; + } + // Checking that we are NOT in a quote and have a separator + if (!escaped && ISSEPARATOR(in[i])) + break; + if (!escaped && in[i] == '\'') + { + i++; + while (in[i] && in[i] != '\'') + i++; + if (!in[i]) + { + l->current_tok.type = TOKEN_ERROR; + return '\0'; + } + } + else if (!escaped && in[i] == '"') + { + i = look_for_next(in, i + 1, in[i]); + if (!in[i]) + { + l->current_tok.type = TOKEN_ERROR; + return '\0'; + } + } + else if (in[i] == '\\') + { + escaped ^= 1; + } + else + escaped = 0; + i++; + } + + char tmp = in[i]; + in[i] = '\0'; + *end = i; + return tmp; +} + +static enum token_type word_or_ass(struct token t) +{ + char *word = t.value->data; + if (isdigit(word[0])) + { + return TOKEN_WORD; + } + for (int i = 0; word[i]; i++) + { + if (word[i] == '=') + { + return TOKEN_ASS_WORD; + } + if (word[i] != '_' && !isalnum(word[i])) + { + return TOKEN_WORD; + } + } + return TOKEN_WORD; +} + +static void lex_word(struct lexer *l) +{ + int i = 0; + char tmp = extract_word(l, &i); + if (l->current_tok.type == TOKEN_ERROR) + return; + char *in = l->input->data; + + l->current_tok.type = TOKEN_ERROR; + // Identifies reserved words + for (int n = 0; reserved_words[n].word != NULL; n++) + { + if (STRINGS_ARE_EQUAL(in + l->pos, reserved_words[n].word)) + l->current_tok.type = reserved_words[n].type; + } + + // If we couldn't identify a reserved word + if (l->current_tok.type == TOKEN_ERROR) + { + struct string *pp = string_create(in + l->pos); + if (pp == NULL) + { + l->current_tok.type = TOKEN_ERROR; + in[i] = tmp; + return; + } + l->current_tok.value = pp; + // Set the token type + l->current_tok.type = word_or_ass(l->current_tok); + } + in[i] = tmp; +} + +static struct token set_ttype(struct lexer *lexer, enum token_type type) +{ + lexer->current_tok.type = type; + return lexer->current_tok; +} + +static struct token lex_and_or(struct lexer *l) +{ + if (l->input->data[l->pos] == '|' && l->input->data[l->pos + 1] == '|') + l->current_tok.type = TOKEN_OR; + else if (l->input->data[l->pos] == '|' && l->input->data[l->pos + 1] != '|') + l->current_tok.type = TOKEN_PIPE; + else if (l->input->data[l->pos] == '&' && l->input->data[l->pos + 1] == '&') + l->current_tok.type = TOKEN_AND; + else + l->current_tok.type = TOKEN_ERROR; + return l->current_tok; +} + +static struct token lex_redirect(struct lexer *l) +{ + l->current_tok.type = TOKEN_REDIR; + struct string *val = (l->current_tok.value = string_create(NULL)); + string_pushc(val, l->input->data[l->pos]); + if ((l->input->data[l->pos + 1] == '>' || l->input->data[l->pos + 1] == '&') + || (l->input->data[l->pos] == '>' && l->input->data[l->pos + 1] == '|')) + string_pushc(val, l->input->data[l->pos + 1]); + return l->current_tok; +} + +static void lex_special(struct lexer *l) +{ + // If the first char is a digit and we recognized a number followed by a + // redir + if (isdigit(l->input->data[l->pos]) && create_ionumber(l)) + // return immediately + return; + lex_word(l); +} + +struct token lexer_next_token(struct lexer *lexer) +{ + if (lexer->pos >= lexer->input->length) + { + lexer->current_tok.type = TOKEN_EOF; + lexer->current_tok.value = NULL; + return lexer->current_tok; + } + char *in = lexer->input->data; + + for (; in[lexer->pos] && (in[lexer->pos] == ' ' || NEWLINEESCAPE); + lexer->pos++) + { + if (in[lexer->pos] == '\\' && in[lexer->pos + 1] == '\n') + lexer->pos++; + continue; + } + + switch (in[lexer->pos]) + { + case ';': + return set_ttype(lexer, TOKEN_SEMICOLON); + case '\n': + return set_ttype(lexer, TOKEN_NEWLINE); + case '\0': + return set_ttype(lexer, TOKEN_EOF); + case '(': + return set_ttype(lexer, TOKEN_PAR_LEFT); + case ')': + return set_ttype(lexer, TOKEN_PAR_RIGHT); + case '{': + return set_ttype(lexer, TOKEN_CURLY_LEFT); + case '}': + return set_ttype(lexer, TOKEN_CURLY_RIGHT); + case '|': + /* FALLTHROUGH */ + case '&': + return lex_and_or(lexer); + case '<': + /* FALLTHROUGH */ + case '>': + return lex_redirect(lexer); + case '#': + yeet_comment(lexer); + return lexer_next_token(lexer); + default: + lex_special(lexer); + return lexer->current_tok; + } +} + +static void move_pos(struct lexer *lexer) +{ + enum token_type t = lexer->current_tok.type; + if (t == TOKEN_EOF) + { + return; + } + if (t == TOKEN_IF || t == TOKEN_FI || t == TOKEN_IN || t == TOKEN_DO + || t == TOKEN_OR || t == TOKEN_AND) + lexer->pos += 2; + else if (t == TOKEN_FOR) + lexer->pos += 3; + else if (t == TOKEN_ELSE || t == TOKEN_ELIF || t == TOKEN_THEN + || t == TOKEN_DONE) + lexer->pos += 4; + else if (t == TOKEN_WHILE || t == TOKEN_UNTIL) + lexer->pos += 5; + else if (t == TOKEN_WORD || t == TOKEN_IONUMBER || t == TOKEN_REDIR + || t == TOKEN_ASS_WORD) + lexer->pos += lexer->current_tok.value->length; + else + lexer->pos++; +} + +struct token lexer_peek(struct lexer *lexer) +{ + if (lexer->processed) + return lexer->current_tok; + lexer->processed = 1; + struct token res = lexer_next_token(lexer); + + move_pos(lexer); + + return res; +} + +struct token lexer_pop(struct lexer *lexer) +{ + struct token res; + + if (lexer->processed) + { + res = lexer->current_tok; + lexer->processed = 0; + return res; + } + + res = lexer_next_token(lexer); + + move_pos(lexer); + + lexer->processed = 0; + + return res; +} diff --git a/42sh/src/lexer/lexer.h b/42sh/src/lexer/lexer.h new file mode 100644 index 0000000..8d8cdf5 --- /dev/null +++ b/42sh/src/lexer/lexer.h @@ -0,0 +1,48 @@ +#ifndef LEXER_H +#define LEXER_H + +#include +#include +#include + +// True if C could be used as a word +#define ISWORD(C) \ + C == TOKEN_WORD || C == TOKEN_THEN || C == TOKEN_ELIF || C == TOKEN_ELSE \ + || C == TOKEN_IF || C == TOKEN_WHILE || C == TOKEN_UNTIL \ + || C == TOKEN_DO || C == TOKEN_DONE || C == TOKEN_FOR || C == TOKEN_IN \ + || C == TOKEN_NEG || C == TOKEN_FI || C == TOKEN_CURLY_LEFT \ + || C == TOKEN_CURLY_RIGHT + +struct lexer +{ + struct string *input; // input data + size_t pos; // the current offset inside the input data + char processed; + struct token current_tok; // next (if processed) token +}; + +// Creates a new lexer given an input string +struct lexer *lexer_new(struct string *input); + +// Frees the given lexer, not its input +void lexer_free(struct lexer *lexer); + +// Returns a token from the input string +// If the token is a WORD, copies the word to the current_tok.value field +struct token lexer_next_token(struct lexer *lexer); + +/* +** Processes the next token if necessary +** (previous call to lexer_pop or first call) +*/ +// Returns the next token +struct token lexer_peek(struct lexer *lexer); + +/* +** Processes the next token if necessary +** (previous call to lexer_pop or first call) +*/ +// Returns the next token and moves the cursor forward +struct token lexer_pop(struct lexer *lexer); + +#endif /* ! LEXER_H */ diff --git a/42sh/src/lexer/token.h b/42sh/src/lexer/token.h new file mode 100644 index 0000000..89d772a --- /dev/null +++ b/42sh/src/lexer/token.h @@ -0,0 +1,54 @@ +#ifndef TOKEN_H +#define TOKEN_H + +#include + +enum token_type +{ + // STEP 1 + TOKEN_NEWLINE, + TOKEN_EOF, + TOKEN_ERROR, + TOKEN_WORD, + TOKEN_IF, + TOKEN_THEN, + TOKEN_ELIF, + TOKEN_ELSE, + TOKEN_SEMICOLON, + TOKEN_FI, + TOKEN_HASHTAG, + + // STEP 2 + TOKEN_REDIR, + TOKEN_PIPE, + TOKEN_NEG, + TOKEN_WHILE, + TOKEN_UNTIL, + TOKEN_DO, + TOKEN_FOR, + TOKEN_DONE, + TOKEN_AND, + TOKEN_OR, + TOKEN_ESCAPE, + TOKEN_ASS_WORD, + TOKEN_DOUBLEQUOTE, + TOKEN_DOLLAR, + TOKEN_IN, + TOKEN_IONUMBER, + + // STEP 3 + TOKEN_PAR_RIGHT, + TOKEN_PAR_LEFT, + TOKEN_CURLY_RIGHT, + TOKEN_CURLY_LEFT, + + // STEP 4 +}; + +struct token +{ + enum token_type type; + struct string *value; +}; + +#endif /* ! TOKEN_H */ diff --git a/42sh/src/lexer/utils.h b/42sh/src/lexer/utils.h new file mode 100644 index 0000000..3edd83f --- /dev/null +++ b/42sh/src/lexer/utils.h @@ -0,0 +1,23 @@ +#ifndef UTILS_H +#define UTILS_H + +#include + +struct reserved_word +{ + const char *word; + enum token_type type; +}; + +static struct reserved_word reserved_words[] = { + { "if", TOKEN_IF }, { "then", TOKEN_THEN }, + { "elif", TOKEN_ELIF }, { "else", TOKEN_ELSE }, + { "fi", TOKEN_FI }, { "while", TOKEN_WHILE }, + { "until", TOKEN_UNTIL }, { "do", TOKEN_DO }, + { "done", TOKEN_DONE }, { "for", TOKEN_FOR }, + { "in", TOKEN_IN }, { "!", TOKEN_NEG }, + { "}", TOKEN_CURLY_RIGHT }, { "{", TOKEN_CURLY_LEFT }, + { NULL, TOKEN_ERROR } +}; + +#endif /* ! UTILS_H */ -- cgit v1.2.3