From eec3b4853b2d82f08983ae2a3cfa58b37d4d3217 Mon Sep 17 00:00:00 2001 From: ghostlyzsh Date: Fri, 2 Aug 2024 09:47:37 -0500 Subject: [PATCH] convert lexer to use utf-8 character, bitwise operations --- kabel/grammar.ebnf | 6 ++- kabel/src/lexer.rs | 101 ++++++++++++++++++++++---------------------- kabel/src/parser.rs | 86 ++++++++++++++++++++++++++++++++++--- 3 files changed, 137 insertions(+), 56 deletions(-) diff --git a/kabel/grammar.ebnf b/kabel/grammar.ebnf index da8f0f1581b4df92b7153eb8fde713803c7ef081..7ea3f6d94ad9416147853d0a9b1b55d344079193 100644 --- a/kabel/grammar.ebnf +++ b/kabel/grammar.ebnf @@ -26,7 +26,11 @@ assignment = ( identifier , "=" , assignment ) | logical_or; logical_or = logical_and { , "||" , logical_and } ; -logical_and = equality { , "&&" , equality } ; +logical_and = bit_and { , "&&" , bit_and } ; + +bit_and = bit_xor { , "*" , bit_xor } ; +bit_xor = bit_or { , "^" , bit_or } ; +bit_or = equality { , "|" , equality } ; equality = comparison { , ( "==" | "!=" ) , comparison } ; diff --git a/kabel/src/lexer.rs b/kabel/src/lexer.rs index 50afd0eb97f323cd50130ef484834224685f40cd..0971b4458078d9d5f41e30757c083ae0665e5ec2 100644 --- a/kabel/src/lexer.rs +++ b/kabel/src/lexer.rs @@ -6,13 +6,13 @@ use crate::{ }; pub struct Lexer { - input: Vec, + input: Vec, start: usize, current: usize, line: usize, line_start: usize, column: usize, - c: u8, + c: char, pub errors: Vec, pub output: Vec, } @@ -20,13 +20,13 @@ pub struct Lexer { impl Lexer { pub fn new(input: String) -> Self { Self { - input: input.as_bytes().to_vec(), + input: input.chars().collect(), start: 0, current: 0, line: 0, line_start: 0, column: 0, - c: 0x00, + c: '\0', errors: Vec::new(), output: Vec::new(), } @@ -35,21 +35,21 @@ impl Lexer { pub fn next_token(&mut self) -> bool { self.read_char(); match self.c { - b'+' => { + '+' => { self.output.push(token!(self, TokenType::Plus)); self.start = self.current; } - b'-' => { + '-' => { self.output.push(token!(self, TokenType::Minus)); self.start = self.current; } - b'*' => { + '*' => { self.output.push(token!(self, TokenType::Star)); self.start = self.current; } - b'/' => { - if self.peek() == b'/' { - while self.peek() != b'\n' && self.current < self.input.len() { + '/' => { + if self.peek() == '/' { + while self.peek() != '\n' && self.current < self.input.len() { self.read_char(); } self.start = self.current; @@ -58,44 +58,48 @@ impl Lexer { self.start = self.current; } } - b'(' => { + '(' => { self.output.push(token!(self, TokenType::LeftParen)); self.start = self.current; } - b')' => { + ')' => { self.output.push(token!(self, TokenType::RightParen)); self.start = self.current; } - b'{' => { + '{' => { self.output.push(token!(self, TokenType::LeftBrace)); self.start = self.current; } - b'}' => { + '}' => { self.output.push(token!(self, TokenType::RightBrace)); self.start = self.current; } - b'[' => { + '[' => { self.output.push(token!(self, TokenType::LeftSquare)); self.start = self.current; } - b']' => { + ']' => { self.output.push(token!(self, TokenType::RightSquare)); self.start = self.current; } - b'.' => { + '.' => { self.output.push(token!(self, TokenType::Period)); self.start = self.current; } - b',' => { + ',' => { self.output.push(token!(self, TokenType::Comma)); self.start = self.current; } - b';' => { + ';' => { self.output.push(token!(self, TokenType::Semicolon)); self.start = self.current; } - b'|' => { - if self.peek() == b'|' { + '^' => { + self.output.push(token!(self, TokenType::Caret)); + self.start = self.current; + } + '|' => { + if self.peek() == '|' { self.read_char(); self.output.push(token!(self, TokenType::OrOr)); self.start = self.current; @@ -104,8 +108,8 @@ impl Lexer { self.start = self.current; } } - b'&' => { - if self.peek() == b'&' { + '&' => { + if self.peek() == '&' { self.read_char(); self.output.push(token!(self, TokenType::AndAnd)); self.start = self.current; @@ -114,8 +118,8 @@ impl Lexer { self.start = self.current; } } - b'=' => { - if self.peek() == b'=' { + '=' => { + if self.peek() == '=' { self.read_char(); self.output.push(token!(self, TokenType::EqualEqual)); self.start = self.current; @@ -124,8 +128,8 @@ impl Lexer { self.start = self.current; } } - b'!' => { - if self.peek() == b'=' { + '!' => { + if self.peek() == '=' { self.read_char(); self.output.push(token!(self, TokenType::BangEqual)); self.start = self.current; @@ -134,8 +138,8 @@ impl Lexer { self.start = self.current; } } - b'>' => { - if self.peek() == b'=' { + '>' => { + if self.peek() == '=' { self.read_char(); self.output.push(token!(self, TokenType::GreaterEqual)); self.start = self.current; @@ -144,8 +148,8 @@ impl Lexer { self.start = self.current; } } - b'<' => { - if self.peek() == b'=' { + '<' => { + if self.peek() == '=' { self.read_char(); self.output.push(token!(self, TokenType::LessEqual)); self.start = self.current; @@ -154,18 +158,16 @@ impl Lexer { self.start = self.current; } } - b'"' => { + '"' => { let mut contents = String::new(); - while self.read_char() != b'"' { - if self.c == 0x05 { + while self.read_char() != '"' { + if self.c == '\0'{ self.errors.push(KabelError::new( ErrorKind::UnexpectedEof, "File ended before closing quote".to_string(), self.line, self.column, - from_utf8(&self.input[self.start..self.current]) - .unwrap() - .to_string(), + self.input[self.start..self.current].iter().collect(), )); return false; } @@ -174,20 +176,20 @@ impl Lexer { self.start = self.current; self.output.push(token!(self, TokenType::Str(contents))); } - b'\n' => { + '\n' => { self.line += 1; self.line_start = self.current; self.column = 0; self.start = self.current; } - b' ' | b'\r' | b'\t' => { + ' ' | '\r' | '\t' => { self.start = self.current; } - 0x05 => return false, + '\0' => return false, c => { - if c.is_ascii_alphabetic() { + if c.is_ascii_alphabetic() || c == '_' { let mut content = (c as char).to_string(); - while self.peek().is_ascii_alphanumeric() || self.c == b'_' { + while self.peek().is_ascii_alphanumeric() || self.c == '_' { content.push(self.c as char); self.read_char(); } @@ -199,7 +201,7 @@ impl Lexer { number.push(self.c as char); self.read_char(); } - if self.c == b'.' { + if self.c == '.' { number.push('.'); while self.read_char().is_ascii_digit() { number.push(self.c as char); @@ -215,9 +217,7 @@ impl Lexer { format!("Stray \"{0}\"", c as char), self.line, self.column, - from_utf8(&self.input[self.line_start..self.current]) - .unwrap() - .to_string(), + self.input[self.line_start..self.current].iter().collect(), )); } } @@ -225,9 +225,9 @@ impl Lexer { true } - pub fn read_char(&mut self) -> u8 { + pub fn read_char(&mut self) -> char{ if self.current >= self.input.len() { - self.c = 0x05; // EOF + self.c = '\0'; // EOF return self.c; } self.c = self.input[self.current]; @@ -235,9 +235,9 @@ impl Lexer { self.column += 1; return self.c; } - pub fn peek(&mut self) -> u8 { + pub fn peek(&mut self) -> char { if self.current >= self.input.len() { - self.c = 0x05; // EOF + self.c = '\0'; // EOF return self.c; } self.c = self.input[self.current]; @@ -282,6 +282,7 @@ pub enum TokenType { AndAnd, Or, OrOr, + Caret, Ident(String), Str(String), diff --git a/kabel/src/parser.rs b/kabel/src/parser.rs index b32224e09531f4593beee13c8964a036e110954f..8ab92e99c68a51e589b0d50031b938ae2a0466a4 100644 --- a/kabel/src/parser.rs +++ b/kabel/src/parser.rs @@ -166,7 +166,7 @@ impl Parser { } } - pub fn for_statement(&mut self) -> Result { + pub fn for_statement(&mut self) -> Result { let for_ident = self.read_token()?; let left_paren = self.read_token()?; if let TokenType::LeftParen = left_paren.token_type { @@ -196,7 +196,12 @@ impl Parser { if let TokenType::RightParen = right_paren.token_type { let block = self.block()?; return Ok(AST { - ast_type: ASTType::For(Box::new(expression1), Box::new(expression2), Box::new(expression3), Box::new(block.clone())), + ast_type: ASTType::For( + Box::new(expression1), + Box::new(expression2), + Box::new(expression3), + Box::new(block.clone()), + ), start: for_ident.start, end: block.end, line: for_ident.line, @@ -449,11 +454,11 @@ impl Parser { Ok(left) } pub fn logical_and(&mut self) -> Result { - let mut left = self.equality()?; + let mut left = self.bit_and()?; while self.current < self.input.len() && self.peek()?.token_type == TokenType::AndAnd { self.read_token()?; - let right = self.equality()?; + let right = self.bit_and()?; left = AST { ast_type: ASTType::Binary( Box::new(left.clone()), @@ -469,6 +474,69 @@ impl Parser { Ok(left) } + pub fn bit_and(&mut self) -> Result { + let mut left = self.bit_xor()?; + + while self.current < self.input.len() && self.peek()?.token_type == TokenType::And { + self.read_token()?; + let right = self.bit_xor()?; + left = AST { + ast_type: ASTType::Binary( + Box::new(left.clone()), + BinOp::BitAnd, + Box::new(right.clone()), + ), + start: left.start, + end: right.end, + line: left.line, + column: left.column, + }; + } + + Ok(left) + } + pub fn bit_xor(&mut self) -> Result { + let mut left = self.bit_or()?; + + while self.current < self.input.len() && self.peek()?.token_type == TokenType::Caret { + self.read_token()?; + let right = self.bit_or()?; + left = AST { + ast_type: ASTType::Binary( + Box::new(left.clone()), + BinOp::BitXor, + Box::new(right.clone()), + ), + start: left.start, + end: right.end, + line: left.line, + column: left.column, + }; + } + + Ok(left) + } + pub fn bit_or(&mut self) -> Result { + let mut left = self.equality()?; + + while self.current < self.input.len() && self.peek()?.token_type == TokenType::Or { + self.read_token()?; + let right = self.equality()?; + left = AST { + ast_type: ASTType::Binary( + Box::new(left.clone()), + BinOp::BitOr, + Box::new(right.clone()), + ), + start: left.start, + end: right.end, + line: left.line, + column: left.column, + }; + } + + Ok(left) + } pub fn equality(&mut self) -> Result { let mut left = self.comparison()?; @@ -917,7 +985,12 @@ pub enum ASTType { Return(Option>), // expression Loop(Box), // block While(Box, Box), // condition, block - For(Box>, Box>, Box>, Box), // expr1, expr2, expr3, block + For( + Box>, + Box>, + Box>, + Box, + ), // expr1, expr2, expr3, block Break, Continue, If(Box, Box, Option>), // condition, block, else/else if @@ -958,6 +1031,9 @@ pub enum BinOp { Le, Or, And, + BitAnd, + BitXor, + BitOr, Assign, }