From a67d4cb273de802572b5f08115aa5450f011b4b8 Mon Sep 17 00:00:00 2001 From: Victor Timofei Date: Tue, 5 Sep 2023 00:10:14 +0300 Subject: [PATCH] Add `let` statement parser --- .gitignore | 1 + Cargo.toml | 8 ++ src/ast.rs | 69 ++++++++++++++ src/lexer.rs | 251 ++++++++++++++++++++++++++++++++++++++++++++++++++ src/main.rs | 11 +++ src/parser.rs | 172 ++++++++++++++++++++++++++++++++++ src/repl.rs | 30 ++++++ src/token.rs | 61 ++++++++++++ 8 files changed, 603 insertions(+) create mode 100644 .gitignore create mode 100644 Cargo.toml create mode 100644 src/ast.rs create mode 100644 src/lexer.rs create mode 100644 src/main.rs create mode 100644 src/parser.rs create mode 100644 src/repl.rs create mode 100644 src/token.rs diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..ea8c4bf --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +/target diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000..f0c3c0a --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,8 @@ +[package] +name = "monkeyrs" +version = "0.1.0" +edition = "2021" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] diff --git a/src/ast.rs b/src/ast.rs new file mode 100644 index 0000000..bfdaa2b --- /dev/null +++ b/src/ast.rs @@ -0,0 +1,69 @@ +use std::{fmt::Debug, rc::Rc}; + +use crate::token::Token; + +pub trait Node: Debug {} + +pub trait Statement: Node { + fn statement_node(&self); +} + +pub trait Expression: Node { + fn expression_node(&self); +} + +pub struct Program { + pub statements: Vec>, +} + +impl Node for Program {} + +impl Debug for Program { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "Vec<")?; + for statement in &self.statements { + write!(f, "Box<")?; + statement.fmt(f)?; + write!(f, ">, ")?; + } + write!(f, ">")?; + + Ok(()) + } +} + +#[derive(Debug)] +pub struct LetStatement { + pub token: Token, + pub name: Identifier, + pub value: Box, +} + +impl Node for LetStatement {} + +impl Statement for LetStatement { + fn statement_node(&self) {} +} + +#[derive(Debug)] +pub struct Identifier { + pub token: Token, +} + +impl Node for Identifier {} + +impl Expression for Identifier { + fn expression_node(&self) {} +} + +#[derive(Debug)] +pub struct DummyExpression {} + +impl Node for DummyExpression {} + +impl Expression for DummyExpression { + fn expression_node(&self) { + panic!("this is dummy"); + } + +} diff --git a/src/lexer.rs b/src/lexer.rs new file mode 100644 index 0000000..775c082 --- /dev/null +++ b/src/lexer.rs @@ -0,0 +1,251 @@ +use crate::token::Token; + +pub struct Lexer { + source: String, + position: usize, + read_pos: usize, + ch: char, +} + +impl Lexer { + pub fn new(source: String) -> Self { + let mut lexer = Self { + source, + position: 0, + read_pos: 0, + ch: char::from_u32(0).unwrap(), + }; + + lexer.read_char(); + + lexer + } + + fn read_char(&mut self) { + self.ch = if self.read_pos >= self.source.chars().count() { + char::from_u32(0).unwrap() + } else { + self.source.chars().nth(self.read_pos).unwrap() + }; + + self.position = self.read_pos; + self.read_pos += 1; + } + + fn read_identifier(&mut self) -> Token { + let pos = self.position; + + while is_letter(self.ch) { + self.read_char(); + } + + let literal = &self.source[pos..self.position]; + + return Token::lookup_ident(literal); + } + + fn skip_whitespace(&mut self) { + while self.ch == ' ' || self.ch == '\t' || self.ch == '\n' || self.ch == '\t' { + self.read_char(); + } + } + + fn read_number(&mut self) -> Token { + let pos = self.position; + + while is_digit(self.ch) { + self.read_char(); + } + + Token::Int(self.source[pos..self.position].parse().unwrap()) + } + + fn peek_char(&self) -> char { + if self.read_pos >= self.source.chars().count() { + char::from_u32(0).unwrap() + } else { + self.source.chars().nth(self.read_pos).unwrap() + } + } +} + +impl Iterator for Lexer { + type Item = Token; + + fn next(&mut self) -> Option { + use Token::*; + + self.skip_whitespace(); + + let token = match self.ch { + '=' => { + if self.peek_char() == '=' { + self.read_char(); + Eq + } else { + Assign + } + } + ';' => Semicolon, + '(' => Lparen, + ')' => Rparen, + ',' => Comma, + '+' => Plus, + '-' => Minus, + '!' => { + if self.peek_char() == '=' { + self.read_char(); + NotEq + } else { + Bang + } + } + '/' => Slash, + '*' => Asterisk, + '<' => Lt, + '>' => Gt, + '{' => Lbrace, + '}' => Rbrace, + c if c as u32 == 0 => EOF, + _ => { + let tok = if is_letter(self.ch) { + self.read_identifier() + } else if is_digit(self.ch) { + self.read_number() + } else { + Illegal + }; + return Some(tok); + } + }; + + self.read_char(); + + Some(token) + } +} + +fn is_letter(ch: char) -> bool { + 'a' <= ch && ch <= 'z' || 'A' <= ch && ch <= 'Z' || ch == '_' +} + +fn is_digit(ch: char) -> bool { + '0' <= ch && ch <= '9' +} + +#[cfg(test)] +mod tests { + use crate::lexer::Lexer; + use crate::token::Token; + + #[test] + fn test_next_token() { + let input = "let five = 5;\ + let ten = 10;\ + \ + let add = fn(x, y) {\ + x + y;\ + };\ + \ + let result = add(five, ten);\ + !-/*5; + 5 < 10 > 5; + if (5 < 10) { + return true; + } else { + return false; + } + 10 == 10; + 10 != 9; + " + .to_string(); + + use Token::*; + + let tests = vec![ + Let, + Ident("five".to_string()), + Assign, + Int(5), + Semicolon, + Let, + Ident("ten".to_string()), + Assign, + Int(10), + Semicolon, + Let, + Ident("add".to_string()), + Assign, + Function, + Lparen, + Ident("x".to_string()), + Comma, + Ident("y".to_string()), + Rparen, + Lbrace, + Ident("x".to_string()), + Plus, + Ident("y".to_string()), + Semicolon, + Rbrace, + Semicolon, + Let, + Ident("result".to_string()), + Assign, + Ident("add".to_string()), + Lparen, + Ident("five".to_string()), + Comma, + Ident("ten".to_string()), + Rparen, + Semicolon, + Bang, + Minus, + Slash, + Asterisk, + Int(5), + Semicolon, + Int(5), + Lt, + Int(10), + Gt, + Int(5), + Semicolon, + If, + Lparen, + Int(5), + Lt, + Int(10), + Rparen, + Lbrace, + Return, + True, + Semicolon, + Rbrace, + Else, + Lbrace, + Return, + False, + Semicolon, + Rbrace, + Int(10), + Eq, + Int(10), + Semicolon, + Int(10), + NotEq, + Int(9), + Semicolon, + EOF, + ]; + + let mut lexer_it = Lexer::new(input); + + for (i, tt) in tests.iter().enumerate() { + let token = lexer_it.next(); + + println!("{i}"); + assert_eq!(*tt, token.unwrap()); + } + } +} diff --git a/src/main.rs b/src/main.rs new file mode 100644 index 0000000..d205d35 --- /dev/null +++ b/src/main.rs @@ -0,0 +1,11 @@ +mod ast; +mod lexer; +mod repl; +mod token; +mod parser; + +fn main() { + let stdout = std::io::stdout(); + let stdin = std::io::stdin(); + repl::start(stdout, stdin); +} diff --git a/src/parser.rs b/src/parser.rs new file mode 100644 index 0000000..53cb7b0 --- /dev/null +++ b/src/parser.rs @@ -0,0 +1,172 @@ +use std::rc::Rc; + +use crate::{lexer::Lexer, token::Token, ast::{Program, Statement, LetStatement, Identifier, DummyExpression}}; + +pub type Result = std::result::Result; + +#[derive(Debug)] +enum Error { + UnexpectedToken{ + expected: Token, + actual: Option, + }, +} + +struct Parser { + lexer: Lexer, + cur_token: Option, + peek_token: Option, +} + +impl Parser { + pub fn new(mut lexer: Lexer) -> Self { + let cur_token = lexer.next(); + let peek_token = lexer.next(); + Self { lexer, cur_token, peek_token } + } + + pub fn parse(&mut self) -> Result { + let mut program = Program { + statements: Vec::new(), + }; + + + let mut done = Some(()); + + while done.is_some() { + let stmt = self.parse_statement(); + program.statements.push(stmt?); + + done = self.next(); + } + + Ok(program) + } + + fn parse_statement(&mut self) -> Result> { + match &self.cur_token { + Some(token) => { + use Token::*; + match token { + Let => self.parse_let_statement(), + t => unimplemented!("{t:?} statement token not impl"), + } + } + None => unreachable!() + } + } + + fn parse_let_statement(&mut self) -> Result> { + let token = self.cur_token.clone().unwrap(); + + self.expect_peek(&Token::Ident("".to_string()))?; + + let name = Identifier { + token: self.cur_token.clone().unwrap(), + }; + + self.expect_peek(&Token::Assign)?; + + while !self.cur_token_is(&Token::Semicolon) { + self.next(); + } + + Ok(Rc::new(LetStatement { + token, + name, + value: Box::new(DummyExpression {}), + })) + } + + fn expect_peek(&mut self, token: &Token) -> Result<()> { + if self.peek_token.is_none() { + return Err(Error::UnexpectedToken { expected: token.clone(), actual: None }); + } + + let peek_token = self.peek_token.clone().unwrap(); + + if token.is_same_type(&peek_token) { + self.next(); + Ok(()) + } else { + Err(Error::UnexpectedToken { expected: token.clone(), actual: None }) + } + } + + fn peek_token_is(&self, token: &Token) -> bool { + if self.peek_token.is_none() { + return false; + } + + self.peek_token.clone().unwrap().is_same_type(token) + } + + fn cur_token_is(&self, token: &Token) -> bool { + if self.cur_token.is_none() { + return false; + } + + self.cur_token.clone().unwrap().is_same_type(token) + } +} + +impl Iterator for Parser { + type Item = (); + + fn next(&mut self) -> Option { + let peek_token = self.lexer.next(); + + + self.cur_token = self.peek_token.clone(); + self.peek_token = peek_token; + + match self.cur_token { + None | Some(Token::EOF) => None, + Some(_) => Some(()), + } + } +} + +#[cfg(test)] +mod tests { + use std::rc::Rc; + + use crate::{lexer::Lexer, ast::Statement}; + + use super::Parser; + + #[test] + fn let_statements() { + let source = "let x = 5;\ + let y = 10;\ + let foobar = 838383;\ + ".to_string(); + + let lexer = Lexer::new(source); + + let mut parser = Parser::new(lexer); + + let program = parser.parse().unwrap(); + + assert_eq!(program.statements.len(), 3); + + let expected_identifiers = vec![ + "x", + "y", + "foobar", + ]; + let mut statements_iter = program.statements.iter(); + for tt in expected_identifiers { + let statement = statements_iter.next().unwrap(); + test_let_statement(statement.clone(), tt); + } + } + + fn test_let_statement(stmt: Rc, name: &str) { + assert_eq!( + format!("{stmt:?}"), + format!("LetStatement {{ token: Let, name: Identifier {{ token: Ident(\"{name}\") }}, value: DummyExpression }}"), + ); + } + +} diff --git a/src/repl.rs b/src/repl.rs new file mode 100644 index 0000000..7c2966b --- /dev/null +++ b/src/repl.rs @@ -0,0 +1,30 @@ +use std::io::{BufRead, BufReader, Read, Write}; + +use crate::{lexer::Lexer, token::Token}; + +const PROMPT: &str = ">> "; + +pub fn start(mut w: impl Write, r: impl Read) { + let mut reader = BufReader::new(r); + loop { + write!(w, "{PROMPT}").unwrap(); + w.flush().unwrap(); + + let mut line = String::new(); + reader.read_line(&mut line).unwrap(); + + if line.len() == 0 { + writeln!(w, "").unwrap(); + return; + } + + let lex = Lexer::new(line); + + for token in lex { + if token == Token::EOF { + break; + } + writeln!(w, "{token:?}").unwrap(); + } + } +} diff --git a/src/token.rs b/src/token.rs new file mode 100644 index 0000000..503e615 --- /dev/null +++ b/src/token.rs @@ -0,0 +1,61 @@ +#[derive(Debug, PartialEq, Clone)] +pub enum Token { + Illegal, + EOF, + + Ident(String), + Int(i64), + + Assign, + Plus, + Minus, + Bang, + Asterisk, + Slash, + + Lt, + Gt, + Eq, + NotEq, + + Comma, + Semicolon, + + Lparen, + Rparen, + Lbrace, + Rbrace, + + Function, + Let, + True, + False, + If, + Else, + Return, +} + +impl Token { + pub fn is_same_type(&self, other: &Token) -> bool { + use Token::*; + match self { + Ident(_) => if let Ident(_) = other { true } else { false }, + Int(_) => if let Int(_) = other { true } else { false }, + tok => tok == other, + } + } + + pub fn lookup_ident(ident: &str) -> Token { + use Token::*; + match ident { + "fn" => Function, + "let" => Let, + "true" => True, + "false" => False, + "if" => If, + "else" => Else, + "return" => Return, + ident => Ident(ident.to_string()), + } + } +}