From 169870428a9074d2429e6ec3dac64e84bb0392e3 Mon Sep 17 00:00:00 2001 From: David Li Date: Fri, 3 Nov 2017 23:00:54 -0400 Subject: Implement skeleton of lexer --- .gitignore | 3 ++ Cargo.lock | 89 ++++++++++++++++++++++++++++++++++++++++++++++++++++ Cargo.toml | 8 +++++ src/lexer.rs | 101 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ src/main.rs | 13 ++++++++ 5 files changed, 214 insertions(+) create mode 100644 .gitignore create mode 100644 Cargo.lock create mode 100644 Cargo.toml create mode 100644 src/lexer.rs create mode 100644 src/main.rs diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..fa50122 --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +target/ +**/*.rs.bk +*~ diff --git a/Cargo.lock b/Cargo.lock new file mode 100644 index 0000000..7a58190 --- /dev/null +++ b/Cargo.lock @@ -0,0 +1,89 @@ +[[package]] +name = "aho-corasick" +version = "0.6.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "memchr 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "lazy_static" +version = "0.2.9" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "libc" +version = "0.2.33" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "memchr" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "libc 0.2.33 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "regex" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "aho-corasick 0.6.3 (registry+https://github.com/rust-lang/crates.io-index)", + "memchr 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)", + "regex-syntax 0.4.1 (registry+https://github.com/rust-lang/crates.io-index)", + "thread_local 0.3.4 (registry+https://github.com/rust-lang/crates.io-index)", + "utf8-ranges 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "regex-syntax" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "taiga" +version = "0.1.0" +dependencies = [ + "lazy_static 0.2.9 (registry+https://github.com/rust-lang/crates.io-index)", + "regex 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "thread_local" +version = "0.3.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "lazy_static 0.2.9 (registry+https://github.com/rust-lang/crates.io-index)", + "unreachable 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "unreachable" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "void 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "utf8-ranges" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "void" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[metadata] +"checksum aho-corasick 0.6.3 (registry+https://github.com/rust-lang/crates.io-index)" = "500909c4f87a9e52355b26626d890833e9e1d53ac566db76c36faa984b889699" +"checksum lazy_static 0.2.9 (registry+https://github.com/rust-lang/crates.io-index)" = "c9e5e58fa1a4c3b915a561a78a22ee0cac6ab97dca2504428bc1cb074375f8d5" +"checksum libc 0.2.33 (registry+https://github.com/rust-lang/crates.io-index)" = "5ba3df4dcb460b9dfbd070d41c94c19209620c191b0340b929ce748a2bcd42d2" +"checksum memchr 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)" = "148fab2e51b4f1cfc66da2a7c32981d1d3c083a803978268bb11fe4b86925e7a" +"checksum regex 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)" = "1731164734096285ec2a5ec7fea5248ae2f5485b3feeb0115af4fda2183b2d1b" +"checksum regex-syntax 0.4.1 (registry+https://github.com/rust-lang/crates.io-index)" = "ad890a5eef7953f55427c50575c680c42841653abd2b028b68cd223d157f62db" +"checksum thread_local 0.3.4 (registry+https://github.com/rust-lang/crates.io-index)" = "1697c4b57aeeb7a536b647165a2825faddffb1d3bad386d507709bd51a90bb14" +"checksum unreachable 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)" = "382810877fe448991dfc7f0dd6e3ae5d58088fd0ea5e35189655f84e6814fa56" +"checksum utf8-ranges 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)" = "662fab6525a98beff2921d7f61a39e7d59e0b425ebc7d0d9e66d316e55124122" +"checksum void 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)" = "6a02e4885ed3bc0f2de90ea6dd45ebcbb66dacffe03547fadbb0eeae2770887d" diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000..14d5b5a --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,8 @@ +[package] +name = "taiga" +version = "0.1.0" +authors = ["David Li "] + +[dependencies] +lazy_static = "0.2.9" +regex = "0.2" \ No newline at end of file diff --git a/src/lexer.rs b/src/lexer.rs new file mode 100644 index 0000000..dd89fe5 --- /dev/null +++ b/src/lexer.rs @@ -0,0 +1,101 @@ +use regex::Regex; + +#[derive(Clone,Debug)] +pub enum TokenKind { + Let, + In, + End, + Equals, + Integer(u64), + Name(String), +} + +#[derive(Debug)] +pub struct Token { + pub kind: TokenKind, + pub start: usize, + pub end: usize, +} + +pub struct Lexer<'a> { + input: &'a str, + position: usize, +} + +lazy_static! { + static ref WHITESPACE: Regex = Regex::new(r"^\s+").unwrap(); + static ref LET: Regex = Regex::new(r"^let").unwrap(); + static ref IN: Regex = Regex::new(r"^in").unwrap(); + static ref END: Regex = Regex::new(r"^end").unwrap(); + static ref EQUALS: Regex = Regex::new(r"^=").unwrap(); + static ref INTEGER: Regex = Regex::new(r"^[0-9]+").unwrap(); + static ref NAME: Regex = Regex::new(r"^[_[:alpha:]][_[:alpha:]0-9]*").unwrap(); + + static ref TOKENS: [(&'static Regex, fn(&str) -> TokenKind); 6] = [ + (&LET, {fn temp(_: &str) -> TokenKind { + TokenKind::Let + } temp}), + (&IN, {fn temp(_: &str) -> TokenKind { + TokenKind::In + } temp}), + (&END, {fn temp(_: &str) -> TokenKind { + TokenKind::End + } temp}), + (&EQUALS, {fn temp(_: &str) -> TokenKind { + TokenKind::Equals + } temp}), + (&INTEGER, {fn temp(num: &str) -> TokenKind { + TokenKind::Integer(num.parse::().unwrap()) + } temp}), + (&NAME, {fn temp(name: &str) -> TokenKind { + TokenKind::Name(name.to_owned()) + } temp}), + ]; +} + +impl<'a> Lexer<'a> { + pub fn new(input: &'a str) -> Lexer<'a> { + // blog.matthewcheok.com/writing-a-lexer-in-swift + Lexer { + input: input, + position: 0, + } + } + + fn skip_whitespace(&mut self) { + if let Some(result) = WHITESPACE.find(&self.input[self.position..]) { + self.position += result.end(); + } + } +} + +impl<'a> Iterator for Lexer<'a> { + type Item = Token; + + fn next(&mut self) -> Option { + if self.position >= self.input.len() { + None + } + else { + self.skip_whitespace(); + if self.position >= self.input.len() { + return None; + } + + for &(regexp, builder) in TOKENS.iter() { + if let Some(result) = regexp.find(&self.input[self.position..]) { + let position = self.position; + self.position += result.end(); + return Some(Token { + kind: builder(result.as_str()), + start: position + result.start(), + end: position + result.end(), + }); + } + } + + self.position = self.input.len(); + None + } + } +} diff --git a/src/main.rs b/src/main.rs new file mode 100644 index 0000000..5e071b0 --- /dev/null +++ b/src/main.rs @@ -0,0 +1,13 @@ +#[macro_use] extern crate lazy_static; +extern crate regex; + +pub mod lexer; + +fn main() { + println!("Hello, world!"); + let s = "let x = 5 in x end end "; + let lex = lexer::Lexer::new(s); + for token in lex { + println!("{:?}", token); + } +} -- cgit v1.2.3