From 169870428a9074d2429e6ec3dac64e84bb0392e3 Mon Sep 17 00:00:00 2001 From: David Li Date: Fri, 3 Nov 2017 23:00:54 -0400 Subject: Implement skeleton of lexer --- src/lexer.rs | 101 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ src/main.rs | 13 ++++++++ 2 files changed, 114 insertions(+) create mode 100644 src/lexer.rs create mode 100644 src/main.rs (limited to 'src') diff --git a/src/lexer.rs b/src/lexer.rs new file mode 100644 index 0000000..dd89fe5 --- /dev/null +++ b/src/lexer.rs @@ -0,0 +1,101 @@ +use regex::Regex; + +#[derive(Clone,Debug)] +pub enum TokenKind { + Let, + In, + End, + Equals, + Integer(u64), + Name(String), +} + +#[derive(Debug)] +pub struct Token { + pub kind: TokenKind, + pub start: usize, + pub end: usize, +} + +pub struct Lexer<'a> { + input: &'a str, + position: usize, +} + +lazy_static! { + static ref WHITESPACE: Regex = Regex::new(r"^\s+").unwrap(); + static ref LET: Regex = Regex::new(r"^let").unwrap(); + static ref IN: Regex = Regex::new(r"^in").unwrap(); + static ref END: Regex = Regex::new(r"^end").unwrap(); + static ref EQUALS: Regex = Regex::new(r"^=").unwrap(); + static ref INTEGER: Regex = Regex::new(r"^[0-9]+").unwrap(); + static ref NAME: Regex = Regex::new(r"^[_[:alpha:]][_[:alpha:]0-9]*").unwrap(); + + static ref TOKENS: [(&'static Regex, fn(&str) -> TokenKind); 6] = [ + (&LET, {fn temp(_: &str) -> TokenKind { + TokenKind::Let + } temp}), + (&IN, {fn temp(_: &str) -> TokenKind { + TokenKind::In + } temp}), + (&END, {fn temp(_: &str) -> TokenKind { + TokenKind::End + } temp}), + (&EQUALS, {fn temp(_: &str) -> TokenKind { + TokenKind::Equals + } temp}), + (&INTEGER, {fn temp(num: &str) -> TokenKind { + TokenKind::Integer(num.parse::().unwrap()) + } temp}), + (&NAME, {fn temp(name: &str) -> TokenKind { + TokenKind::Name(name.to_owned()) + } temp}), + ]; +} + +impl<'a> Lexer<'a> { + pub fn new(input: &'a str) -> Lexer<'a> { + // blog.matthewcheok.com/writing-a-lexer-in-swift + Lexer { + input: input, + position: 0, + } + } + + fn skip_whitespace(&mut self) { + if let Some(result) = WHITESPACE.find(&self.input[self.position..]) { + self.position += result.end(); + } + } +} + +impl<'a> Iterator for Lexer<'a> { + type Item = Token; + + fn next(&mut self) -> Option { + if self.position >= self.input.len() { + None + } + else { + self.skip_whitespace(); + if self.position >= self.input.len() { + return None; + } + + for &(regexp, builder) in TOKENS.iter() { + if let Some(result) = regexp.find(&self.input[self.position..]) { + let position = self.position; + self.position += result.end(); + return Some(Token { + kind: builder(result.as_str()), + start: position + result.start(), + end: position + result.end(), + }); + } + } + + self.position = self.input.len(); + None + } + } +} diff --git a/src/main.rs b/src/main.rs new file mode 100644 index 0000000..5e071b0 --- /dev/null +++ b/src/main.rs @@ -0,0 +1,13 @@ +#[macro_use] extern crate lazy_static; +extern crate regex; + +pub mod lexer; + +fn main() { + println!("Hello, world!"); + let s = "let x = 5 in x end end "; + let lex = lexer::Lexer::new(s); + for token in lex { + println!("{:?}", token); + } +} -- cgit v1.2.3