//! Lexer for the doot language. use chumsky::prelude::*; use ordered_float::OrderedFloat; use std::fmt; /// Token types produced by the lexer. #[derive(Clone, Debug, PartialEq, Eq, Hash)] pub enum Token { // Literals Int(i64), Float(OrderedFloat), Str(String), Bool(bool), // Identifiers and keywords Ident(String), // Keywords Let, Fn, AsyncFn, If, Else, Then, For, In, Match, Struct, Enum, Type, Import, As, Dotfile, Package, Secret, Encrypted, Hook, BeforeDeploy, AfterDeploy, BeforePackage, AfterPackage, Macro, Await, Return, When, // Operators Plus, Minus, Star, Slash, Percent, Eq, EqEq, NotEq, Lt, Gt, LtEq, GtEq, And, Or, Not, Pipe, DoublePipe, DoubleColon, Arrow, FatArrow, Dot, DotDot, QuestionQuestion, // Delimiters LParen, RParen, LBracket, RBracket, LBrace, RBrace, Comma, Colon, Semicolon, Newline, // Special Tilde, At, Hash, Bang, Indent(usize), Dedent, } impl fmt::Display for Token { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { match self { Token::Int(n) => write!(f, "{}", n), Token::Float(n) => write!(f, "{}", n), Token::Str(s) => write!(f, "\"{}\"", s), Token::Bool(b) => write!(f, "{}", b), Token::Ident(s) => write!(f, "{}", s), Token::Let => write!(f, "let"), Token::Fn => write!(f, "fn"), Token::AsyncFn => write!(f, "async fn"), Token::If => write!(f, "if"), Token::Else => write!(f, "else"), Token::Then => write!(f, "then"), Token::For => write!(f, "for"), Token::In => write!(f, "in"), Token::Match => write!(f, "match"), Token::Struct => write!(f, "struct"), Token::Enum => write!(f, "enum"), Token::Type => write!(f, "type"), Token::Import => write!(f, "import"), Token::As => write!(f, "as"), Token::Dotfile => write!(f, "dotfile"), Token::Package => write!(f, "package"), Token::Secret => write!(f, "secret"), Token::Encrypted => write!(f, "encrypted"), Token::Hook => write!(f, "hook"), Token::BeforeDeploy => write!(f, "before_deploy"), Token::AfterDeploy => write!(f, "after_deploy"), Token::BeforePackage => write!(f, "before_package"), Token::AfterPackage => write!(f, "after_package"), Token::Macro => write!(f, "macro"), Token::Await => write!(f, "await"), Token::Return => write!(f, "return"), Token::When => write!(f, "when"), Token::Plus => write!(f, "+"), Token::Minus => write!(f, "-"), Token::Star => write!(f, "*"), Token::Slash => write!(f, "/"), Token::Percent => write!(f, "%"), Token::Eq => write!(f, "="), Token::EqEq => write!(f, "=="), Token::NotEq => write!(f, "!="), Token::Lt => write!(f, "<"), Token::Gt => write!(f, ">"), Token::LtEq => write!(f, "<="), Token::GtEq => write!(f, ">="), Token::And => write!(f, "&&"), Token::Or => write!(f, "||"), Token::Not => write!(f, "!"), Token::Pipe => write!(f, "|"), Token::DoublePipe => write!(f, "||"), Token::DoubleColon => write!(f, "::"), Token::Arrow => write!(f, "->"), Token::FatArrow => write!(f, "=>"), Token::Dot => write!(f, "."), Token::DotDot => write!(f, ".."), Token::QuestionQuestion => write!(f, "??"), Token::LParen => write!(f, "("), Token::RParen => write!(f, ")"), Token::LBracket => write!(f, "["), Token::RBracket => write!(f, "]"), Token::LBrace => write!(f, "{{"), Token::RBrace => write!(f, "}}"), Token::Comma => write!(f, ","), Token::Colon => write!(f, ":"), Token::Semicolon => write!(f, ";"), Token::Newline => write!(f, "\\n"), Token::Tilde => write!(f, "~"), Token::At => write!(f, "@"), Token::Hash => write!(f, "#"), Token::Bang => write!(f, "!"), Token::Indent(n) => write!(f, "", n), Token::Dedent => write!(f, ""), } } } /// Source location range. pub type Span = std::ops::Range; /// Token with source location. #[derive(Clone, Debug)] pub struct Spanned { pub node: T, pub span: Span, } impl Spanned { /// Creates a new spanned token. pub fn new(node: T, span: Span) -> Self { Self { node, span } } } /// Tokenizes doot source code. pub struct Lexer; impl Lexer { /// Returns the token parser combinator. pub fn lexer() -> impl chumsky::Parser>, Error = Simple> { let octal = just("0o") .ignore_then(text::digits(8)) .map(|s: String| Token::Int(i64::from_str_radix(&s, 8).unwrap_or(0))); let hex = just("0x") .ignore_then(text::digits(16)) .map(|s: String| Token::Int(i64::from_str_radix(&s, 16).unwrap_or(0))); let decimal = text::int(10).map(|s: String| Token::Int(s.parse().unwrap())); let int = octal.or(hex).or(decimal); let float = text::int(10).then(just('.').then(text::digits(10))).map( |(a, (_, b)): (String, (char, String))| { let f: f64 = format!("{}.{}", a, b).parse().unwrap(); Token::Float(OrderedFloat(f)) }, ); let escape = just('\\').ignore_then( just('\\') .or(just('/')) .or(just('"')) .or(just('n').to('\n')) .or(just('r').to('\r')) .or(just('t').to('\t')), ); let string = just('"') .ignore_then(filter(|c| *c != '\\' && *c != '"').or(escape).repeated()) .then_ignore(just('"')) .collect::() .map(Token::Str); // Heredoc: >>>...<<< let heredoc = just(">>>") .ignore_then(take_until(just("<<<"))) .map(|(chars, _): (Vec, _)| { let s: String = chars.into_iter().collect(); // Trim leading newline if present let s = s.strip_prefix('\n').unwrap_or(&s); Token::Str(s.to_string()) }); let keyword_or_ident = text::ident().map(|s: String| match s.as_str() { "let" => Token::Let, "fn" => Token::Fn, "async" => Token::Ident("async".to_string()), "if" => Token::If, "else" => Token::Else, "then" => Token::Then, "for" => Token::For, "in" => Token::In, "match" => Token::Match, "struct" => Token::Struct, "enum" => Token::Enum, "type" => Token::Type, "import" => Token::Import, "as" => Token::As, "dotfile" => Token::Dotfile, "package" => Token::Package, "secret" => Token::Secret, "encrypted" => Token::Encrypted, "hook" => Token::Hook, "before_deploy" => Token::BeforeDeploy, "after_deploy" => Token::AfterDeploy, "before_package" => Token::BeforePackage, "after_package" => Token::AfterPackage, "macro" => Token::Macro, "await" => Token::Await, "return" => Token::Return, "when" => Token::When, "true" => Token::Bool(true), "false" => Token::Bool(false), _ => Token::Ident(s), }); let op = choice(( just("??").to(Token::QuestionQuestion), just("=>").to(Token::FatArrow), just("->").to(Token::Arrow), just("::").to(Token::DoubleColon), just("..").to(Token::DotDot), just("==").to(Token::EqEq), just("!=").to(Token::NotEq), just("<=").to(Token::LtEq), just(">=").to(Token::GtEq), just("&&").to(Token::And), just("||").to(Token::Or), just('+').to(Token::Plus), just('-').to(Token::Minus), just('*').to(Token::Star), just('/').to(Token::Slash), just('%').to(Token::Percent), just('=').to(Token::Eq), just('<').to(Token::Lt), just('>').to(Token::Gt), just('!').to(Token::Bang), just('|').to(Token::Pipe), just('.').to(Token::Dot), )); let delim = choice(( just('(').to(Token::LParen), just(')').to(Token::RParen), just('[').to(Token::LBracket), just(']').to(Token::RBracket), just('{').to(Token::LBrace), just('}').to(Token::RBrace), just(',').to(Token::Comma), just(':').to(Token::Colon), just(';').to(Token::Semicolon), just('~').to(Token::Tilde), just('@').to(Token::At), just('#').to(Token::Hash), )); let comment = just('#').then(none_of("\n").repeated()).ignored(); let whitespace = just(' ').or(just('\t')).repeated().at_least(1).ignored(); let newline = just('\n').to(Token::Newline); let token = choice(( float, int, heredoc, string, keyword_or_ident, op, delim, newline, )) .map_with_span(Spanned::new); token .padded_by(comment.repeated()) .padded_by(whitespace.repeated()) .repeated() .then_ignore(end()) } /// Tokenizes the input string with indentation processing. #[tracing::instrument(skip_all)] pub fn lex(input: &str) -> Result>, Vec>> { let tokens = Self::lexer().parse(input)?; Ok(Self::process_indentation(tokens)) } /// Converts whitespace into indent/dedent tokens. #[tracing::instrument(level = "trace", skip_all)] fn process_indentation(tokens: Vec>) -> Vec> { let mut result = Vec::new(); let mut indent_stack = vec![0usize]; let mut at_line_start = true; let mut line_start_pos = 0; for token in tokens { match &token.node { Token::Newline => { result.push(token.clone()); at_line_start = true; line_start_pos = token.span.end; } _ if at_line_start => { let span_start = token.span.start; let current_indent = span_start.saturating_sub(line_start_pos); let last_indent = *indent_stack.last().unwrap(); if current_indent > last_indent { indent_stack.push(current_indent); result.push(Spanned::new( Token::Indent(current_indent), span_start..span_start, )); } else { while indent_stack.len() > 1 && current_indent < *indent_stack.last().unwrap() { indent_stack.pop(); result.push(Spanned::new(Token::Dedent, span_start..span_start)); } } at_line_start = false; result.push(token); } _ => { result.push(token); } } } let end = result.last().map(|t| t.span.end).unwrap_or(0); while indent_stack.len() > 1 { indent_stack.pop(); result.push(Spanned::new(Token::Dedent, end..end)); } result } } #[cfg(test)] mod tests { use super::*; #[test] fn test_basic_tokens() { let input = "let x = 42"; let tokens = Lexer::lex(input).unwrap(); assert!(matches!(tokens[0].node, Token::Let)); assert!(matches!(tokens[1].node, Token::Ident(ref s) if s == "x")); assert!(matches!(tokens[2].node, Token::Eq)); assert!(matches!(tokens[3].node, Token::Int(42))); } #[test] fn test_string_literal() { let input = r#""hello world""#; let tokens = Lexer::lex(input).unwrap(); assert!(matches!(tokens[0].node, Token::Str(ref s) if s == "hello world")); } #[test] fn test_operators() { let input = "a ?? b => c"; let tokens = Lexer::lex(input).unwrap(); assert!(matches!(tokens[1].node, Token::QuestionQuestion)); assert!(matches!(tokens[3].node, Token::FatArrow)); } }