doot/crates/doot-lang/src/lexer.rs

427 lines
13 KiB
Rust

//! Lexer for the doot language.
use chumsky::prelude::*;
use ordered_float::OrderedFloat;
use std::fmt;
/// Token types produced by the lexer.
#[derive(Clone, Debug, PartialEq, Eq, Hash)]
pub enum Token {
// Literals
Int(i64),
Float(OrderedFloat<f64>),
Str(String),
Bool(bool),
// Identifiers and keywords
Ident(String),
// Keywords
Let,
Fn,
AsyncFn,
If,
Else,
Then,
For,
In,
Match,
Struct,
Enum,
Type,
Import,
As,
Dotfile,
Package,
Secret,
Encrypted,
Hook,
BeforeDeploy,
AfterDeploy,
BeforePackage,
AfterPackage,
Macro,
Await,
Return,
When,
// Operators
Plus,
Minus,
Star,
Slash,
Percent,
Eq,
EqEq,
NotEq,
Lt,
Gt,
LtEq,
GtEq,
And,
Or,
Not,
Pipe,
DoublePipe,
DoubleColon,
Arrow,
FatArrow,
Dot,
DotDot,
QuestionQuestion,
// Delimiters
LParen,
RParen,
LBracket,
RBracket,
LBrace,
RBrace,
Comma,
Colon,
Semicolon,
Newline,
// Special
Tilde,
At,
Hash,
Bang,
Indent(usize),
Dedent,
}
impl fmt::Display for Token {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
Token::Int(n) => write!(f, "{}", n),
Token::Float(n) => write!(f, "{}", n),
Token::Str(s) => write!(f, "\"{}\"", s),
Token::Bool(b) => write!(f, "{}", b),
Token::Ident(s) => write!(f, "{}", s),
Token::Let => write!(f, "let"),
Token::Fn => write!(f, "fn"),
Token::AsyncFn => write!(f, "async fn"),
Token::If => write!(f, "if"),
Token::Else => write!(f, "else"),
Token::Then => write!(f, "then"),
Token::For => write!(f, "for"),
Token::In => write!(f, "in"),
Token::Match => write!(f, "match"),
Token::Struct => write!(f, "struct"),
Token::Enum => write!(f, "enum"),
Token::Type => write!(f, "type"),
Token::Import => write!(f, "import"),
Token::As => write!(f, "as"),
Token::Dotfile => write!(f, "dotfile"),
Token::Package => write!(f, "package"),
Token::Secret => write!(f, "secret"),
Token::Encrypted => write!(f, "encrypted"),
Token::Hook => write!(f, "hook"),
Token::BeforeDeploy => write!(f, "before_deploy"),
Token::AfterDeploy => write!(f, "after_deploy"),
Token::BeforePackage => write!(f, "before_package"),
Token::AfterPackage => write!(f, "after_package"),
Token::Macro => write!(f, "macro"),
Token::Await => write!(f, "await"),
Token::Return => write!(f, "return"),
Token::When => write!(f, "when"),
Token::Plus => write!(f, "+"),
Token::Minus => write!(f, "-"),
Token::Star => write!(f, "*"),
Token::Slash => write!(f, "/"),
Token::Percent => write!(f, "%"),
Token::Eq => write!(f, "="),
Token::EqEq => write!(f, "=="),
Token::NotEq => write!(f, "!="),
Token::Lt => write!(f, "<"),
Token::Gt => write!(f, ">"),
Token::LtEq => write!(f, "<="),
Token::GtEq => write!(f, ">="),
Token::And => write!(f, "&&"),
Token::Or => write!(f, "||"),
Token::Not => write!(f, "!"),
Token::Pipe => write!(f, "|"),
Token::DoublePipe => write!(f, "||"),
Token::DoubleColon => write!(f, "::"),
Token::Arrow => write!(f, "->"),
Token::FatArrow => write!(f, "=>"),
Token::Dot => write!(f, "."),
Token::DotDot => write!(f, ".."),
Token::QuestionQuestion => write!(f, "??"),
Token::LParen => write!(f, "("),
Token::RParen => write!(f, ")"),
Token::LBracket => write!(f, "["),
Token::RBracket => write!(f, "]"),
Token::LBrace => write!(f, "{{"),
Token::RBrace => write!(f, "}}"),
Token::Comma => write!(f, ","),
Token::Colon => write!(f, ":"),
Token::Semicolon => write!(f, ";"),
Token::Newline => write!(f, "\\n"),
Token::Tilde => write!(f, "~"),
Token::At => write!(f, "@"),
Token::Hash => write!(f, "#"),
Token::Bang => write!(f, "!"),
Token::Indent(n) => write!(f, "<indent {}>", n),
Token::Dedent => write!(f, "<dedent>"),
}
}
}
/// Source location range.
pub type Span = std::ops::Range<usize>;
/// Token with source location.
#[derive(Clone, Debug)]
pub struct Spanned<T> {
pub node: T,
pub span: Span,
}
impl<T> Spanned<T> {
/// Creates a new spanned token.
pub fn new(node: T, span: Span) -> Self {
Self { node, span }
}
}
/// Tokenizes doot source code.
pub struct Lexer;
impl Lexer {
/// Returns the token parser combinator.
pub fn lexer() -> impl chumsky::Parser<char, Vec<Spanned<Token>>, Error = Simple<char>> {
let octal = just("0o")
.ignore_then(text::digits(8))
.map(|s: String| Token::Int(i64::from_str_radix(&s, 8).unwrap_or(0)));
let hex = just("0x")
.ignore_then(text::digits(16))
.map(|s: String| Token::Int(i64::from_str_radix(&s, 16).unwrap_or(0)));
let decimal = text::int(10).map(|s: String| Token::Int(s.parse().unwrap()));
let int = octal.or(hex).or(decimal);
let float = text::int(10).then(just('.').then(text::digits(10))).map(
|(a, (_, b)): (String, (char, String))| {
let f: f64 = format!("{}.{}", a, b).parse().unwrap();
Token::Float(OrderedFloat(f))
},
);
let escape = just('\\').ignore_then(
just('\\')
.or(just('/'))
.or(just('"'))
.or(just('n').to('\n'))
.or(just('r').to('\r'))
.or(just('t').to('\t')),
);
let string = just('"')
.ignore_then(filter(|c| *c != '\\' && *c != '"').or(escape).repeated())
.then_ignore(just('"'))
.collect::<String>()
.map(Token::Str);
// Heredoc: >>>...<<<
let heredoc =
just(">>>")
.ignore_then(take_until(just("<<<")))
.map(|(chars, _): (Vec<char>, _)| {
let s: String = chars.into_iter().collect();
// Trim leading newline if present
let s = s.strip_prefix('\n').unwrap_or(&s);
Token::Str(s.to_string())
});
let keyword_or_ident = text::ident().map(|s: String| match s.as_str() {
"let" => Token::Let,
"fn" => Token::Fn,
"async" => Token::Ident("async".to_string()),
"if" => Token::If,
"else" => Token::Else,
"then" => Token::Then,
"for" => Token::For,
"in" => Token::In,
"match" => Token::Match,
"struct" => Token::Struct,
"enum" => Token::Enum,
"type" => Token::Type,
"import" => Token::Import,
"as" => Token::As,
"dotfile" => Token::Dotfile,
"package" => Token::Package,
"secret" => Token::Secret,
"encrypted" => Token::Encrypted,
"hook" => Token::Hook,
"before_deploy" => Token::BeforeDeploy,
"after_deploy" => Token::AfterDeploy,
"before_package" => Token::BeforePackage,
"after_package" => Token::AfterPackage,
"macro" => Token::Macro,
"await" => Token::Await,
"return" => Token::Return,
"when" => Token::When,
"true" => Token::Bool(true),
"false" => Token::Bool(false),
_ => Token::Ident(s),
});
let op = choice((
just("??").to(Token::QuestionQuestion),
just("=>").to(Token::FatArrow),
just("->").to(Token::Arrow),
just("::").to(Token::DoubleColon),
just("..").to(Token::DotDot),
just("==").to(Token::EqEq),
just("!=").to(Token::NotEq),
just("<=").to(Token::LtEq),
just(">=").to(Token::GtEq),
just("&&").to(Token::And),
just("||").to(Token::Or),
just('+').to(Token::Plus),
just('-').to(Token::Minus),
just('*').to(Token::Star),
just('/').to(Token::Slash),
just('%').to(Token::Percent),
just('=').to(Token::Eq),
just('<').to(Token::Lt),
just('>').to(Token::Gt),
just('!').to(Token::Bang),
just('|').to(Token::Pipe),
just('.').to(Token::Dot),
));
let delim = choice((
just('(').to(Token::LParen),
just(')').to(Token::RParen),
just('[').to(Token::LBracket),
just(']').to(Token::RBracket),
just('{').to(Token::LBrace),
just('}').to(Token::RBrace),
just(',').to(Token::Comma),
just(':').to(Token::Colon),
just(';').to(Token::Semicolon),
just('~').to(Token::Tilde),
just('@').to(Token::At),
just('#').to(Token::Hash),
));
let comment = just('#').then(none_of("\n").repeated()).ignored();
let whitespace = just(' ').or(just('\t')).repeated().at_least(1).ignored();
let newline = just('\n').to(Token::Newline);
let token = choice((
float,
int,
heredoc,
string,
keyword_or_ident,
op,
delim,
newline,
))
.map_with_span(Spanned::new);
token
.padded_by(comment.repeated())
.padded_by(whitespace.repeated())
.repeated()
.then_ignore(end())
}
/// Tokenizes the input string with indentation processing.
#[tracing::instrument(skip_all)]
pub fn lex(input: &str) -> Result<Vec<Spanned<Token>>, Vec<Simple<char>>> {
let tokens = Self::lexer().parse(input)?;
Ok(Self::process_indentation(tokens))
}
/// Converts whitespace into indent/dedent tokens.
#[tracing::instrument(level = "trace", skip_all)]
fn process_indentation(tokens: Vec<Spanned<Token>>) -> Vec<Spanned<Token>> {
let mut result = Vec::new();
let mut indent_stack = vec![0usize];
let mut at_line_start = true;
let mut line_start_pos = 0;
for token in tokens {
match &token.node {
Token::Newline => {
result.push(token.clone());
at_line_start = true;
line_start_pos = token.span.end;
}
_ if at_line_start => {
let span_start = token.span.start;
let current_indent = span_start.saturating_sub(line_start_pos);
let last_indent = *indent_stack.last().unwrap();
if current_indent > last_indent {
indent_stack.push(current_indent);
result.push(Spanned::new(
Token::Indent(current_indent),
span_start..span_start,
));
} else {
while indent_stack.len() > 1
&& current_indent < *indent_stack.last().unwrap()
{
indent_stack.pop();
result.push(Spanned::new(Token::Dedent, span_start..span_start));
}
}
at_line_start = false;
result.push(token);
}
_ => {
result.push(token);
}
}
}
let end = result.last().map(|t| t.span.end).unwrap_or(0);
while indent_stack.len() > 1 {
indent_stack.pop();
result.push(Spanned::new(Token::Dedent, end..end));
}
result
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_basic_tokens() {
let input = "let x = 42";
let tokens = Lexer::lex(input).unwrap();
assert!(matches!(tokens[0].node, Token::Let));
assert!(matches!(tokens[1].node, Token::Ident(ref s) if s == "x"));
assert!(matches!(tokens[2].node, Token::Eq));
assert!(matches!(tokens[3].node, Token::Int(42)));
}
#[test]
fn test_string_literal() {
let input = r#""hello world""#;
let tokens = Lexer::lex(input).unwrap();
assert!(matches!(tokens[0].node, Token::Str(ref s) if s == "hello world"));
}
#[test]
fn test_operators() {
let input = "a ?? b => c";
let tokens = Lexer::lex(input).unwrap();
assert!(matches!(tokens[1].node, Token::QuestionQuestion));
assert!(matches!(tokens[3].node, Token::FatArrow));
}
}