427 lines
13 KiB
Rust
427 lines
13 KiB
Rust
//! Lexer for the doot language.
|
|
|
|
use chumsky::prelude::*;
|
|
use ordered_float::OrderedFloat;
|
|
use std::fmt;
|
|
|
|
/// Token types produced by the lexer.
|
|
#[derive(Clone, Debug, PartialEq, Eq, Hash)]
|
|
pub enum Token {
|
|
// Literals
|
|
Int(i64),
|
|
Float(OrderedFloat<f64>),
|
|
Str(String),
|
|
Bool(bool),
|
|
|
|
// Identifiers and keywords
|
|
Ident(String),
|
|
|
|
// Keywords
|
|
Let,
|
|
Fn,
|
|
AsyncFn,
|
|
If,
|
|
Else,
|
|
Then,
|
|
For,
|
|
In,
|
|
Match,
|
|
Struct,
|
|
Enum,
|
|
Type,
|
|
Import,
|
|
As,
|
|
Dotfile,
|
|
Package,
|
|
Secret,
|
|
Encrypted,
|
|
Hook,
|
|
BeforeDeploy,
|
|
AfterDeploy,
|
|
BeforePackage,
|
|
AfterPackage,
|
|
Macro,
|
|
Await,
|
|
Return,
|
|
When,
|
|
|
|
// Operators
|
|
Plus,
|
|
Minus,
|
|
Star,
|
|
Slash,
|
|
Percent,
|
|
Eq,
|
|
EqEq,
|
|
NotEq,
|
|
Lt,
|
|
Gt,
|
|
LtEq,
|
|
GtEq,
|
|
And,
|
|
Or,
|
|
Not,
|
|
Pipe,
|
|
DoublePipe,
|
|
DoubleColon,
|
|
Arrow,
|
|
FatArrow,
|
|
Dot,
|
|
DotDot,
|
|
QuestionQuestion,
|
|
|
|
// Delimiters
|
|
LParen,
|
|
RParen,
|
|
LBracket,
|
|
RBracket,
|
|
LBrace,
|
|
RBrace,
|
|
Comma,
|
|
Colon,
|
|
Semicolon,
|
|
Newline,
|
|
|
|
// Special
|
|
Tilde,
|
|
At,
|
|
Hash,
|
|
Bang,
|
|
Indent(usize),
|
|
Dedent,
|
|
}
|
|
|
|
impl fmt::Display for Token {
|
|
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
|
match self {
|
|
Token::Int(n) => write!(f, "{}", n),
|
|
Token::Float(n) => write!(f, "{}", n),
|
|
Token::Str(s) => write!(f, "\"{}\"", s),
|
|
Token::Bool(b) => write!(f, "{}", b),
|
|
Token::Ident(s) => write!(f, "{}", s),
|
|
Token::Let => write!(f, "let"),
|
|
Token::Fn => write!(f, "fn"),
|
|
Token::AsyncFn => write!(f, "async fn"),
|
|
Token::If => write!(f, "if"),
|
|
Token::Else => write!(f, "else"),
|
|
Token::Then => write!(f, "then"),
|
|
Token::For => write!(f, "for"),
|
|
Token::In => write!(f, "in"),
|
|
Token::Match => write!(f, "match"),
|
|
Token::Struct => write!(f, "struct"),
|
|
Token::Enum => write!(f, "enum"),
|
|
Token::Type => write!(f, "type"),
|
|
Token::Import => write!(f, "import"),
|
|
Token::As => write!(f, "as"),
|
|
Token::Dotfile => write!(f, "dotfile"),
|
|
Token::Package => write!(f, "package"),
|
|
Token::Secret => write!(f, "secret"),
|
|
Token::Encrypted => write!(f, "encrypted"),
|
|
Token::Hook => write!(f, "hook"),
|
|
Token::BeforeDeploy => write!(f, "before_deploy"),
|
|
Token::AfterDeploy => write!(f, "after_deploy"),
|
|
Token::BeforePackage => write!(f, "before_package"),
|
|
Token::AfterPackage => write!(f, "after_package"),
|
|
Token::Macro => write!(f, "macro"),
|
|
Token::Await => write!(f, "await"),
|
|
Token::Return => write!(f, "return"),
|
|
Token::When => write!(f, "when"),
|
|
Token::Plus => write!(f, "+"),
|
|
Token::Minus => write!(f, "-"),
|
|
Token::Star => write!(f, "*"),
|
|
Token::Slash => write!(f, "/"),
|
|
Token::Percent => write!(f, "%"),
|
|
Token::Eq => write!(f, "="),
|
|
Token::EqEq => write!(f, "=="),
|
|
Token::NotEq => write!(f, "!="),
|
|
Token::Lt => write!(f, "<"),
|
|
Token::Gt => write!(f, ">"),
|
|
Token::LtEq => write!(f, "<="),
|
|
Token::GtEq => write!(f, ">="),
|
|
Token::And => write!(f, "&&"),
|
|
Token::Or => write!(f, "||"),
|
|
Token::Not => write!(f, "!"),
|
|
Token::Pipe => write!(f, "|"),
|
|
Token::DoublePipe => write!(f, "||"),
|
|
Token::DoubleColon => write!(f, "::"),
|
|
Token::Arrow => write!(f, "->"),
|
|
Token::FatArrow => write!(f, "=>"),
|
|
Token::Dot => write!(f, "."),
|
|
Token::DotDot => write!(f, ".."),
|
|
Token::QuestionQuestion => write!(f, "??"),
|
|
Token::LParen => write!(f, "("),
|
|
Token::RParen => write!(f, ")"),
|
|
Token::LBracket => write!(f, "["),
|
|
Token::RBracket => write!(f, "]"),
|
|
Token::LBrace => write!(f, "{{"),
|
|
Token::RBrace => write!(f, "}}"),
|
|
Token::Comma => write!(f, ","),
|
|
Token::Colon => write!(f, ":"),
|
|
Token::Semicolon => write!(f, ";"),
|
|
Token::Newline => write!(f, "\\n"),
|
|
Token::Tilde => write!(f, "~"),
|
|
Token::At => write!(f, "@"),
|
|
Token::Hash => write!(f, "#"),
|
|
Token::Bang => write!(f, "!"),
|
|
Token::Indent(n) => write!(f, "<indent {}>", n),
|
|
Token::Dedent => write!(f, "<dedent>"),
|
|
}
|
|
}
|
|
}
|
|
|
|
/// Source location range.
|
|
pub type Span = std::ops::Range<usize>;
|
|
|
|
/// Token with source location.
|
|
#[derive(Clone, Debug)]
|
|
pub struct Spanned<T> {
|
|
pub node: T,
|
|
pub span: Span,
|
|
}
|
|
|
|
impl<T> Spanned<T> {
|
|
/// Creates a new spanned token.
|
|
pub fn new(node: T, span: Span) -> Self {
|
|
Self { node, span }
|
|
}
|
|
}
|
|
|
|
/// Tokenizes doot source code.
|
|
pub struct Lexer;
|
|
|
|
impl Lexer {
|
|
/// Returns the token parser combinator.
|
|
pub fn lexer() -> impl chumsky::Parser<char, Vec<Spanned<Token>>, Error = Simple<char>> {
|
|
let octal = just("0o")
|
|
.ignore_then(text::digits(8))
|
|
.map(|s: String| Token::Int(i64::from_str_radix(&s, 8).unwrap_or(0)));
|
|
|
|
let hex = just("0x")
|
|
.ignore_then(text::digits(16))
|
|
.map(|s: String| Token::Int(i64::from_str_radix(&s, 16).unwrap_or(0)));
|
|
|
|
let decimal = text::int(10).map(|s: String| Token::Int(s.parse().unwrap()));
|
|
|
|
let int = octal.or(hex).or(decimal);
|
|
|
|
let float = text::int(10).then(just('.').then(text::digits(10))).map(
|
|
|(a, (_, b)): (String, (char, String))| {
|
|
let f: f64 = format!("{}.{}", a, b).parse().unwrap();
|
|
Token::Float(OrderedFloat(f))
|
|
},
|
|
);
|
|
|
|
let escape = just('\\').ignore_then(
|
|
just('\\')
|
|
.or(just('/'))
|
|
.or(just('"'))
|
|
.or(just('n').to('\n'))
|
|
.or(just('r').to('\r'))
|
|
.or(just('t').to('\t')),
|
|
);
|
|
|
|
let string = just('"')
|
|
.ignore_then(filter(|c| *c != '\\' && *c != '"').or(escape).repeated())
|
|
.then_ignore(just('"'))
|
|
.collect::<String>()
|
|
.map(Token::Str);
|
|
|
|
// Heredoc: >>>...<<<
|
|
let heredoc =
|
|
just(">>>")
|
|
.ignore_then(take_until(just("<<<")))
|
|
.map(|(chars, _): (Vec<char>, _)| {
|
|
let s: String = chars.into_iter().collect();
|
|
// Trim leading newline if present
|
|
let s = s.strip_prefix('\n').unwrap_or(&s);
|
|
Token::Str(s.to_string())
|
|
});
|
|
|
|
let keyword_or_ident = text::ident().map(|s: String| match s.as_str() {
|
|
"let" => Token::Let,
|
|
"fn" => Token::Fn,
|
|
"async" => Token::Ident("async".to_string()),
|
|
"if" => Token::If,
|
|
"else" => Token::Else,
|
|
"then" => Token::Then,
|
|
"for" => Token::For,
|
|
"in" => Token::In,
|
|
"match" => Token::Match,
|
|
"struct" => Token::Struct,
|
|
"enum" => Token::Enum,
|
|
"type" => Token::Type,
|
|
"import" => Token::Import,
|
|
"as" => Token::As,
|
|
"dotfile" => Token::Dotfile,
|
|
"package" => Token::Package,
|
|
"secret" => Token::Secret,
|
|
"encrypted" => Token::Encrypted,
|
|
"hook" => Token::Hook,
|
|
"before_deploy" => Token::BeforeDeploy,
|
|
"after_deploy" => Token::AfterDeploy,
|
|
"before_package" => Token::BeforePackage,
|
|
"after_package" => Token::AfterPackage,
|
|
"macro" => Token::Macro,
|
|
"await" => Token::Await,
|
|
"return" => Token::Return,
|
|
"when" => Token::When,
|
|
"true" => Token::Bool(true),
|
|
"false" => Token::Bool(false),
|
|
_ => Token::Ident(s),
|
|
});
|
|
|
|
let op = choice((
|
|
just("??").to(Token::QuestionQuestion),
|
|
just("=>").to(Token::FatArrow),
|
|
just("->").to(Token::Arrow),
|
|
just("::").to(Token::DoubleColon),
|
|
just("..").to(Token::DotDot),
|
|
just("==").to(Token::EqEq),
|
|
just("!=").to(Token::NotEq),
|
|
just("<=").to(Token::LtEq),
|
|
just(">=").to(Token::GtEq),
|
|
just("&&").to(Token::And),
|
|
just("||").to(Token::Or),
|
|
just('+').to(Token::Plus),
|
|
just('-').to(Token::Minus),
|
|
just('*').to(Token::Star),
|
|
just('/').to(Token::Slash),
|
|
just('%').to(Token::Percent),
|
|
just('=').to(Token::Eq),
|
|
just('<').to(Token::Lt),
|
|
just('>').to(Token::Gt),
|
|
just('!').to(Token::Bang),
|
|
just('|').to(Token::Pipe),
|
|
just('.').to(Token::Dot),
|
|
));
|
|
|
|
let delim = choice((
|
|
just('(').to(Token::LParen),
|
|
just(')').to(Token::RParen),
|
|
just('[').to(Token::LBracket),
|
|
just(']').to(Token::RBracket),
|
|
just('{').to(Token::LBrace),
|
|
just('}').to(Token::RBrace),
|
|
just(',').to(Token::Comma),
|
|
just(':').to(Token::Colon),
|
|
just(';').to(Token::Semicolon),
|
|
just('~').to(Token::Tilde),
|
|
just('@').to(Token::At),
|
|
just('#').to(Token::Hash),
|
|
));
|
|
|
|
let comment = just('#').then(none_of("\n").repeated()).ignored();
|
|
|
|
let whitespace = just(' ').or(just('\t')).repeated().at_least(1).ignored();
|
|
|
|
let newline = just('\n').to(Token::Newline);
|
|
|
|
let token = choice((
|
|
float,
|
|
int,
|
|
heredoc,
|
|
string,
|
|
keyword_or_ident,
|
|
op,
|
|
delim,
|
|
newline,
|
|
))
|
|
.map_with_span(Spanned::new);
|
|
|
|
token
|
|
.padded_by(comment.repeated())
|
|
.padded_by(whitespace.repeated())
|
|
.repeated()
|
|
.then_ignore(end())
|
|
}
|
|
|
|
/// Tokenizes the input string with indentation processing.
|
|
#[tracing::instrument(skip_all)]
|
|
pub fn lex(input: &str) -> Result<Vec<Spanned<Token>>, Vec<Simple<char>>> {
|
|
let tokens = Self::lexer().parse(input)?;
|
|
Ok(Self::process_indentation(tokens))
|
|
}
|
|
|
|
/// Converts whitespace into indent/dedent tokens.
|
|
#[tracing::instrument(level = "trace", skip_all)]
|
|
fn process_indentation(tokens: Vec<Spanned<Token>>) -> Vec<Spanned<Token>> {
|
|
let mut result = Vec::new();
|
|
let mut indent_stack = vec![0usize];
|
|
let mut at_line_start = true;
|
|
let mut line_start_pos = 0;
|
|
|
|
for token in tokens {
|
|
match &token.node {
|
|
Token::Newline => {
|
|
result.push(token.clone());
|
|
at_line_start = true;
|
|
line_start_pos = token.span.end;
|
|
}
|
|
_ if at_line_start => {
|
|
let span_start = token.span.start;
|
|
let current_indent = span_start.saturating_sub(line_start_pos);
|
|
let last_indent = *indent_stack.last().unwrap();
|
|
|
|
if current_indent > last_indent {
|
|
indent_stack.push(current_indent);
|
|
result.push(Spanned::new(
|
|
Token::Indent(current_indent),
|
|
span_start..span_start,
|
|
));
|
|
} else {
|
|
while indent_stack.len() > 1
|
|
&& current_indent < *indent_stack.last().unwrap()
|
|
{
|
|
indent_stack.pop();
|
|
result.push(Spanned::new(Token::Dedent, span_start..span_start));
|
|
}
|
|
}
|
|
|
|
at_line_start = false;
|
|
result.push(token);
|
|
}
|
|
_ => {
|
|
result.push(token);
|
|
}
|
|
}
|
|
}
|
|
|
|
let end = result.last().map(|t| t.span.end).unwrap_or(0);
|
|
while indent_stack.len() > 1 {
|
|
indent_stack.pop();
|
|
result.push(Spanned::new(Token::Dedent, end..end));
|
|
}
|
|
|
|
result
|
|
}
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use super::*;
|
|
|
|
#[test]
|
|
fn test_basic_tokens() {
|
|
let input = "let x = 42";
|
|
let tokens = Lexer::lex(input).unwrap();
|
|
assert!(matches!(tokens[0].node, Token::Let));
|
|
assert!(matches!(tokens[1].node, Token::Ident(ref s) if s == "x"));
|
|
assert!(matches!(tokens[2].node, Token::Eq));
|
|
assert!(matches!(tokens[3].node, Token::Int(42)));
|
|
}
|
|
|
|
#[test]
|
|
fn test_string_literal() {
|
|
let input = r#""hello world""#;
|
|
let tokens = Lexer::lex(input).unwrap();
|
|
assert!(matches!(tokens[0].node, Token::Str(ref s) if s == "hello world"));
|
|
}
|
|
|
|
#[test]
|
|
fn test_operators() {
|
|
let input = "a ?? b => c";
|
|
let tokens = Lexer::lex(input).unwrap();
|
|
assert!(matches!(tokens[1].node, Token::QuestionQuestion));
|
|
assert!(matches!(tokens[3].node, Token::FatArrow));
|
|
}
|
|
}
|