Initial commit
This commit is contained in:
6
aurac_lexer/Cargo.toml
Normal file
6
aurac_lexer/Cargo.toml
Normal file
@@ -0,0 +1,6 @@
|
||||
[package]
|
||||
name = "aurac_lexer"
|
||||
version = "0.1.0"
|
||||
edition = "2021"
|
||||
|
||||
[dependencies]
|
||||
203
aurac_lexer/src/lexer.rs
Normal file
203
aurac_lexer/src/lexer.rs
Normal file
@@ -0,0 +1,203 @@
|
||||
use crate::token::{Span, Token, TokenKind};
|
||||
use std::str::CharIndices;
|
||||
|
||||
pub struct Lexer<'a> {
|
||||
input: &'a str,
|
||||
chars: CharIndices<'a>,
|
||||
current: Option<(usize, char)>,
|
||||
line: usize,
|
||||
column: usize,
|
||||
indent_stack: Vec<usize>,
|
||||
pending_dedents: usize,
|
||||
emitted_eof: bool,
|
||||
}
|
||||
|
||||
impl<'a> Lexer<'a> {
|
||||
pub fn new(input: &'a str) -> Self {
|
||||
let mut chars = input.char_indices();
|
||||
let current = chars.next();
|
||||
Self {
|
||||
input,
|
||||
chars,
|
||||
current,
|
||||
line: 1,
|
||||
column: 1,
|
||||
indent_stack: vec![0], // base level indentation
|
||||
pending_dedents: 0,
|
||||
emitted_eof: false,
|
||||
}
|
||||
}
|
||||
|
||||
fn advance(&mut self) -> Option<(usize, char)> {
|
||||
let result = self.current;
|
||||
if let Some((_, c)) = result {
|
||||
if c == '\n' {
|
||||
self.line += 1;
|
||||
self.column = 1;
|
||||
} else {
|
||||
self.column += 1;
|
||||
}
|
||||
}
|
||||
self.current = self.chars.next();
|
||||
result
|
||||
}
|
||||
|
||||
fn peek(&self) -> Option<(usize, char)> {
|
||||
self.current
|
||||
}
|
||||
|
||||
pub fn next_token(&mut self) -> Token<'a> {
|
||||
if self.pending_dedents > 0 {
|
||||
self.pending_dedents -= 1;
|
||||
return Token::new(
|
||||
TokenKind::Dedent,
|
||||
Span { line: self.line, column: 1, offset: self.current.map(|(o,_)| o).unwrap_or(self.input.len()), len: 0 }
|
||||
);
|
||||
}
|
||||
|
||||
if let Some((offset, c)) = self.current {
|
||||
// Indentation mapping at the start of lines
|
||||
if self.column == 1 && c != '\n' && c != '\r' {
|
||||
let mut spaces = 0;
|
||||
while let Some((_, pc)) = self.current {
|
||||
if pc == ' ' {
|
||||
spaces += 1;
|
||||
self.advance();
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if self.current.map_or(false, |(_, c)| c == '\n' || c == '\r') {
|
||||
// Empty/whitespace-only line: proceed to standard token matching
|
||||
// which will hit the '\n' matcher below.
|
||||
} else {
|
||||
let current_indent = *self.indent_stack.last().unwrap_or(&0);
|
||||
|
||||
if spaces > current_indent {
|
||||
self.indent_stack.push(spaces);
|
||||
return Token::new(
|
||||
TokenKind::Indent,
|
||||
Span { line: self.line, column: 1, offset, len: spaces }
|
||||
);
|
||||
} else if spaces < current_indent {
|
||||
let mut dedents = 0;
|
||||
while let Some(&last) = self.indent_stack.last() {
|
||||
if last > spaces {
|
||||
self.indent_stack.pop();
|
||||
dedents += 1;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if dedents > 0 {
|
||||
self.pending_dedents = dedents - 1;
|
||||
return Token::new(
|
||||
TokenKind::Dedent,
|
||||
Span { line: self.line, column: 1, offset, len: spaces }
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Normal matching path
|
||||
let (start_offset, c) = self.advance().unwrap();
|
||||
let start_col = self.column - 1;
|
||||
|
||||
match c {
|
||||
' ' | '\r' => self.next_token(),
|
||||
'\n' => Token::new(TokenKind::Newline, Span { line: self.line - 1, column: start_col, offset: start_offset, len: 1 }),
|
||||
':' => {
|
||||
if self.peek().map(|(_, pc)| pc) == Some(':') {
|
||||
let _ = self.advance();
|
||||
Token::new(TokenKind::DoubleColon, Span { line: self.line, column: start_col, offset: start_offset, len: 2 })
|
||||
} else {
|
||||
Token::new(TokenKind::Colon, Span { line: self.line, column: start_col, offset: start_offset, len: 1 })
|
||||
}
|
||||
}
|
||||
'-' => {
|
||||
if self.peek().map(|(_, pc)| pc) == Some('>') {
|
||||
let _ = self.advance();
|
||||
Token::new(TokenKind::Arrow, Span { line: self.line, column: start_col, offset: start_offset, len: 2 })
|
||||
} else {
|
||||
Token::new(TokenKind::Minus, Span { line: self.line, column: start_col, offset: start_offset, len: 1 })
|
||||
}
|
||||
}
|
||||
'+' => Token::new(TokenKind::Plus, Span { line: self.line, column: start_col, offset: start_offset, len: 1 }),
|
||||
'*' => Token::new(TokenKind::Star, Span { line: self.line, column: start_col, offset: start_offset, len: 1 }),
|
||||
'/' => Token::new(TokenKind::Slash, Span { line: self.line, column: start_col, offset: start_offset, len: 1 }),
|
||||
'(' => Token::new(TokenKind::OpenParen, Span { line: self.line, column: start_col, offset: start_offset, len: 1 }),
|
||||
')' => Token::new(TokenKind::CloseParen, Span { line: self.line, column: start_col, offset: start_offset, len: 1 }),
|
||||
'{' => Token::new(TokenKind::OpenBrace, Span { line: self.line, column: start_col, offset: start_offset, len: 1 }),
|
||||
'}' => Token::new(TokenKind::CloseBrace, Span { line: self.line, column: start_col, offset: start_offset, len: 1 }),
|
||||
',' => Token::new(TokenKind::Comma, Span { line: self.line, column: start_col, offset: start_offset, len: 1 }),
|
||||
'=' => {
|
||||
if self.peek().map(|(_, pc)| pc) == Some('=') {
|
||||
let _ = self.advance();
|
||||
Token::new(TokenKind::EqualEqual, Span { line: self.line, column: start_col, offset: start_offset, len: 2 })
|
||||
} else {
|
||||
Token::new(TokenKind::Equal, Span { line: self.line, column: start_col, offset: start_offset, len: 1 })
|
||||
}
|
||||
}
|
||||
'|' => Token::new(TokenKind::Pipe, Span { line: self.line, column: start_col, offset: start_offset, len: 1 }),
|
||||
'>' => Token::new(TokenKind::Greater, Span { line: self.line, column: start_col, offset: start_offset, len: 1 }),
|
||||
'<' => Token::new(TokenKind::Less, Span { line: self.line, column: start_col, offset: start_offset, len: 1 }),
|
||||
_ if c.is_alphabetic() => {
|
||||
while let Some((_, pc)) = self.peek() {
|
||||
if pc.is_alphanumeric() || pc == '_' {
|
||||
self.advance();
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
let end_offset = self.current.map(|(o, _)| o).unwrap_or(self.input.len());
|
||||
let ident_str = &self.input[start_offset..end_offset];
|
||||
|
||||
let kind = match ident_str {
|
||||
"struct" => TokenKind::Struct,
|
||||
"fn" => TokenKind::Fn,
|
||||
"pure" => TokenKind::Pure,
|
||||
"actor" => TokenKind::Actor,
|
||||
"let" => TokenKind::Let,
|
||||
"if" => TokenKind::If,
|
||||
"else" => TokenKind::Else,
|
||||
"match" => TokenKind::Match,
|
||||
"return" => TokenKind::Return,
|
||||
"type" => TokenKind::Type,
|
||||
"gpu" => TokenKind::Gpu,
|
||||
"i8" | "i16" | "i32" | "i64" | "u8" | "u16" | "u32" | "u64" | "f32" | "f64" | "bool" | "str" => TokenKind::BaseType(ident_str),
|
||||
_ => TokenKind::Ident(ident_str),
|
||||
};
|
||||
Token::new(kind, Span { line: self.line, column: start_col, offset: start_offset, len: end_offset - start_offset })
|
||||
}
|
||||
_ if c.is_ascii_digit() => {
|
||||
while let Some((_, pc)) = self.peek() {
|
||||
if pc.is_ascii_digit() || pc == '.' {
|
||||
self.advance();
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
let end_offset = self.current.map(|(o, _)| o).unwrap_or(self.input.len());
|
||||
let num_str = &self.input[start_offset..end_offset];
|
||||
Token::new(TokenKind::Number(num_str), Span { line: self.line, column: start_col, offset: start_offset, len: end_offset - start_offset })
|
||||
}
|
||||
_ => Token::new(TokenKind::Error(c), Span { line: self.line, column: start_col, offset: start_offset, len: c.len_utf8() }),
|
||||
}
|
||||
} else {
|
||||
if self.indent_stack.len() > 1 {
|
||||
self.indent_stack.pop();
|
||||
Token::new(TokenKind::Dedent, Span { line: self.line, column: self.column, offset: self.input.len(), len: 0 })
|
||||
} else if !self.emitted_eof {
|
||||
self.emitted_eof = true;
|
||||
Token::new(TokenKind::Eof, Span { line: self.line, column: self.column, offset: self.input.len(), len: 0 })
|
||||
} else {
|
||||
Token::new(TokenKind::Eof, Span { line: self.line, column: self.column, offset: self.input.len(), len: 0 })
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
43
aurac_lexer/src/lib.rs
Normal file
43
aurac_lexer/src/lib.rs
Normal file
@@ -0,0 +1,43 @@
|
||||
pub mod lexer;
|
||||
pub mod token;
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::lexer::Lexer;
|
||||
use super::token::TokenKind::*;
|
||||
|
||||
#[test]
|
||||
fn test_struct_indentation() {
|
||||
let input = "struct Position:\n x: f32\n y: f32\n";
|
||||
let mut lexer = Lexer::new(input);
|
||||
|
||||
let expected_tokens = vec![
|
||||
Struct,
|
||||
Ident("Position"),
|
||||
Colon,
|
||||
Newline,
|
||||
Indent,
|
||||
Ident("x"),
|
||||
Colon,
|
||||
BaseType("f32"),
|
||||
Newline,
|
||||
Ident("y"),
|
||||
Colon,
|
||||
BaseType("f32"),
|
||||
Newline,
|
||||
Dedent,
|
||||
Eof,
|
||||
];
|
||||
|
||||
let mut actual_tokens = Vec::new();
|
||||
loop {
|
||||
let tok = lexer.next_token();
|
||||
actual_tokens.push(tok.kind.clone());
|
||||
if tok.kind == Eof {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
assert_eq!(actual_tokens, expected_tokens);
|
||||
}
|
||||
}
|
||||
45
aurac_lexer/src/token.rs
Normal file
45
aurac_lexer/src/token.rs
Normal file
@@ -0,0 +1,45 @@
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||
pub struct Span {
|
||||
pub line: usize,
|
||||
pub column: usize,
|
||||
pub offset: usize,
|
||||
pub len: usize,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, PartialEq)]
|
||||
pub enum TokenKind<'a> {
|
||||
// Keywords
|
||||
Struct, Fn, Pure, Actor, Let, If, Else, Match, Return, Type, Gpu,
|
||||
|
||||
// Identifiers and Literals bound to the input lifetime ('a) for zero-copy
|
||||
Ident(&'a str),
|
||||
Number(&'a str),
|
||||
StringLit(&'a str),
|
||||
|
||||
// Base Types
|
||||
BaseType(&'a str),
|
||||
|
||||
// Symbols & Operators
|
||||
Colon, DoubleColon, Comma, Arrow, Equal, Pipe,
|
||||
Plus, Minus, Star, Slash,
|
||||
OpenParen, CloseParen, OpenBrace, CloseBrace, OpenAngle, CloseAngle,
|
||||
Greater, Less, EqualEqual,
|
||||
|
||||
// Significant Whitespace
|
||||
Indent, Dedent, Newline,
|
||||
|
||||
Eof,
|
||||
Error(char),
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, PartialEq)]
|
||||
pub struct Token<'a> {
|
||||
pub kind: TokenKind<'a>,
|
||||
pub span: Span,
|
||||
}
|
||||
|
||||
impl<'a> Token<'a> {
|
||||
pub fn new(kind: TokenKind<'a>, span: Span) -> Self {
|
||||
Self { kind, span }
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user