slang_frontend/
lexer.rs

1use slang_error::{CompileResult, CompilerError, LineInfo, ErrorCode};
2use crate::token::{Token, Tokentype};
3
4pub struct LexerResult<'a> {
5    /// The list of tokens generated from the input
6    pub tokens: Vec<Token>,
7    /// The line information for the tokens
8    pub line_info: LineInfo<'a>,
9}
10
11/// Lexer state for tracking position during tokenization
12struct LexerState<'a> {
13    /// Source text being tokenized
14    input: &'a str,
15    /// Iterator over source characters
16    chars: std::iter::Peekable<std::str::Chars<'a>>,
17    /// Current position in source
18    current_pos: usize,
19    /// Current line number
20    current_line: usize,
21    /// Number of tokens on current line
22    tokens_on_current_line: usize,
23    /// Tokens generated so far
24    tokens: Vec<Token>,
25    /// Line token counts for line info
26    line_tokens: Vec<(u16, u16)>,
27    /// Collected lexer errors
28    errors: Vec<CompilerError>,
29}
30
31impl<'a> LexerState<'a> {
32    /// Creates a new lexer state for the given input
33    ///
34    /// ### Arguments
35    /// * `input` - The source code to tokenize
36    ///
37    /// ### Returns
38    /// A new LexerState object
39    fn new(input: &'a str) -> Self {
40        LexerState {
41            input,
42            chars: input.chars().peekable(),
43            current_pos: 0,
44            current_line: 1,
45            tokens_on_current_line: 0,
46            tokens: Vec::new(),
47            line_tokens: Vec::new(),
48            errors: Vec::new(),
49        }
50    }
51
52    /// Advances to the next character in the input
53    ///
54    /// ### Arguments
55    /// * `state` - The current lexer state
56    fn advance(&mut self) -> Option<char> {
57        let c = self.chars.next();
58        if c.is_some() {
59            self.current_pos += 1;
60        }
61        c
62    }
63
64    /// Peeks at the next character without consuming it
65    ///
66    /// ### Arguments
67    /// * `state` - The current lexer state
68    fn peek(&mut self) -> Option<&char> {
69        self.chars.peek()
70    }
71
72    /// Adds a token to the token list
73    ///
74    /// ### Arguments
75    /// * `state` - The current lexer state
76    /// * `token_type` - The type of token to add
77    /// * `lexeme` - The string representation of the token
78    /// * `start_pos` - The starting position of the token in the input
79    fn add_token(&mut self, token_type: Tokentype, lexeme: String, start_pos: usize) {
80        self.tokens.push(Token::new(token_type, lexeme, start_pos));
81        self.tokens_on_current_line += 1;
82    }
83
84    /// Adds an error to the error list
85    ///
86    /// ### Arguments
87    /// * `state` - The current lexer state
88    /// * `error_code` - The error code for this error
89    /// * `message` - The error message
90    /// * `start_pos` - The starting position of the error
91    /// * `token_length` - The length of the problematic token
92    fn add_error(&mut self, error_code: ErrorCode, message: String, start_pos: usize, token_length: Option<usize>) {
93        // Calculate column position from start_pos
94        let line_start = self.input[..start_pos].rfind('\n').map_or(0, |pos| pos + 1);
95        let column = start_pos - line_start + 1;
96        
97        self.errors.push(CompilerError::new(
98            error_code,
99            message,
100            self.current_line,
101            column,
102            start_pos,
103            token_length,
104        ));
105    }
106
107    /// Records a line break, updating line counts
108    ///
109    /// ### Arguments
110    /// * `state` - The current lexer state
111    fn record_line_break(&mut self) {
112        if self.tokens_on_current_line > 0 {
113            self.line_tokens
114                .push((self.current_line as u16, self.tokens_on_current_line as u16));
115        }
116        self.current_line += 1;
117        self.tokens_on_current_line = 0;
118    }
119
120    /// Finishes tokenization and returns the result
121    ///
122    /// ### Arguments
123    /// * `state` - The current lexer state
124    fn finish(mut self) -> CompileResult<LexerResult<'a>> {
125        // Add any remaining tokens on the last line
126        if self.tokens_on_current_line > 0 {
127            self.line_tokens
128                .push((self.current_line as u16, self.tokens_on_current_line as u16));
129        }
130        self.tokens
131            .push(Token::new(Tokentype::Eof, "".to_string(), self.current_pos));
132        let mut info = LineInfo::new(self.input);
133        info.per_line = self.line_tokens;
134
135        // If there are errors, return them
136        if !self.errors.is_empty() {
137            return Err(self.errors);
138        }
139
140        Ok(LexerResult {
141            tokens: self.tokens,
142            line_info: info,
143        })
144    }
145}
146
147/// Converts source code text into a sequence of tokens with line information
148///
149/// ### Arguments
150///
151/// * `input` - The source code to tokenize
152///
153/// ### Returns
154///
155/// A CompileResult containing LexerResult (tokens and line information) or lexer errors
156pub fn tokenize(input: &str) -> CompileResult<LexerResult> {
157    let mut state = LexerState::new(input);
158
159    while let Some(&c) = state.peek() {
160        let token_start_pos = state.current_pos;
161
162        match c {
163            c if c.is_whitespace() => handle_whitespace(&mut state),
164            c if c.is_alphabetic() => handle_identifier(&mut state, token_start_pos),
165            c if c.is_ascii_digit() => handle_number(&mut state, token_start_pos),
166            '"' => handle_string(&mut state),
167            ':' => handle_simple_token(&mut state, Tokentype::Colon, ":", token_start_pos),
168            '+' => handle_simple_token(&mut state, Tokentype::Plus, "+", token_start_pos),
169            '-' => handle_dash(&mut state, token_start_pos),
170            '*' => handle_simple_token(&mut state, Tokentype::Multiply, "*", token_start_pos),
171            '/' => handle_slash(&mut state, token_start_pos),
172            '=' => handle_equals(&mut state, token_start_pos),
173            '<' => handle_less_than(&mut state, token_start_pos),
174            '>' => handle_greater_than(&mut state, token_start_pos),
175            '!' => handle_exclamation(&mut state, token_start_pos),
176            ';' => handle_simple_token(&mut state, Tokentype::Semicolon, ";", token_start_pos),
177            '{' => handle_simple_token(&mut state, Tokentype::LeftBrace, "{", token_start_pos),
178            '}' => handle_simple_token(&mut state, Tokentype::RightBrace, "}", token_start_pos),
179            ',' => handle_simple_token(&mut state, Tokentype::Comma, ",", token_start_pos),
180            '(' => handle_simple_token(&mut state, Tokentype::LeftParen, "(", token_start_pos),
181            ')' => handle_simple_token(&mut state, Tokentype::RightParen, ")", token_start_pos),
182            '&' => handle_ampersand(&mut state, token_start_pos),
183            '|' => handle_pipe(&mut state, token_start_pos),
184            _ => handle_invalid_char(&mut state, token_start_pos),
185        }
186    }
187
188    state.finish()
189}
190
191/// Handles whitespace characters in the input
192///
193/// ### Arguments
194/// * `state` - The current lexer state
195fn handle_whitespace(state: &mut LexerState) {
196    let c = state.advance().unwrap();
197
198    if c == '\n' {
199        state.record_line_break();
200    }
201}
202
203/// Handles alphabetic identifiers and keywords
204///
205/// ### Arguments
206/// * `state` - The current lexer state
207/// * `start_pos` - The starting position of the identifier in the input
208fn handle_identifier(state: &mut LexerState, start_pos: usize) {
209    let mut identifier = String::new();
210
211    while let Some(&c) = state.peek() {
212        if c.is_alphanumeric() || c == '_' {
213            identifier.push(c);
214            state.advance();
215        } else {
216            break;
217        }
218    }
219
220    let token_type = match identifier.as_str() {
221        "let" => Tokentype::Let,
222        "mut" => Tokentype::Mut,
223        "struct" => Tokentype::Struct,
224        "fn" => Tokentype::Fn,
225        "return" => Tokentype::Return,
226        "if" => Tokentype::If,
227        "else" => Tokentype::Else,
228        "true" | "false" => Tokentype::BooleanLiteral,
229        _ => Tokentype::Identifier,
230    };
231
232    state.add_token(token_type, identifier, start_pos);
233}
234
235/// Handles numeric literals (integers and floating point)
236///
237/// ### Arguments
238/// * `state` - The current lexer state
239/// * `start_pos` - The starting position of the number in the input
240fn handle_number(state: &mut LexerState, start_pos: usize) {
241    let mut number = String::new();
242    let mut is_float = false;
243
244    while let Some(&c) = state.peek() {
245        if c.is_ascii_digit() {
246            number.push(c);
247            state.advance();
248        } else if c == '.' {
249            if is_float {
250                break;
251            }
252            is_float = true;
253            number.push(c);
254            state.advance();
255        } else if c == 'e' || c == 'E' {
256            number.push(c);
257            state.advance();
258            if let Some(&next_c) = state.peek() {
259                if next_c == '+' || next_c == '-' {
260                    number.push(next_c);
261                    state.advance();
262                }
263            }
264        } else {
265            break;
266        }
267    }
268
269    let token_type = if is_float {
270        Tokentype::FloatLiteral
271    } else {
272        Tokentype::IntegerLiteral
273    };
274
275    state.add_token(token_type, number, start_pos);
276}
277
278/// Handles string literals
279///
280/// ### Arguments
281/// * `state` - The current lexer state
282fn handle_string(state: &mut LexerState) {
283    let start_pos = state.current_pos;
284    state.advance(); // consume opening quote
285    let mut string = String::new();
286    let mut closed = false;
287
288    while let Some(&c) = state.peek() {
289        if c == '"' {
290            state.advance();
291            closed = true;
292            break;
293        } else if c == '\n' {
294            state.current_line += 1;
295            string.push(c);
296            state.advance();
297        } else {
298            string.push(c);
299            state.advance();
300        }
301    }
302
303    if !closed {
304        let error_message = "Expected closing quote for string literal".to_string();
305        let invalid_lexeme = format!("\"{}",string);
306        state.add_error(
307            ErrorCode::ExpectedClosingQuote, 
308            error_message, 
309            start_pos, 
310            Some(invalid_lexeme.len())
311        );
312    } else {
313        state.add_token(Tokentype::StringLiteral, string, start_pos);
314    }
315}
316
317/// Handles simple one-character tokens
318///
319/// ### Arguments
320/// * `state` - The current lexer state
321/// * `token_type` - The type of token to add
322/// * `lexeme` - The string representation of the token
323/// * `start_pos` - The starting position of the token in the input
324fn handle_simple_token(
325    state: &mut LexerState,
326    token_type: Tokentype,
327    lexeme: &str,
328    start_pos: usize,
329) {
330    state.advance();
331    state.add_token(token_type, lexeme.to_string(), start_pos);
332}
333
334/// Handles dash character (minus or arrow)
335///
336/// ### Arguments
337/// * `state` - The current lexer state
338/// * `start_pos` - The starting position of the dash in the input
339fn handle_dash(state: &mut LexerState, start_pos: usize) {
340    state.advance();
341    if state.peek() == Some(&'>') {
342        state.advance();
343        state.add_token(Tokentype::Arrow, "->".to_string(), start_pos);
344    } else {
345        state.add_token(Tokentype::Minus, "-".to_string(), start_pos);
346    }
347}
348
349/// Handles slash character (divide or comments)
350///
351/// ### Arguments
352/// * `state` - The current lexer state
353/// * `start_pos` - The starting position of the slash in the input
354fn handle_slash(state: &mut LexerState, start_pos: usize) {
355    state.advance();
356
357    if state.peek() == Some(&'/') {
358        handle_line_comment(state);
359    } else if state.peek() == Some(&'*') {
360        handle_block_comment(state);
361    } else {
362        state.add_token(Tokentype::Divide, "/".to_string(), start_pos);
363    }
364}
365
366/// Handles single-line comments
367///
368/// ### Arguments
369/// * `state` - The current lexer state
370fn handle_line_comment(state: &mut LexerState) {
371    state.advance();
372
373    while let Some(&c) = state.peek() {
374        if c == '\n' {
375            state.advance();
376            state.record_line_break();
377            break;
378        }
379        state.advance();
380    }
381}
382
383/// Handles multi-line block comments
384///
385/// ### Arguments
386/// * `state` - The current lexer state
387fn handle_block_comment(state: &mut LexerState) {
388    state.advance();
389
390    let mut nesting = 1;
391    while nesting > 0 {
392        if state.peek().is_none() {
393            break;
394        }
395
396        if let Some(&c) = state.peek() {
397            if c == '\n' {
398                state.record_line_break();
399            }
400        }
401
402        if state.peek() == Some(&'*') {
403            state.advance();
404            if state.peek() == Some(&'/') {
405                state.advance();
406                nesting -= 1;
407                continue;
408            }
409        } else if state.peek() == Some(&'/') {
410            state.advance();
411            if state.peek() == Some(&'*') {
412                state.advance();
413                nesting += 1;
414                continue;
415            }
416        } else {
417            state.advance();
418        }
419    }
420}
421
422/// Handles equals character (assignment or equality)
423///
424/// ### Arguments
425/// * `state` - The current lexer state
426/// * `start_pos` - The starting position of the equals in the input
427fn handle_equals(state: &mut LexerState, start_pos: usize) {
428    state.advance();
429    if state.peek() == Some(&'=') {
430        state.advance();
431        state.add_token(Tokentype::EqualEqual, "==".to_string(), start_pos);
432    } else {
433        state.add_token(Tokentype::Equal, "=".to_string(), start_pos);
434    }
435}
436
437/// Handles less than character (less than or less than or equal)
438///
439/// ### Arguments
440/// * `state` - The current lexer state
441/// * `start_pos` - The starting position of the less than in the input
442fn handle_less_than(state: &mut LexerState, start_pos: usize) {
443    state.advance();
444    if state.peek() == Some(&'=') {
445        state.advance();
446        state.add_token(Tokentype::LessEqual, "<=".to_string(), start_pos);
447    } else {
448        state.add_token(Tokentype::Less, "<".to_string(), start_pos);
449    }
450}
451
452/// Handles greater than character (greater than or greater than or equal)
453///
454/// ### Arguments
455/// * `state` - The current lexer state
456/// * `start_pos` - The starting position of the greater than in the input
457fn handle_greater_than(state: &mut LexerState, start_pos: usize) {
458    state.advance();
459    if state.peek() == Some(&'=') {
460        state.advance();
461        state.add_token(Tokentype::GreaterEqual, ">=".to_string(), start_pos);
462    } else {
463        state.add_token(Tokentype::Greater, ">".to_string(), start_pos);
464    }
465}
466
467/// Handles exclamation mark (not or not equal)
468///
469/// ### Arguments
470/// * `state` - The current lexer state
471/// * `start_pos` - The starting position of the exclamation mark in the input
472fn handle_exclamation(state: &mut LexerState, start_pos: usize) {
473    state.advance();
474    if state.peek() == Some(&'=') {
475        state.advance();
476        state.add_token(Tokentype::NotEqual, "!=".to_string(), start_pos);
477    } else {
478        state.add_token(Tokentype::Not, "!".to_string(), start_pos);
479    }
480}
481
482/// Handles ampersand character (logical AND)
483///
484/// ### Arguments
485/// * `state` - The current lexer state
486/// * `start_pos` - The starting position of the ampersand in the input
487fn handle_ampersand(state: &mut LexerState, start_pos: usize) {
488    state.advance();
489    if state.peek() == Some(&'&') {
490        state.advance();
491        state.add_token(Tokentype::And, "&&".to_string(), start_pos);
492    } else {
493        state.add_token(Tokentype::Invalid, "&".to_string(), start_pos);
494    }
495}
496
497/// Handles pipe character (logical OR)
498///
499/// ### Arguments
500/// * `state` - The current lexer state
501/// * `start_pos` - The starting position of the pipe in the input
502fn handle_pipe(state: &mut LexerState, start_pos: usize) {
503    state.advance();
504    if state.peek() == Some(&'|') {
505        state.advance();
506        state.add_token(Tokentype::Or, "||".to_string(), start_pos);
507    } else {
508        state.add_token(Tokentype::Invalid, "|".to_string(), start_pos);
509    }
510}
511
512/// Handles invalid characters
513///
514/// ### Arguments
515/// * `state` - The current lexer state
516/// * `start_pos` - The starting position of the invalid character in the input
517fn handle_invalid_char(state: &mut LexerState, start_pos: usize) {
518    let invalid_char = state.advance().unwrap();
519    state.add_token(Tokentype::Invalid, invalid_char.to_string(), start_pos);
520}