1use slang_error::{CompileResult, CompilerError, LineInfo, ErrorCode};
2use crate::token::{Token, Tokentype};
3
4pub struct LexerResult<'a> {
5 pub tokens: Vec<Token>,
7 pub line_info: LineInfo<'a>,
9}
10
11struct LexerState<'a> {
13 input: &'a str,
15 chars: std::iter::Peekable<std::str::Chars<'a>>,
17 current_pos: usize,
19 current_line: usize,
21 tokens_on_current_line: usize,
23 tokens: Vec<Token>,
25 line_tokens: Vec<(u16, u16)>,
27 errors: Vec<CompilerError>,
29}
30
31impl<'a> LexerState<'a> {
32 fn new(input: &'a str) -> Self {
40 LexerState {
41 input,
42 chars: input.chars().peekable(),
43 current_pos: 0,
44 current_line: 1,
45 tokens_on_current_line: 0,
46 tokens: Vec::new(),
47 line_tokens: Vec::new(),
48 errors: Vec::new(),
49 }
50 }
51
52 fn advance(&mut self) -> Option<char> {
57 let c = self.chars.next();
58 if c.is_some() {
59 self.current_pos += 1;
60 }
61 c
62 }
63
64 fn peek(&mut self) -> Option<&char> {
69 self.chars.peek()
70 }
71
72 fn add_token(&mut self, token_type: Tokentype, lexeme: String, start_pos: usize) {
80 self.tokens.push(Token::new(token_type, lexeme, start_pos));
81 self.tokens_on_current_line += 1;
82 }
83
84 fn add_error(&mut self, error_code: ErrorCode, message: String, start_pos: usize, token_length: Option<usize>) {
93 let line_start = self.input[..start_pos].rfind('\n').map_or(0, |pos| pos + 1);
95 let column = start_pos - line_start + 1;
96
97 self.errors.push(CompilerError::new(
98 error_code,
99 message,
100 self.current_line,
101 column,
102 start_pos,
103 token_length,
104 ));
105 }
106
107 fn record_line_break(&mut self) {
112 if self.tokens_on_current_line > 0 {
113 self.line_tokens
114 .push((self.current_line as u16, self.tokens_on_current_line as u16));
115 }
116 self.current_line += 1;
117 self.tokens_on_current_line = 0;
118 }
119
120 fn finish(mut self) -> CompileResult<LexerResult<'a>> {
125 if self.tokens_on_current_line > 0 {
127 self.line_tokens
128 .push((self.current_line as u16, self.tokens_on_current_line as u16));
129 }
130 self.tokens
131 .push(Token::new(Tokentype::Eof, "".to_string(), self.current_pos));
132 let mut info = LineInfo::new(self.input);
133 info.per_line = self.line_tokens;
134
135 if !self.errors.is_empty() {
137 return Err(self.errors);
138 }
139
140 Ok(LexerResult {
141 tokens: self.tokens,
142 line_info: info,
143 })
144 }
145}
146
147pub fn tokenize(input: &str) -> CompileResult<LexerResult> {
157 let mut state = LexerState::new(input);
158
159 while let Some(&c) = state.peek() {
160 let token_start_pos = state.current_pos;
161
162 match c {
163 c if c.is_whitespace() => handle_whitespace(&mut state),
164 c if c.is_alphabetic() => handle_identifier(&mut state, token_start_pos),
165 c if c.is_ascii_digit() => handle_number(&mut state, token_start_pos),
166 '"' => handle_string(&mut state),
167 ':' => handle_simple_token(&mut state, Tokentype::Colon, ":", token_start_pos),
168 '+' => handle_simple_token(&mut state, Tokentype::Plus, "+", token_start_pos),
169 '-' => handle_dash(&mut state, token_start_pos),
170 '*' => handle_simple_token(&mut state, Tokentype::Multiply, "*", token_start_pos),
171 '/' => handle_slash(&mut state, token_start_pos),
172 '=' => handle_equals(&mut state, token_start_pos),
173 '<' => handle_less_than(&mut state, token_start_pos),
174 '>' => handle_greater_than(&mut state, token_start_pos),
175 '!' => handle_exclamation(&mut state, token_start_pos),
176 ';' => handle_simple_token(&mut state, Tokentype::Semicolon, ";", token_start_pos),
177 '{' => handle_simple_token(&mut state, Tokentype::LeftBrace, "{", token_start_pos),
178 '}' => handle_simple_token(&mut state, Tokentype::RightBrace, "}", token_start_pos),
179 ',' => handle_simple_token(&mut state, Tokentype::Comma, ",", token_start_pos),
180 '(' => handle_simple_token(&mut state, Tokentype::LeftParen, "(", token_start_pos),
181 ')' => handle_simple_token(&mut state, Tokentype::RightParen, ")", token_start_pos),
182 '&' => handle_ampersand(&mut state, token_start_pos),
183 '|' => handle_pipe(&mut state, token_start_pos),
184 _ => handle_invalid_char(&mut state, token_start_pos),
185 }
186 }
187
188 state.finish()
189}
190
191fn handle_whitespace(state: &mut LexerState) {
196 let c = state.advance().unwrap();
197
198 if c == '\n' {
199 state.record_line_break();
200 }
201}
202
203fn handle_identifier(state: &mut LexerState, start_pos: usize) {
209 let mut identifier = String::new();
210
211 while let Some(&c) = state.peek() {
212 if c.is_alphanumeric() || c == '_' {
213 identifier.push(c);
214 state.advance();
215 } else {
216 break;
217 }
218 }
219
220 let token_type = match identifier.as_str() {
221 "let" => Tokentype::Let,
222 "mut" => Tokentype::Mut,
223 "struct" => Tokentype::Struct,
224 "fn" => Tokentype::Fn,
225 "return" => Tokentype::Return,
226 "if" => Tokentype::If,
227 "else" => Tokentype::Else,
228 "true" | "false" => Tokentype::BooleanLiteral,
229 _ => Tokentype::Identifier,
230 };
231
232 state.add_token(token_type, identifier, start_pos);
233}
234
235fn handle_number(state: &mut LexerState, start_pos: usize) {
241 let mut number = String::new();
242 let mut is_float = false;
243
244 while let Some(&c) = state.peek() {
245 if c.is_ascii_digit() {
246 number.push(c);
247 state.advance();
248 } else if c == '.' {
249 if is_float {
250 break;
251 }
252 is_float = true;
253 number.push(c);
254 state.advance();
255 } else if c == 'e' || c == 'E' {
256 number.push(c);
257 state.advance();
258 if let Some(&next_c) = state.peek() {
259 if next_c == '+' || next_c == '-' {
260 number.push(next_c);
261 state.advance();
262 }
263 }
264 } else {
265 break;
266 }
267 }
268
269 let token_type = if is_float {
270 Tokentype::FloatLiteral
271 } else {
272 Tokentype::IntegerLiteral
273 };
274
275 state.add_token(token_type, number, start_pos);
276}
277
278fn handle_string(state: &mut LexerState) {
283 let start_pos = state.current_pos;
284 state.advance(); let mut string = String::new();
286 let mut closed = false;
287
288 while let Some(&c) = state.peek() {
289 if c == '"' {
290 state.advance();
291 closed = true;
292 break;
293 } else if c == '\n' {
294 state.current_line += 1;
295 string.push(c);
296 state.advance();
297 } else {
298 string.push(c);
299 state.advance();
300 }
301 }
302
303 if !closed {
304 let error_message = "Expected closing quote for string literal".to_string();
305 let invalid_lexeme = format!("\"{}",string);
306 state.add_error(
307 ErrorCode::ExpectedClosingQuote,
308 error_message,
309 start_pos,
310 Some(invalid_lexeme.len())
311 );
312 } else {
313 state.add_token(Tokentype::StringLiteral, string, start_pos);
314 }
315}
316
317fn handle_simple_token(
325 state: &mut LexerState,
326 token_type: Tokentype,
327 lexeme: &str,
328 start_pos: usize,
329) {
330 state.advance();
331 state.add_token(token_type, lexeme.to_string(), start_pos);
332}
333
334fn handle_dash(state: &mut LexerState, start_pos: usize) {
340 state.advance();
341 if state.peek() == Some(&'>') {
342 state.advance();
343 state.add_token(Tokentype::Arrow, "->".to_string(), start_pos);
344 } else {
345 state.add_token(Tokentype::Minus, "-".to_string(), start_pos);
346 }
347}
348
349fn handle_slash(state: &mut LexerState, start_pos: usize) {
355 state.advance();
356
357 if state.peek() == Some(&'/') {
358 handle_line_comment(state);
359 } else if state.peek() == Some(&'*') {
360 handle_block_comment(state);
361 } else {
362 state.add_token(Tokentype::Divide, "/".to_string(), start_pos);
363 }
364}
365
366fn handle_line_comment(state: &mut LexerState) {
371 state.advance();
372
373 while let Some(&c) = state.peek() {
374 if c == '\n' {
375 state.advance();
376 state.record_line_break();
377 break;
378 }
379 state.advance();
380 }
381}
382
383fn handle_block_comment(state: &mut LexerState) {
388 state.advance();
389
390 let mut nesting = 1;
391 while nesting > 0 {
392 if state.peek().is_none() {
393 break;
394 }
395
396 if let Some(&c) = state.peek() {
397 if c == '\n' {
398 state.record_line_break();
399 }
400 }
401
402 if state.peek() == Some(&'*') {
403 state.advance();
404 if state.peek() == Some(&'/') {
405 state.advance();
406 nesting -= 1;
407 continue;
408 }
409 } else if state.peek() == Some(&'/') {
410 state.advance();
411 if state.peek() == Some(&'*') {
412 state.advance();
413 nesting += 1;
414 continue;
415 }
416 } else {
417 state.advance();
418 }
419 }
420}
421
422fn handle_equals(state: &mut LexerState, start_pos: usize) {
428 state.advance();
429 if state.peek() == Some(&'=') {
430 state.advance();
431 state.add_token(Tokentype::EqualEqual, "==".to_string(), start_pos);
432 } else {
433 state.add_token(Tokentype::Equal, "=".to_string(), start_pos);
434 }
435}
436
437fn handle_less_than(state: &mut LexerState, start_pos: usize) {
443 state.advance();
444 if state.peek() == Some(&'=') {
445 state.advance();
446 state.add_token(Tokentype::LessEqual, "<=".to_string(), start_pos);
447 } else {
448 state.add_token(Tokentype::Less, "<".to_string(), start_pos);
449 }
450}
451
452fn handle_greater_than(state: &mut LexerState, start_pos: usize) {
458 state.advance();
459 if state.peek() == Some(&'=') {
460 state.advance();
461 state.add_token(Tokentype::GreaterEqual, ">=".to_string(), start_pos);
462 } else {
463 state.add_token(Tokentype::Greater, ">".to_string(), start_pos);
464 }
465}
466
467fn handle_exclamation(state: &mut LexerState, start_pos: usize) {
473 state.advance();
474 if state.peek() == Some(&'=') {
475 state.advance();
476 state.add_token(Tokentype::NotEqual, "!=".to_string(), start_pos);
477 } else {
478 state.add_token(Tokentype::Not, "!".to_string(), start_pos);
479 }
480}
481
482fn handle_ampersand(state: &mut LexerState, start_pos: usize) {
488 state.advance();
489 if state.peek() == Some(&'&') {
490 state.advance();
491 state.add_token(Tokentype::And, "&&".to_string(), start_pos);
492 } else {
493 state.add_token(Tokentype::Invalid, "&".to_string(), start_pos);
494 }
495}
496
497fn handle_pipe(state: &mut LexerState, start_pos: usize) {
503 state.advance();
504 if state.peek() == Some(&'|') {
505 state.advance();
506 state.add_token(Tokentype::Or, "||".to_string(), start_pos);
507 } else {
508 state.add_token(Tokentype::Invalid, "|".to_string(), start_pos);
509 }
510}
511
512fn handle_invalid_char(state: &mut LexerState, start_pos: usize) {
518 let invalid_char = state.advance().unwrap();
519 state.add_token(Tokentype::Invalid, invalid_char.to_string(), start_pos);
520}