Gitbook: https://tigercosmos.github.io/lets-build-dbms/

I write a scanner to get tokens. The code is at lexer.rs.

The algorithm is simple. The scanner will read char by char. There are two cursors, cursor_l and cursor_r. When the income char is one of separators or delimiters, the scanner will check the last word selected by two cursors. If the word is a keyword, add the keyword as the token. Otherwise, add the identifier with its name.

There are some small parts need to fix, but the function is almost done.

If we have a message:

1
select customername, contactname, address from customers where address is null;

then the scanner will get:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
[
Symbol { name: "select", len: 6, token: Select, group: Keyword },
Symbol { name: "customername", len: 12, token: Identifier, group: Identifier },
Symbol { name: ",", len: 1, token: Comma, group: Delimiter },
Symbol { name: "", len: 0, token: Identifier, group: Identifier },
Symbol { name: "contactname", len: 11, token: Identifier, group: Identifier },
Symbol { name: ",", len: 1, token: Comma, group: Delimiter },
Symbol { name: "", len: 0, token: Identifier, group: Identifier },
Symbol { name: "address", len: 7, token: Identifier, group: Identifier },
Symbol { name: "from", len: 4, token: From, group: Keyword },
Symbol { name: "customers", len: 9, token: Identifier, group: Identifier },
Symbol { name: "where", len: 5, token: Where, group: Keyword },
Symbol { name: "address", len: 7, token: Identifier, group: Identifier },
Symbol { name: "is", len: 2, token: Identifier, group: Identifier },
Symbol { name: "null", len: 4, token: Identifier, group: Identifier },
Symbol { name: ";", len: 1, token: Semicolon, group: Delimiter }
]

is null should be recognized as a token, so I will fix later.

as you can see, we get the tokens and we can use these token to do the next step.

!FILENAME sql/lexer.rs

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
use sql::symbol;

#[derive(Debug, Clone)]
pub struct Scanner<'a> {
message: String,
tokens: Vec<symbol::Symbol<'a>>,
pos: Pos,
}

#[derive(Debug, Clone)]
struct Pos {
cursor_l: usize,
cursor_r: usize,
}

impl<'a> Scanner<'a> {
pub fn new(message: &str) -> Scanner {
Scanner {
message: message.to_lowercase().trim().to_string(),
tokens: vec![],
pos: Pos {
cursor_l: 0,
cursor_r: 0,
},
}
}
pub fn scan_tokens(&'a mut self) -> Vec<symbol::Symbol<'a>> {
println!("Starting scanning message: {}", self.message);
let mut chars = self.message.chars();
loop {
match chars.next() {
Some(x) => {
if is_letter_or_number(x) {
self.pos.cursor_r += 1;
} else {
match x {
' ' | '\t' | '\r' | '\n' | '(' | ')' | ','
| ';' => {
if self.pos.cursor_l != self.pos.cursor_r {
let word = self
.message
.get(
self.pos.cursor_l
..self.pos.cursor_r,
).unwrap();
println!(
"encounter `{}`, last word is {}",
x, word
);
match symbol::SYMBOLS.get(word) {
// either keyword
Some(token) => {
self.tokens.push(token.clone())
}
// or identifier
None => {
self.tokens.push(symbol::sym(
word,
symbol::Token::Identifier,
symbol::Group::Identifier,
));
}
}
if is_delimiter(x) {
self.tokens.push(
symbol::Symbol::match_delimiter(x)
.unwrap(),
);
}
}
// set the cursor next to `x` in the right
self.pos.cursor_r += 1;
self.pos.cursor_l = self.pos.cursor_r;
}
_ => {
// error
}
}
}
}
// no remaining char in message
None => break,
};
}
self.tokens.clone()
}
}

fn is_letter_or_number(ch: char) -> bool {
ch.is_digit(10) || ch.is_ascii_alphabetic()
}

fn is_delimiter(ch: char) -> bool {
ch == '(' || ch == ')' || ch == ',' || ch == ';'
}