Phasor
2.2.0
Stack VM based Programming Language
Loading...
Searching...
No Matches
Lexer.cpp
Go to the documentation of this file.
1
#include "
Lexer.hpp
"
2
#include <cctype>
3
#include <sstream>
4
#include <stdexcept>
5
6
namespace
Phasor
7
{
8
9
Lexer::Lexer
(
const
std::string &
source
) :
source
(
source
)
10
{
11
}
12
13
void
Lexer::skipShebang
()
14
{
15
if
(
position
== 0 &&
peek
() ==
'#'
&&
16
position
+ 1 <
source
.length() &&
source
[
position
+ 1] ==
'!'
)
17
{
18
while
(!
isAtEnd
() &&
peek
() !=
'\n'
)
19
{
20
advance
();
21
}
22
}
23
}
24
25
std::vector<Token>
Lexer::tokenize
()
26
{
27
std::vector<Token> tokens;
28
skipShebang
();
29
while
(!
isAtEnd
())
30
{
31
skipWhitespace
();
32
if
(
isAtEnd
())
33
break
;
34
tokens.push_back(
scanToken
());
35
}
36
tokens.push_back({
TokenType::EndOfFile
,
""
,
line
,
column
});
37
return
tokens;
38
}
39
40
char
Lexer::peek
()
41
{
42
if
(
isAtEnd
())
43
return
'\0'
;
44
return
source
[
position
];
45
}
46
47
char
Lexer::advance
()
48
{
49
char
c =
source
[
position
++];
50
column
++;
51
if
(c ==
'\n'
)
52
{
53
line
++;
54
column
= 1;
55
}
56
return
c;
57
}
58
59
bool
Lexer::isAtEnd
()
60
{
61
return
position
>=
source
.length();
62
}
63
64
void
Lexer::skipWhitespace
()
65
{
66
while
(!
isAtEnd
())
67
{
68
char
c =
peek
();
69
if
(std::isspace(
static_cast<
unsigned
char
>
(c)))
70
{
71
advance
();
72
}
73
else
if
(c ==
'/'
&&
position
+ 1 <
source
.length() &&
source
[
position
+ 1] ==
'/'
)
74
{
75
// Skip single-line comment
76
while
(!
isAtEnd
() &&
peek
() !=
'\n'
)
77
{
78
advance
();
79
}
80
}
81
else
82
{
83
break
;
84
}
85
}
86
}
87
88
Token
Lexer::scanToken
()
89
{
90
char
c =
peek
();
91
if
(std::isalpha(
static_cast<
unsigned
char
>
(c)))
92
return
identifier
();
93
if
(std::isdigit(
static_cast<
unsigned
char
>
(c)))
94
return
number
();
95
if
(c ==
'"'
)
96
return
string
();
97
if
(c ==
'`'
)
98
return
complexString
();
99
100
// Multi-character operators
101
if
(c ==
'+'
&&
position
+ 1 <
source
.length() &&
source
[
position
+ 1] ==
'+'
)
102
{
103
advance
();
104
advance
();
105
return
{
TokenType::Symbol
,
"++"
,
line
,
column
};
106
}
107
if
(c ==
'-'
&&
position
+ 1 <
source
.length() &&
source
[
position
+ 1] ==
'-'
)
108
{
109
advance
();
110
advance
();
111
return
{
TokenType::Symbol
,
"--"
,
line
,
column
};
112
}
113
if
(c ==
'='
&&
position
+ 1 <
source
.length() &&
source
[
position
+ 1] ==
'='
)
114
{
115
advance
();
116
advance
();
117
return
{
TokenType::Symbol
,
"=="
,
line
,
column
};
118
}
119
if
(c ==
'!'
&&
position
+ 1 <
source
.length() &&
source
[
position
+ 1] ==
'='
)
120
{
121
advance
();
122
advance
();
123
return
{
TokenType::Symbol
,
"!="
,
line
,
column
};
124
}
125
if
(c ==
'-'
&&
position
+ 1 <
source
.length() &&
source
[
position
+ 1] ==
'>'
)
126
{
127
advance
();
128
advance
();
129
return
{
TokenType::Symbol
,
"->"
,
line
,
column
};
130
}
131
if
(c ==
'<'
&&
position
+ 1 <
source
.length() &&
source
[
position
+ 1] ==
'='
)
132
{
133
advance
();
134
advance
();
135
return
{
TokenType::Symbol
,
"<="
,
line
,
column
};
136
}
137
if
(c ==
'>'
&&
position
+ 1 <
source
.length() &&
source
[
position
+ 1] ==
'='
)
138
{
139
advance
();
140
advance
();
141
return
{
TokenType::Symbol
,
">="
,
line
,
column
};
142
}
143
if
(c ==
'&'
&&
position
+ 1 <
source
.length() &&
source
[
position
+ 1] ==
'&'
)
144
{
145
advance
();
146
advance
();
147
return
{
TokenType::Symbol
,
"&&"
,
line
,
column
};
148
}
149
if
(c ==
'|'
&&
position
+ 1 <
source
.length() &&
source
[
position
+ 1] ==
'|'
)
150
{
151
advance
();
152
advance
();
153
return
{
TokenType::Symbol
,
"||"
,
line
,
column
};
154
}
155
156
// Single-character symbols (parentheses, operators, punctuation, etc.)
157
if
(std::string(
"()+-*/%<>=!&|.{}:;,[]"
).find(c) != std::string::npos)
158
{
159
advance
();
160
return
{
TokenType::Symbol
, std::string(1, c),
line
,
column
};
161
}
162
163
advance
();
164
return
{
TokenType::Unknown
, std::string(1, c),
line
,
column
};
165
}
166
167
Token
Lexer::identifier
()
168
{
169
size_t
start =
position
;
170
while
(std::isalnum(
static_cast<
unsigned
char
>
(
peek
())) ||
peek
() ==
'_'
)
171
advance
();
172
std::string text =
source
.substr(start,
position
- start);
173
174
// Check for keywords
175
static
const
std::vector<std::string> keywords = {
176
"var"
,
"const"
,
"fn"
,
"class"
,
"if"
,
"else"
,
"while"
,
"for"
,
177
"return"
,
"true"
,
"false"
,
"null"
,
"import"
,
"export"
,
"async"
,
"await"
,
178
"throw"
,
"try"
,
"catch"
,
"match"
,
"enum"
,
"template"
,
"operator"
,
"unsafe"
,
179
"spawn"
,
"print"
,
"struct"
,
"break"
,
"continue"
,
"switch"
,
"case"
,
"default"
};
180
181
for
(
const
auto
&kw : keywords)
182
{
183
if
(text == kw)
184
{
185
return
{
TokenType::Keyword
, text,
line
,
column
};
186
}
187
}
188
189
return
{
TokenType::Identifier
, text,
line
,
column
};
190
}
191
192
Token
Lexer::number
()
193
{
194
size_t
start =
position
;
195
while
(std::isdigit(
static_cast<
unsigned
char
>
(
peek
())))
196
advance
();
197
if
(
peek
() ==
'.'
&&
position
+ 1 <
source
.length() &&
198
std::isdigit(
static_cast<
unsigned
char
>
(
source
[
position
+ 1])))
199
{
200
advance
();
201
while
(std::isdigit(
static_cast<
unsigned
char
>
(
peek
())))
202
advance
();
203
}
204
return
{
TokenType::Number
,
source
.substr(start,
position
- start),
line
,
column
};
205
}
206
207
static
int
hexValue
(
char
c)
208
{
209
if
(c >=
'0'
&& c <=
'9'
)
210
return
c -
'0'
;
211
if
(c >=
'a'
&& c <=
'f'
)
212
return
10 + (c -
'a'
);
213
if
(c >=
'A'
&& c <=
'F'
)
214
return
10 + (c -
'A'
);
215
return
-1;
216
}
217
218
Token
Lexer::string
()
219
{
220
size_t
tokenLine =
line
;
221
size_t
tokenColumn =
column
;
222
std::ostringstream out;
223
advance
();
// Skip opening quote
224
225
while
(!
isAtEnd
())
226
{
227
char
c =
advance
();
228
229
// Raw newline inside a string is treated as unterminated/error.
230
if
(c ==
'\n'
)
231
{
232
// Unterminated string literal
233
return
{
TokenType::Unknown
, std::string(), tokenLine, tokenColumn};
234
}
235
236
if
(c ==
'\\'
)
237
{
238
if
(
isAtEnd
())
239
{
240
// Unterminated escape at end of file
241
return
{
TokenType::Unknown
, std::string(), tokenLine, tokenColumn};
242
}
243
char
esc =
advance
();
244
switch
(esc)
245
{
246
case
'n'
:
247
out <<
'\n'
;
248
break
;
249
case
't'
:
250
out <<
'\t'
;
251
break
;
252
case
'r'
:
253
out <<
'\r'
;
254
break
;
255
case
'\\'
:
256
out <<
'\\'
;
257
break
;
258
case
'"'
:
259
out <<
'"'
;
260
break
;
261
case
'\''
:
262
out <<
'\''
;
263
break
;
264
case
'0'
:
265
out <<
'\0'
;
266
break
;
267
case
'b'
:
268
out <<
'\b'
;
269
break
;
270
case
'f'
:
271
out <<
'\f'
;
272
break
;
273
case
'v'
:
274
out <<
'\v'
;
275
break
;
276
case
'x'
: {
277
// Hex escape sequence: \xHH
278
if
(
isAtEnd
())
279
return
{
TokenType::Unknown
, std::string(), tokenLine, tokenColumn};
280
char
h1 =
advance
();
281
if
(
isAtEnd
())
282
return
{
TokenType::Unknown
, std::string(), tokenLine, tokenColumn};
283
char
h2 =
advance
();
284
int
v1 =
hexValue
(h1), v2 =
hexValue
(h2);
285
if
(v1 < 0 || v2 < 0)
286
return
{
TokenType::Unknown
, std::string(), tokenLine, tokenColumn};
287
char
value =
static_cast<
char
>
((v1 << 4) | v2);
288
out << value;
289
break
;
290
}
291
default
:
292
// Unknown escape: be permissive and append the escaped character as-is.
293
out << esc;
294
break
;
295
}
296
}
297
else
if
(c ==
'"'
)
298
{
299
// Closing quote
300
return
{
TokenType::String
, out.str(), tokenLine, tokenColumn};
301
}
302
else
303
{
304
out << c;
305
}
306
}
307
308
// If we get here, string was unterminated
309
return
{
TokenType::Unknown
, std::string(), tokenLine, tokenColumn};
310
}
311
312
Token
Lexer::complexString
()
313
{
314
size_t
tokenLine =
line
;
315
size_t
tokenColumn =
column
;
316
std::ostringstream out;
317
advance
();
// Skip opening backtick
318
319
// Not even attempting ${} syntax for now. Just read as a raw string.
320
321
while
(!
isAtEnd
())
322
{
323
char
c =
advance
();
324
325
if
(c ==
'`'
)
326
{
327
// Closing backtick
328
return
{
TokenType::String
, out.str(), tokenLine, tokenColumn};
329
}
330
else
331
{
332
out << c;
333
}
334
}
335
336
// If we get here, string was unterminated
337
return
{
TokenType::Unknown
, std::string(), tokenLine, tokenColumn};
338
}
339
}
// namespace Phasor
Lexer.hpp
Phasor::Lexer::scanToken
Token scanToken()
Definition
Lexer.cpp:88
Phasor::Lexer::Lexer
Lexer(const std::string &source)
Definition
Lexer.cpp:9
Phasor::Lexer::complexString
Token complexString()
Definition
Lexer.cpp:312
Phasor::Lexer::number
Token number()
Definition
Lexer.cpp:192
Phasor::Lexer::string
Token string()
Definition
Lexer.cpp:218
Phasor::Lexer::source
std::string source
Definition
Lexer.hpp:36
Phasor::Lexer::tokenize
std::vector< Token > tokenize()
Definition
Lexer.cpp:25
Phasor::Lexer::position
size_t position
Definition
Lexer.hpp:37
Phasor::Lexer::column
size_t column
Definition
Lexer.hpp:39
Phasor::Lexer::skipWhitespace
void skipWhitespace()
Definition
Lexer.cpp:64
Phasor::Lexer::peek
char peek()
Definition
Lexer.cpp:40
Phasor::Lexer::skipShebang
void skipShebang()
Definition
Lexer.cpp:13
Phasor::Lexer::identifier
Token identifier()
Definition
Lexer.cpp:167
Phasor::Lexer::isAtEnd
bool isAtEnd()
Definition
Lexer.cpp:59
Phasor::Lexer::advance
char advance()
Definition
Lexer.cpp:47
Phasor::Lexer::line
size_t line
Definition
Lexer.hpp:38
Phasor
The Phasor Programming Language and Runtime.
Definition
AST.hpp:8
Phasor::hexValue
static int hexValue(char c)
Definition
Lexer.cpp:207
Phasor::TokenType::Symbol
@ Symbol
Definition
Lexer.hpp:15
Phasor::TokenType::Keyword
@ Keyword
Definition
Lexer.hpp:14
Phasor::TokenType::String
@ String
Definition
Lexer.hpp:13
Phasor::TokenType::Identifier
@ Identifier
Definition
Lexer.hpp:11
Phasor::TokenType::Unknown
@ Unknown
Definition
Lexer.hpp:17
Phasor::TokenType::EndOfFile
@ EndOfFile
Definition
Lexer.hpp:16
Phasor::TokenType::Number
@ Number
Definition
Lexer.hpp:12
Phasor::Token
Definition
Lexer.hpp:21
src
Backend
Lexer
Lexer.cpp
Generated by
1.16.1