Phasor
3.3.0
Stack VM based Programming Language
Loading...
Searching...
No Matches
Lexer.cpp
Go to the documentation of this file.
1
#include "
Lexer.hpp
"
2
#include <cctype>
3
#include <sstream>
4
#include <algorithm>
5
#include <stdexcept>
6
#include <utility>
7
8
namespace
Phasor
9
{
10
11
Lexer::Lexer
(std::string
source
) :
source
(std::move(
source
))
12
{
13
std::erase(this->source,
'\r'
);
14
}
15
16
void
Lexer::skipShebang
()
17
{
18
if
(
position
== 0 &&
peek
() ==
'#'
&&
position
+ 1 <
source
.length() &&
source
[
position
+ 1] ==
'!'
)
19
{
20
while
(!
isAtEnd
() &&
peek
() !=
'\n'
)
21
{
22
advance
();
23
}
24
}
25
}
26
27
std::vector<Token>
Lexer::tokenize
()
28
{
29
std::vector<Token> tokens;
30
skipShebang
();
31
while
(!
isAtEnd
())
32
{
33
skipWhitespace
();
34
if
(
isAtEnd
())
35
{
36
break
;
37
}
38
tokens.push_back(
scanToken
());
39
}
40
tokens.push_back({
TokenType::EndOfFile
,
""
,
line
,
column
});
41
return
tokens;
42
}
43
44
char
Lexer::peek
()
45
{
46
if
(
isAtEnd
())
47
{
48
return
'\0'
;
49
}
50
return
source
[
position
];
51
}
52
53
char
Lexer::advance
()
54
{
55
char
c =
source
[
position
++];
56
column
++;
57
if
(c ==
'\n'
)
58
{
59
line
++;
60
column
= 1;
61
}
62
return
c;
63
}
64
65
bool
Lexer::isAtEnd
()
66
{
67
return
position
>=
source
.length();
68
}
69
70
void
Lexer::skipWhitespace
()
71
{
72
while
(!
isAtEnd
())
73
{
74
char
c =
peek
();
75
if
(std::isspace(
static_cast<
unsigned
char
>
(c)) != 0)
76
{
77
advance
();
78
}
79
else
if
(c ==
'/'
&&
position
+ 1 <
source
.length() &&
source
[
position
+ 1] ==
'/'
)
80
{
81
// Skip single-line comment
82
while
(!
isAtEnd
() &&
peek
() !=
'\n'
)
83
{
84
advance
();
85
}
86
}
87
else
88
{
89
break
;
90
}
91
}
92
}
93
94
Token
Lexer::scanToken
()
95
{
96
char
c =
peek
();
97
if
(std::isalpha(
static_cast<
unsigned
char
>
(c)) != 0)
98
{
99
return
identifier
();
100
}
101
if
(std::isdigit(
static_cast<
unsigned
char
>
(c)) != 0)
102
{
103
return
number
();
104
}
105
if
(c ==
'"'
)
106
{
107
return
string
();
108
}
109
if
(c ==
'`'
)
110
{
111
return
complexString
();
112
}
113
114
// Multi-character operators
115
if
(c ==
'+'
&&
position
+ 1 <
source
.length() &&
source
[
position
+ 1] ==
'+'
)
116
{
117
advance
();
118
advance
();
119
return
{
TokenType::Symbol
,
"++"
,
line
,
column
};
120
}
121
if
(c ==
'-'
&&
position
+ 1 <
source
.length() &&
source
[
position
+ 1] ==
'-'
)
122
{
123
advance
();
124
advance
();
125
return
{
TokenType::Symbol
,
"--"
,
line
,
column
};
126
}
127
if
(c ==
'='
&&
position
+ 1 <
source
.length() &&
source
[
position
+ 1] ==
'='
)
128
{
129
advance
();
130
advance
();
131
return
{
TokenType::Symbol
,
"=="
,
line
,
column
};
132
}
133
if
(c ==
'!'
&&
position
+ 1 <
source
.length() &&
source
[
position
+ 1] ==
'='
)
134
{
135
advance
();
136
advance
();
137
return
{
TokenType::Symbol
,
"!="
,
line
,
column
};
138
}
139
if
(c ==
'-'
&&
position
+ 1 <
source
.length() &&
source
[
position
+ 1] ==
'>'
)
140
{
141
advance
();
142
advance
();
143
return
{
TokenType::Symbol
,
"->"
,
line
,
column
};
144
}
145
if
(c ==
'<'
&&
position
+ 1 <
source
.length() &&
source
[
position
+ 1] ==
'='
)
146
{
147
advance
();
148
advance
();
149
return
{
TokenType::Symbol
,
"<="
,
line
,
column
};
150
}
151
if
(c ==
'>'
&&
position
+ 1 <
source
.length() &&
source
[
position
+ 1] ==
'='
)
152
{
153
advance
();
154
advance
();
155
return
{
TokenType::Symbol
,
">="
,
line
,
column
};
156
}
157
if
(c ==
'&'
&&
position
+ 1 <
source
.length() &&
source
[
position
+ 1] ==
'&'
)
158
{
159
advance
();
160
advance
();
161
return
{
TokenType::Symbol
,
"&&"
,
line
,
column
};
162
}
163
if
(c ==
'|'
&&
position
+ 1 <
source
.length() &&
source
[
position
+ 1] ==
'|'
)
164
{
165
advance
();
166
advance
();
167
return
{
TokenType::Symbol
,
"||"
,
line
,
column
};
168
}
169
170
// Single-character symbols (parentheses, operators, punctuation, etc.)
171
if
(std::string(
"()+-*/%<>=!&|.{}:;,[]"
).find(c) != std::string::npos)
172
{
173
advance
();
174
return
{
TokenType::Symbol
, std::string(1, c),
line
,
column
};
175
}
176
177
advance
();
178
return
{
TokenType::Unknown
, std::string(1, c),
line
,
column
};
179
}
180
181
Token
Lexer::identifier
()
182
{
183
size_t
start =
position
;
184
while
((std::isalnum(
static_cast<
unsigned
char
>
(
peek
())) != 0) ||
peek
() ==
'_'
)
185
{
186
advance
();
187
}
188
std::string text =
source
.substr(start,
position
- start);
189
190
static
const
std::vector<std::string> keywords = {
"var"
,
"fn"
,
"if"
,
"else"
,
"while"
,
"for"
,
191
"return"
,
"true"
,
"false"
,
"null"
,
"throw"
,
"print"
,
192
"break"
,
"continue"
,
"switch"
,
"case"
,
"default"
,
"include"
};
193
194
for
(
const
auto
&kw : keywords)
195
{
196
if
(text == kw)
197
{
198
return
{
TokenType::Keyword
, text,
line
,
column
};
199
}
200
}
201
202
return
{
TokenType::Identifier
, text,
line
,
column
};
203
}
204
205
Token
Lexer::number
()
206
{
207
size_t
start =
position
;
208
while
(std::isdigit(
static_cast<
unsigned
char
>
(
peek
())) != 0)
209
{
210
advance
();
211
}
212
if
(
peek
() ==
'.'
&&
position
+ 1 <
source
.length() &&
213
(std::isdigit(
static_cast<
unsigned
char
>
(
source
[
position
+ 1])) != 0))
214
{
215
advance
();
216
while
(std::isdigit(
static_cast<
unsigned
char
>
(
peek
())) != 0)
217
{
218
advance
();
219
}
220
}
221
return
{
TokenType::Number
,
source
.substr(start,
position
- start),
line
,
column
};
222
}
223
224
static
int
hexValue
(
char
c)
225
{
226
if
(c >=
'0'
&& c <=
'9'
)
227
{
228
return
c -
'0'
;
229
}
230
if
(c >=
'a'
&& c <=
'f'
)
231
{
232
return
10 + (c -
'a'
);
233
}
234
if
(c >=
'A'
&& c <=
'F'
)
235
{
236
return
10 + (c -
'A'
);
237
}
238
return
-1;
239
}
240
241
Token
Lexer::string
()
242
{
243
size_t
tokenLine =
line
;
244
size_t
tokenColumn =
column
;
245
std::ostringstream out;
246
advance
();
// Skip opening quote
247
248
while
(!
isAtEnd
())
249
{
250
char
c =
advance
();
251
252
// Raw newline inside a string is treated as unterminated/error.
253
if
(c ==
'\n'
)
254
{
255
// Unterminated string literal
256
return
{
TokenType::Unknown
, std::string(), tokenLine, tokenColumn};
257
}
258
259
if
(c ==
'\\'
)
260
{
261
if
(
isAtEnd
())
262
{
263
// Unterminated escape at end of file
264
return
{
TokenType::Unknown
, std::string(), tokenLine, tokenColumn};
265
}
266
char
esc =
advance
();
267
switch
(esc)
268
{
269
case
'n'
:
270
out <<
'\n'
;
271
break
;
272
case
't'
:
273
out <<
'\t'
;
274
break
;
275
case
'r'
:
276
out <<
'\r'
;
277
break
;
278
case
'\\'
:
279
out <<
'\\'
;
280
break
;
281
case
'"'
:
282
out <<
'"'
;
283
break
;
284
case
'\''
:
285
out <<
'\''
;
286
break
;
287
case
'0'
:
288
out <<
'\0'
;
289
break
;
290
case
'b'
:
291
out <<
'\b'
;
292
break
;
293
case
'f'
:
294
out <<
'\f'
;
295
break
;
296
case
'v'
:
297
out <<
'\v'
;
298
break
;
299
case
'x'
: {
300
// Hex escape sequence: \xHH
301
if
(
isAtEnd
())
302
{
303
return
{
TokenType::Unknown
, std::string(), tokenLine, tokenColumn};
304
}
305
char
h1 =
advance
();
306
if
(
isAtEnd
())
307
{
308
return
{
TokenType::Unknown
, std::string(), tokenLine, tokenColumn};
309
}
310
char
h2 =
advance
();
311
int
v1 =
hexValue
(h1);
312
int
v2 =
hexValue
(h2);
313
if
(v1 < 0 || v2 < 0)
314
{
315
return
{
TokenType::Unknown
, std::string(), tokenLine, tokenColumn};
316
}
317
char
value =
static_cast<
char
>
((v1 << 4) | v2);
318
out << value;
319
break
;
320
}
321
default
:
322
// Unknown escape: be permissive and append the escaped character as-is.
323
out << esc;
324
break
;
325
}
326
}
327
else
if
(c ==
'"'
)
328
{
329
// Closing quote
330
return
{
TokenType::String
, out.str(), tokenLine, tokenColumn};
331
}
332
else
333
{
334
out << c;
335
}
336
}
337
338
// If we get here, string was unterminated
339
return
{
TokenType::Unknown
, std::string(), tokenLine, tokenColumn};
340
}
341
342
Token
Lexer::complexString
()
343
{
344
size_t
tokenLine =
line
;
345
size_t
tokenColumn =
column
;
346
std::ostringstream out;
347
advance
();
// Skip opening backtick
348
349
// Not even attempting ${} syntax for now. Just read as a raw string.
350
351
while
(!
isAtEnd
())
352
{
353
char
c =
advance
();
354
355
if
(c ==
'`'
)
356
{
357
// Closing backtick
358
return
{
TokenType::String
, out.str(), tokenLine, tokenColumn};
359
}
360
361
out << c;
362
}
363
364
// If we get here, string was unterminated
365
return
{
TokenType::Unknown
, std::string(), tokenLine, tokenColumn};
366
}
367
}
// namespace Phasor
Lexer.hpp
Phasor::Lexer::Lexer
Lexer(std::string source)
Definition
Lexer.cpp:11
Phasor::Lexer::number
Token number()
Definition
Lexer.cpp:205
Phasor::Lexer::isAtEnd
bool isAtEnd()
Definition
Lexer.cpp:65
Phasor::Lexer::skipShebang
void skipShebang()
Definition
Lexer.cpp:16
Phasor::Lexer::skipWhitespace
void skipWhitespace()
Definition
Lexer.cpp:70
Phasor::Lexer::complexString
Token complexString()
Definition
Lexer.cpp:342
Phasor::Lexer::source
std::string source
Definition
Lexer.hpp:30
Phasor::Lexer::identifier
Token identifier()
Definition
Lexer.cpp:181
Phasor::Lexer::position
size_t position
Definition
Lexer.hpp:31
Phasor::Lexer::column
size_t column
Definition
Lexer.hpp:33
Phasor::Lexer::advance
char advance()
Definition
Lexer.cpp:53
Phasor::Lexer::tokenize
std::vector< Token > tokenize()
Definition
Lexer.cpp:27
Phasor::Lexer::string
Token string()
Definition
Lexer.cpp:241
Phasor::Lexer::scanToken
Token scanToken()
Definition
Lexer.cpp:94
Phasor::Lexer::line
size_t line
Definition
Lexer.hpp:32
Phasor::Lexer::peek
char peek()
Definition
Lexer.cpp:44
Phasor
The Phasor Programming Language and Runtime.
Definition
AST.hpp:12
Phasor::hexValue
static int hexValue(char c)
Definition
Lexer.cpp:224
Phasor::TokenType::Symbol
@ Symbol
Definition
AST.hpp:20
Phasor::TokenType::Keyword
@ Keyword
Definition
AST.hpp:19
Phasor::TokenType::String
@ String
Definition
AST.hpp:18
Phasor::TokenType::Identifier
@ Identifier
Definition
AST.hpp:16
Phasor::TokenType::Unknown
@ Unknown
Definition
AST.hpp:22
Phasor::TokenType::EndOfFile
@ EndOfFile
Definition
AST.hpp:21
Phasor::TokenType::Number
@ Number
Definition
AST.hpp:17
Phasor::Token
Token structure.
Definition
AST.hpp:26
src
Language
Phasor
Lexer
Lexer.cpp
Generated by
1.16.1