Phasor
3.1.1
Stack VM based Programming Language
Loading...
Searching...
No Matches
Lexer.cpp
Go to the documentation of this file.
1
#include "
Lexer.hpp
"
2
#include <cctype>
3
#include <sstream>
4
#include <algorithm>
5
#include <stdexcept>
6
7
namespace
Phasor
8
{
9
10
Lexer::Lexer
(
const
std::string &
source
) :
source
(
source
)
11
{
12
this->source.erase(std::remove(this->source.begin(), this->source.end(),
'\r'
), this->source.end());
13
}
14
15
void
Lexer::skipShebang
()
16
{
17
if
(
position
== 0 &&
peek
() ==
'#'
&&
position
+ 1 <
source
.length() &&
source
[
position
+ 1] ==
'!'
)
18
{
19
while
(!
isAtEnd
() &&
peek
() !=
'\n'
)
20
{
21
advance
();
22
}
23
}
24
}
25
26
std::vector<Token>
Lexer::tokenize
()
27
{
28
std::vector<Token> tokens;
29
skipShebang
();
30
while
(!
isAtEnd
())
31
{
32
skipWhitespace
();
33
if
(
isAtEnd
())
34
break
;
35
tokens.push_back(
scanToken
());
36
}
37
tokens.push_back({
TokenType::EndOfFile
,
""
,
line
,
column
});
38
return
tokens;
39
}
40
41
char
Lexer::peek
()
42
{
43
if
(
isAtEnd
())
44
return
'\0'
;
45
return
source
[
position
];
46
}
47
48
char
Lexer::advance
()
49
{
50
char
c =
source
[
position
++];
51
column
++;
52
if
(c ==
'\n'
)
53
{
54
line
++;
55
column
= 1;
56
}
57
return
c;
58
}
59
60
bool
Lexer::isAtEnd
()
61
{
62
return
position
>=
source
.length();
63
}
64
65
void
Lexer::skipWhitespace
()
66
{
67
while
(!
isAtEnd
())
68
{
69
char
c =
peek
();
70
if
(std::isspace(
static_cast<
unsigned
char
>
(c)))
71
{
72
advance
();
73
}
74
else
if
(c ==
'/'
&&
position
+ 1 <
source
.length() &&
source
[
position
+ 1] ==
'/'
)
75
{
76
// Skip single-line comment
77
while
(!
isAtEnd
() &&
peek
() !=
'\n'
)
78
{
79
advance
();
80
}
81
}
82
else
83
{
84
break
;
85
}
86
}
87
}
88
89
Token
Lexer::scanToken
()
90
{
91
char
c =
peek
();
92
if
(std::isalpha(
static_cast<
unsigned
char
>
(c)))
93
return
identifier
();
94
if
(std::isdigit(
static_cast<
unsigned
char
>
(c)))
95
return
number
();
96
if
(c ==
'"'
)
97
return
string
();
98
if
(c ==
'`'
)
99
return
complexString
();
100
101
// Multi-character operators
102
if
(c ==
'+'
&&
position
+ 1 <
source
.length() &&
source
[
position
+ 1] ==
'+'
)
103
{
104
advance
();
105
advance
();
106
return
{
TokenType::Symbol
,
"++"
,
line
,
column
};
107
}
108
if
(c ==
'-'
&&
position
+ 1 <
source
.length() &&
source
[
position
+ 1] ==
'-'
)
109
{
110
advance
();
111
advance
();
112
return
{
TokenType::Symbol
,
"--"
,
line
,
column
};
113
}
114
if
(c ==
'='
&&
position
+ 1 <
source
.length() &&
source
[
position
+ 1] ==
'='
)
115
{
116
advance
();
117
advance
();
118
return
{
TokenType::Symbol
,
"=="
,
line
,
column
};
119
}
120
if
(c ==
'!'
&&
position
+ 1 <
source
.length() &&
source
[
position
+ 1] ==
'='
)
121
{
122
advance
();
123
advance
();
124
return
{
TokenType::Symbol
,
"!="
,
line
,
column
};
125
}
126
if
(c ==
'-'
&&
position
+ 1 <
source
.length() &&
source
[
position
+ 1] ==
'>'
)
127
{
128
advance
();
129
advance
();
130
return
{
TokenType::Symbol
,
"->"
,
line
,
column
};
131
}
132
if
(c ==
'<'
&&
position
+ 1 <
source
.length() &&
source
[
position
+ 1] ==
'='
)
133
{
134
advance
();
135
advance
();
136
return
{
TokenType::Symbol
,
"<="
,
line
,
column
};
137
}
138
if
(c ==
'>'
&&
position
+ 1 <
source
.length() &&
source
[
position
+ 1] ==
'='
)
139
{
140
advance
();
141
advance
();
142
return
{
TokenType::Symbol
,
">="
,
line
,
column
};
143
}
144
if
(c ==
'&'
&&
position
+ 1 <
source
.length() &&
source
[
position
+ 1] ==
'&'
)
145
{
146
advance
();
147
advance
();
148
return
{
TokenType::Symbol
,
"&&"
,
line
,
column
};
149
}
150
if
(c ==
'|'
&&
position
+ 1 <
source
.length() &&
source
[
position
+ 1] ==
'|'
)
151
{
152
advance
();
153
advance
();
154
return
{
TokenType::Symbol
,
"||"
,
line
,
column
};
155
}
156
157
// Single-character symbols (parentheses, operators, punctuation, etc.)
158
if
(std::string(
"()+-*/%<>=!&|.{}:;,[]"
).find(c) != std::string::npos)
159
{
160
advance
();
161
return
{
TokenType::Symbol
, std::string(1, c),
line
,
column
};
162
}
163
164
advance
();
165
return
{
TokenType::Unknown
, std::string(1, c),
line
,
column
};
166
}
167
168
Token
Lexer::identifier
()
169
{
170
size_t
start =
position
;
171
while
(std::isalnum(
static_cast<
unsigned
char
>
(
peek
())) ||
peek
() ==
'_'
)
172
advance
();
173
std::string text =
source
.substr(start,
position
- start);
174
175
static
const
std::vector<std::string> keywords = {
"var"
,
"fn"
,
"if"
,
"else"
,
"while"
,
"for"
,
176
"return"
,
"true"
,
"false"
,
"null"
,
"throw"
,
"print"
,
177
"break"
,
"continue"
,
"switch"
,
"case"
,
"default"
,
"include"
};
178
179
for
(
const
auto
&kw : keywords)
180
{
181
if
(text == kw)
182
{
183
return
{
TokenType::Keyword
, text,
line
,
column
};
184
}
185
}
186
187
return
{
TokenType::Identifier
, text,
line
,
column
};
188
}
189
190
Token
Lexer::number
()
191
{
192
size_t
start =
position
;
193
while
(std::isdigit(
static_cast<
unsigned
char
>
(
peek
())))
194
advance
();
195
if
(
peek
() ==
'.'
&&
position
+ 1 <
source
.length() &&
196
std::isdigit(
static_cast<
unsigned
char
>
(
source
[
position
+ 1])))
197
{
198
advance
();
199
while
(std::isdigit(
static_cast<
unsigned
char
>
(
peek
())))
200
advance
();
201
}
202
return
{
TokenType::Number
,
source
.substr(start,
position
- start),
line
,
column
};
203
}
204
205
static
int
hexValue
(
char
c)
206
{
207
if
(c >=
'0'
&& c <=
'9'
)
208
return
c -
'0'
;
209
if
(c >=
'a'
&& c <=
'f'
)
210
return
10 + (c -
'a'
);
211
if
(c >=
'A'
&& c <=
'F'
)
212
return
10 + (c -
'A'
);
213
return
-1;
214
}
215
216
Token
Lexer::string
()
217
{
218
size_t
tokenLine =
line
;
219
size_t
tokenColumn =
column
;
220
std::ostringstream out;
221
advance
();
// Skip opening quote
222
223
while
(!
isAtEnd
())
224
{
225
char
c =
advance
();
226
227
// Raw newline inside a string is treated as unterminated/error.
228
if
(c ==
'\n'
)
229
{
230
// Unterminated string literal
231
return
{
TokenType::Unknown
, std::string(), tokenLine, tokenColumn};
232
}
233
234
if
(c ==
'\\'
)
235
{
236
if
(
isAtEnd
())
237
{
238
// Unterminated escape at end of file
239
return
{
TokenType::Unknown
, std::string(), tokenLine, tokenColumn};
240
}
241
char
esc =
advance
();
242
switch
(esc)
243
{
244
case
'n'
:
245
out <<
'\n'
;
246
break
;
247
case
't'
:
248
out <<
'\t'
;
249
break
;
250
case
'r'
:
251
out <<
'\r'
;
252
break
;
253
case
'\\'
:
254
out <<
'\\'
;
255
break
;
256
case
'"'
:
257
out <<
'"'
;
258
break
;
259
case
'\''
:
260
out <<
'\''
;
261
break
;
262
case
'0'
:
263
out <<
'\0'
;
264
break
;
265
case
'b'
:
266
out <<
'\b'
;
267
break
;
268
case
'f'
:
269
out <<
'\f'
;
270
break
;
271
case
'v'
:
272
out <<
'\v'
;
273
break
;
274
case
'x'
: {
275
// Hex escape sequence: \xHH
276
if
(
isAtEnd
())
277
return
{
TokenType::Unknown
, std::string(), tokenLine, tokenColumn};
278
char
h1 =
advance
();
279
if
(
isAtEnd
())
280
return
{
TokenType::Unknown
, std::string(), tokenLine, tokenColumn};
281
char
h2 =
advance
();
282
int
v1 =
hexValue
(h1), v2 =
hexValue
(h2);
283
if
(v1 < 0 || v2 < 0)
284
return
{
TokenType::Unknown
, std::string(), tokenLine, tokenColumn};
285
char
value =
static_cast<
char
>
((v1 << 4) | v2);
286
out << value;
287
break
;
288
}
289
default
:
290
// Unknown escape: be permissive and append the escaped character as-is.
291
out << esc;
292
break
;
293
}
294
}
295
else
if
(c ==
'"'
)
296
{
297
// Closing quote
298
return
{
TokenType::String
, out.str(), tokenLine, tokenColumn};
299
}
300
else
301
{
302
out << c;
303
}
304
}
305
306
// If we get here, string was unterminated
307
return
{
TokenType::Unknown
, std::string(), tokenLine, tokenColumn};
308
}
309
310
Token
Lexer::complexString
()
311
{
312
size_t
tokenLine =
line
;
313
size_t
tokenColumn =
column
;
314
std::ostringstream out;
315
advance
();
// Skip opening backtick
316
317
// Not even attempting ${} syntax for now. Just read as a raw string.
318
319
while
(!
isAtEnd
())
320
{
321
char
c =
advance
();
322
323
if
(c ==
'`'
)
324
{
325
// Closing backtick
326
return
{
TokenType::String
, out.str(), tokenLine, tokenColumn};
327
}
328
else
329
{
330
out << c;
331
}
332
}
333
334
// If we get here, string was unterminated
335
return
{
TokenType::Unknown
, std::string(), tokenLine, tokenColumn};
336
}
337
}
// namespace Phasor
Lexer.hpp
Phasor::Lexer::number
Token number()
Definition
Lexer.cpp:190
Phasor::Lexer::isAtEnd
bool isAtEnd()
Definition
Lexer.cpp:60
Phasor::Lexer::skipShebang
void skipShebang()
Definition
Lexer.cpp:15
Phasor::Lexer::skipWhitespace
void skipWhitespace()
Definition
Lexer.cpp:65
Phasor::Lexer::complexString
Token complexString()
Definition
Lexer.cpp:310
Phasor::Lexer::source
std::string source
Definition
Lexer.hpp:30
Phasor::Lexer::identifier
Token identifier()
Definition
Lexer.cpp:168
Phasor::Lexer::position
size_t position
Definition
Lexer.hpp:31
Phasor::Lexer::column
size_t column
Definition
Lexer.hpp:33
Phasor::Lexer::advance
char advance()
Definition
Lexer.cpp:48
Phasor::Lexer::tokenize
std::vector< Token > tokenize()
Definition
Lexer.cpp:26
Phasor::Lexer::Lexer
Lexer(const std::string &source)
Definition
Lexer.cpp:10
Phasor::Lexer::string
Token string()
Definition
Lexer.cpp:216
Phasor::Lexer::scanToken
Token scanToken()
Definition
Lexer.cpp:89
Phasor::Lexer::line
size_t line
Definition
Lexer.hpp:32
Phasor::Lexer::peek
char peek()
Definition
Lexer.cpp:41
Phasor
The Phasor Programming Language and Runtime.
Definition
AST.hpp:11
Phasor::hexValue
static int hexValue(char c)
Definition
Lexer.cpp:205
Phasor::TokenType::Symbol
@ Symbol
Definition
AST.hpp:19
Phasor::TokenType::Keyword
@ Keyword
Definition
AST.hpp:18
Phasor::TokenType::String
@ String
Definition
AST.hpp:17
Phasor::TokenType::Identifier
@ Identifier
Definition
AST.hpp:15
Phasor::TokenType::Unknown
@ Unknown
Definition
AST.hpp:21
Phasor::TokenType::EndOfFile
@ EndOfFile
Definition
AST.hpp:20
Phasor::TokenType::Number
@ Number
Definition
AST.hpp:16
Phasor::Token
Token structure.
Definition
AST.hpp:25
src
Language
Phasor
Lexer
Lexer.cpp
Generated by
1.16.1