Phasor 3.3.0
Stack VM based Programming Language
Loading...
Searching...
No Matches
Lexer.cpp
Go to the documentation of this file.
1#include "Lexer.hpp"
2#include <cctype>
3#include <sstream>
4#include <algorithm>
5#include <stdexcept>
6#include <utility>
7
8namespace Phasor
9{
10
11Lexer::Lexer(std::string source) : source(std::move(source))
12{
13 std::erase(this->source, '\r');
14}
15
17{
18 if (position == 0 && peek() == '#' && position + 1 < source.length() && source[position + 1] == '!')
19 {
20 while (!isAtEnd() && peek() != '\n')
21 {
22 advance();
23 }
24 }
25}
26
27std::vector<Token> Lexer::tokenize()
28{
29 std::vector<Token> tokens;
31 while (!isAtEnd())
32 {
34 if (isAtEnd())
35 {
36 break;
37 }
38 tokens.push_back(scanToken());
39 }
40 tokens.push_back({TokenType::EndOfFile, "", line, column});
41 return tokens;
42}
43
45{
46 if (isAtEnd())
47 {
48 return '\0';
49 }
50 return source[position];
51}
52
54{
55 char c = source[position++];
56 column++;
57 if (c == '\n')
58 {
59 line++;
60 column = 1;
61 }
62 return c;
63}
64
66{
67 return position >= source.length();
68}
69
71{
72 while (!isAtEnd())
73 {
74 char c = peek();
75 if (std::isspace(static_cast<unsigned char>(c)) != 0)
76 {
77 advance();
78 }
79 else if (c == '/' && position + 1 < source.length() && source[position + 1] == '/')
80 {
81 // Skip single-line comment
82 while (!isAtEnd() && peek() != '\n')
83 {
84 advance();
85 }
86 }
87 else
88 {
89 break;
90 }
91 }
92}
93
95{
96 char c = peek();
97 if (std::isalpha(static_cast<unsigned char>(c)) != 0)
98 {
99 return identifier();
100 }
101 if (std::isdigit(static_cast<unsigned char>(c)) != 0)
102 {
103 return number();
104 }
105 if (c == '"')
106 {
107 return string();
108 }
109 if (c == '`')
110 {
111 return complexString();
112 }
113
114 // Multi-character operators
115 if (c == '+' && position + 1 < source.length() && source[position + 1] == '+')
116 {
117 advance();
118 advance();
119 return {TokenType::Symbol, "++", line, column};
120 }
121 if (c == '-' && position + 1 < source.length() && source[position + 1] == '-')
122 {
123 advance();
124 advance();
125 return {TokenType::Symbol, "--", line, column};
126 }
127 if (c == '=' && position + 1 < source.length() && source[position + 1] == '=')
128 {
129 advance();
130 advance();
131 return {TokenType::Symbol, "==", line, column};
132 }
133 if (c == '!' && position + 1 < source.length() && source[position + 1] == '=')
134 {
135 advance();
136 advance();
137 return {TokenType::Symbol, "!=", line, column};
138 }
139 if (c == '-' && position + 1 < source.length() && source[position + 1] == '>')
140 {
141 advance();
142 advance();
143 return {TokenType::Symbol, "->", line, column};
144 }
145 if (c == '<' && position + 1 < source.length() && source[position + 1] == '=')
146 {
147 advance();
148 advance();
149 return {TokenType::Symbol, "<=", line, column};
150 }
151 if (c == '>' && position + 1 < source.length() && source[position + 1] == '=')
152 {
153 advance();
154 advance();
155 return {TokenType::Symbol, ">=", line, column};
156 }
157 if (c == '&' && position + 1 < source.length() && source[position + 1] == '&')
158 {
159 advance();
160 advance();
161 return {TokenType::Symbol, "&&", line, column};
162 }
163 if (c == '|' && position + 1 < source.length() && source[position + 1] == '|')
164 {
165 advance();
166 advance();
167 return {TokenType::Symbol, "||", line, column};
168 }
169
170 // Single-character symbols (parentheses, operators, punctuation, etc.)
171 if (std::string("()+-*/%<>=!&|.{}:;,[]").find(c) != std::string::npos)
172 {
173 advance();
174 return {TokenType::Symbol, std::string(1, c), line, column};
175 }
176
177 advance();
178 return {TokenType::Unknown, std::string(1, c), line, column};
179}
180
182{
183 size_t start = position;
184 while ((std::isalnum(static_cast<unsigned char>(peek())) != 0) || peek() == '_')
185 {
186 advance();
187 }
188 std::string text = source.substr(start, position - start);
189
190 static const std::vector<std::string> keywords = {"var", "fn", "if", "else", "while", "for",
191 "return", "true", "false", "null", "throw", "print",
192 "break", "continue", "switch", "case", "default", "include"};
193
194 for (const auto &kw : keywords)
195 {
196 if (text == kw)
197 {
198 return {TokenType::Keyword, text, line, column};
199 }
200 }
201
202 return {TokenType::Identifier, text, line, column};
203}
204
206{
207 size_t start = position;
208 while (std::isdigit(static_cast<unsigned char>(peek())) != 0)
209 {
210 advance();
211 }
212 if (peek() == '.' && position + 1 < source.length() &&
213 (std::isdigit(static_cast<unsigned char>(source[position + 1])) != 0))
214 {
215 advance();
216 while (std::isdigit(static_cast<unsigned char>(peek())) != 0)
217 {
218 advance();
219 }
220 }
221 return {TokenType::Number, source.substr(start, position - start), line, column};
222}
223
224static int hexValue(char c)
225{
226 if (c >= '0' && c <= '9')
227 {
228 return c - '0';
229 }
230 if (c >= 'a' && c <= 'f')
231 {
232 return 10 + (c - 'a');
233 }
234 if (c >= 'A' && c <= 'F')
235 {
236 return 10 + (c - 'A');
237 }
238 return -1;
239}
240
242{
243 size_t tokenLine = line;
244 size_t tokenColumn = column;
245 std::ostringstream out;
246 advance(); // Skip opening quote
247
248 while (!isAtEnd())
249 {
250 char c = advance();
251
252 // Raw newline inside a string is treated as unterminated/error.
253 if (c == '\n')
254 {
255 // Unterminated string literal
256 return {TokenType::Unknown, std::string(), tokenLine, tokenColumn};
257 }
258
259 if (c == '\\')
260 {
261 if (isAtEnd())
262 {
263 // Unterminated escape at end of file
264 return {TokenType::Unknown, std::string(), tokenLine, tokenColumn};
265 }
266 char esc = advance();
267 switch (esc)
268 {
269 case 'n':
270 out << '\n';
271 break;
272 case 't':
273 out << '\t';
274 break;
275 case 'r':
276 out << '\r';
277 break;
278 case '\\':
279 out << '\\';
280 break;
281 case '"':
282 out << '"';
283 break;
284 case '\'':
285 out << '\'';
286 break;
287 case '0':
288 out << '\0';
289 break;
290 case 'b':
291 out << '\b';
292 break;
293 case 'f':
294 out << '\f';
295 break;
296 case 'v':
297 out << '\v';
298 break;
299 case 'x': {
300 // Hex escape sequence: \xHH
301 if (isAtEnd())
302 {
303 return {TokenType::Unknown, std::string(), tokenLine, tokenColumn};
304 }
305 char h1 = advance();
306 if (isAtEnd())
307 {
308 return {TokenType::Unknown, std::string(), tokenLine, tokenColumn};
309 }
310 char h2 = advance();
311 int v1 = hexValue(h1);
312 int v2 = hexValue(h2);
313 if (v1 < 0 || v2 < 0)
314 {
315 return {TokenType::Unknown, std::string(), tokenLine, tokenColumn};
316 }
317 char value = static_cast<char>((v1 << 4) | v2);
318 out << value;
319 break;
320 }
321 default:
322 // Unknown escape: be permissive and append the escaped character as-is.
323 out << esc;
324 break;
325 }
326 }
327 else if (c == '"')
328 {
329 // Closing quote
330 return {TokenType::String, out.str(), tokenLine, tokenColumn};
331 }
332 else
333 {
334 out << c;
335 }
336 }
337
338 // If we get here, string was unterminated
339 return {TokenType::Unknown, std::string(), tokenLine, tokenColumn};
340}
341
343{
344 size_t tokenLine = line;
345 size_t tokenColumn = column;
346 std::ostringstream out;
347 advance(); // Skip opening backtick
348
349 // Not even attempting ${} syntax for now. Just read as a raw string.
350
351 while (!isAtEnd())
352 {
353 char c = advance();
354
355 if (c == '`')
356 {
357 // Closing backtick
358 return {TokenType::String, out.str(), tokenLine, tokenColumn};
359 }
360
361 out << c;
362 }
363
364 // If we get here, string was unterminated
365 return {TokenType::Unknown, std::string(), tokenLine, tokenColumn};
366}
367} // namespace Phasor
Lexer(std::string source)
Definition Lexer.cpp:11
Token number()
Definition Lexer.cpp:205
bool isAtEnd()
Definition Lexer.cpp:65
void skipShebang()
Definition Lexer.cpp:16
void skipWhitespace()
Definition Lexer.cpp:70
Token complexString()
Definition Lexer.cpp:342
std::string source
Definition Lexer.hpp:30
Token identifier()
Definition Lexer.cpp:181
size_t position
Definition Lexer.hpp:31
size_t column
Definition Lexer.hpp:33
char advance()
Definition Lexer.cpp:53
std::vector< Token > tokenize()
Definition Lexer.cpp:27
Token string()
Definition Lexer.cpp:241
Token scanToken()
Definition Lexer.cpp:94
size_t line
Definition Lexer.hpp:32
char peek()
Definition Lexer.cpp:44
The Phasor Programming Language and Runtime.
Definition AST.hpp:12
static int hexValue(char c)
Definition Lexer.cpp:224
Token structure.
Definition AST.hpp:26