Phasor 2.2.0
Stack VM based Programming Language
Loading...
Searching...
No Matches
Lexer.cpp
Go to the documentation of this file.
1#include "Lexer.hpp"
2#include <cctype>
3#include <sstream>
4#include <stdexcept>
5
6namespace Phasor
7{
8
9Lexer::Lexer(const std::string &source) : source(source)
10{
11}
12
14{
15 if (position == 0 && peek() == '#' &&
16 position + 1 < source.length() && source[position + 1] == '!')
17 {
18 while (!isAtEnd() && peek() != '\n')
19 {
20 advance();
21 }
22 }
23}
24
25std::vector<Token> Lexer::tokenize()
26{
27 std::vector<Token> tokens;
29 while (!isAtEnd())
30 {
32 if (isAtEnd())
33 break;
34 tokens.push_back(scanToken());
35 }
36 tokens.push_back({TokenType::EndOfFile, "", line, column});
37 return tokens;
38}
39
41{
42 if (isAtEnd())
43 return '\0';
44 return source[position];
45}
46
48{
49 char c = source[position++];
50 column++;
51 if (c == '\n')
52 {
53 line++;
54 column = 1;
55 }
56 return c;
57}
58
60{
61 return position >= source.length();
62}
63
65{
66 while (!isAtEnd())
67 {
68 char c = peek();
69 if (std::isspace(static_cast<unsigned char>(c)))
70 {
71 advance();
72 }
73 else if (c == '/' && position + 1 < source.length() && source[position + 1] == '/')
74 {
75 // Skip single-line comment
76 while (!isAtEnd() && peek() != '\n')
77 {
78 advance();
79 }
80 }
81 else
82 {
83 break;
84 }
85 }
86}
87
89{
90 char c = peek();
91 if (std::isalpha(static_cast<unsigned char>(c)))
92 return identifier();
93 if (std::isdigit(static_cast<unsigned char>(c)))
94 return number();
95 if (c == '"')
96 return string();
97 if (c == '`')
98 return complexString();
99
100 // Multi-character operators
101 if (c == '+' && position + 1 < source.length() && source[position + 1] == '+')
102 {
103 advance();
104 advance();
105 return {TokenType::Symbol, "++", line, column};
106 }
107 if (c == '-' && position + 1 < source.length() && source[position + 1] == '-')
108 {
109 advance();
110 advance();
111 return {TokenType::Symbol, "--", line, column};
112 }
113 if (c == '=' && position + 1 < source.length() && source[position + 1] == '=')
114 {
115 advance();
116 advance();
117 return {TokenType::Symbol, "==", line, column};
118 }
119 if (c == '!' && position + 1 < source.length() && source[position + 1] == '=')
120 {
121 advance();
122 advance();
123 return {TokenType::Symbol, "!=", line, column};
124 }
125 if (c == '-' && position + 1 < source.length() && source[position + 1] == '>')
126 {
127 advance();
128 advance();
129 return {TokenType::Symbol, "->", line, column};
130 }
131 if (c == '<' && position + 1 < source.length() && source[position + 1] == '=')
132 {
133 advance();
134 advance();
135 return {TokenType::Symbol, "<=", line, column};
136 }
137 if (c == '>' && position + 1 < source.length() && source[position + 1] == '=')
138 {
139 advance();
140 advance();
141 return {TokenType::Symbol, ">=", line, column};
142 }
143 if (c == '&' && position + 1 < source.length() && source[position + 1] == '&')
144 {
145 advance();
146 advance();
147 return {TokenType::Symbol, "&&", line, column};
148 }
149 if (c == '|' && position + 1 < source.length() && source[position + 1] == '|')
150 {
151 advance();
152 advance();
153 return {TokenType::Symbol, "||", line, column};
154 }
155
156 // Single-character symbols (parentheses, operators, punctuation, etc.)
157 if (std::string("()+-*/%<>=!&|.{}:;,[]").find(c) != std::string::npos)
158 {
159 advance();
160 return {TokenType::Symbol, std::string(1, c), line, column};
161 }
162
163 advance();
164 return {TokenType::Unknown, std::string(1, c), line, column};
165}
166
168{
169 size_t start = position;
170 while (std::isalnum(static_cast<unsigned char>(peek())) || peek() == '_')
171 advance();
172 std::string text = source.substr(start, position - start);
173
174 // Check for keywords
175 static const std::vector<std::string> keywords = {
176 "var", "const", "fn", "class", "if", "else", "while", "for",
177 "return", "true", "false", "null", "import", "export", "async", "await",
178 "throw", "try", "catch", "match", "enum", "template", "operator", "unsafe",
179 "spawn", "print", "struct", "break", "continue", "switch", "case", "default"};
180
181 for (const auto &kw : keywords)
182 {
183 if (text == kw)
184 {
185 return {TokenType::Keyword, text, line, column};
186 }
187 }
188
189 return {TokenType::Identifier, text, line, column};
190}
191
193{
194 size_t start = position;
195 while (std::isdigit(static_cast<unsigned char>(peek())))
196 advance();
197 if (peek() == '.' && position + 1 < source.length() &&
198 std::isdigit(static_cast<unsigned char>(source[position + 1])))
199 {
200 advance();
201 while (std::isdigit(static_cast<unsigned char>(peek())))
202 advance();
203 }
204 return {TokenType::Number, source.substr(start, position - start), line, column};
205}
206
207static int hexValue(char c)
208{
209 if (c >= '0' && c <= '9')
210 return c - '0';
211 if (c >= 'a' && c <= 'f')
212 return 10 + (c - 'a');
213 if (c >= 'A' && c <= 'F')
214 return 10 + (c - 'A');
215 return -1;
216}
217
219{
220 size_t tokenLine = line;
221 size_t tokenColumn = column;
222 std::ostringstream out;
223 advance(); // Skip opening quote
224
225 while (!isAtEnd())
226 {
227 char c = advance();
228
229 // Raw newline inside a string is treated as unterminated/error.
230 if (c == '\n')
231 {
232 // Unterminated string literal
233 return {TokenType::Unknown, std::string(), tokenLine, tokenColumn};
234 }
235
236 if (c == '\\')
237 {
238 if (isAtEnd())
239 {
240 // Unterminated escape at end of file
241 return {TokenType::Unknown, std::string(), tokenLine, tokenColumn};
242 }
243 char esc = advance();
244 switch (esc)
245 {
246 case 'n':
247 out << '\n';
248 break;
249 case 't':
250 out << '\t';
251 break;
252 case 'r':
253 out << '\r';
254 break;
255 case '\\':
256 out << '\\';
257 break;
258 case '"':
259 out << '"';
260 break;
261 case '\'':
262 out << '\'';
263 break;
264 case '0':
265 out << '\0';
266 break;
267 case 'b':
268 out << '\b';
269 break;
270 case 'f':
271 out << '\f';
272 break;
273 case 'v':
274 out << '\v';
275 break;
276 case 'x': {
277 // Hex escape sequence: \xHH
278 if (isAtEnd())
279 return {TokenType::Unknown, std::string(), tokenLine, tokenColumn};
280 char h1 = advance();
281 if (isAtEnd())
282 return {TokenType::Unknown, std::string(), tokenLine, tokenColumn};
283 char h2 = advance();
284 int v1 = hexValue(h1), v2 = hexValue(h2);
285 if (v1 < 0 || v2 < 0)
286 return {TokenType::Unknown, std::string(), tokenLine, tokenColumn};
287 char value = static_cast<char>((v1 << 4) | v2);
288 out << value;
289 break;
290 }
291 default:
292 // Unknown escape: be permissive and append the escaped character as-is.
293 out << esc;
294 break;
295 }
296 }
297 else if (c == '"')
298 {
299 // Closing quote
300 return {TokenType::String, out.str(), tokenLine, tokenColumn};
301 }
302 else
303 {
304 out << c;
305 }
306 }
307
308 // If we get here, string was unterminated
309 return {TokenType::Unknown, std::string(), tokenLine, tokenColumn};
310}
311
313{
314 size_t tokenLine = line;
315 size_t tokenColumn = column;
316 std::ostringstream out;
317 advance(); // Skip opening backtick
318
319 // Not even attempting ${} syntax for now. Just read as a raw string.
320
321 while (!isAtEnd())
322 {
323 char c = advance();
324
325 if (c == '`')
326 {
327 // Closing backtick
328 return {TokenType::String, out.str(), tokenLine, tokenColumn};
329 }
330 else
331 {
332 out << c;
333 }
334 }
335
336 // If we get here, string was unterminated
337 return {TokenType::Unknown, std::string(), tokenLine, tokenColumn};
338}
339} // namespace Phasor
Token scanToken()
Definition Lexer.cpp:88
Lexer(const std::string &source)
Definition Lexer.cpp:9
Token complexString()
Definition Lexer.cpp:312
Token number()
Definition Lexer.cpp:192
Token string()
Definition Lexer.cpp:218
std::string source
Definition Lexer.hpp:36
std::vector< Token > tokenize()
Definition Lexer.cpp:25
size_t position
Definition Lexer.hpp:37
size_t column
Definition Lexer.hpp:39
void skipWhitespace()
Definition Lexer.cpp:64
char peek()
Definition Lexer.cpp:40
void skipShebang()
Definition Lexer.cpp:13
Token identifier()
Definition Lexer.cpp:167
bool isAtEnd()
Definition Lexer.cpp:59
char advance()
Definition Lexer.cpp:47
size_t line
Definition Lexer.hpp:38
The Phasor Programming Language and Runtime.
Definition AST.hpp:8
static int hexValue(char c)
Definition Lexer.cpp:207