Phasor 3.3.0
Stack VM based Programming Language
Loading...
Searching...
No Matches
Lexer.cpp
Go to the documentation of this file.
1#include "Lexer.hpp"
2#include <cctype>
3#include <sstream>
4#include <stdexcept>
5#include <utility>
6
7namespace pulsar
8{
9
10Lexer::Lexer(std::string source) : source(std::move(source))
11{
12}
13
14void Lexer::skipShebang()
15{
16 if (position == 0 && peek() == '#' && position + 1 < source.length() && source[position + 1] == '!')
17 {
18 while (!isAtEnd() && peek() != '\n')
19 {
20 advance();
21 }
22 }
23}
24
25std::vector<Phasor::Token> Lexer::tokenize()
26{
27 std::vector<Phasor::Token> tokens;
28 skipShebang();
29 while (!isAtEnd())
30 {
31 skipWhitespace();
32 if (isAtEnd())
33 {
34 break;
35 }
36 tokens.push_back(scanToken());
37 }
38 tokens.push_back({Phasor::TokenType::EndOfFile, "", line, column});
39 return tokens;
40}
41
42char Lexer::peek()
43{
44 if (isAtEnd())
45 {
46 return '\0';
47 }
48 return source[position];
49}
50
51char Lexer::advance()
52{
53 char c = source[position++];
54 column++;
55 if (c == '\n')
56 {
57 line++;
58 column = 1;
59 }
60 return c;
61}
62
63bool Lexer::isAtEnd()
64{
65 return position >= source.length();
66}
67
68void Lexer::skipWhitespace()
69{
70 while (!isAtEnd())
71 {
72 char c = peek();
73 if (std::isspace(static_cast<unsigned char>(c)) != 0)
74 {
75 advance();
76 }
77 else if (c == '/' && position + 1 < source.length() && source[position + 1] == '/')
78 {
79 // Skip single-line comment
80 while (!isAtEnd() && peek() != '\n')
81 {
82 advance();
83 }
84 }
85 else
86 {
87 break;
88 }
89 }
90}
91
92Phasor::Token Lexer::scanToken()
93{
94 char c = peek();
95 if (std::isalpha(static_cast<unsigned char>(c)) != 0)
96 {
97 return identifier();
98 }
99 if (std::isdigit(static_cast<unsigned char>(c)) != 0)
100 {
101 return number();
102 }
103 if (c == '"')
104 {
105 return string();
106 }
107 if (c == '`')
108 {
109 return complexString();
110 }
111
112 // Multi-character operators
113 if (c == '+' && position + 1 < source.length() && source[position + 1] == '+')
114 {
115 advance();
116 advance();
117 return {Phasor::TokenType::Symbol, "++", line, column};
118 }
119 if (c == '-' && position + 1 < source.length() && source[position + 1] == '-')
120 {
121 advance();
122 advance();
123 return {Phasor::TokenType::Symbol, "--", line, column};
124 }
125 if (c == '=' && position + 1 < source.length() && source[position + 1] == '=')
126 {
127 advance();
128 advance();
129 return {Phasor::TokenType::Symbol, "==", line, column};
130 }
131 if (c == '!' && position + 1 < source.length() && source[position + 1] == '=')
132 {
133 advance();
134 advance();
135 return {Phasor::TokenType::Symbol, "!=", line, column};
136 }
137 if (c == '-' && position + 1 < source.length() && source[position + 1] == '>')
138 {
139 advance();
140 advance();
141 return {Phasor::TokenType::Symbol, "->", line, column};
142 }
143 if (c == '<' && position + 1 < source.length() && source[position + 1] == '=')
144 {
145 advance();
146 advance();
147 return {Phasor::TokenType::Symbol, "<=", line, column};
148 }
149 if (c == '>' && position + 1 < source.length() && source[position + 1] == '=')
150 {
151 advance();
152 advance();
153 return {Phasor::TokenType::Symbol, ">=", line, column};
154 }
155 if (c == '&' && position + 1 < source.length() && source[position + 1] == '&')
156 {
157 advance();
158 advance();
159 return {Phasor::TokenType::Symbol, "&&", line, column};
160 }
161 if (c == '|' && position + 1 < source.length() && source[position + 1] == '|')
162 {
163 advance();
164 advance();
165 return {Phasor::TokenType::Symbol, "||", line, column};
166 }
167
168 // Single-character symbols (parentheses, operators, punctuation, etc.)
169 if (std::string("()+-*/%<>=!&|.{}:;,[]").find(c) != std::string::npos)
170 {
171 advance();
172 return {Phasor::TokenType::Symbol, std::string(1, c), line, column};
173 }
174
175 advance();
176 return {Phasor::TokenType::Unknown, std::string(1, c), line, column};
177}
178
179Phasor::Token Lexer::identifier()
180{
181 size_t start = position;
182 while ((std::isalnum(static_cast<unsigned char>(peek())) != 0) || peek() == '_')
183 {
184 advance();
185 }
186 std::string text = source.substr(start, position - start);
187
188 static const std::vector<std::string> keywords = {"let", "func", "print", "if", "else", "while"};
189
190 for (const auto &kw : keywords)
191 {
192 if (text == kw)
193 {
194 return {Phasor::TokenType::Keyword, text, line, column};
195 }
196 }
197
198 return {Phasor::TokenType::Identifier, text, line, column};
199}
200
201Phasor::Token Lexer::number()
202{
203 size_t start = position;
204 while (std::isdigit(static_cast<unsigned char>(peek())) != 0)
205 {
206 advance();
207 }
208 if (peek() == '.' && position + 1 < source.length() &&
209 (std::isdigit(static_cast<unsigned char>(source[position + 1])) != 0))
210 {
211 advance();
212 while (std::isdigit(static_cast<unsigned char>(peek())) != 0)
213 {
214 advance();
215 }
216 }
217 return {Phasor::TokenType::Number, source.substr(start, position - start), line, column};
218}
219
220static int hexValue(char c)
221{
222 if (c >= '0' && c <= '9')
223 {
224 return c - '0';
225 }
226 if (c >= 'a' && c <= 'f')
227 {
228 return 10 + (c - 'a');
229 }
230 if (c >= 'A' && c <= 'F')
231 {
232 return 10 + (c - 'A');
233 }
234 return -1;
235}
236
237Phasor::Token Lexer::string()
238{
239 size_t tokenLine = line;
240 size_t tokenColumn = column;
241 std::ostringstream out;
242 advance(); // Skip opening quote
243
244 while (!isAtEnd())
245 {
246 char c = advance();
247
248 // Raw newline inside a string is treated as unterminated/error.
249 if (c == '\n')
250 {
251 // Unterminated string literal
252 return {Phasor::TokenType::Unknown, std::string(), tokenLine, tokenColumn};
253 }
254
255 if (c == '\\')
256 {
257 if (isAtEnd())
258 {
259 // Unterminated escape at end of file
260 return {Phasor::TokenType::Unknown, std::string(), tokenLine, tokenColumn};
261 }
262 char esc = advance();
263 switch (esc)
264 {
265 case 'n':
266 out << '\n';
267 break;
268 case 't':
269 out << '\t';
270 break;
271 case 'r':
272 out << '\r';
273 break;
274 case '\\':
275 out << '\\';
276 break;
277 case '"':
278 out << '"';
279 break;
280 case '\'':
281 out << '\'';
282 break;
283 case '0':
284 out << '\0';
285 break;
286 case 'b':
287 out << '\b';
288 break;
289 case 'f':
290 out << '\f';
291 break;
292 case 'v':
293 out << '\v';
294 break;
295 case 'x': {
296 // Hex escape sequence: \xHH
297 if (isAtEnd())
298 {
299 return {Phasor::TokenType::Unknown, std::string(), tokenLine, tokenColumn};
300 }
301 char h1 = advance();
302 if (isAtEnd())
303 {
304 return {Phasor::TokenType::Unknown, std::string(), tokenLine, tokenColumn};
305 }
306 char h2 = advance();
307 int v1 = hexValue(h1);
308 int v2 = hexValue(h2);
309 if (v1 < 0 || v2 < 0)
310 {
311 return {Phasor::TokenType::Unknown, std::string(), tokenLine, tokenColumn};
312 }
313 char value = static_cast<char>((v1 << 4) | v2);
314 out << value;
315 break;
316 }
317 default:
318 // Unknown escape: be permissive and append the escaped character as-is.
319 out << esc;
320 break;
321 }
322 }
323 else if (c == '"')
324 {
325 // Closing quote
326 return {Phasor::TokenType::String, out.str(), tokenLine, tokenColumn};
327 }
328 else
329 {
330 out << c;
331 }
332 }
333
334 // If we get here, string was unterminated
335 return {Phasor::TokenType::Unknown, std::string(), tokenLine, tokenColumn};
336}
337
338Phasor::Token Lexer::complexString()
339{
340 size_t tokenLine = line;
341 size_t tokenColumn = column;
342 std::ostringstream out;
343 advance(); // Skip opening backtick
344
345 // Not even attempting ${} syntax for now. Just read as a raw string.
346
347 while (!isAtEnd())
348 {
349 char c = advance();
350
351 if (c == '`')
352 {
353 // Closing backtick
354 return {Phasor::TokenType::String, out.str(), tokenLine, tokenColumn};
355 }
356
357 out << c;
358 }
359
360 // If we get here, string was unterminated
361 return {Phasor::TokenType::Unknown, std::string(), tokenLine, tokenColumn};
362}
363} // namespace pulsar
Lexer(std::string source)
static int hexValue(char c)
Definition Lexer.cpp:224
The Pulsar Scripting Language.
Definition Compiler.cpp:14
static int hexValue(char c)
Definition Lexer.cpp:220
Token structure.
Definition AST.hpp:26