Phasor 3.1.1
Stack VM based Programming Language
Loading...
Searching...
No Matches
Lexer.cpp
Go to the documentation of this file.
1#include "Lexer.hpp"
2#include <cctype>
3#include <sstream>
4#include <stdexcept>
5
6namespace pulsar
7{
8
9Lexer::Lexer(const std::string &source) : source(source)
10{
11}
12
13void Lexer::skipShebang()
14{
15 if (position == 0 && peek() == '#' && position + 1 < source.length() && source[position + 1] == '!')
16 {
17 while (!isAtEnd() && peek() != '\n')
18 {
19 advance();
20 }
21 }
22}
23
24std::vector<Phasor::Token> Lexer::tokenize()
25{
26 std::vector<Phasor::Token> tokens;
27 skipShebang();
28 while (!isAtEnd())
29 {
30 skipWhitespace();
31 if (isAtEnd())
32 break;
33 tokens.push_back(scanToken());
34 }
35 tokens.push_back({Phasor::TokenType::EndOfFile, "", line, column});
36 return tokens;
37}
38
39char Lexer::peek()
40{
41 if (isAtEnd())
42 return '\0';
43 return source[position];
44}
45
46char Lexer::advance()
47{
48 char c = source[position++];
49 column++;
50 if (c == '\n')
51 {
52 line++;
53 column = 1;
54 }
55 return c;
56}
57
58bool Lexer::isAtEnd()
59{
60 return position >= source.length();
61}
62
63void Lexer::skipWhitespace()
64{
65 while (!isAtEnd())
66 {
67 char c = peek();
68 if (std::isspace(static_cast<unsigned char>(c)))
69 {
70 advance();
71 }
72 else if (c == '/' && position + 1 < source.length() && source[position + 1] == '/')
73 {
74 // Skip single-line comment
75 while (!isAtEnd() && peek() != '\n')
76 {
77 advance();
78 }
79 }
80 else
81 {
82 break;
83 }
84 }
85}
86
87Phasor::Token Lexer::scanToken()
88{
89 char c = peek();
90 if (std::isalpha(static_cast<unsigned char>(c)))
91 return identifier();
92 if (std::isdigit(static_cast<unsigned char>(c)))
93 return number();
94 if (c == '"')
95 return string();
96 if (c == '`')
97 return complexString();
98
99 // Multi-character operators
100 if (c == '+' && position + 1 < source.length() && source[position + 1] == '+')
101 {
102 advance();
103 advance();
104 return {Phasor::TokenType::Symbol, "++", line, column};
105 }
106 if (c == '-' && position + 1 < source.length() && source[position + 1] == '-')
107 {
108 advance();
109 advance();
110 return {Phasor::TokenType::Symbol, "--", line, column};
111 }
112 if (c == '=' && position + 1 < source.length() && source[position + 1] == '=')
113 {
114 advance();
115 advance();
116 return {Phasor::TokenType::Symbol, "==", line, column};
117 }
118 if (c == '!' && position + 1 < source.length() && source[position + 1] == '=')
119 {
120 advance();
121 advance();
122 return {Phasor::TokenType::Symbol, "!=", line, column};
123 }
124 if (c == '-' && position + 1 < source.length() && source[position + 1] == '>')
125 {
126 advance();
127 advance();
128 return {Phasor::TokenType::Symbol, "->", line, column};
129 }
130 if (c == '<' && position + 1 < source.length() && source[position + 1] == '=')
131 {
132 advance();
133 advance();
134 return {Phasor::TokenType::Symbol, "<=", line, column};
135 }
136 if (c == '>' && position + 1 < source.length() && source[position + 1] == '=')
137 {
138 advance();
139 advance();
140 return {Phasor::TokenType::Symbol, ">=", line, column};
141 }
142 if (c == '&' && position + 1 < source.length() && source[position + 1] == '&')
143 {
144 advance();
145 advance();
146 return {Phasor::TokenType::Symbol, "&&", line, column};
147 }
148 if (c == '|' && position + 1 < source.length() && source[position + 1] == '|')
149 {
150 advance();
151 advance();
152 return {Phasor::TokenType::Symbol, "||", line, column};
153 }
154
155 // Single-character symbols (parentheses, operators, punctuation, etc.)
156 if (std::string("()+-*/%<>=!&|.{}:;,[]").find(c) != std::string::npos)
157 {
158 advance();
159 return {Phasor::TokenType::Symbol, std::string(1, c), line, column};
160 }
161
162 advance();
163 return {Phasor::TokenType::Unknown, std::string(1, c), line, column};
164}
165
166Phasor::Token Lexer::identifier()
167{
168 size_t start = position;
169 while (std::isalnum(static_cast<unsigned char>(peek())) || peek() == '_')
170 advance();
171 std::string text = source.substr(start, position - start);
172
173 static const std::vector<std::string> keywords = {"let", "func", "print", "if", "else", "while"};
174
175 for (const auto &kw : keywords)
176 {
177 if (text == kw)
178 {
179 return {Phasor::TokenType::Keyword, text, line, column};
180 }
181 }
182
183 return {Phasor::TokenType::Identifier, text, line, column};
184}
185
186Phasor::Token Lexer::number()
187{
188 size_t start = position;
189 while (std::isdigit(static_cast<unsigned char>(peek())))
190 advance();
191 if (peek() == '.' && position + 1 < source.length() &&
192 std::isdigit(static_cast<unsigned char>(source[position + 1])))
193 {
194 advance();
195 while (std::isdigit(static_cast<unsigned char>(peek())))
196 advance();
197 }
198 return {Phasor::TokenType::Number, source.substr(start, position - start), line, column};
199}
200
201static int hexValue(char c)
202{
203 if (c >= '0' && c <= '9')
204 return c - '0';
205 if (c >= 'a' && c <= 'f')
206 return 10 + (c - 'a');
207 if (c >= 'A' && c <= 'F')
208 return 10 + (c - 'A');
209 return -1;
210}
211
212Phasor::Token Lexer::string()
213{
214 size_t tokenLine = line;
215 size_t tokenColumn = column;
216 std::ostringstream out;
217 advance(); // Skip opening quote
218
219 while (!isAtEnd())
220 {
221 char c = advance();
222
223 // Raw newline inside a string is treated as unterminated/error.
224 if (c == '\n')
225 {
226 // Unterminated string literal
227 return {Phasor::TokenType::Unknown, std::string(), tokenLine, tokenColumn};
228 }
229
230 if (c == '\\')
231 {
232 if (isAtEnd())
233 {
234 // Unterminated escape at end of file
235 return {Phasor::TokenType::Unknown, std::string(), tokenLine, tokenColumn};
236 }
237 char esc = advance();
238 switch (esc)
239 {
240 case 'n':
241 out << '\n';
242 break;
243 case 't':
244 out << '\t';
245 break;
246 case 'r':
247 out << '\r';
248 break;
249 case '\\':
250 out << '\\';
251 break;
252 case '"':
253 out << '"';
254 break;
255 case '\'':
256 out << '\'';
257 break;
258 case '0':
259 out << '\0';
260 break;
261 case 'b':
262 out << '\b';
263 break;
264 case 'f':
265 out << '\f';
266 break;
267 case 'v':
268 out << '\v';
269 break;
270 case 'x': {
271 // Hex escape sequence: \xHH
272 if (isAtEnd())
273 return {Phasor::TokenType::Unknown, std::string(), tokenLine, tokenColumn};
274 char h1 = advance();
275 if (isAtEnd())
276 return {Phasor::TokenType::Unknown, std::string(), tokenLine, tokenColumn};
277 char h2 = advance();
278 int v1 = hexValue(h1), v2 = hexValue(h2);
279 if (v1 < 0 || v2 < 0)
280 return {Phasor::TokenType::Unknown, std::string(), tokenLine, tokenColumn};
281 char value = static_cast<char>((v1 << 4) | v2);
282 out << value;
283 break;
284 }
285 default:
286 // Unknown escape: be permissive and append the escaped character as-is.
287 out << esc;
288 break;
289 }
290 }
291 else if (c == '"')
292 {
293 // Closing quote
294 return {Phasor::TokenType::String, out.str(), tokenLine, tokenColumn};
295 }
296 else
297 {
298 out << c;
299 }
300 }
301
302 // If we get here, string was unterminated
303 return {Phasor::TokenType::Unknown, std::string(), tokenLine, tokenColumn};
304}
305
306Phasor::Token Lexer::complexString()
307{
308 size_t tokenLine = line;
309 size_t tokenColumn = column;
310 std::ostringstream out;
311 advance(); // Skip opening backtick
312
313 // Not even attempting ${} syntax for now. Just read as a raw string.
314
315 while (!isAtEnd())
316 {
317 char c = advance();
318
319 if (c == '`')
320 {
321 // Closing backtick
322 return {Phasor::TokenType::String, out.str(), tokenLine, tokenColumn};
323 }
324 else
325 {
326 out << c;
327 }
328 }
329
330 // If we get here, string was unterminated
331 return {Phasor::TokenType::Unknown, std::string(), tokenLine, tokenColumn};
332}
333} // namespace pulsar
Lexer(const std::string &source)
static int hexValue(char c)
Definition Lexer.cpp:205
The Pulsar Scripting Language.
Definition Compiler.cpp:13
static int hexValue(char c)
Definition Lexer.cpp:201
Token structure.
Definition AST.hpp:25