Phasor 3.1.1
Stack VM based Programming Language
Loading...
Searching...
No Matches
Lexer.cpp
Go to the documentation of this file.
1#include "Lexer.hpp"
2#include <cctype>
3#include <sstream>
4#include <algorithm>
5#include <stdexcept>
6
7namespace Phasor
8{
9
10Lexer::Lexer(const std::string &source) : source(source)
11{
12 this->source.erase(std::remove(this->source.begin(), this->source.end(), '\r'), this->source.end());
13}
14
16{
17 if (position == 0 && peek() == '#' && position + 1 < source.length() && source[position + 1] == '!')
18 {
19 while (!isAtEnd() && peek() != '\n')
20 {
21 advance();
22 }
23 }
24}
25
26std::vector<Token> Lexer::tokenize()
27{
28 std::vector<Token> tokens;
30 while (!isAtEnd())
31 {
33 if (isAtEnd())
34 break;
35 tokens.push_back(scanToken());
36 }
37 tokens.push_back({TokenType::EndOfFile, "", line, column});
38 return tokens;
39}
40
42{
43 if (isAtEnd())
44 return '\0';
45 return source[position];
46}
47
49{
50 char c = source[position++];
51 column++;
52 if (c == '\n')
53 {
54 line++;
55 column = 1;
56 }
57 return c;
58}
59
61{
62 return position >= source.length();
63}
64
66{
67 while (!isAtEnd())
68 {
69 char c = peek();
70 if (std::isspace(static_cast<unsigned char>(c)))
71 {
72 advance();
73 }
74 else if (c == '/' && position + 1 < source.length() && source[position + 1] == '/')
75 {
76 // Skip single-line comment
77 while (!isAtEnd() && peek() != '\n')
78 {
79 advance();
80 }
81 }
82 else
83 {
84 break;
85 }
86 }
87}
88
90{
91 char c = peek();
92 if (std::isalpha(static_cast<unsigned char>(c)))
93 return identifier();
94 if (std::isdigit(static_cast<unsigned char>(c)))
95 return number();
96 if (c == '"')
97 return string();
98 if (c == '`')
99 return complexString();
100
101 // Multi-character operators
102 if (c == '+' && position + 1 < source.length() && source[position + 1] == '+')
103 {
104 advance();
105 advance();
106 return {TokenType::Symbol, "++", line, column};
107 }
108 if (c == '-' && position + 1 < source.length() && source[position + 1] == '-')
109 {
110 advance();
111 advance();
112 return {TokenType::Symbol, "--", line, column};
113 }
114 if (c == '=' && position + 1 < source.length() && source[position + 1] == '=')
115 {
116 advance();
117 advance();
118 return {TokenType::Symbol, "==", line, column};
119 }
120 if (c == '!' && position + 1 < source.length() && source[position + 1] == '=')
121 {
122 advance();
123 advance();
124 return {TokenType::Symbol, "!=", line, column};
125 }
126 if (c == '-' && position + 1 < source.length() && source[position + 1] == '>')
127 {
128 advance();
129 advance();
130 return {TokenType::Symbol, "->", line, column};
131 }
132 if (c == '<' && position + 1 < source.length() && source[position + 1] == '=')
133 {
134 advance();
135 advance();
136 return {TokenType::Symbol, "<=", line, column};
137 }
138 if (c == '>' && position + 1 < source.length() && source[position + 1] == '=')
139 {
140 advance();
141 advance();
142 return {TokenType::Symbol, ">=", line, column};
143 }
144 if (c == '&' && position + 1 < source.length() && source[position + 1] == '&')
145 {
146 advance();
147 advance();
148 return {TokenType::Symbol, "&&", line, column};
149 }
150 if (c == '|' && position + 1 < source.length() && source[position + 1] == '|')
151 {
152 advance();
153 advance();
154 return {TokenType::Symbol, "||", line, column};
155 }
156
157 // Single-character symbols (parentheses, operators, punctuation, etc.)
158 if (std::string("()+-*/%<>=!&|.{}:;,[]").find(c) != std::string::npos)
159 {
160 advance();
161 return {TokenType::Symbol, std::string(1, c), line, column};
162 }
163
164 advance();
165 return {TokenType::Unknown, std::string(1, c), line, column};
166}
167
169{
170 size_t start = position;
171 while (std::isalnum(static_cast<unsigned char>(peek())) || peek() == '_')
172 advance();
173 std::string text = source.substr(start, position - start);
174
175 static const std::vector<std::string> keywords = {"var", "fn", "if", "else", "while", "for",
176 "return", "true", "false", "null", "throw", "print",
177 "break", "continue", "switch", "case", "default", "include"};
178
179 for (const auto &kw : keywords)
180 {
181 if (text == kw)
182 {
183 return {TokenType::Keyword, text, line, column};
184 }
185 }
186
187 return {TokenType::Identifier, text, line, column};
188}
189
191{
192 size_t start = position;
193 while (std::isdigit(static_cast<unsigned char>(peek())))
194 advance();
195 if (peek() == '.' && position + 1 < source.length() &&
196 std::isdigit(static_cast<unsigned char>(source[position + 1])))
197 {
198 advance();
199 while (std::isdigit(static_cast<unsigned char>(peek())))
200 advance();
201 }
202 return {TokenType::Number, source.substr(start, position - start), line, column};
203}
204
205static int hexValue(char c)
206{
207 if (c >= '0' && c <= '9')
208 return c - '0';
209 if (c >= 'a' && c <= 'f')
210 return 10 + (c - 'a');
211 if (c >= 'A' && c <= 'F')
212 return 10 + (c - 'A');
213 return -1;
214}
215
217{
218 size_t tokenLine = line;
219 size_t tokenColumn = column;
220 std::ostringstream out;
221 advance(); // Skip opening quote
222
223 while (!isAtEnd())
224 {
225 char c = advance();
226
227 // Raw newline inside a string is treated as unterminated/error.
228 if (c == '\n')
229 {
230 // Unterminated string literal
231 return {TokenType::Unknown, std::string(), tokenLine, tokenColumn};
232 }
233
234 if (c == '\\')
235 {
236 if (isAtEnd())
237 {
238 // Unterminated escape at end of file
239 return {TokenType::Unknown, std::string(), tokenLine, tokenColumn};
240 }
241 char esc = advance();
242 switch (esc)
243 {
244 case 'n':
245 out << '\n';
246 break;
247 case 't':
248 out << '\t';
249 break;
250 case 'r':
251 out << '\r';
252 break;
253 case '\\':
254 out << '\\';
255 break;
256 case '"':
257 out << '"';
258 break;
259 case '\'':
260 out << '\'';
261 break;
262 case '0':
263 out << '\0';
264 break;
265 case 'b':
266 out << '\b';
267 break;
268 case 'f':
269 out << '\f';
270 break;
271 case 'v':
272 out << '\v';
273 break;
274 case 'x': {
275 // Hex escape sequence: \xHH
276 if (isAtEnd())
277 return {TokenType::Unknown, std::string(), tokenLine, tokenColumn};
278 char h1 = advance();
279 if (isAtEnd())
280 return {TokenType::Unknown, std::string(), tokenLine, tokenColumn};
281 char h2 = advance();
282 int v1 = hexValue(h1), v2 = hexValue(h2);
283 if (v1 < 0 || v2 < 0)
284 return {TokenType::Unknown, std::string(), tokenLine, tokenColumn};
285 char value = static_cast<char>((v1 << 4) | v2);
286 out << value;
287 break;
288 }
289 default:
290 // Unknown escape: be permissive and append the escaped character as-is.
291 out << esc;
292 break;
293 }
294 }
295 else if (c == '"')
296 {
297 // Closing quote
298 return {TokenType::String, out.str(), tokenLine, tokenColumn};
299 }
300 else
301 {
302 out << c;
303 }
304 }
305
306 // If we get here, string was unterminated
307 return {TokenType::Unknown, std::string(), tokenLine, tokenColumn};
308}
309
311{
312 size_t tokenLine = line;
313 size_t tokenColumn = column;
314 std::ostringstream out;
315 advance(); // Skip opening backtick
316
317 // Not even attempting ${} syntax for now. Just read as a raw string.
318
319 while (!isAtEnd())
320 {
321 char c = advance();
322
323 if (c == '`')
324 {
325 // Closing backtick
326 return {TokenType::String, out.str(), tokenLine, tokenColumn};
327 }
328 else
329 {
330 out << c;
331 }
332 }
333
334 // If we get here, string was unterminated
335 return {TokenType::Unknown, std::string(), tokenLine, tokenColumn};
336}
337} // namespace Phasor
Token number()
Definition Lexer.cpp:190
bool isAtEnd()
Definition Lexer.cpp:60
void skipShebang()
Definition Lexer.cpp:15
void skipWhitespace()
Definition Lexer.cpp:65
Token complexString()
Definition Lexer.cpp:310
std::string source
Definition Lexer.hpp:30
Token identifier()
Definition Lexer.cpp:168
size_t position
Definition Lexer.hpp:31
size_t column
Definition Lexer.hpp:33
char advance()
Definition Lexer.cpp:48
std::vector< Token > tokenize()
Definition Lexer.cpp:26
Lexer(const std::string &source)
Definition Lexer.cpp:10
Token string()
Definition Lexer.cpp:216
Token scanToken()
Definition Lexer.cpp:89
size_t line
Definition Lexer.hpp:32
char peek()
Definition Lexer.cpp:41
The Phasor Programming Language and Runtime.
Definition AST.hpp:11
static int hexValue(char c)
Definition Lexer.cpp:205
Token structure.
Definition AST.hpp:25