XPlor/libs/dsl/lexer.cpp

270 lines
8.2 KiB
C++
Raw Normal View History

2026-01-01 22:18:25 -05:00
#include "lexer.h"
#include <stdexcept>
#include <QHash>
#include <QDebug>
static QHash<QString, TokenKind> buildKeywords() {
QHash<QString, TokenKind> k;
k["type"] = TokenKind::KwType;
k["byteorder"] = TokenKind::KwByteOrder;
k["LE"] = TokenKind::KwLE;
k["BE"] = TokenKind::KwBE;
k["skip"] = TokenKind::KwSkip;
k["if"] = TokenKind::KwIf;
k["else"] = TokenKind::KwElse;
k["while"] = TokenKind::KwWhile;
k["align"] = TokenKind::KwAlign;
k["seek"] = TokenKind::KwSeek;
k["repeat"] = TokenKind::KwRepeat;
k["for"] = TokenKind::KwFor;
k["in"] = TokenKind::KwIn;
k["EOF"] = TokenKind::KwEOF;
k["criteria"] = TokenKind::KwCriteria;
k["require"] = TokenKind::KwRequire;
k["bool"] = TokenKind::KwBool;
k["break"] = TokenKind::KwBreak;
k["true"] = TokenKind::KwTrue;
k["false"] = TokenKind::KwFalse;
k["match"] = TokenKind::KwMatch;
k["inline"] = TokenKind::KwInline;
k["array"] = TokenKind::KwArray;
k["const"] = TokenKind::KwConst;
k["when"] = TokenKind::KwWhen;
k["default"] = TokenKind::KwDefault;
2026-01-01 22:18:25 -05:00
k["u8"] = TokenKind::KwU8;
k["u16"] = TokenKind::KwU16;
k["u32"] = TokenKind::KwU32;
k["u64"] = TokenKind::KwU64;
k["i8"] = TokenKind::KwI8;
k["i16"] = TokenKind::KwI16;
k["i32"] = TokenKind::KwI32;
k["i64"] = TokenKind::KwI64;
k["f32"] = TokenKind::KwF32;
k["f64"] = TokenKind::KwF64;
2026-01-01 22:18:25 -05:00
return k;
}
static const QHash<QString, TokenKind> g_kw = buildKeywords();
Lexer::Lexer(QString src) : m_src(std::move(src)) {
m_cur = lexOne();
}
QChar Lexer::ch(int off) const {
int idx = m_i + off;
if (idx < 0 || idx >= m_src.size()) return QChar();
return m_src[idx];
}
void Lexer::advance(int n) {
for (int k = 0; k < n; k++) {
if (m_i >= m_src.size()) return;
const QChar c = m_src[m_i++];
if (c == '\n') { m_line++; m_col = 1; }
else m_col++;
}
}
void Lexer::skipWsAndComments() {
while (true) {
// whitespace
while (!ch().isNull() && ch().isSpace()) advance();
// // comment
if (ch() == '/' && ch(1) == '/') {
while (!ch().isNull() && ch() != '\n') advance();
continue;
}
// /* comment */
if (ch() == '/' && ch(1) == '*') {
advance(2);
while (!ch().isNull()) {
if (ch() == '*' && ch(1) == '/') { advance(2); break; }
advance();
}
continue;
}
break;
}
}
Token Lexer::make(TokenKind k, QString t) {
Token tok;
tok.kind = k;
tok.text = std::move(t);
tok.line = m_line;
tok.col = m_col;
return tok;
}
Token Lexer::lexOne() {
skipWsAndComments();
if (ch().isNull()) return make(TokenKind::End);
const int startLine = m_line;
const int startCol = m_col;
auto mk = [&](TokenKind k, QString t = QString()) {
Token tok;
tok.kind = k;
tok.text = std::move(t);
tok.line = startLine;
tok.col = startCol;
return tok;
};
// identifiers / keywords
if (ch().isLetter() || ch() == '_') {
QString s;
while (ch().isLetterOrNumber() || ch() == '_') {
s += ch();
advance();
}
if (g_kw.contains(s)) return mk(g_kw.value(s), s);
return mk(TokenKind::Identifier, s);
}
// number: decimal or hex
if (ch().isDigit()) {
const int startLine = m_line;
const int startCol = m_col;
auto mkNum = [&](QString text, qint64 value) {
Token tok;
tok.kind = TokenKind::Number;
tok.text = std::move(text);
tok.number = value;
tok.line = startLine;
tok.col = startCol;
return tok;
};
// ---- HEX: 0x[0-9a-fA-F]+ (no ambiguity allowed)
if (ch() == '0') {
const QChar n1 = ch(1);
if (!n1.isNull() && n1.toLower() == QChar('x')) {
advance(2); // consume "0x"
QString hex;
while (!ch().isNull()) {
const QChar c = ch();
const bool isHex =
c.isDigit() ||
(c.toLower() >= QChar('a') && c.toLower() <= QChar('f'));
if (!isHex) break;
hex += c;
advance();
}
if (hex.isEmpty()) {
throw std::runtime_error("Invalid hex literal: expected hex digits after 0x");
}
return mkNum("0x" + hex, hex.toLongLong(nullptr, 16));
}
}
// ---- DECIMAL: [0-9]+
QString dec;
while (ch().isDigit()) {
dec += ch();
advance();
}
return mkNum(dec, dec.toLongLong(nullptr, 10));
}
// string "..."
if (ch() == '"') {
advance();
QString s;
while (!ch().isNull() && ch() != '"') {
if (ch() == '\\') {
advance();
if (ch().isNull()) break;
const QChar esc = ch();
if (esc == 'n') s += '\n';
else if (esc == 't') s += '\t';
else s += esc;
advance();
} else {
s += ch();
advance();
}
}
if (ch() == '"') advance();
return mk(TokenKind::String, s);
}
// multi-char ops
if (ch() == '.' && ch(1) == '.') { advance(2); return mk(TokenKind::DotDot, ".."); }
if (ch() == '.') { advance(); return mk(TokenKind::Dot, "."); }
if (ch() == '=' && ch(1) == '>') { advance(2); return mk(TokenKind::Arrow, "=>"); }
2026-01-01 22:18:25 -05:00
if (ch() == '=' && ch(1) == '=') { advance(2); return mk(TokenKind::EqEq, "=="); }
if (ch() == '!' && ch(1) == '=') { advance(2); return mk(TokenKind::NotEq, "!="); }
if (ch() == '<' && ch(1) == '=') { advance(2); return mk(TokenKind::Lte, "<="); }
if (ch() == '>' && ch(1) == '=') { advance(2); return mk(TokenKind::Gte, ">="); }
if (ch() == '<' && ch(1) == '<') { advance(2); return mk(TokenKind::LShift, "<<"); }
if (ch() == '>' && ch(1) == '>') { advance(2); return mk(TokenKind::RShift, ">>"); }
if (ch() == '&' && ch(1) == '&') { advance(2); return mk(TokenKind::AndAnd, "&&"); }
if (ch() == '|' && ch(1) == '>') { advance(2); return mk(TokenKind::Pipe, "|>"); }
if (ch() == '|' && ch(1) == '|') { advance(2); return mk(TokenKind::OrOr, "||"); }
// single-char
const QChar c = ch();
advance();
switch (c.unicode()) {
case '{': return mk(TokenKind::LBrace, "{");
case '}': return mk(TokenKind::RBrace, "}");
case '[': return mk(TokenKind::LBracket, "[");
case ']': return mk(TokenKind::RBracket, "]");
case '(': return mk(TokenKind::LParen, "(");
case ')': return mk(TokenKind::RParen, ")");
case ';': return mk(TokenKind::Semicolon, ";");
case ',': return mk(TokenKind::Comma, ",");
case ':': return mk(TokenKind::Colon, ":");
2026-01-01 22:18:25 -05:00
case '=': return mk(TokenKind::Assign, "=");
case '|': return mk(TokenKind::Bar, "|");
case '+': return mk(TokenKind::Plus, "+");
case '-': return mk(TokenKind::Minus, "-");
case '*': return mk(TokenKind::Star, "*");
case '/': return mk(TokenKind::Slash, "/");
case '%': return mk(TokenKind::Percent, "%");
case '&': return mk(TokenKind::Amp, "&");
case '^': return mk(TokenKind::Caret, "^");
case '!': return mk(TokenKind::Bang, "!");
case '<': return mk(TokenKind::Lt, "<");
case '>': return mk(TokenKind::Gt, ">");
default:
// unknown -> treat as End to stop hard, but better to throw in Parser
return mk(TokenKind::End);
}
}
Token Lexer::next() {
Token old = m_cur;
m_cur = lexOne();
return old;
}
bool Lexer::match(TokenKind k) {
if (m_cur.kind == k) { next(); return true; }
return false;
}
Token Lexer::expect(TokenKind k, const char* msg) {
if (m_cur.kind != k) {
// leave detailed error for parser throw
Token bad = m_cur;
throw std::runtime_error(QString("%1 at %2:%3 (got '%4')")
.arg(msg).arg(bad.line).arg(bad.col).arg(bad.text).toStdString());
}
return next();
}