272 lines
8.2 KiB
C++
272 lines
8.2 KiB
C++
#include "lexer.h"
|
|
|
|
#include <stdexcept>
|
|
#include <QHash>
|
|
#include <QDebug>
|
|
|
|
static QHash<QString, TokenKind> buildKeywords() {
|
|
QHash<QString, TokenKind> k;
|
|
k["type"] = TokenKind::KwType;
|
|
k["byteorder"] = TokenKind::KwByteOrder;
|
|
k["LE"] = TokenKind::KwLE;
|
|
k["BE"] = TokenKind::KwBE;
|
|
k["skip"] = TokenKind::KwSkip;
|
|
k["if"] = TokenKind::KwIf;
|
|
k["else"] = TokenKind::KwElse;
|
|
k["while"] = TokenKind::KwWhile;
|
|
k["align"] = TokenKind::KwAlign;
|
|
k["seek"] = TokenKind::KwSeek;
|
|
k["repeat"] = TokenKind::KwRepeat;
|
|
k["for"] = TokenKind::KwFor;
|
|
k["in"] = TokenKind::KwIn;
|
|
k["EOF"] = TokenKind::KwEOF;
|
|
k["criteria"] = TokenKind::KwCriteria;
|
|
k["require"] = TokenKind::KwRequire;
|
|
k["bool"] = TokenKind::KwBool;
|
|
k["break"] = TokenKind::KwBreak;
|
|
k["true"] = TokenKind::KwTrue;
|
|
k["false"] = TokenKind::KwFalse;
|
|
k["match"] = TokenKind::KwMatch;
|
|
k["inline"] = TokenKind::KwInline;
|
|
k["array"] = TokenKind::KwArray;
|
|
k["const"] = TokenKind::KwConst;
|
|
k["when"] = TokenKind::KwWhen;
|
|
k["default"] = TokenKind::KwDefault;
|
|
k["ui"] = TokenKind::KwUi;
|
|
k["edit"] = TokenKind::KwEdit;
|
|
|
|
k["u8"] = TokenKind::KwU8;
|
|
k["u16"] = TokenKind::KwU16;
|
|
k["u32"] = TokenKind::KwU32;
|
|
k["u64"] = TokenKind::KwU64;
|
|
k["i8"] = TokenKind::KwI8;
|
|
k["i16"] = TokenKind::KwI16;
|
|
k["i32"] = TokenKind::KwI32;
|
|
k["i64"] = TokenKind::KwI64;
|
|
k["f32"] = TokenKind::KwF32;
|
|
k["f64"] = TokenKind::KwF64;
|
|
|
|
return k;
|
|
}
|
|
|
|
static const QHash<QString, TokenKind> g_kw = buildKeywords();
|
|
|
|
Lexer::Lexer(QString src) : m_src(std::move(src)) {
|
|
m_cur = lexOne();
|
|
}
|
|
|
|
QChar Lexer::ch(int off) const {
|
|
int idx = m_i + off;
|
|
if (idx < 0 || idx >= m_src.size()) return QChar();
|
|
return m_src[idx];
|
|
}
|
|
|
|
void Lexer::advance(int n) {
|
|
for (int k = 0; k < n; k++) {
|
|
if (m_i >= m_src.size()) return;
|
|
const QChar c = m_src[m_i++];
|
|
if (c == '\n') { m_line++; m_col = 1; }
|
|
else m_col++;
|
|
}
|
|
}
|
|
|
|
void Lexer::skipWsAndComments() {
|
|
while (true) {
|
|
// whitespace
|
|
while (!ch().isNull() && ch().isSpace()) advance();
|
|
|
|
// // comment
|
|
if (ch() == '/' && ch(1) == '/') {
|
|
while (!ch().isNull() && ch() != '\n') advance();
|
|
continue;
|
|
}
|
|
|
|
// /* comment */
|
|
if (ch() == '/' && ch(1) == '*') {
|
|
advance(2);
|
|
while (!ch().isNull()) {
|
|
if (ch() == '*' && ch(1) == '/') { advance(2); break; }
|
|
advance();
|
|
}
|
|
continue;
|
|
}
|
|
|
|
break;
|
|
}
|
|
}
|
|
|
|
Token Lexer::make(TokenKind k, QString t) {
|
|
Token tok;
|
|
tok.kind = k;
|
|
tok.text = std::move(t);
|
|
tok.line = m_line;
|
|
tok.col = m_col;
|
|
return tok;
|
|
}
|
|
|
|
Token Lexer::lexOne() {
|
|
skipWsAndComments();
|
|
|
|
if (ch().isNull()) return make(TokenKind::End);
|
|
|
|
const int startLine = m_line;
|
|
const int startCol = m_col;
|
|
|
|
auto mk = [&](TokenKind k, QString t = QString()) {
|
|
Token tok;
|
|
tok.kind = k;
|
|
tok.text = std::move(t);
|
|
tok.line = startLine;
|
|
tok.col = startCol;
|
|
return tok;
|
|
};
|
|
|
|
// identifiers / keywords
|
|
if (ch().isLetter() || ch() == '_') {
|
|
QString s;
|
|
while (ch().isLetterOrNumber() || ch() == '_') {
|
|
s += ch();
|
|
advance();
|
|
}
|
|
if (g_kw.contains(s)) return mk(g_kw.value(s), s);
|
|
return mk(TokenKind::Identifier, s);
|
|
}
|
|
|
|
// number: decimal or hex
|
|
if (ch().isDigit()) {
|
|
const int startLine = m_line;
|
|
const int startCol = m_col;
|
|
|
|
auto mkNum = [&](QString text, qint64 value) {
|
|
Token tok;
|
|
tok.kind = TokenKind::Number;
|
|
tok.text = std::move(text);
|
|
tok.number = value;
|
|
tok.line = startLine;
|
|
tok.col = startCol;
|
|
return tok;
|
|
};
|
|
|
|
// ---- HEX: 0x[0-9a-fA-F]+ (no ambiguity allowed)
|
|
if (ch() == '0') {
|
|
const QChar n1 = ch(1);
|
|
if (!n1.isNull() && n1.toLower() == QChar('x')) {
|
|
advance(2); // consume "0x"
|
|
|
|
QString hex;
|
|
while (!ch().isNull()) {
|
|
const QChar c = ch();
|
|
const bool isHex =
|
|
c.isDigit() ||
|
|
(c.toLower() >= QChar('a') && c.toLower() <= QChar('f'));
|
|
if (!isHex) break;
|
|
hex += c;
|
|
advance();
|
|
}
|
|
|
|
if (hex.isEmpty()) {
|
|
throw std::runtime_error("Invalid hex literal: expected hex digits after 0x");
|
|
}
|
|
|
|
return mkNum("0x" + hex, hex.toLongLong(nullptr, 16));
|
|
}
|
|
}
|
|
|
|
// ---- DECIMAL: [0-9]+
|
|
QString dec;
|
|
while (ch().isDigit()) {
|
|
dec += ch();
|
|
advance();
|
|
}
|
|
return mkNum(dec, dec.toLongLong(nullptr, 10));
|
|
}
|
|
|
|
// string "..."
|
|
if (ch() == '"') {
|
|
advance();
|
|
QString s;
|
|
while (!ch().isNull() && ch() != '"') {
|
|
if (ch() == '\\') {
|
|
advance();
|
|
if (ch().isNull()) break;
|
|
const QChar esc = ch();
|
|
if (esc == 'n') s += '\n';
|
|
else if (esc == 't') s += '\t';
|
|
else s += esc;
|
|
advance();
|
|
} else {
|
|
s += ch();
|
|
advance();
|
|
}
|
|
}
|
|
if (ch() == '"') advance();
|
|
return mk(TokenKind::String, s);
|
|
}
|
|
|
|
// multi-char ops
|
|
if (ch() == '.' && ch(1) == '.') { advance(2); return mk(TokenKind::DotDot, ".."); }
|
|
if (ch() == '.') { advance(); return mk(TokenKind::Dot, "."); }
|
|
if (ch() == '=' && ch(1) == '>') { advance(2); return mk(TokenKind::Arrow, "=>"); }
|
|
if (ch() == '=' && ch(1) == '=') { advance(2); return mk(TokenKind::EqEq, "=="); }
|
|
if (ch() == '!' && ch(1) == '=') { advance(2); return mk(TokenKind::NotEq, "!="); }
|
|
if (ch() == '<' && ch(1) == '=') { advance(2); return mk(TokenKind::Lte, "<="); }
|
|
if (ch() == '>' && ch(1) == '=') { advance(2); return mk(TokenKind::Gte, ">="); }
|
|
if (ch() == '<' && ch(1) == '<') { advance(2); return mk(TokenKind::LShift, "<<"); }
|
|
if (ch() == '>' && ch(1) == '>') { advance(2); return mk(TokenKind::RShift, ">>"); }
|
|
if (ch() == '&' && ch(1) == '&') { advance(2); return mk(TokenKind::AndAnd, "&&"); }
|
|
if (ch() == '|' && ch(1) == '>') { advance(2); return mk(TokenKind::Pipe, "|>"); }
|
|
if (ch() == '|' && ch(1) == '|') { advance(2); return mk(TokenKind::OrOr, "||"); }
|
|
|
|
// single-char
|
|
const QChar c = ch();
|
|
advance();
|
|
|
|
switch (c.unicode()) {
|
|
case '{': return mk(TokenKind::LBrace, "{");
|
|
case '}': return mk(TokenKind::RBrace, "}");
|
|
case '[': return mk(TokenKind::LBracket, "[");
|
|
case ']': return mk(TokenKind::RBracket, "]");
|
|
case '(': return mk(TokenKind::LParen, "(");
|
|
case ')': return mk(TokenKind::RParen, ")");
|
|
case ';': return mk(TokenKind::Semicolon, ";");
|
|
case ',': return mk(TokenKind::Comma, ",");
|
|
case ':': return mk(TokenKind::Colon, ":");
|
|
case '=': return mk(TokenKind::Assign, "=");
|
|
case '|': return mk(TokenKind::Bar, "|");
|
|
case '+': return mk(TokenKind::Plus, "+");
|
|
case '-': return mk(TokenKind::Minus, "-");
|
|
case '*': return mk(TokenKind::Star, "*");
|
|
case '/': return mk(TokenKind::Slash, "/");
|
|
case '%': return mk(TokenKind::Percent, "%");
|
|
case '&': return mk(TokenKind::Amp, "&");
|
|
case '^': return mk(TokenKind::Caret, "^");
|
|
case '!': return mk(TokenKind::Bang, "!");
|
|
case '<': return mk(TokenKind::Lt, "<");
|
|
case '>': return mk(TokenKind::Gt, ">");
|
|
default:
|
|
// unknown -> treat as End to stop hard, but better to throw in Parser
|
|
return mk(TokenKind::End);
|
|
}
|
|
}
|
|
|
|
Token Lexer::next() {
|
|
Token old = m_cur;
|
|
m_cur = lexOne();
|
|
return old;
|
|
}
|
|
|
|
bool Lexer::match(TokenKind k) {
|
|
if (m_cur.kind == k) { next(); return true; }
|
|
return false;
|
|
}
|
|
|
|
Token Lexer::expect(TokenKind k, const char* msg) {
|
|
if (m_cur.kind != k) {
|
|
// leave detailed error for parser throw
|
|
Token bad = m_cur;
|
|
throw std::runtime_error(QString("%1 at %2:%3 (got '%4')")
|
|
.arg(msg).arg(bad.line).arg(bad.col).arg(bad.text).toStdString());
|
|
}
|
|
return next();
|
|
}
|