XPlor/libs/dsl/lexer.cpp

#include "lexer.h"

#include <stdexcept>
#include <QHash>
#include <QDebug>

static QHash<QString, TokenKind> buildKeywords() {
    QHash<QString, TokenKind> k;
    k["type"] = TokenKind::KwType;
    k["byteorder"] = TokenKind::KwByteOrder;
    k["LE"] = TokenKind::KwLE;
    k["BE"] = TokenKind::KwBE;
    k["skip"] = TokenKind::KwSkip;
    k["if"] = TokenKind::KwIf;
    k["else"] = TokenKind::KwElse;
    k["while"] = TokenKind::KwWhile;
    k["align"] = TokenKind::KwAlign;
    k["seek"] = TokenKind::KwSeek;
    k["repeat"] = TokenKind::KwRepeat;
    k["for"] = TokenKind::KwFor;
    k["in"] = TokenKind::KwIn;
    k["EOF"] = TokenKind::KwEOF;
    k["criteria"] = TokenKind::KwCriteria;
    k["require"]  = TokenKind::KwRequire;
    k["bool"] = TokenKind::KwBool;
    k["break"] = TokenKind::KwBreak;
    k["true"] = TokenKind::KwTrue;
    k["false"] = TokenKind::KwFalse;
    k["match"] = TokenKind::KwMatch;
    k["inline"] = TokenKind::KwInline;
    k["array"] = TokenKind::KwArray;
    k["const"] = TokenKind::KwConst;
    k["when"] = TokenKind::KwWhen;
    k["default"] = TokenKind::KwDefault;

    k["u8"] = TokenKind::KwU8;
    k["u16"] = TokenKind::KwU16;
    k["u32"] = TokenKind::KwU32;
    k["u64"] = TokenKind::KwU64;
    k["i8"] = TokenKind::KwI8;
    k["i16"] = TokenKind::KwI16;
    k["i32"] = TokenKind::KwI32;
    k["i64"] = TokenKind::KwI64;
    k["f32"] = TokenKind::KwF32;
    k["f64"] = TokenKind::KwF64;

    return k;
}

static const QHash<QString, TokenKind> g_kw = buildKeywords();

Lexer::Lexer(QString src) : m_src(std::move(src)) {
    m_cur = lexOne();
}

QChar Lexer::ch(int off) const {
    int idx = m_i + off;
    if (idx < 0 || idx >= m_src.size()) return QChar();
    return m_src[idx];
}

void Lexer::advance(int n) {
    for (int k = 0; k < n; k++) {
        if (m_i >= m_src.size()) return;
        const QChar c = m_src[m_i++];
        if (c == '\n') { m_line++; m_col = 1; }
        else m_col++;
    }
}

void Lexer::skipWsAndComments() {
    while (true) {
        // whitespace
        while (!ch().isNull() && ch().isSpace()) advance();

        // // comment
        if (ch() == '/' && ch(1) == '/') {
            while (!ch().isNull() && ch() != '\n') advance();
            continue;
        }

        // /* comment */
        if (ch() == '/' && ch(1) == '*') {
            advance(2);
            while (!ch().isNull()) {
                if (ch() == '*' && ch(1) == '/') { advance(2); break; }
                advance();
            }
            continue;
        }

        break;
    }
}

Token Lexer::make(TokenKind k, QString t) {
    Token tok;
    tok.kind = k;
    tok.text = std::move(t);
    tok.line = m_line;
    tok.col = m_col;
    return tok;
}

Token Lexer::lexOne() {
    skipWsAndComments();

    if (ch().isNull()) return make(TokenKind::End);

    const int startLine = m_line;
    const int startCol  = m_col;

    auto mk = [&](TokenKind k, QString t = QString()) {
        Token tok;
        tok.kind = k;
        tok.text = std::move(t);
        tok.line = startLine;
        tok.col = startCol;
        return tok;
    };

    // identifiers / keywords
    if (ch().isLetter() || ch() == '_') {
        QString s;
        while (ch().isLetterOrNumber() || ch() == '_') {
            s += ch();
            advance();
        }
        if (g_kw.contains(s)) return mk(g_kw.value(s), s);
        return mk(TokenKind::Identifier, s);
    }

    // number: decimal or hex
    if (ch().isDigit()) {
        const int startLine = m_line;
        const int startCol  = m_col;

        auto mkNum = [&](QString text, qint64 value) {
            Token tok;
            tok.kind = TokenKind::Number;
            tok.text = std::move(text);
            tok.number = value;
            tok.line = startLine;
            tok.col = startCol;
            return tok;
        };

        // ---- HEX: 0x[0-9a-fA-F]+ (no ambiguity allowed)
        if (ch() == '0') {
            const QChar n1 = ch(1);
            if (!n1.isNull() && n1.toLower() == QChar('x')) {
                advance(2); // consume "0x"

                QString hex;
                while (!ch().isNull()) {
                    const QChar c = ch();
                    const bool isHex =
                        c.isDigit() ||
                        (c.toLower() >= QChar('a') && c.toLower() <= QChar('f'));
                    if (!isHex) break;
                    hex += c;
                    advance();
                }

                if (hex.isEmpty()) {
                    throw std::runtime_error("Invalid hex literal: expected hex digits after 0x");
                }

                return mkNum("0x" + hex, hex.toLongLong(nullptr, 16));
            }
        }

        // ---- DECIMAL: [0-9]+
        QString dec;
        while (ch().isDigit()) {
            dec += ch();
            advance();
        }
        return mkNum(dec, dec.toLongLong(nullptr, 10));
    }

    // string "..."
    if (ch() == '"') {
        advance();
        QString s;
        while (!ch().isNull() && ch() != '"') {
            if (ch() == '\\') {
                advance();
                if (ch().isNull()) break;
                const QChar esc = ch();
                if (esc == 'n') s += '\n';
                else if (esc == 't') s += '\t';
                else s += esc;
                advance();
            } else {
                s += ch();
                advance();
            }
        }
        if (ch() == '"') advance();
        return mk(TokenKind::String, s);
    }

    // multi-char ops
    if (ch() == '.' && ch(1) == '.') { advance(2); return mk(TokenKind::DotDot, ".."); }
    if (ch() == '.') { advance(); return mk(TokenKind::Dot, "."); }
    if (ch() == '=' && ch(1) == '>') { advance(2); return mk(TokenKind::Arrow, "=>"); }
    if (ch() == '=' && ch(1) == '=') { advance(2); return mk(TokenKind::EqEq, "=="); }
    if (ch() == '!' && ch(1) == '=') { advance(2); return mk(TokenKind::NotEq, "!="); }
    if (ch() == '<' && ch(1) == '=') { advance(2); return mk(TokenKind::Lte, "<="); }
    if (ch() == '>' && ch(1) == '=') { advance(2); return mk(TokenKind::Gte, ">="); }
    if (ch() == '<' && ch(1) == '<') { advance(2); return mk(TokenKind::LShift, "<<"); }
    if (ch() == '>' && ch(1) == '>') { advance(2); return mk(TokenKind::RShift, ">>"); }
    if (ch() == '&' && ch(1) == '&') { advance(2); return mk(TokenKind::AndAnd, "&&"); }
    if (ch() == '|' && ch(1) == '>') { advance(2); return mk(TokenKind::Pipe, "|>"); }
    if (ch() == '|' && ch(1) == '|') { advance(2); return mk(TokenKind::OrOr, "||"); }

    // single-char
    const QChar c = ch();
    advance();

    switch (c.unicode()) {
    case '{': return mk(TokenKind::LBrace, "{");
    case '}': return mk(TokenKind::RBrace, "}");
    case '[': return mk(TokenKind::LBracket, "[");
    case ']': return mk(TokenKind::RBracket, "]");
    case '(': return mk(TokenKind::LParen, "(");
    case ')': return mk(TokenKind::RParen, ")");
    case ';': return mk(TokenKind::Semicolon, ";");
    case ',': return mk(TokenKind::Comma, ",");
    case ':': return mk(TokenKind::Colon, ":");
    case '=': return mk(TokenKind::Assign, "=");
    case '|': return mk(TokenKind::Bar, "|");
    case '+': return mk(TokenKind::Plus, "+");
    case '-': return mk(TokenKind::Minus, "-");
    case '*': return mk(TokenKind::Star, "*");
    case '/': return mk(TokenKind::Slash, "/");
    case '%': return mk(TokenKind::Percent, "%");
    case '&': return mk(TokenKind::Amp, "&");
    case '^': return mk(TokenKind::Caret, "^");
    case '!': return mk(TokenKind::Bang, "!");
    case '<': return mk(TokenKind::Lt, "<");
    case '>': return mk(TokenKind::Gt, ">");
    default:
        // unknown -> treat as End to stop hard, but better to throw in Parser
        return mk(TokenKind::End);
    }
}

Token Lexer::next() {
    Token old = m_cur;
    m_cur = lexOne();
    return old;
}

bool Lexer::match(TokenKind k) {
    if (m_cur.kind == k) { next(); return true; }
    return false;
}

Token Lexer::expect(TokenKind k, const char* msg) {
    if (m_cur.kind != k) {
        // leave detailed error for parser throw
        Token bad = m_cur;
        throw std::runtime_error(QString("%1 at %2:%3 (got '%4')")
                                     .arg(msg).arg(bad.line).arg(bad.col).arg(bad.text).toStdString());
    }
    return next();
}
Add DSL library 2026-01-01 22:18:25 -05:00			`#include "lexer.h"`

			`#include <stdexcept>`
			`#include <QHash>`
			`#include <QDebug>`

			`static QHash<QString, TokenKind> buildKeywords() {`
			`QHash<QString, TokenKind> k;`
			`k["type"] = TokenKind::KwType;`
			`k["byteorder"] = TokenKind::KwByteOrder;`
			`k["LE"] = TokenKind::KwLE;`
			`k["BE"] = TokenKind::KwBE;`
			`k["skip"] = TokenKind::KwSkip;`
			`k["if"] = TokenKind::KwIf;`
			`k["else"] = TokenKind::KwElse;`
			`k["while"] = TokenKind::KwWhile;`
			`k["align"] = TokenKind::KwAlign;`
			`k["seek"] = TokenKind::KwSeek;`
			`k["repeat"] = TokenKind::KwRepeat;`
			`k["for"] = TokenKind::KwFor;`
			`k["in"] = TokenKind::KwIn;`
			`k["EOF"] = TokenKind::KwEOF;`
			`k["criteria"] = TokenKind::KwCriteria;`
			`k["require"] = TokenKind::KwRequire;`
			`k["bool"] = TokenKind::KwBool;`
Enhance DSL interpreter with new built-in functions Add support for: - basename() function for extracting filenames from paths - cstring() function for reading null-terminated strings - ascii() function for reading fixed-length ASCII strings - Enhanced type registry with additional primitive types - Improved parser with better error handling These additions enable more flexible XScript definitions for parsing binary file formats. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com> 2026-01-07 16:35:35 -05:00			`k["break"] = TokenKind::KwBreak;`
Add XScript language features and DslKeys abstraction - Add inline, array, const, and match statement syntax - Add true/false keywords and deflate() decompression function - Introduce DslKeys enum for internal metadata key management - Improve parse_here delegation pattern with variable merging - Remove deprecated bracket attribute syntax (use ui() instead) - Enhance script type editor with additional functionality - Remove obsolete install.cmd Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com> 2026-01-11 12:08:26 -05:00			`k["true"] = TokenKind::KwTrue;`
			`k["false"] = TokenKind::KwFalse;`
			`k["match"] = TokenKind::KwMatch;`
			`k["inline"] = TokenKind::KwInline;`
			`k["array"] = TokenKind::KwArray;`
			`k["const"] = TokenKind::KwConst;`
			`k["when"] = TokenKind::KwWhen;`
			`k["default"] = TokenKind::KwDefault;`
Add DSL library 2026-01-01 22:18:25 -05:00
			`k["u8"] = TokenKind::KwU8;`
			`k["u16"] = TokenKind::KwU16;`
			`k["u32"] = TokenKind::KwU32;`
			`k["u64"] = TokenKind::KwU64;`
			`k["i8"] = TokenKind::KwI8;`
			`k["i16"] = TokenKind::KwI16;`
			`k["i32"] = TokenKind::KwI32;`
			`k["i64"] = TokenKind::KwI64;`
Enhance DSL interpreter with new built-in functions Add support for: - basename() function for extracting filenames from paths - cstring() function for reading null-terminated strings - ascii() function for reading fixed-length ASCII strings - Enhanced type registry with additional primitive types - Improved parser with better error handling These additions enable more flexible XScript definitions for parsing binary file formats. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com> 2026-01-07 16:35:35 -05:00			`k["f32"] = TokenKind::KwF32;`
			`k["f64"] = TokenKind::KwF64;`
Add DSL library 2026-01-01 22:18:25 -05:00
			`return k;`
			`}`

			`static const QHash<QString, TokenKind> g_kw = buildKeywords();`

			`Lexer::Lexer(QString src) : m_src(std::move(src)) {`
			`m_cur = lexOne();`
			`}`

			`QChar Lexer::ch(int off) const {`
			`int idx = m_i + off;`
			`if (idx < 0 \|\| idx >= m_src.size()) return QChar();`
			`return m_src[idx];`
			`}`

			`void Lexer::advance(int n) {`
			`for (int k = 0; k < n; k++) {`
			`if (m_i >= m_src.size()) return;`
			`const QChar c = m_src[m_i++];`
			`if (c == '\n') { m_line++; m_col = 1; }`
			`else m_col++;`
			`}`
			`}`

			`void Lexer::skipWsAndComments() {`
			`while (true) {`
			`// whitespace`
			`while (!ch().isNull() && ch().isSpace()) advance();`

			`// // comment`
			`if (ch() == '/' && ch(1) == '/') {`
			`while (!ch().isNull() && ch() != '\n') advance();`
			`continue;`
			`}`

			`// /* comment */`
			`if (ch() == '/' && ch(1) == '*') {`
			`advance(2);`
			`while (!ch().isNull()) {`
			`if (ch() == '*' && ch(1) == '/') { advance(2); break; }`
			`advance();`
			`}`
			`continue;`
			`}`

			`break;`
			`}`
			`}`

			`Token Lexer::make(TokenKind k, QString t) {`
			`Token tok;`
			`tok.kind = k;`
			`tok.text = std::move(t);`
			`tok.line = m_line;`
			`tok.col = m_col;`
			`return tok;`
			`}`

			`Token Lexer::lexOne() {`
			`skipWsAndComments();`

			`if (ch().isNull()) return make(TokenKind::End);`

			`const int startLine = m_line;`
			`const int startCol = m_col;`

			`auto mk = [&](TokenKind k, QString t = QString()) {`
			`Token tok;`
			`tok.kind = k;`
			`tok.text = std::move(t);`
			`tok.line = startLine;`
			`tok.col = startCol;`
			`return tok;`
			`};`

			`// identifiers / keywords`
			`if (ch().isLetter() \|\| ch() == '_') {`
			`QString s;`
			`while (ch().isLetterOrNumber() \|\| ch() == '_') {`
			`s += ch();`
			`advance();`
			`}`
			`if (g_kw.contains(s)) return mk(g_kw.value(s), s);`
			`return mk(TokenKind::Identifier, s);`
			`}`

			`// number: decimal or hex`
			`if (ch().isDigit()) {`
			`const int startLine = m_line;`
			`const int startCol = m_col;`

			`auto mkNum = [&](QString text, qint64 value) {`
			`Token tok;`
			`tok.kind = TokenKind::Number;`
			`tok.text = std::move(text);`
			`tok.number = value;`
			`tok.line = startLine;`
			`tok.col = startCol;`
			`return tok;`
			`};`

			`// ---- HEX: 0x[0-9a-fA-F]+ (no ambiguity allowed)`
			`if (ch() == '0') {`
			`const QChar n1 = ch(1);`
			`if (!n1.isNull() && n1.toLower() == QChar('x')) {`
			`advance(2); // consume "0x"`

			`QString hex;`
			`while (!ch().isNull()) {`
			`const QChar c = ch();`
			`const bool isHex =`
			`c.isDigit() \|\|`
			`(c.toLower() >= QChar('a') && c.toLower() <= QChar('f'));`
			`if (!isHex) break;`
			`hex += c;`
			`advance();`
			`}`

			`if (hex.isEmpty()) {`
			`throw std::runtime_error("Invalid hex literal: expected hex digits after 0x");`
			`}`

			`return mkNum("0x" + hex, hex.toLongLong(nullptr, 16));`
			`}`
			`}`

			`// ---- DECIMAL: [0-9]+`
			`QString dec;`
			`while (ch().isDigit()) {`
			`dec += ch();`
			`advance();`
			`}`
			`return mkNum(dec, dec.toLongLong(nullptr, 10));`
			`}`

			`// string "..."`
			`if (ch() == '"') {`
			`advance();`
			`QString s;`
			`while (!ch().isNull() && ch() != '"') {`
			`if (ch() == '\\') {`
			`advance();`
			`if (ch().isNull()) break;`
			`const QChar esc = ch();`
			`if (esc == 'n') s += '\n';`
			`else if (esc == 't') s += '\t';`
			`else s += esc;`
			`advance();`
			`} else {`
			`s += ch();`
			`advance();`
			`}`
			`}`
			`if (ch() == '"') advance();`
			`return mk(TokenKind::String, s);`
			`}`

			`// multi-char ops`
			`if (ch() == '.' && ch(1) == '.') { advance(2); return mk(TokenKind::DotDot, ".."); }`
Add XScript language features and DslKeys abstraction - Add inline, array, const, and match statement syntax - Add true/false keywords and deflate() decompression function - Introduce DslKeys enum for internal metadata key management - Improve parse_here delegation pattern with variable merging - Remove deprecated bracket attribute syntax (use ui() instead) - Enhance script type editor with additional functionality - Remove obsolete install.cmd Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com> 2026-01-11 12:08:26 -05:00			`if (ch() == '.') { advance(); return mk(TokenKind::Dot, "."); }`
			`if (ch() == '=' && ch(1) == '>') { advance(2); return mk(TokenKind::Arrow, "=>"); }`
Add DSL library 2026-01-01 22:18:25 -05:00			`if (ch() == '=' && ch(1) == '=') { advance(2); return mk(TokenKind::EqEq, "=="); }`
			`if (ch() == '!' && ch(1) == '=') { advance(2); return mk(TokenKind::NotEq, "!="); }`
			`if (ch() == '<' && ch(1) == '=') { advance(2); return mk(TokenKind::Lte, "<="); }`
			`if (ch() == '>' && ch(1) == '=') { advance(2); return mk(TokenKind::Gte, ">="); }`
			`if (ch() == '<' && ch(1) == '<') { advance(2); return mk(TokenKind::LShift, "<<"); }`
			`if (ch() == '>' && ch(1) == '>') { advance(2); return mk(TokenKind::RShift, ">>"); }`
			`if (ch() == '&' && ch(1) == '&') { advance(2); return mk(TokenKind::AndAnd, "&&"); }`
			`if (ch() == '\|' && ch(1) == '>') { advance(2); return mk(TokenKind::Pipe, "\|>"); }`
			`if (ch() == '\|' && ch(1) == '\|') { advance(2); return mk(TokenKind::OrOr, "\|\|"); }`

			`// single-char`
			`const QChar c = ch();`
			`advance();`

			`switch (c.unicode()) {`
			`case '{': return mk(TokenKind::LBrace, "{");`
			`case '}': return mk(TokenKind::RBrace, "}");`
			`case '[': return mk(TokenKind::LBracket, "[");`
			`case ']': return mk(TokenKind::RBracket, "]");`
			`case '(': return mk(TokenKind::LParen, "(");`
			`case ')': return mk(TokenKind::RParen, ")");`
			`case ';': return mk(TokenKind::Semicolon, ";");`
			`case ',': return mk(TokenKind::Comma, ",");`
Add XScript language features and DslKeys abstraction - Add inline, array, const, and match statement syntax - Add true/false keywords and deflate() decompression function - Introduce DslKeys enum for internal metadata key management - Improve parse_here delegation pattern with variable merging - Remove deprecated bracket attribute syntax (use ui() instead) - Enhance script type editor with additional functionality - Remove obsolete install.cmd Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com> 2026-01-11 12:08:26 -05:00			`case ':': return mk(TokenKind::Colon, ":");`
Add DSL library 2026-01-01 22:18:25 -05:00			`case '=': return mk(TokenKind::Assign, "=");`
			`case '\|': return mk(TokenKind::Bar, "\|");`
			`case '+': return mk(TokenKind::Plus, "+");`
			`case '-': return mk(TokenKind::Minus, "-");`
			`case '': return mk(TokenKind::Star, "");`
			`case '/': return mk(TokenKind::Slash, "/");`
			`case '%': return mk(TokenKind::Percent, "%");`
			`case '&': return mk(TokenKind::Amp, "&");`
			`case '^': return mk(TokenKind::Caret, "^");`
			`case '!': return mk(TokenKind::Bang, "!");`
			`case '<': return mk(TokenKind::Lt, "<");`
			`case '>': return mk(TokenKind::Gt, ">");`
			`default:`
			`// unknown -> treat as End to stop hard, but better to throw in Parser`
			`return mk(TokenKind::End);`
			`}`
			`}`

			`Token Lexer::next() {`
			`Token old = m_cur;`
			`m_cur = lexOne();`
			`return old;`
			`}`

			`bool Lexer::match(TokenKind k) {`
			`if (m_cur.kind == k) { next(); return true; }`
			`return false;`
			`}`

			`Token Lexer::expect(TokenKind k, const char* msg) {`
			`if (m_cur.kind != k) {`
			`// leave detailed error for parser throw`
			`Token bad = m_cur;`
			`throw std::runtime_error(QString("%1 at %2:%3 (got '%4')")`
			`.arg(msg).arg(bad.line).arg(bad.col).arg(bad.text).toStdString());`
			`}`
			`return next();`
			`}`