#include "lexer.h" #include #include #include static QHash buildKeywords() { QHash k; k["type"] = TokenKind::KwType; k["byteorder"] = TokenKind::KwByteOrder; k["LE"] = TokenKind::KwLE; k["BE"] = TokenKind::KwBE; k["skip"] = TokenKind::KwSkip; k["if"] = TokenKind::KwIf; k["else"] = TokenKind::KwElse; k["while"] = TokenKind::KwWhile; k["align"] = TokenKind::KwAlign; k["seek"] = TokenKind::KwSeek; k["repeat"] = TokenKind::KwRepeat; k["for"] = TokenKind::KwFor; k["in"] = TokenKind::KwIn; k["EOF"] = TokenKind::KwEOF; k["criteria"] = TokenKind::KwCriteria; k["require"] = TokenKind::KwRequire; k["bool"] = TokenKind::KwBool; k["u8"] = TokenKind::KwU8; k["u16"] = TokenKind::KwU16; k["u32"] = TokenKind::KwU32; k["u64"] = TokenKind::KwU64; k["i8"] = TokenKind::KwI8; k["i16"] = TokenKind::KwI16; k["i32"] = TokenKind::KwI32; k["i64"] = TokenKind::KwI64; return k; } static const QHash g_kw = buildKeywords(); Lexer::Lexer(QString src) : m_src(std::move(src)) { m_cur = lexOne(); } QChar Lexer::ch(int off) const { int idx = m_i + off; if (idx < 0 || idx >= m_src.size()) return QChar(); return m_src[idx]; } void Lexer::advance(int n) { for (int k = 0; k < n; k++) { if (m_i >= m_src.size()) return; const QChar c = m_src[m_i++]; if (c == '\n') { m_line++; m_col = 1; } else m_col++; } } void Lexer::skipWsAndComments() { while (true) { // whitespace while (!ch().isNull() && ch().isSpace()) advance(); // // comment if (ch() == '/' && ch(1) == '/') { while (!ch().isNull() && ch() != '\n') advance(); continue; } // /* comment */ if (ch() == '/' && ch(1) == '*') { advance(2); while (!ch().isNull()) { if (ch() == '*' && ch(1) == '/') { advance(2); break; } advance(); } continue; } break; } } Token Lexer::make(TokenKind k, QString t) { Token tok; tok.kind = k; tok.text = std::move(t); tok.line = m_line; tok.col = m_col; return tok; } Token Lexer::lexOne() { skipWsAndComments(); if (ch().isNull()) return make(TokenKind::End); const int startLine = m_line; const int startCol = m_col; auto mk = [&](TokenKind k, QString t = QString()) { Token tok; tok.kind = k; tok.text = std::move(t); tok.line = startLine; tok.col = startCol; return tok; }; // identifiers / keywords if (ch().isLetter() || ch() == '_') { QString s; while (ch().isLetterOrNumber() || ch() == '_') { s += ch(); advance(); } if (g_kw.contains(s)) return mk(g_kw.value(s), s); return mk(TokenKind::Identifier, s); } // number: decimal or hex if (ch().isDigit()) { const int startLine = m_line; const int startCol = m_col; auto mkNum = [&](QString text, qint64 value) { Token tok; tok.kind = TokenKind::Number; tok.text = std::move(text); tok.number = value; tok.line = startLine; tok.col = startCol; return tok; }; // ---- HEX: 0x[0-9a-fA-F]+ (no ambiguity allowed) if (ch() == '0') { const QChar n1 = ch(1); if (!n1.isNull() && n1.toLower() == QChar('x')) { advance(2); // consume "0x" QString hex; while (!ch().isNull()) { const QChar c = ch(); const bool isHex = c.isDigit() || (c.toLower() >= QChar('a') && c.toLower() <= QChar('f')); if (!isHex) break; hex += c; advance(); } if (hex.isEmpty()) { throw std::runtime_error("Invalid hex literal: expected hex digits after 0x"); } return mkNum("0x" + hex, hex.toLongLong(nullptr, 16)); } } // ---- DECIMAL: [0-9]+ QString dec; while (ch().isDigit()) { dec += ch(); advance(); } return mkNum(dec, dec.toLongLong(nullptr, 10)); } // string "..." if (ch() == '"') { advance(); QString s; while (!ch().isNull() && ch() != '"') { if (ch() == '\\') { advance(); if (ch().isNull()) break; const QChar esc = ch(); if (esc == 'n') s += '\n'; else if (esc == 't') s += '\t'; else s += esc; advance(); } else { s += ch(); advance(); } } if (ch() == '"') advance(); return mk(TokenKind::String, s); } // multi-char ops if (ch() == '.' && ch(1) == '.') { advance(2); return mk(TokenKind::DotDot, ".."); } if (ch() == '=' && ch(1) == '=') { advance(2); return mk(TokenKind::EqEq, "=="); } if (ch() == '!' && ch(1) == '=') { advance(2); return mk(TokenKind::NotEq, "!="); } if (ch() == '<' && ch(1) == '=') { advance(2); return mk(TokenKind::Lte, "<="); } if (ch() == '>' && ch(1) == '=') { advance(2); return mk(TokenKind::Gte, ">="); } if (ch() == '<' && ch(1) == '<') { advance(2); return mk(TokenKind::LShift, "<<"); } if (ch() == '>' && ch(1) == '>') { advance(2); return mk(TokenKind::RShift, ">>"); } if (ch() == '&' && ch(1) == '&') { advance(2); return mk(TokenKind::AndAnd, "&&"); } if (ch() == '|' && ch(1) == '>') { advance(2); return mk(TokenKind::Pipe, "|>"); } if (ch() == '|' && ch(1) == '|') { advance(2); return mk(TokenKind::OrOr, "||"); } // single-char const QChar c = ch(); advance(); switch (c.unicode()) { case '{': return mk(TokenKind::LBrace, "{"); case '}': return mk(TokenKind::RBrace, "}"); case '[': return mk(TokenKind::LBracket, "["); case ']': return mk(TokenKind::RBracket, "]"); case '(': return mk(TokenKind::LParen, "("); case ')': return mk(TokenKind::RParen, ")"); case ';': return mk(TokenKind::Semicolon, ";"); case ',': return mk(TokenKind::Comma, ","); case '=': return mk(TokenKind::Assign, "="); case '|': return mk(TokenKind::Bar, "|"); case '+': return mk(TokenKind::Plus, "+"); case '-': return mk(TokenKind::Minus, "-"); case '*': return mk(TokenKind::Star, "*"); case '/': return mk(TokenKind::Slash, "/"); case '%': return mk(TokenKind::Percent, "%"); case '&': return mk(TokenKind::Amp, "&"); case '^': return mk(TokenKind::Caret, "^"); case '!': return mk(TokenKind::Bang, "!"); case '<': return mk(TokenKind::Lt, "<"); case '>': return mk(TokenKind::Gt, ">"); default: // unknown -> treat as End to stop hard, but better to throw in Parser return mk(TokenKind::End); } } Token Lexer::next() { Token old = m_cur; m_cur = lexOne(); return old; } bool Lexer::match(TokenKind k) { if (m_cur.kind == k) { next(); return true; } return false; } Token Lexer::expect(TokenKind k, const char* msg) { if (m_cur.kind != k) { // leave detailed error for parser throw Token bad = m_cur; throw std::runtime_error(QString("%1 at %2:%3 (got '%4')") .arg(msg).arg(bad.line).arg(bad.col).arg(bad.text).toStdString()); } return next(); }