// SPDX-FileCopyrightText: 2022 Johannes Loher // // SPDX-License-Identifier: MIT import type { Token } from "./grammar"; export class Lexer { constructor(private readonly input: string) {} *[Symbol.iterator](): Generator<Token, void> { let pos = 0; while (true) { if (this.isWhiteSpace(this.input[pos])) { pos += 1; continue; } const [token, newPos] = this.getNextToken(pos); pos = newPos; yield token; if (token.type === "eof" || token.type === "invalid") { break; } } } private getNextToken(pos: number): [Token, number] { const current = this.input[pos]; if (current === undefined) { return [{ type: "eof", pos }, pos]; } if (this.isOperatorStart(current)) { return this.getOperator(pos); } if (this.isDigit(current)) { return this.getNumber(pos); } if (current === "'" || current === '"' || current === "`") { return this.getString(pos); } if (current === ".") { const next = this.input[pos + 1]; if (this.isDigit(next)) { return this.getNumber(pos); } return this.getOperator(pos); } if (this.isIdentifierStart(current)) { return this.getIdentifier(pos); } return [{ type: "invalid", pos }, pos]; } private isOperatorStart(char: string) { const operatorStartChars: (string | undefined)[] = [ "+", "-", "*", "/", "%", "=", "!", ">", "<", "&", "|", "~", "^", "?", ":", "!", ",", "(", ")", "[", "]", "{", "}", ]; return operatorStartChars.includes(char[0]); } private getOperator(pos: number): [Token, number] { const current = this.input[pos]; const next = this.input[pos + 1]; const nextButOne = this.input[pos + 2]; switch (current) { case "+": case "-": case "/": case "%": case "~": case "^": case ".": case ":": case ",": case "(": case ")": case "[": case "]": case "{": case "}": { return [{ type: current, pos }, pos + 1]; } case "*": { if (next === "*") { return [{ type: "**", pos }, pos + 2]; } return [{ type: "*", pos }, pos + 1]; } case "=": { if (next === "=") { if (nextButOne === "=") { return [{ type: "===", pos }, pos + 3]; } return [{ type: "==", pos }, pos + 2]; } return [{ type: "invalid", pos }, pos]; } case "!": { if (next === "=") { if (nextButOne === "=") { return [{ type: "!==", pos }, pos + 3]; } return [{ type: "!=", pos }, pos + 2]; } return [{ type: "!", pos }, pos + 1]; } case ">": { switch (next) { case ">": { if (nextButOne === ">") { return [{ type: ">>>", pos }, pos + 3]; } return [{ type: ">>", pos }, pos + 2]; } case "=": { return [{ type: ">=", pos }, pos + 2]; } default: { return [{ type: ">", pos }, pos + 1]; } } } case "<": { switch (next) { case "=": { return [{ type: "<=", pos }, pos + 2]; } case "<": { return [{ type: "<<", pos }, pos + 2]; } default: { return [{ type: "<", pos }, pos + 1]; } } } case "&": { if (next === "&") { return [{ type: "&&", pos }, pos + 2]; } return [{ type: "&", pos }, pos + 1]; } case "|": { if (next === "|") { return [{ type: "||", pos }, pos + 2]; } return [{ type: "|", pos }, pos + 1]; } case "?": { switch (next) { case ".": { return [{ type: "?.", pos }, pos + 2]; } case "?": { return [{ type: "??", pos }, pos + 2]; } default: { return [{ type: "?", pos }, pos + 1]; } } } } return [{ type: "invalid", pos }, pos]; } private isDigit(char: string | undefined): char is `${number}` { return /\d/.test(char?.[0] ?? ""); } private getNumber(pos: number): [Token, number] { let endPos = pos; let foundDot = false; let only0s = false; while ( this.isDigit(this.input[endPos]) || this.input[endPos] === "." || (this.input[endPos] === "_" && endPos > pos) ) { if (this.input[endPos] === ".") { if (foundDot) { return [{ type: "invalid", pos }, pos]; } foundDot = true; } if (this.input[endPos] === "0") { only0s = endPos === pos ? true : only0s; } if ( this.input[endPos] === "_" && (this.input[endPos - 1] === "_" || this.input[endPos - 1] === "." || only0s) ) { return [{ type: "invalid", pos }, pos]; } endPos += 1; } if (pos === endPos) { return [{ type: "invalid", pos }, pos]; } if (this.input[endPos - 1] === "_") { return [{ type: "invalid", pos }, pos]; } return [{ type: "number", symbol: this.input.slice(pos, endPos), pos }, endPos]; } private isIdentifierStart(char: string | undefined) { return /[$_\p{ID_Start}]/u.test(char?.[0] ?? ""); } private isIdentifier(char: string | undefined) { return /[$\u200c\u200d\p{ID_Continue}]/u.test(char?.[0] ?? ""); } private getIdentifier(pos: number): [Token, number] { let endPos = pos; while (endPos < this.input.length && this.isIdentifier(this.input[endPos])) { endPos += 1; } if (endPos === pos) { return [{ type: "invalid", pos }, pos]; } return [{ type: "iden", symbol: this.input.slice(pos, endPos), pos }, endPos]; } private getString(pos: number): [Token, number] { const quote = this.input[pos]; let endPos = pos + 1; let prev = this.input[pos]; while (endPos < this.input.length && (this.input[endPos] !== quote || prev === "\\")) { prev = this.input[endPos]; endPos += 1; } if (endPos === pos || this.input[endPos] !== quote) { return [{ type: "invalid", pos }, pos]; } return [{ type: "string", symbol: this.input.slice(pos, endPos + 1), pos }, endPos + 1]; } private isWhiteSpace(char: string | undefined) { return /\s/.test(char?.[0] ?? ""); } }