Context
Some time ago I wrote a very simple SQL interpreter for Javascript for fun one weekend and have since started using it in production in a few projects for work, and as requirements change for things I'm working on, I implement new features in the library. For a year or so I've made due with a single, monstrous function that handled lexing, parsing and interpreting all at once, all while avoiding regex as much as possible as I've gotten the impression that regex is to be avoided as a parsing tool.
I'm now learning the hard way that this is not sustainable. I'm attempting to implement a proper lexer/parser before adding any new features.
Question
As someone with no experience writing a real lexer I started by reading a bunch of really dry tutorials in languages I don't really understand. The one thing they all seemed to have in common is that they check each character of the input string individually, which to me, seems tedious and unnecessary, so I decided to try it my own way, which turned out to be much, much simpler and shorter than implementations I found online.
If my implementation is so much better then why aren't other people doing something similar? There must be something wrong with this algorithm, what is it?
My Method
- Have an array of Regex patterns that matches each token. Patterns are ordered by priority, for example, quoted strings come before numbers because numbers that are quoted are strings, not numbers.
- Loop thru each pattern and produce an array of arrays which contain all matches for every token (in the same order of priority). Each match contains:
- The literal text that was matched
- Its position in the input string
- The type of token it matches (eg STRING or NUMBER)
- Loop thru each token match and string them together when one match begins exactly where another one ends, in order of priority.
- When a match is not found at the position where the last match ended check to see if length of all matched tokens === the length of the input string. If not, throw an error.
The Code
This demo will tokenize a string and show you the details for each token when you mouseover the word or symbol.
function jSQLLexer(input) { this.input = input; this.pos = 0; this.real_pos = 0; this.tokens = []; this.token_matches = []; } // Get all possible token matches to be sorted out jSQLLexer.prototype.getTokenMatches = function(){ if(this.token_matches.length) return this.token_matches; this.token_matches = [] var r; for(var i=0; i<jSQLLexer.token_types.length; i++){ this.token_matches[i] = []; while ((r = jSQLLexer.token_types[i].pattern.exec(this.input)) != null) { this.token_matches[i].push(r); } } return this.token_matches; }; // Get list of tokens for the input string jSQLLexer.prototype.getTokens = function(){ if(this.tokens.length) return this.tokens; this.pos = 0; var matches = this.getTokenMatches(), throwaway = ["COMMENT", "WHITESPACE"]; for(var type_id=0; type_id<matches.length; type_id++){ if(this.pos >= this.input.length) break; for(var match_index=0; match_index<matches[type_id].length; match_index++){ var r = matches[type_id][match_index]; if(r.index !== this.pos) continue; var token = new jSQLToken(this.pos, r[0], type_id); this.pos += token.length; if(throwaway.indexOf(token.type) === -1) this.tokens.push(token); type_id=-1; break; } } if(this.pos !== this.input.length){ var pos; if(this.tokens.length){ var lastToken = this.tokens[this.tokens.length-1]; pos = lastToken.input_pos + lastToken.length; }else pos = 0; throw new jSQL_Lexer_Error(pos, this.input); } return this.tokens; }; jSQLLexer.token_types = [ // STRINGs {pattern: /"(?:[^"\\]|\\.)*"/g, type: 'STRING', name: "DQ STRING"}, {pattern: /'(?:[^'\\]|\\.)*'/g, type: 'STRING', name: "SQ STRING"}, // COMMENTs {pattern: /--.*[\n\r$]/g, type: 'COMMENT', name: "SINGLE LINE COMMENT"}, {pattern: /\/\*([\s\S]*?)\*\//g, type: 'COMMENT', name: "MULTI LINE COMMENT"}, // WHITESPACE {pattern: /\r?\n|\r/g, type: 'WHITESPACE', name: "LINEBREAK"}, {pattern: /[ \t]/g, type: 'WHITESPACE', name: "WHITESPACE"}, // NUMBERs {pattern: /\d+/g, type: 'NUMBER', name: 'INTEGER'}, {pattern: /\d+.\.\d+/g, type: 'NUMBER', name: 'FLOAT'}, // QUALIFIERs {pattern: /if not exists/gi, type: 'QUALIFIER', name: "IF NOT EXISTS"}, // SYMBOLs {pattern: /!=/gi, type: 'SYMBOL', name: "NOT EQUAL"}, {pattern: /<>/gi, type: 'SYMBOL', name: "NOT EQUAL"}, {pattern: /\(/gi, type: 'SYMBOL', name: "LEFT PEREN"}, {pattern: /\)/gi, type: 'SYMBOL', name: "RIGHT PEREN"}, {pattern: /,/gi, type: 'SYMBOL', name: "COMMA"}, {pattern: /\?/gi, type: 'SYMBOL', name: "QUESTION MARK"}, {pattern: /,/gi, type: 'SYMBOL', name: "COMMA"}, {pattern: /\*/gi, type: 'SYMBOL', name: "ASTERISK"}, {pattern: /;/gi, type: 'SYMBOL', name: "SEMICOLON"}, {pattern: /=/gi, type: 'SYMBOL', name: "EQUALS"}, {pattern: />/gi, type: 'SYMBOL', name: "GREATER THAN"}, {pattern: /</gi, type: 'SYMBOL', name: "LESS THAN"}, // KEYWORDs {pattern: /primary key/gi, type: 'KEYWORD', name: "PRIMARY KEY"}, {pattern: /unique key/gi, type: 'KEYWORD', name: "UNIQUE KEY"}, {pattern: /values/gi, type: 'KEYWORD', name: "VALUES"}, {pattern: /from/gi, type: 'KEYWORD', name: "FROM"}, {pattern: /auto_increment/gi, type: 'KEYWORD', name: "AUTO_INCREMENT"}, {pattern: /ignore/gi, type: 'KEYWORD', name: "IGNORE"}, {pattern: /into/gi, type: 'KEYWORD', name: "INTO"}, {pattern: /all/gi, type: 'KEYWORD', name: "ALL"}, {pattern: /distinct/gi, type: 'KEYWORD', name: "DISTINCT"}, {pattern: /distinctrow/gi, type: 'KEYWORD', name: "DISTINCTROW"}, {pattern: /where/gi, type: 'KEYWORD', name: "WHERE"}, {pattern: /and/gi, type: 'KEYWORD', name: "AND"}, {pattern: /like/gi, type: 'KEYWORD', name: "LIKE"}, {pattern: /order by/gi, type: 'KEYWORD', name: "ORDER BY"}, {pattern: /or/gi, type: 'KEYWORD', name: "OR"}, {pattern: /limit/gi, type: 'KEYWORD', name: "LIMIT"}, {pattern: /offset/gi, type: 'KEYWORD', name: "OFFSET"}, {pattern: /asc/gi, type: 'KEYWORD', name: "ASC"}, {pattern: /desc/gi, type: 'KEYWORD', name: "DESC"}, {pattern: /set/gi, type: 'KEYWORD', name: "SET"}, // DIRECTIVEs {pattern: /create table/gi, type: 'DIRECTIVE', name: "CREATE TABLE"}, {pattern: /insert/gi, type: 'DIRECTIVE', name: "INSERT"}, {pattern: /delete from/gi, type: 'DIRECTIVE', name: "DELETE FROM"}, {pattern: /drop table/gi, type: 'DIRECTIVE', name: "DROP TABLE"}, {pattern: /update/gi, type: 'DIRECTIVE', name: "UPDATE"}, {pattern: /select/gi, type: 'DIRECTIVE', name: "SELECT"}, // IDENTIFIERs are developer specified so should be checked last {pattern: /`[0-9a-zA-Z$_]*[0-9a-zA-Z$_]`/gi, type: 'IDENTIFIER', name: "QTD IDENTIFIER"}, {pattern: /[0-9a-zA-Z$_]*[0-9a-zA-Z$_]/gi, type: 'IDENTIFIER', name: "UNQTD IDENTIFIER"} ]; // This is a polyfill required by jSQLToken var jSQL = { types: { exists: type=>-1<[ "VARCHAR", "INT", "CHAR", "BIGINT", "MEDIUMINT", "SMALLINT", "ENUM", "NUMBER", "JSON", "FUNCTION", "DATE" ].indexOf(type.toUpperCase()) } }; // The token constructor function jSQLToken(pos, literal, tok_index){ this.type_id = tok_index; this.input_pos = pos; this.literal = literal; this.value = literal; this.length = literal.length; this.type = jSQLLexer.token_types[tok_index].type; this.name = jSQLLexer.token_types[tok_index].name; if(this.type === "IDENTIFIER" && this.name === "UNQTD IDENTIFIER" && jSQL.types.exists(this.literal)) this.name = "DATA TYPE"; if(this.type === "IDENTIFIER" && this.name === "QTD IDENTIFIER") this.value = literal.replace(/`/g,''); if(this.type === "STRING" && this.name === "DQ STRING") this.value = literal.substr(1, literal.length - 2).replace(/\"/g, '"'); if(this.type === "STRING" && this.name === "SQ STRING") this.value = literal.substr(1, literal.length - 2).replace(/\'/g, "'"); if(this.type === "NUMBER") this.value = parseFloat(this.literal); } function jSQL_Lexer_Error(pos, context) { var max_ellipse_len = 25; var ellipse_len = context.length > pos + max_ellipse_len ? max_ellipse_len : context.length - pos; var preview = context.substr(pos, ellipse_len); this.error = "0070"; this.message = "Unknown token near char "+pos+" of "+context.length+" \""+preview+"\"."; this.stack = undefined; var e = new Error(); if(e.stack) this.stack = e.stack; this.toString = function () { return "jSQL Lexer Error #"+this.error+" - "+this.message; }; } const drawOutput = sql => { let tokens = new jSQLLexer(sql).getTokens(), output=[]; tokens.forEach((token,i)=>output.push("<span class='token' data-token-id='"+i+"'>"+token.literal+"</span>")); output.push("<div id=o></div>"); $("body").html(output.join(" ")); $(".token").mouseover(function(){ let o = []; for(let i in tokens[$(this).data("token-id")]) if(["string", "number"].indexOf(typeof tokens[$(this).data("token-id")][i]) > -1) o.push("<b>"+i+"</b> = "+tokens[$(this).data("token-id")][i]); $("#o").html('<ul><li>'+(o.join('</li><li>'))+'</li></ul>'); $(this).css({color: "red"}); }); $(".token").mouseout(function(){ $("#o").empty(); $(this).css({color: "black"}); }); }; drawOutput(`CREATE TABLE IF NOT EXISTS someTable ( name varchar(3), numba2 smallint(3), numba3 mediumint(3), numba4 bigint(3), type enum('type1', "type2", "type3") )`);
<script src="https://ajax.googleapis.com/ajax/libs/jquery/2.1.1/jquery.min.js"></script>
exp()
: it undoubtedly teaches you a lot but does it give you any insight at all about programming problems which involve exponentiation?\$\endgroup\$