As I introduced yesterday, there are a lexical scanner and a grammar rule parser in the SQL parser.

I would implement a basic SQL first, which supports a simplified version of SQL syntax. How basic is it? I use W3C SQL Tutorial as the standard, which means I will support most syntax in that tutorial. There are more syntax which are complex, and I would not support those for now.

Today, I am studying how to implement the lexical scanner. There are many articles about how a lexical scanner works, so I skip the introduction here. Just mention a few good articles: “Gentle introduction into compilers. Part 1: Lexical analysis and Scanner” by @maxim_koretskyi, “Lexical Analysis” by Faiçal Tchirou.

How MySQL Does

Before we start, it’s good to see how others compiler implement their lexical scanner. Finite state machines and finite automaton are helpful for processing lexical analysis. Most projects use tools like GNU Flex, however MySQL uses handwritten identifiers in lexical scanner for better performance and flexibility.

MySQL puts their keywords in sql/lex.h:

!FILENAME mysql_server/sql/lex.h

static const SYMBOL symbols[] = {
    /*
     Insert new SQL keywords after that commentary (by alphabetical order):
    */
    {SYM("&&", AND_AND_SYM)},
    {SYM("<", LT)},
    {SYM("<=", LE)},
    {SYM("<>", NE)},
    {SYM("!=", NE)},
    {SYM("=", EQ)},
    {SYM(">", GT_SYM)},
    {SYM(">=", GE)},
    {SYM("<<", SHIFT_LEFT)},
    {SYM(">>", SHIFT_RIGHT)},
    {SYM("<=>", EQUAL_SYM)},
    {SYM("ACCESSIBLE", ACCESSIBLE_SYM)},
    {SYM("ACCOUNT", ACCOUNT_SYM)},
    {SYM("ACTION", ACTION)},
    {SYM("ADD", ADD)},
    {SYM("ADMIN", ADMIN_SYM)},
    {SYM("AFTER", AFTER_SYM)},
    {SYM("AGAINST", AGAINST)},
    // ...
    // ...
}

The entry points for the lexical scanner of MySQL is yylex() in sql/sql_lex.cc, where yy is just a prefix.

!FILENAME mysql_server/sql/sql_lex.cc

static int lex_one_token(YYSTYPE *yylval, THD *thd) {
  uchar c = 0;
  bool comment_closed;
  int tokval, result_state;
  uint length;
  enum my_lex_states state;
  Lex_input_stream *lip = &thd->m_parser_state->m_lip;
  const CHARSET_INFO *cs = thd->charset();
  const my_lex_states *state_map = cs->state_maps->main_map;
  const uchar *ident_map = cs->ident_map;

  lip->yylval = yylval;  // The global state

  lip->start_token();
  state = lip->next_state;
  lip->next_state = MY_LEX_START;
  for (;;) {
    switch (state) {
      case MY_LEX_START:  // Start of token
        // Skip starting whitespace
        while (state_map[c = lip->yyPeek()] == MY_LEX_SKIP) {
          if (c == '\n') lip->yylineno++;

          lip->yySkip();
        }

        /* Start of real token */
        lip->restart_token();
        c = lip->yyGet();
        state = state_map[c];
        break;
      case MY_LEX_CHAR:  // Unknown or single char token
      // ...
      // ...
      // ...
    }
  }
}

How TypeScript Does

It is also worth to read the scanner in TypeScript, in which is TypeScript/src/compiler/scanner.ts

!FILENAME TypeScript/src/compiler/scanner.ts

    // ...
    // ...

    // ...
    // ...

    const textToToken = createMapFromTemplate<SyntaxKind>({
        ...textToKeywordObj,
        "{": SyntaxKind.OpenBraceToken,
        "}": SyntaxKind.CloseBraceToken,
        "(": SyntaxKind.OpenParenToken,
        ")": SyntaxKind.CloseParenToken,
        "[": SyntaxKind.OpenBracketToken,
        "]": SyntaxKind.CloseBracketToken,
        ".": SyntaxKind.DotToken,
        "...": SyntaxKind.DotDotDotToken
        // ...
        // ...
    })
    // ...
    // ...

    // ...
    // ...

    // ...
    // ...

    // Creates a scanner over a (possibly unspecified) range of a piece of text.
    export function createScanner(languageVersion: ScriptTarget,
                                  skipTrivia: boolean,
                                  languageVariant = LanguageVariant.Standard,
                                  textInitial?: string,
                                  onError?: ErrorCallback,
                                  start?: number,
                                  length?: number): Scanner {
        let text = textInitial!;

        // Current position (end position of text of current token)
        let pos: number;


        // end of text
        let end: number;

        // Start position of whitespace before current token
        let startPos: number;

        // Start position of text of current token
        let tokenPos: number;

        let token: SyntaxKind;
        let tokenValue!: string;
        let tokenFlags: TokenFlags;

        let inJSDocType = 0;

        setText(text, start, length);

        return {
            getStartPos: () => startPos,
            getTextPos: () => pos,
            getToken: () => token,
            getTokenPos: () => tokenPos,
            getTokenText: () => text.substring(tokenPos, pos),
            getTokenValue: () => tokenValue,
            hasExtendedUnicodeEscape: () => (tokenFlags & TokenFlags.ExtendedUnicodeEscape) !== 0,
            hasPrecedingLineBreak: () => (tokenFlags & TokenFlags.PrecedingLineBreak) !== 0,
            isIdentifier: () => token === SyntaxKind.Identifier || token > SyntaxKind.LastReservedWord,
            isReservedWord: () => token >= SyntaxKind.FirstReservedWord && token <= SyntaxKind.LastReservedWord,
            isUnterminated: () => (tokenFlags & TokenFlags.Unterminated) !== 0,
            getTokenFlags: () => tokenFlags,
            reScanGreaterToken,
            reScanSlashToken,
            reScanTemplateToken,
            scanJsxIdentifier,
            scanJsxAttributeValue,
            reScanJsxToken,
            scanJsxToken,
            scanJSDocToken,
            scan,
            getText,
            setText,
            setScriptTarget,
            setLanguageVariant,
            setOnError,
            setTextPos,
            setInJSDocType,
            tryScan,
            lookAhead,
            scanRange,
    };
    // ...
    // ...

    // ...
    // ...
    // ...
    // ...

Tomorrow I will start to implement the lexical scanner of StellarSQL.