Simple Parser Framework Library

Peter S. Housel

March 2003, January 2004


Copyright

This is the Monday simple-parser library, a framework for conveniently generating lexical scanners and parsers.

Copyright ©2003–2004 Peter S. Housel.

This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version.

This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details.

You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA


Introduction

Many tools are available for building lexical scanners and phrase-grammar parsers. Each of these tools has its own advantages and disadvantages, and imposes its own constraints on the user's program. A common constraint is that the parser specification is written using a different language that needs to be translated using a separate tool before it can be integrated into the rest of the program.

Recursive-descent parsers can be written by hand for languages in LL(1) without additional tools. The code for a recursive-descent parser is generally not as succinct as the input to a specialized parser generator, and rewriting grammars to conform to LL(1) can be inconvenient.

This library is designed to provide the following features:

The library makes use of the Monday regular, grammar, and source-location libraries.

<Module Dylan-user>=

define library simple-parser
  use common-dylan;
  use collections;
  use regular;
  use grammar;
  use source-location;
  <Exports for the simple-parser library>
end library;
          
<Module definitions for the simple-parser library>

Public Interface

The simple-parser module exports the symbols that constitute the primary public interface, containing the classes, macros and functions that most client programs will use.

<Exports for the simple-parser library>=

export simple-parser;
          

<Module definitions for the simple-parser library>=

define module simple-parser
  create
    <simple-lexical-definition>,
    \simple-lexical-definition,
    <simple-lexical-scanner>,
    scan-tokens,
    simple-grammar-productions,
    simple-parser-automaton,
    <simple-parser>,
    simple-parser-reset,
    simple-parser-consume-token,
    simple-parser-can-consume-token?,
    simple-parser-source-location;
end module;
          

Lexical Definitions

A lexical definition contains the names of all of the defined grammar terminals, their declared semantic types, and their definitions in regular expression notation. It also contains the Deterministic Finite Automaton (DFA) derived from the set of lexical definitions.

<Exports for the simple-parser library>+=

export simple-lexical-definition;
          

<Module definitions for the simple-parser library>+=

define module simple-lexical-definition
  use common-dylan;
  use bit-set;
  use regular-expression,
    rename: { regular-expression-dfa-state-transitions
                => lexical-state-transitions },
    export: { lexical-state-transitions };
  use simple-parser;
  export
    lexical-token-number,
    lexical-token-type,
    lexical-automaton,
    <simple-lexical-state>,
    lexical-state-accept-function;
  create
    token-accept-function;
end module;
          

<simple-lexical-definition>

[Class]


Class of descriptions of lexical tokens.

Superclasses:

<object>

Init-keywords:
clauses:
An instance of <sequence>
Description:

FIXME

Definition

<Module simple-lexical-definition>=

define class <simple-lexical-definition> (<object>)
  constant slot lexical-clauses :: <sequence>,
    required-init-keyword: clauses:;
  slot lexical-automaton :: <regular-expression-dfa-state>;
  <Additional Slots in <simple-lexical-definition>>
end class;
              

simple-lexical-definition

[Macro]


Macro for conveniently defining instances of <simple-lexical-definition>.

Description:

FIXME

Definition

<Module simple-lexical-definition>+=

define macro simple-lexical-definition
  { simple-lexical-definition ?clauses end }
    => { make(<simple-lexical-definition>, clauses: vector(?clauses)) }
clauses:
  { } => { }
  { ?clause; ... } => { ?clause, ... }

clause:
  { name ?:name = ?def:expression }
    => { <Shorthand name> }
  { token ?:name }
    => { <Expansion for a named token> }
  { token ?:name, #rest ?options:expression }
    => { <Expansion for a named token with options> }
  { token ?:name :: ?type:expression }
    => { <Expansion for a token with name and type> }
  { token ?:name :: ?type:expression, #rest ?options:expression }
    => { <Expansion for a token with name, type, and options> }
  { token ?:name = ?def:expression,
      #rest ?options:expression }
    => { <Expansion for a token with name and defining expression> }
  { token ?:name :: ?type:expression = ?def:expression,
      #rest ?options:expression }
    => { <Expansion for a token with name, type, and defining expression> }
  { inert ?def:expression }
    => { <Expansion for an Inert token with defining expression> }
end macro;
              

lexical-token-number

[Generic Function]


Returns the number assigned to a named token.

Signature:

lexical-token-number definition tokennumber

Arguments:
definition
An instance of <simple-lexical-definition>.
token
An instance of <symbol>.
Values:
number
An instance of <integer>.
Description:

FIXME

Definition:

<Module simple-lexical-definition>+=

define method lexical-token-number
    (definition :: <simple-lexical-definition>, token :: <symbol>)
 => (number :: <integer>);
  <Return the token number for token in definition>
end method;
              

lexical-token-type

[Generic Function]


Returns the type declared for a named token.

Signature:

lexical-token-type definition tokentype

Arguments:
definition
An instance of <simple-lexical-definition>.
token
An instance of <symbol>.
Values:
type
An instance of <type>, or #f.
Description:

FIXME

Definition:

<Module simple-lexical-definition>+=

define method lexical-token-type
    (definition :: <simple-lexical-definition>, token :: <symbol>)
 => (number :: false-or(<type>));
  <Return the declared token type for token in definition>  
end method;
              

Token Information

Clauses in a <simple-lexical-definition> record the name, declared type, and defining regular-expression for each listed token. This and additional information will be stored in <token-clause> objects.

<Module simple-lexical-definition>+=

define class <token-clause> (<object>)
  constant slot token-recognized? :: <boolean>,
    init-value: #t, init-keyword: recognized?:;
  constant slot token-name :: false-or(<symbol>),
    init-value: #f, init-keyword: name:;
  constant slot token-type :: false-or(<type>),
    init-value: #f, init-keyword: type:;
  constant slot token-priority :: <integer>,
    init-value: 0, init-keyword: priority:;
  constant slot token-regular-expression :: false-or(<byte-string>),
    init-value: #f, init-keyword: regular-expression:;
  constant slot token-semantic-value-function :: false-or(<function>),
    init-value: #f, init-keyword: semantic-value-function:;
  <Additional slots in <token-clause>>
end class;
            

<Shorthand name>=

make(<token-clause>,
     recognized?: #f, name: ?#"name", regular-expression: ?def)
            

<Expansion for a named token>=

make(<token-clause>, name: ?#"name")
            

<Expansion for a named token with options>=

make(<token-clause>, name: ?#"name", ?options)
            

<Expansion for a token with name and type>=

make(<token-clause>, name: ?#"name", type: ?type)
            

<Expansion for a token with name, type, and options>=

make(<token-clause>, name: ?#"name", type: ?type, ?options)
            

<Expansion for a token with name and defining expression>=

make(<token-clause>,
     name: ?#"name", regular-expression: ?def, ?options)
            

<Expansion for a token with name, type, and defining expression>=

make(<token-clause>,
     name: ?#"name", type: ?type, regular-expression: ?def, ?options)
            

<Expansion for an Inert token with defining expression>=

make(<token-clause>, regular-expression: ?def)
            

Parsing Regular Expressions

We'll use an ordinary recursive-descent parser to parse regular expressions. The parse-regular-expression function takes a string and returns a <regular-expression> that represents it.

<Module simple-lexical-definition>+=

define function parse-regular-expression
    (string :: <byte-string>,
     named-regular-expressions :: <object-table>)
 => (node :: <regular-expression>);
  local
    <Local methods for parse-regular-expression>;
  let (node :: <regular-expression>, pos :: <integer>)
      = parse-regexp0(string, 0);
  if (pos < string.size)
    error("regular expression \"%s\" ended prematurely at position %d",
          string, pos)
  end if;
  node;
end function;
            

The | operator has the lowest precedence and is left-associative. We parse for it accordingly using the traditional recursive-descent parser.

The only thing special about our recursive-descent parser is the fact that we're using multiple return values to return both a parse tree node and an index value for the next unparsed character in the regular expression. Callers pass in a start argument to indicate where parsing should start, and the regular expression parsing routines return the pos index.

<Local methods for parse-regular-expression>=

method parse-regexp0
    (string :: <byte-string>, start :: <integer>)
 => (node :: <regular-expression>, pos :: <integer>);
  let (node :: <regular-expression>, pos :: <integer>)
    = parse-regexp1(string, start);
  while (pos < string.size & string[pos] == '|')
    let (new-node :: <regular-expression>, new-pos :: <integer>)
      = parse-regexp1(string, pos + 1);
    node := make(<union-regular-expression>, union1: node, union2: new-node);
    pos := new-pos;
  end while;
  values(node, pos);
end,
            

The next higher precedence operator is the concatenation operator, represented by nothing. Since there isn't an operator token associated with this operator, we have to make sure we don't eat up any lower-precedence operators. Concatenation is also left-associative.

<Local methods for parse-regular-expression>+=

method parse-regexp1
    (string :: <byte-string>, start :: <integer>)
 => (node :: <regular-expression>, pos :: <integer>);
  let (node :: <regular-expression>, pos :: <integer>)
    = parse-regexp2(string, start);
  while (pos < string.size & string[pos] ~== '|' & string[pos] ~== ')')
    let (new-node :: <regular-expression>, new-pos :: <integer>)
      = parse-regexp2(string, pos);
    node := make(<concatenation-regular-expression>,
                 head: node, tail: new-node);
    pos := new-pos;
  end while;
  values(node, pos);
end,
            

The * (Kleene star), + (Kleene plus), and ? (optional occurrence) operators have the next higher precedence. They are unary postfix operators.

<Local methods for parse-regular-expression>+=

method parse-regexp2
    (string :: <byte-string>, start :: <integer>)
 => (node :: <regular-expression>, pos :: <integer>);
  let (node :: <regular-expression>, pos :: <integer>)
    = parse-regexp3(string, start);
  if (pos < string.size)
    if (string[pos] == '*')
      let new-node = make(<closure-regular-expression>, of: node);
      values(new-node, pos + 1);
    elseif (string[pos] == '+')
      let new-node = make(<concatenation-regular-expression>,
                          head: node,
                          tail: make(<closure-regular-expression>,
                                     of: copy-regular-expression(node)));
      values(new-node, pos + 1);
    elseif (string[pos] == '?')
      let new-node = make(<union-regular-expression>,
                          union1: make(<epsilon-regular-expression>),
                          union2: node);
      values(new-node, pos + 1);
    else
      values(node, pos);
    end if;
  else
    values(node, pos);
  end if;
end,
            

Now we're down to the leaves, which are individual characters (and parenthesized subexpressions). Backslash-escapes are recognized, though of course they need to be doubled in Dylan source code string literals.

<Local methods for parse-regular-expression>+=

method parse-regexp3
    (string :: <byte-string>, start :: <integer>)
 => (node :: <regular-expression>, pos :: <integer>);
  if (start >= string.size)
    error("regexp missing at end of '%s'", string);
  else
    if (string[start] == '(')
      <Parse parenthetical regular expression and return>
    elseif (string[start] == '\\' & start < string.size - 1)
      values(make(<symbol-regular-expression>,
                  symbol: as(<integer>, string[start + 1])),
             start + 2);
    elseif (string[start] == '.')
      <Return a regular expression corresponding to .>
    elseif (string[start] == '[')
      <Return a regular expression corresponding to a character class>
    elseif (string[start] == '{')
      <Parse a reference to a named regular expression and return>
    else
      values(make(<symbol-regular-expression>,
                  symbol: as(<integer>, string[start])),
             start + 1);
    end if;
  end if;
end
            

The dot character matches any character except newline.

<Return a regular expression corresponding to .>=

let dot-set = make(<bit-set>, upper-bound-hint: 256);
for (symbol :: <integer> from 0 below 256)
  if (symbol ~= as(<integer>, '\n'))
    set-add!(dot-set, symbol);
  end;
end for;
values(make(<symbol-set-regular-expression>, symbol-set: dot-set),
       start + 1);
            

Character classes also generate sets.

<Return a regular expression corresponding to a character class>=

let cclass-set = make(<bit-set>, upper-bound-hint: 256);
let pos = start + 1;
let complement? = if (pos < string.size & string[pos] = '^')
                    start := start + 1;
                    pos := pos + 1;
                    #t;
                  else #f end if;
while (pos < string.size & (string[pos] ~== ']' | pos = start + 1))
  <Add the character or range of characters to the set>
end while;
if (pos = string.size)
  error("closing ']' missing in regexp '%s'", string);
end if;
if (complement?)
  let complement-set = make(<bit-set>, upper-bound-hint: 256);
  for (symbol :: <integer> from 0 below 256)
    unless (member?(symbol, cclass-set)) add!(complement-set, symbol) end;
  end for;
  values(make(<symbol-set-regular-expression>, symbol-set: complement-set),
         pos + 1);
else
  values(make(<symbol-set-regular-expression>, symbol-set: cclass-set),
         pos + 1);
end;
            

<Add the character or range of characters to the set>=

if (pos + 2 < string.size & string[pos + 1] == '-')
  for (symbol :: <integer> from as(<integer>, string[pos])
                                 to as(<integer>, string[pos + 2]))
    set-add!(cclass-set, symbol);
  end for;
  pos := pos + 3;
else
  set-add!(cclass-set, as(<integer>, string[pos]));
  pos := pos + 1;
end;
            

As lex does, we allow references to named regular expressions, with the name enclosed in curly braces.

<Parse a reference to a named regular expression and return>=

let name-start = start + 1;
let name-end
  = for(i from name-start below string.size, until: string[i] == '}')
    finally i;
    end for;
let name
  = as(<symbol>, copy-sequence(string, start: name-start, end: name-end));
values(copy-regular-expression(named-regular-expressions[name]), name-end + 1);
            

To parse a parenthetical expression, we go all the way back to the beginning (where low-precedence things are parsed).

<Parse parenthetical regular expression and return>=

let (node :: <regular-expression>, pos :: <integer>)
    = parse-regexp0(string, start + 1);
if (pos >= string.size | string[pos] ~== ')')
  error("closing ')' missing in regular expression '%s'", string);
else
  values(node, pos + 1);
end if;
            

Computing Token Numbers, Token Accept Actions, and the Automaton

At initialization time, we can assign token numbers to each of the named tokens, compute a deterministic finite automaton that will recognize all of the described tokens, and represent the automaton as an array. We will also maintain a table mapping token names to their corresponding <token-clause< entries, and a vector mapping token numbers to same.

<Additional Slots in <simple-lexical-definition>>=

  constant slot lexical-clauses-by-token = make(<object-table>);
  constant slot lexical-clauses-by-token-number = make(<stretchy-vector>);
            

<Additional slots in <token-clause>>=

  slot token-number :: false-or(<integer>), init-value: #f;
            

<Return the token number for token in definition>=

definition.lexical-clauses-by-token[token].token-number;
            

<Return the declared token type for token in definition>=

let clause = element(definition.lexical-clauses-by-token, token, default: #f);
if (clause)
  clause.token-type
else
  error("%s is not defined as a token", token);
end if

            

<Module simple-lexical-definition>+=

  
define sealed method initialize
    (definition :: <simple-lexical-definition>, #key)
 => ();
  next-method();
  let named-regular-expressions = make(<object-table>);
  let recognizer-regular-expression = make(<epsilon-regular-expression>);
  for (clause :: <token-clause> in definition.lexical-clauses)
    if (clause.token-name)
      definition.lexical-clauses-by-token[clause.token-name] := clause;
      if (clause.token-recognized?)
        clause.token-number := definition.lexical-clauses-by-token-number.size;
        add!(definition.lexical-clauses-by-token-number, clause);
      end if;
    end if;
    if (clause.token-regular-expression)
      let regex
        = parse-regular-expression(clause.token-regular-expression,
                                   named-regular-expressions);
      if (clause.token-name)
        named-regular-expressions[clause.token-name] := regex;
      end if;
      if (clause.token-recognized?)
        <Incorporate regex into recognizer-regular-expression>
      end if;
    end if;
  end for;
  <Construct a DFA from recognizer-regular-expression>
end method;
            

We take each regular expression in the specification, concatenate it with a corresponding accept-action node, and “or” all of the specifications together. This regular expression, when compiled, results in a state machine that can recognize any of the given alternatives.

Note that we started off our loop with an epsilon-node as a sort of sentinel, since it makes the logic simpler and doesn't affect anything once the other nodes are or-ed in.

The accept-function we will associate with accepting states will be tailored to work with our actual lexical scanner implementation.

<Incorporate regex into recognizer-regular-expression>=

let accept-function
  = token-accept-function(clause.token-number,
                          clause.token-name,
                          clause.token-semantic-value-function);
let clause-regular-expression
  = make(<concatenation-regular-expression>,
         head: regex,
         tail: make(<simple-accept-regular-expression>,
                    clause: clause,
                    accept-function: accept-function));
recognizer-regular-expression
  := make(<union-regular-expression>,
          union1: recognizer-regular-expression,
          union2: clause-regular-expression);
            

The class we use for an accept node contains a reference to the accept action function, as does the class we use for representing states.

<Module simple-lexical-definition>+=

define class <simple-accept-regular-expression>
    (<accept-regular-expression>)
  constant slot simple-accept-regular-expression-clause :: <token-clause>,
    required-init-keyword: clause:;
  constant slot simple-accept-regular-expression-accept-function :: <function>,
    required-init-keyword: accept-function:;
end class;
            
define class <simple-lexical-state> (<regular-expression-dfa-state>)
  slot lexical-state-accept-clause :: false-or(<token-clause>),
    init-value: #f;
  slot lexical-state-accept-function :: false-or(<function>),
    init-value: #f;
end class;  
            
define sealed method do-regular-expression-dfa-state-position
    (state :: <regular-expression-dfa-state>,
     position :: <simple-accept-regular-expression>,
     #key deterministic? = #f)
 => ();
  if (~state.lexical-state-accept-clause
        | state.lexical-state-accept-clause.token-priority
            < position.simple-accept-regular-expression-clause.token-priority)
    state.lexical-state-accept-clause
      := position.simple-accept-regular-expression-clause;
    state.lexical-state-accept-function
      := position.simple-accept-regular-expression-accept-function;    
  elseif(state.lexical-state-accept-clause.token-priority
           = position.simple-accept-regular-expression-clause.token-priority)
    error("ambiguous token rules: '%s' vs. '%s'",
          state.lexical-state-accept-clause.token-regular-expression,
          position.simple-accept-regular-expression-clause
            .token-regular-expression);
          
  end if;
end method;
            

Once we've built up a regular expression that contains all of the token definitions, we convert it to a deterministic finite automaton, and construct a transition table from the automaton.

<Construct a DFA from recognizer-regular-expression>=

definition.lexical-automaton
  := regular-expression-dfa(recognizer-regular-expression,
                            transition-collection-class: 
                              <simple-object-vector>,
                            transition-collection-size: 256,
                            state-class: <simple-lexical-state>);
            

Lexical Scanners

A lexical scanner is a recognizer for the terminal symbols described using a lexical definition.

<Exports for the simple-parser library>+=

export simple-lexical-scanner;
          

<Module definitions for the simple-parser library>+=

define module simple-lexical-scanner
  use common-dylan;
  use byte-vector;
  use simple-parser,
    export: { <simple-lexical-scanner>, scan-tokens };
  use simple-lexical-definition;
  use source-location-rangemap;
  use source-location-conditions;

  export
    scanner-lexical-definition,
    scanner-lexical-definition-setter,
    scanner-source-position,
    scanner-source-position-setter;
end module;
          

<simple-lexical-scanner>

[Class]


Class of lexical scanners based on automata generated via <simple-lexical-definition>.

Superclasses:

<object>

Init-keywords:
definition:
An instance of <simple-lexical-definition>.
rangemap:
An instance of <source-location-rangemap>.
position:
An instance of <integer>. The default is 0.
Description:

FIXME

Definition

<Module simple-lexical-scanner>=

define class <simple-lexical-scanner> (<object>)
  slot scanner-lexical-definition :: <simple-lexical-definition>,
    required-init-keyword: definition:;
  slot scanner-source-position :: <integer>,
    init-value: 0, init-keyword: position:;
  constant slot scanner-rangemap :: <source-location-rangemap>,
    required-init-keyword: rangemap:;
  <Additional slots in <simple-lexical-scanner>>
end class;
              

scan-tokens

[Generic Function]


Scans for lexical tokens and passes recognized tokens to a consumer.

Signature:

scan-tokens scanner consumer-function consumer-data text #key start end partial?

Arguments:
scanner
An instance of <simple-lexical-scanner>.
consumer-function
An instance of <function>.
consumer-data
An instance of <object>.
text
An instance of <byte-string>.
start
An instance of <integer>. The default is 0.
end
An instance of <integer>. The default is the length of text.
partial?
A boolean value.
Values:

None.

Description:

FIXME

Definition:

<Module simple-lexical-scanner>+=

define method scan-tokens
    (scanner :: <simple-lexical-scanner>,
     consumer-function :: <function>,
     consumer-data :: <object>,
     text :: <byte-string>,
     #key start: text-start :: <integer> = 0,
          end: text-end :: <integer> = text.size,
          partial?)
 => ();
  <Initialize scan-tokensstate variables>
  iterate loop(index :: <integer> = text-start,
               state :: <simple-lexical-state> = scanner.scanner-state)
    if (index < text-end)
      let symbol = as(<integer>, text[index]);
      <Transition based on symbol>
    else
      if (partial?)
        <Save state in preparation for the next call>
      else
        <Accept tokens at the end of input if possible>
      end if;
    end if;
  end iterate;
end method;
              

FIXME This would be a good place for a high-level algorithm description.

The scanner begins in the automaton's initial state, with no saved text.

<Additional slots in <simple-lexical-scanner>>=

slot scanner-state :: <simple-lexical-state>;
slot scanner-saved-cr-seen? :: <boolean>, init-value: #f;
slot scanner-saved-text :: <byte-string>, init-value: "";
slot scanner-saved-accepting-state :: false-or(<simple-lexical-state>),
  init-value: #f;
slot scanner-saved-accepting-index :: <integer>, init-value: 0;
          

<Module simple-lexical-scanner>+=

define method initialize
    (scanner :: <simple-lexical-scanner>,
     #key definition :: <simple-lexical-definition>, #all-keys)
 => ();
  next-method();
  scanner.scanner-state := definition.lexical-automaton;
end method;
          

Here we initialize the state variables based on the previous pass. The invariants for scanner-source-position are as follows:

<Initialize scan-tokensstate variables>=

let state = scanner.scanner-state;
let cr-seen? = scanner.scanner-saved-cr-seen?;
let token-start :: <integer>
  = if (empty?(scanner.scanner-saved-text))
      text-start;
    else
      let saved-size = scanner.scanner-saved-text.size;
      scanner.scanner-source-position
        := scanner.scanner-source-position - saved-size;
      -(saved-size);
    end if;
let accepting-state :: false-or(<simple-lexical-state>)
  = scanner.scanner-saved-accepting-state;
let accepting-index :: <integer>
  = scanner.scanner-saved-accepting-index;
          

On each iteration we determine whether the current state is accepting, and whether or not we can transition on the current input character from this state. If not, then we accept the current token and contine.

<Transition based on symbol>=

if (state.lexical-state-accept-function)
  accepting-state := state;
  accepting-index := index;
end if;
let new-state = state.lexical-state-transitions[symbol];
if (new-state)
  <Check for a line boundary and update the source location rangemap accordingly>
  loop(index + 1, new-state);
elseif (accepting-state)
  <Accept the token>
  if (accepting-index < 0)
    //  FIXME 
    error("backtrack to %d isn't yet implemented", accepting-index);
  else
    token-start := accepting-index;
    accepting-state := #f;
    accepting-index := 0;
    loop(token-start, scanner.scanner-lexical-definition.lexical-automaton);
  end if;
else
  <Signal an error for an unrecognized token>
end if;
          

When we see a line boundary we add it to the scanner's source location rangemap. This is slightly complicated by the various different line-ending conventions (CR only, CRLF, and LF only). In particular, when we see LF we know we have seen a line boundary; if we see a CR we wait to see if it appears alone or paired with a LF.

<Check for a line boundary and update the source location rangemap accordingly>=

if (symbol = as(<integer>, '\n'))
  let source-position
    = scanner.scanner-source-position
    + if (token-start < 0)
        index - text-start - token-start
      else
        index - token-start
      end;
  rangemap-add-line(scanner.scanner-rangemap, source-position + 1, #f);
  cr-seen? := #f;
elseif (symbol = as(<integer>, '\r'))
  if (cr-seen?)
    let source-position
      = scanner.scanner-source-position
      + if (token-start < 0)
          index - text-start - token-start
        else
          index - token-start
        end;
    rangemap-add-line(scanner.scanner-rangemap, source-position, #f);
  end if;
  cr-seen? := #t;
elseif (cr-seen?)
  let source-position
    = scanner.scanner-source-position
    + if (token-start < 0)
        index - text-start - token-start
      else
        index - token-start
      end;
  rangemap-add-line(scanner.scanner-rangemap, source-position, #f);
  cr-seen? := #f;
end if;
          

The work of accepting a token and passing it on to the consumer is done by the state's accept function.

<Accept the token>=

accepting-state.lexical-state-accept-function
  (scanner, consumer-function, consumer-data, text, text-start,
   token-start, accepting-index);
          

Token definitions can be divided into three classes: “inert” tokens (such as whitespace) that don't need to be passed on to the consumer; punctuation-type tokens that don't have a semantic value; and token definitions that provide a function for computing the token's semantic value based on the token text. We use three different accept-functions to implement the accept actions in these three cases, because they require different amounts of work. (We want to avoid the cost of allocating a string to contain the token text in the boundary-crossing case when we don't actually need to use it.)

<Module simple-lexical-scanner>+=

define method token-accept-function
    (token-number :: false-or(<integer>),
     token-name :: false-or(<symbol>),
     token-semantic-value-function :: false-or(<function>))
 => (accept-function :: <function>);
  if (token-number)
    if (token-semantic-value-function)
      <Return a function to call when accepting a token with a semantic value function>
    else
      <Return a function to call when accepting a punctuation token>
    end;
  else
    <Return a function to call when accepting an inert token>
  end if;
end method;
          

When there is a semantic value function, the token text needs to be extracted from the input (possibly complicated by the text crossing the boundary between the saved and current input text segments) and passed to the semantic value function. The token number, token name, and the returned semantic value are then passed along to the consumer-function.

<Return a function to call when accepting a token with a semantic value function>=

method
    (scanner :: <simple-lexical-scanner>,
     consumer-function :: <function>,
     consumer-data :: <object>,
     text :: <byte-string>, text-start :: <integer>,
     token-start :: <integer>, accepting-index :: <integer>)
 => ();
let (semantic-value, accept-text-size)
  = if (token-start < 0)
      let saved-size = scanner.scanner-saved-text.size;
      if (accepting-index < 0)
        values(token-semantic-value-function(scanner.scanner-saved-text,
                                             saved-size + token-start,
                                             saved-size + accepting-index),
               accepting-index - token-start);
      else
        let accept-text-size = saved-size + (accepting-index - text-start);
        let accept-text :: <byte-string>
          = make(<byte-string>, size: accept-text-size);
        copy-bytes(scanner.scanner-saved-text, 0, accept-text, 0, saved-size);
        copy-bytes(text, text-start, accept-text, saved-size,
                   accept-text-size - saved-size);
        scanner.scanner-saved-text := "";
        values(token-semantic-value-function(accept-text, 0, accept-text-size),
               accept-text-size);
      end if;
    else
      values(token-semantic-value-function(text, token-start, accepting-index),
             accepting-index - token-start);
    end if;
  let start-position = scanner.scanner-source-position;
  let next-start-position = start-position + accept-text-size;
  scanner.scanner-source-position := next-start-position;
  consumer-function(consumer-data, token-number, token-name, semantic-value,
                    start-position, next-start-position - 1);
end;
          

When the token has no semantic value, we only need to pass the token number and token name to the consumer function.

<Return a function to call when accepting a punctuation token>=

method
    (scanner :: <simple-lexical-scanner>,
     consumer-function :: <function>,
     consumer-data :: <object>,
     text, text-start :: <integer>,
     token-start :: <integer>, accepting-index :: <integer>)
 => ();
  let accept-text-size
    = if (token-start < 0 & accepting-index >= 0)
        scanner.scanner-saved-text := "";
        accepting-index - text-start - token-start;
      else
        accepting-index - token-start;
      end if;
  let start-position = scanner.scanner-source-position;
  let next-start-position = start-position + accept-text-size;
  scanner.scanner-source-position := next-start-position;
  consumer-function(consumer-data, token-number, token-name, #f,
                    start-position, next-start-position - 1);
end;
          

For inert tokens, we don't need to do anything but clear the scanner-saved-text when the token text crosses the boundary between the saved and current text segments.

<Return a function to call when accepting an inert token>=

method
    (scanner :: <simple-lexical-scanner>,
     consumer-function :: <function>,
     consumer-data :: <object>,
     text, text-start :: <integer>,
     token-start :: <integer>, accepting-index :: <integer>)
 => ();
  if (token-start < 0 & accepting-index >= 0)
    scanner.scanner-saved-text := "";
    scanner.scanner-source-position
      := scanner.scanner-source-position
      + accepting-index - text-start - token-start;
  else
    scanner.scanner-source-position
      := scanner.scanner-source-position
      + accepting-index - token-start
  end if;
end;
          

When we've finished scanning input text, if we are not at the end of the input stream we need to save state for the next call to scan-tokens. First we record the automaton state, the carriage-return state, and the source-location position.

<Save state in preparation for the next call>=

scanner.scanner-state := state;
scanner.scanner-saved-cr-seen? := cr-seen?;
          

If at the end of the input text we are in the middle of recognizing a token, we need to save the as-yet unaccepted portion.

<Save state in preparation for the next call>+=

if (token-start < 0)
  let saved-size = scanner.scanner-saved-text.size;
  let new-saved-size = saved-size + (text-end - text-start);
  let new-saved-text :: <byte-string>
    = make(<byte-string>, size: new-saved-size);
  copy-bytes(scanner.scanner-saved-text, saved-size + token-start,
             new-saved-text, 0, saved-size);
  copy-bytes(text, text-start,
             new-saved-text, saved-size, text-end - text-start);
  scanner.scanner-saved-text := new-saved-text;
  scanner.scanner-source-position
    := scanner.scanner-source-position + new-saved-size;
elseif (token-start ~= text-end)
  let new-saved-size = text-end - token-start;
  let new-saved-text :: <byte-string>
    = make(<byte-string>, size: new-saved-size);
  copy-bytes(text, token-start, new-saved-text, 0, text-end - token-start);
  scanner.scanner-saved-text := new-saved-text;
  scanner.scanner-source-position
    := scanner.scanner-source-position + new-saved-size;
end if;
          

We also need to save the last position within the saved text that could be an accepting state.

<Save state in preparation for the next call>+=

if (accepting-state)
  scanner.scanner-saved-accepting-state := accepting-state;
  if (accepting-index < 0)
    scanner.scanner-saved-accepting-index
      := accepting-index - (text-end - text-start);
  else
    scanner.scanner-saved-accepting-index := accepting-index - text-end;
  end if;
else
  scanner.scanner-saved-accepting-state := #f;
  scanner.scanner-saved-accepting-index := 0;
end if;
          

If we reach the end of the text and there will be no more input, we ensure that the unprocessed input at the end is a valid token.

<Accept tokens at the end of input if possible>=

if (state.lexical-state-accept-function)
  state.lexical-state-accept-function
    (scanner, consumer-function, consumer-data, text, text-start,
     token-start, index);
elseif (accepting-state)
  accepting-state.lexical-state-accept-function
    (scanner, consumer-function, consumer-data, text, text-start,
     token-start, accepting-index);
  error("unrecognized character at %d-%d\n", accepting-index, index);
elseif (token-start ~= index)
  error("unrecognized character at %d-%d\n", token-start, index);
end if;
          

<Signal an error for an unrecognized token>=

let source-position
  = scanner.scanner-source-position
  + if (token-start < 0)
      index - text-start - token-start
    else
      index - token-start
    end;
source-error(range-source-location(scanner.scanner-rangemap,
                                   source-position,
                                   source-position),
             "unrecognized character '%c'", text[index]);
          

Grammar Production Definitions and Parser Automata

Now that we can define the lexical syntax of a language and generate a scanner, we can turn to the definition of phrase grammars, and the automatic generation of parser semantic actions for building abstract syntax trees. This involves a bit of simple type inference as well.

<Exports for the simple-parser library>+=

export simple-parser-automaton;
          

<Module definitions for the simple-parser library>+=

define module simple-parser-automaton
  use common-dylan;
  use set;
  use simple-parser;
  use simple-lexical-definition;
  use grammar;
  use parser-automaton;
  use source-location;
  export
    <simple-production>,
    production-reduce-action;
  create
    \production-user-reduce-action-function,
    production-auto-reduce-action-function;
end module;
          

simple-grammar-productions

[Macro]


Macro for conveniently defining collections of grammar productions.

Description:

The simple-grammar-production macro returns a <vector> of <production> objects, from which a parser automaton can be produced using the simple-parser-automaton function.

Definition

<Module simple-parser-automaton>=

define macro simple-grammar-productions
  { simple-grammar-productions ?clauses end }
    => { vector(?clauses) }

clauses:
  { } => { }

  { production ?:name => [?symbols] (?variables:*)
      ?:body ... }
    => { <Expansion constructing a production given ?name, right-hand-side ?symbols, action ?variables, and a user-provided ?body>, ... }
  { production ?:name :: ?type:expression => [?symbols] (?variables:*)
      ?:body ... }
    => { <Expansion constructing a production given ?name, a declared ?type, right-hand-side ?symbols, action ?variables, and a user-provided ?body>, ... }

  { production ?:name => [?symbols], #rest ?options:expression; ... }
    => { <Expansion constructing a production given ?name and right-hand-side ?symbols, and ?options>, ... }
  { production ?:name :: ?type:expression => [?symbols],
      #rest ?options:expression; ... }
    => { <Expansion constructing a production given ?name, a declared ?type right-hand-side ?symbols, and ?options>, ... }

  { make-production ?:name :: ?type:expression => [?symbols],
      #rest ?initializers:expression; ... }
    => { <Expansion constructing a production given ?name, a declared ?type right-hand-side ?symbols, and a constructor using ?initializers>, ... }

symbols:
  { } => { }
  { ?symbol:name ... } => { ?symbol ... }
end macro;
              

simple-parser-automaton

[Function]


Constructs a parser automaton from a set of grammar productions defined using simple-grammar-productions.

Signature:

simple-parser-automaton lexical-definition productions start-symbols #key end-symbolautomaton

Arguments:
lexical-definition
An instance of <simple-lexical-definition>.
productions
An instance of <sequence>.
start-symbols
An instance of <sequence>.
end-symbol
An instance of <symbol>. The default value is #"EOF".
Values:
automaton
An instance of <lr-parser-automaton>.
Description:

FIXME The automaton we produce will contain semantic actions that will build an abstract syntax tree (AST).

Definition:

<Module simple-parser-automaton>+=

define method simple-parser-automaton
    (lexical-definition :: <simple-lexical-definition>,
     productions :: <sequence>,
     start-symbols :: <sequence>,
     #key end-symbol :: <symbol> = #"EOF",
          class :: <symbol> = #"LALR-1")
 => (automaton :: <lr-parser-automaton>);
  let grammar = make(<grammar>, productions: productions);
  <Classify productions in grammar>
  <Generate reduce actions for productions>
  make(<parser-automaton>,
       grammar: grammar,
       start-symbols: start-symbols,
       end-symbol: end-symbol,
       class: class);
end method;
              

Implementing Grammar Productions

We need to provide our own <production> class in order to include the declared type (if any) of the nonterminal on the left-hand side, along with additional information needed for generating semantic actions.

<Module simple-parser-automaton>+=

define class <simple-production> (<production>)
  constant slot production-nonterminal-type :: false-or(<type>),
    init-value: #f, init-keyword: nonterminal-type:;
  slot production-reduce-action :: <function>,
    init-keyword: action:;
end class;
            

Clauses of the simple-grammar-productions macro will construct instances of this class. We'll cover the simple cases first, namely those without a specified action.

<Expansion constructing a production given ?name and right-hand-side ?symbols, and ?options>=

make(<simple-production>,
     nonterminal: ?#"name",
     derives: simple-production-derives(?symbols),
     ?options)
            

<Expansion constructing a production given ?name, a declared ?type right-hand-side ?symbols, and ?options>=

make(<simple-production>,
     nonterminal: ?#"name",
     nonterminal-type: ?type,
     derives: simple-production-derives(?symbols),
     ?options)
            

The simple-production-derives auxiliary macro turns a sequence of names into a corresponding vector of symbols.

<Module simple-parser-automaton>+=

define macro simple-production-derives
  { simple-production-derives(?symbols) } => { #[?symbols] }
symbols:
  { } => { }
  { ?symbol:name ... } => { ?#"symbol", ... }
end macro;
            

For cases with a user-sepcified action, we rely on the production-user-reduce-action-function macro (to be defined later) to wrap the action in an appropriate method.

<Expansion constructing a production given ?name, right-hand-side ?symbols, action ?variables, and a user-provided ?body>=

make(<simple-production>,
     nonterminal: ?#"name",
     derives: simple-production-derives(?symbols),
     action: production-user-reduce-action-function([]; [?symbols];
                                                    [?variables]; ?body))
            

<Expansion constructing a production given ?name, a declared ?type, right-hand-side ?symbols, action ?variables, and a user-provided ?body>=

make(<simple-production>,
     nonterminal: ?#"name",
     nonterminal-type: ?type,
     derives: simple-production-derives(?symbols),
     action: production-user-reduce-action-function([result :: ?type];
                                                    [?symbols];
                                                    [?variables]; ?body))
            

<Expansion constructing a production given ?name, a declared ?type right-hand-side ?symbols, and a constructor using ?initializers>=

make(<simple-production>,
     nonterminal: ?#"name",
     nonterminal-type: ?type,
     derives: simple-production-derives(?symbols),
     action:
       if (subtype?(?type, <source-location-mixin>))
         production-user-reduce-action-function([result :: ?type]; [?symbols];
                                                [data, srcloc];
                                                make(?type,
                                                     source-location: srcloc,
                                                     ?initializers))
       else
         production-user-reduce-action-function([result :: ?type]; [?symbols];
                                                []; make(?type, ?initializers))
       end)
                                                           
            

Classifying Grammar Productions

Before we can generate code for the parser semantic actions, it is helpful to first classify the nonterminals and grammar productions into various categories. Nonterminals can be classified by whether or not they have any productions that are directly left-recursive. With a bottom-up (LR) parser, sequences of similar items are normally recognized using left-recursive grammar productions.

Grammar productions can be divided into the following categories:

Sequence-building productions
Productions for nonterminals that have one or more right-recursive productions will construct or add to sequences.
Pass-through productions
Productions without an explicitly declared type and whose right-hand sides have a single terminal or nonterminal with a semantic value pass that semantic value through.
Inert productions
Productions without an explicitly declared type and no symbols with a semantic value on their right-hand sides, have no semantic value themselves.

Since the classification of productions may determine whether or not a symbol is considered to have a value, we'll need to use a worklist-based method for finding the minimum fixed point.

<Classify productions in grammar>=

let worklist = as(<deque>, productions);
let in-worklist-set = make(<object-set>);
<Local variables for the production classificaion phase>
local
  <Local methods for the production classificaion phase>;
until (empty?(worklist))
  let production :: <simple-production> = pop(worklist);
  remove!(in-worklist-set, production);
  let nonterminal = production.production-nonterminal;
  <Note if production has an explicitly-declared value>
  unless (slot-initialized?(production, production-reduce-action))
    <Attempt to classify production>
  end unless;
end until;
            

If production has an explicitly-declared value type, then we need to take note that the associated nonterminal can have a value. If a nonterminal changes from not having a value to having one, all affected productions need to be re-evaluated.

<Local variables for the production classificaion phase>=

let symbol-value-kind = make(<object-table>);
let nonterminal-dependent-productions = make(<object-table>);
            

<Local methods for the production classificaion phase>=

method requeue-production (production)
  unless (member?(production, in-worklist-set))
    push-last(worklist, production);
    add!(in-worklist-set, production);
  end unless;
end method,
            

In particular, whenever a production's classification depends on a particular nonterminal not having a value, and the classifier discovers that it can have one, we need to place the production back in the worklist. Furthermore, if the classifier discovers that a nonterminal has sequence-building productions, then we need to re-queue all of the other productions for the nonterminal.

<Local methods for the production classificaion phase>+=

method note-nonterminal-kind (production, nonterminal, kind)
  let previous-kind = element(symbol-value-kind, nonterminal, default: $unfound);
  if (previous-kind == #f)
    do(requeue-production,
       element(nonterminal-dependent-productions, nonterminal, default: #()));
    remove-key!(nonterminal-dependent-productions, nonterminal);
  end if;
  if (kind == #"sequence" & kind ~== previous-kind)
    do(method(other-production)
         unless(production == other-production)
           requeue-production(other-production)
         end unless;
       end,
       grammar-symbol-productions(grammar, nonterminal));
  end if;
  symbol-value-kind[nonterminal] := kind;
end method
            

<Note if production has an explicitly-declared value>=

if (production.production-nonterminal-type
    & element(symbol-value-kind, nonterminal, default: #f) == #f)
  note-nonterminal-kind(production, nonterminal, #"simple");
elseif (slot-initialized?(production, production-reduce-action)
        & unfound?(element(symbol-value-kind, nonterminal, default: $unfound)))
  symbol-value-kind[nonterminal] := #f;
end if;
            

Our goal is to assign a classification label to each production that doesn't already have a reduction action function.

<Local variables for the production classificaion phase>+=

let production-classification = make(<object-table>);
            

<Attempt to classify production>=

block (next)
  local
    method check-value (symbol)
      let kind = element(symbol-value-kind, symbol, default: $unfound);
      if (found?(kind))
        kind
      elseif (grammar-symbol-nonterminal?(grammar, symbol))
        requeue-production (production);
        next();
      elseif(lexical-token-type(lexical-definition, symbol))
        symbol-value-kind[symbol] := #"simple"
      else
        symbol-value-kind[symbol] := #f;
      end if
    end method;
  for (symbol in production.production-derives,
       count = 0 then if (symbol ~== nonterminal & check-value(symbol))
                        count + 1
                      else
                        count
                      end)
  finally
    <Finalize the classification assignment>
  end for;
end block;
            

FIXME

<Finalize the classification assignment>=

select (count by \=)
  0 =>
    <Handle the no-value case>
  1 =>
    <Handle the one-value case>
  otherwise =>
    <Handle the one-value case>
end select;
            

FIXME

<Handle the no-value case>=

select (element(symbol-value-kind, nonterminal, default: $unfound))
  $unfound =>
    symbol-value-kind[nonterminal] := #f;
    production-classification[production] := #"inert";
  #f, #"simple" =>
    production-classification[production] := #"inert";
  #"sequence" =>
    production-classification[production] := #"sequence-empty";
end select;

for (symbol in production.production-derives)
  if (grammar-symbol-nonterminal?(grammar, symbol))
    nonterminal-dependent-productions[symbol]
      := add(element(nonterminal-dependent-productions, symbol, default: #()),
             production);
  end if;
end for;
            

FIXME

<Handle the one-value case>=

if (production.production-derives[0] == nonterminal)
  note-nonterminal-kind(production, nonterminal, #"sequence");
  production-classification[production] := #"sequence-add";
else
  select (element(symbol-value-kind, nonterminal, default: #f))
    #f =>
      note-nonterminal-kind(production, nonterminal, #"simple");
      production-classification[production] := #"pass";
    #"simple" =>
      production-classification[production] := #"pass";
    #"sequence" =>
      production-classification[production] := #"sequence-start";
  end select;
end if;
            

FIXME

<Handle the one-value case>=

unless (element(symbol-value-kind, nonterminal, default: #f))
  note-nonterminal-kind(production, nonterminal, #"simple");
end;
production-classification[production] := #"too-many";
            

Generating Production Reduce Actions

The action to be performed on reduction of a <simple-production> is stored as a <function> in the production-reduce-action slot. If a function is not explicitly provided, then one will be computed for it.

<Generate reduce actions for productions>=

for (classification keyed-by production :: <simple-production>
       in production-classification)
  production.production-reduce-action
    := production-auto-reduce-action-function(production, classification,
                                              symbol-value-kind);
end for;
            

The Parser

Once we have created a parser automaton for a grammar, we can instantiate a corresponding <simple-parser> to parse input “sentences”.

<Module definitions for the simple-parser library>+=

define module simple-parser-implementation
  use common-dylan;
  use grammar;
  use parser-automaton;
  use simple-parser;
  use simple-parser-automaton;
  use source-location;
  use source-location-rangemap;
  use source-location-conditions;
end module;
          

<simple-parser>

[Class]


The class of parsers for used with the simple-parser framework.

Superclasses:

<object>

Init-keywords:
automaton:
A parser automaton instance returned by the simple-parser-automaton function.
start-symbol:
An instance of <symbol>.
rangemap:
An instance of <source-location-rangemap>.
Description

Definition:

<Module simple-parser-implementation>=

define class <simple-parser> (<sequence>)
  constant slot parser-automaton :: <lr-parser-automaton>,
    required-init-keyword: automaton:;
  constant slot parser-rangemap :: <source-location-rangemap>,
    required-init-keyword: rangemap:;
  constant slot parser-consumer-data :: <object>,
    required-init-keyword: consumer-data:;
  <Slots in <simple-parser>>
end class;
              

simple-parser-reset

[Function]


Resets a <simple-parser> to its initial state.

Signature:

simple-parser-reset simple-parser

Arguments:
simple-parser
An instance of <simple-parser>.
Values:

None.

Description:

Resets parser to its initial state.

Definition:

<Module simple-parser-implementation>+=

define function simple-parser-reset
    (parser :: <simple-parser>, #key start-symbol)
 => ();
  <Reset parser>
end function;
              

simple-parser-consume-token

[Function]


Provides input to a <simple-parser>.

Signature:

simple-parser-consume-token simple-parser token-number token-name token-value start-position end-positionviable-prefix?

Arguments:
simple-parser
An instance of <simple-parser>.
token-number
An instance of <integer>.
Values:
done?
An instance of <boolean>.
Description:

Provides the given token as input to parser. Returns true if the input so far is a viable prefix of the language generated from the grammar.

Definition:

<Module simple-parser-implementation>+=

define function simple-parser-consume-token
    (parser :: <simple-parser>,
     token-number, token-name, token-value,
     start-position :: <integer>, end-position :: <integer>)
 => (viable-prefix?);
  <Consume the input token>
end function;
              

simple-parser-can-consume-token?

[Function]


Tests whether input to a <simple-parser> is valid.

Signature:

simple-parser-can-consume-token? simple-parser token-number token-nameviable-prefix?

Arguments:
simple-parser
An instance of <simple-parser>.
token-number
An instance of <integer>.
token-name
An instance of <object>.
Values:
done?
An instance of <boolean>.
Description:

Tests whether providing the given token as input to parser yields a viable prefix of the language generated from the grammar.

Definition:

<Module simple-parser-implementation>+=

define function simple-parser-can-consume-token?
    (parser :: <simple-parser>, token-number, token-name)
 => (viable-prefix?);
  <Test whether or not consuming the input token is valid>
end function;
              

simple-parser-source-location

[Function]


Returns the source-location of a grammar production <simple-parser>.

Signature:

simple-parser-source-location simple-parser start-position end-positionsource-location

Arguments:
simple-parser
An instance of <simple-parser>.
start-position
An instance of <integer>.
end-position
An instance of <integer>.
Values:
source-location
An instance of <file-source-location>.
Description:

Returns a

Definition:

<Module simple-parser-implementation>+=

define function simple-parser-source-location
    (parser :: <simple-parser>,
     start-position :: <integer>, end-position :: <integer>)
 => (source-location :: <file-source-location>);
  range-source-location(parser.parser-rangemap, start-position, end-position)
end function;
              

Implementing the Parser

As we described in the grammar library, the parser is a pushdown automaton. It maintains a stack containing the current state (on the top of the stack) as well as previous states that can be returned to after productions are recognized.

<Module simple-parser-implementation>+=

define constant $initial-stack-size = 200;
            

<Slots in <simple-parser>>=

  slot parser-stack-top :: <integer>, init-value: 0;
  slot parser-state-stack :: <simple-object-vector>
    = make(<simple-object-vector>, size: $initial-stack-size);
          

Initially the state stack contains the initial state of the automaton.

<Module simple-parser-implementation>+=

define sealed method initialize
    (parser :: <simple-parser>,
     #key automaton :: <lr-parser-automaton>, start-symbol,
     #all-keys)
 => ();
  next-method();
  parser.parser-state-stack[0]
    := lr-parser-automaton-initial-state(automaton, start-symbol);
end method;
          

We also maintain another, parallel stack that contains the semantic value of each shifted terminal or reduced nonterminal.

<Slots in <simple-parser>>+=

  slot parser-semantic-value-stack :: <simple-object-vector>
    = make(<simple-object-vector>, size: $initial-stack-size);
          

When we reset a parser, we re-initialize the stack top pointer, place the initial automaton state back on top of the state stack, and clear out any old values from the semantic value stack.

<Reset parser>=

fill!(parser.parser-semantic-value-stack, #f,
      start: 0, end: parser.parser-stack-top + 1);
parser.parser-stack-top := 0;
parser.parser-state-stack[0]
  := lr-parser-automaton-initial-state(parser.parser-automaton, start-symbol);
            

A third stack is used to maintain source-position information. The stack holds the start-position of each terminal shifted by the parser automaton. When a production is reduced, the appropriate start-position from the stack can be used, along with the the end-position of the last token shifted, to obtain a <source-position> from the parser-rangemap.

<Slots in <simple-parser>>+=

  slot parser-source-position-stack :: limited(<vector>, of: <integer>)
    = make(limited(<vector>, of: <integer>), size: $initial-stack-size);
  slot parser-end-position :: <integer>, init-value: 0;
          

The parser-push-stacks function pushes new values onto the stacks.

<Module simple-parser-implementation>+=

define function parser-push-stacks
    (parser :: <simple-parser>, new-stack-top :: <integer>,
     new-state, token-value, start-position :: <integer>)
 => ();
  if (new-stack-top >= parser.parser-state-stack.size)
    <Grow the stacks>
  end if;
  parser.parser-state-stack[new-stack-top] := new-state;
  parser.parser-semantic-value-stack[new-stack-top] := token-value;
  parser.parser-source-position-stack[new-stack-top] := start-position;
end function;
          

<Grow the stacks>=

let new-size = truncate/(new-stack-top * 3, 2);
let old-state-stack = parser.parser-state-stack;
parser.parser-state-stack
  := make(<simple-object-vector>, size: new-size);
let old-semantic-value-stack = parser.parser-semantic-value-stack;
parser.parser-semantic-value-stack
  := make(<simple-object-vector>, size: new-size);
let old-source-position-stack = parser.parser-source-position-stack;
parser.parser-source-position-stack
  := make(limited(<vector>, of: <integer>), size: new-size);
for (i from 0 below new-stack-top)
  parser.parser-state-stack[i] := old-state-stack[i];
  parser.parser-semantic-value-stack[i] := old-semantic-value-stack[i];
  parser.parser-source-position-stack[i] := old-source-position-stack[i];
end for;
            

The work of parsing takes place in the simple-parser-consume-token function. Given a token as input, we use the parser automaton to determine what the next action should be in the current state.

<Consume the input token>=

local
  method dispatch (stack-top :: <integer>, state) => (viable-prefix?);
    let(action, data)
      = lr-parser-automaton-terminal-transition(parser.parser-automaton,
                                                state, token-name);
    select(action)
      <Perform the appropriate parsing action>
    end select;
  end method;
let stack-top = parser.parser-stack-top;
dispatch(stack-top, parser.parser-state-stack[stack-top])
            

If the automaton tells us to shift to a new state, we push that state onto the top of the state stack, and push the token-value and start-position onto the appropriate parallel stacks.

<Perform the appropriate parsing action>=

#"shift" =>
  parser-push-stacks(parser, stack-top + 1, data, token-value, start-position);
  parser.parser-end-position := end-position;
  <If the semantic stack shrunk, clear any unused entries>
  parser.parser-stack-top := stack-top + 1;
  #t;
            

Note that if, due to previous reductions, the stack shrunk between when we started the loop and the return after shifting the token, we clear the entries on the semantic stack to prevent values there from becoming uncollectable garbage.

<If the semantic stack shrunk, clear any unused entries>=

if (parser.parser-stack-top > stack-top + 1)
  fill!(parser.parser-semantic-value-stack, #f,
        start: stack-top + 2, end: parser.parser-stack-top + 1);
end if;
            

If it tells us to reduce a production, we pop the intermediate states off the stack, call the semantic action function, push the state indicated by the nonterminal transition onto the stack, and handle the input terminal from the new state.

<Perform the appropriate parsing action>+=

#"reduce" =>
  let rhs-size = data.production-derives.size;
  let reduce-stack-top = stack-top - rhs-size;
  let reduce-state = parser.parser-state-stack[reduce-stack-top];
  
  let new-state
    = lr-parser-automaton-nonterminal-transition
        (parser.parser-automaton, reduce-state, data.production-nonterminal);

  <Prepare parser for the calling of the parse action function>

  if (rhs-size.zero?)
    let reduce-semantic-value
      = data.production-reduce-action(parser, parser.parser-consumer-data,
                                      start-position, start-position);
    parser-push-stacks(parser, reduce-stack-top + 1,
                       new-state, reduce-semantic-value, start-position);
  else
    let reduce-semantic-value
      = data.production-reduce-action(parser, parser.parser-consumer-data,
                                      parser.parser-source-position-stack[reduce-stack-top + 1],
                                      parser.parser-end-position);
    parser.parser-state-stack[reduce-stack-top + 1] := new-state;
    parser.parser-semantic-value-stack[reduce-stack-top + 1]
      := reduce-semantic-value;
  end if;

  dispatch(reduce-stack-top + 1, new-state);
            

FIXME What to do about syntax errors?

<Perform the appropriate parsing action>+=

#"error" =>
  let srcloc
    = simple-parser-source-location(parser, start-position, end-position);
  source-error(srcloc, "syntax error at %=, expected one of %=",
               token-name,
               lr-parser-automaton-transition-terminals(parser.parser-automaton,
                                                        state));
            

If the automaton tells us to accept on this input, we return a boolean true value, indicating a successful parse.

<Perform the appropriate parsing action>+=

#"accept" =>
  #t;
            

For parser-can-consume-token? we examine the parse tables in the same way, but don't actually affect the state of the parser. We will transition the automaton until it shifts the token (or accepts).

<Test whether or not consuming the input token is valid>=

local
  method dispatch
      (stack-top :: <integer>, stack :: false-or(<stretchy-object-vector>),
       state)
   => (viable-prefix?);
    let(action, data)
      = lr-parser-automaton-terminal-transition(parser.parser-automaton,
                                                state, token-name);
    select(action)
      #"shift" =>
        #t;
      #"reduce" =>
        <Perform a trial reduction>
      #"error" =>
        #f;
      #"accept" =>
        #t;
    end select;
  end method;
let stack-top = parser.parser-stack-top;
dispatch(stack-top, #f, parser.parser-state-stack[stack-top])
            

<Perform a trial reduction>=

  let stack = stack
    | begin
        let stack = make(<stretchy-object-vector>, size: stack-top + 1);
        for (i from 0 to stack-top)
          stack[i] := parser.parser-state-stack[i];
        end;
        stack;
      end;
  let rhs-size = data.production-derives.size;
  let reduce-stack-top = stack-top - rhs-size;
  let reduce-state = stack[reduce-stack-top];
  
  let new-state
    = lr-parser-automaton-nonterminal-transition
        (parser.parser-automaton, reduce-state, data.production-nonterminal);

  stack[reduce-stack-top + 1] := new-state;
  dispatch(reduce-stack-top + 1, stack, new-state);
            

The Parse Action Execution Environment

Within the scope of a parse action function, the <simple-parser> object passed as the first argument serves as a <sequence> from which the right-hand side's semantic values can be retrieved. This requires a bit of additional state.

<Slots in <simple-parser>>+=

  slot parser-rhs-size :: <integer>, init-value: 0;
            

<Prepare parser for the calling of the parse action function>=

parser.parser-rhs-size := rhs-size;
parser.parser-stack-top := reduce-stack-top + 1;
            

Given this state information, we can write all of the necessary <sequence> methods.

<Module simple-parser-implementation>+=

define sealed method size (parser :: <simple-parser>) => (size :: <integer>);
  parser.parser-rhs-size
end method;
            
define sealed method element
    (parser :: <simple-parser>, index :: <integer>, #key default = $unsupplied)
 => (value :: <object>);
  if (0 <= index & index < parser.parser-rhs-size)
    parser.parser-semantic-value-stack[parser.parser-stack-top + index]
  elseif (supplied?(default))
    default
  else
    error("index %d is out of range", index);
  end if
end method;
            
define sealed inline method type-for-copy
    (parser :: <simple-parser>)
 => (type :: <type>);
  <simple-object-vector>
end method;
            
define sealed inline method forward-iteration-protocol
    (parser :: <simple-parser>)
 => (initial-state :: <integer>, limit :: <integer>, next-state :: <function>,
     finished-state? :: <function>, current-key :: <function>,
     current-element :: <function>, current-element-setter :: <function>,
     copy-state :: <function>);
  values(// initial-state
         parser.parser-stack-top,
         // limit
         parser.parser-stack-top + parser.parser-rhs-size,
         // next-state
         method (parser :: <simple-parser>, state :: <integer>)
          => (new-state :: <integer>);
           state + 1
         end,
         // finished-state?
         method (parser :: <simple-parser>, state :: <integer>,
                 limit :: <integer>)
          => (finished? :: <boolean>);
           state >= limit
         end method,
         // current-key
         method (parser :: <simple-parser>, state :: <integer>)
          => (key :: <integer>)
           state
         end method,
         // current-element
         method (parser :: <simple-parser>, state :: <integer>)
           parser.parser-semantic-value-stack[state]
         end method,
         // current-element-setter,
         method (parser :: <simple-parser>, state :: <integer>, value)
           error("The <simple-parser> semantic stack may not be changed");
         end method,
         // copy-state
         method (parser :: <simple-parser>, state :: <integer>)
          => (state :: <integer>);
           state
         end method)
end method;
            

Implementing User-Specified Parse Actions

The production-user-reduce-action-function wraps the user-supplied ?body inside a binding of the right-hand-side ?symbols to their corresponding sematic values. If a single auxiliary variable is provided, it is bound to the parser's parser-consumer-data; if an additional variable is supplied, it is bound to the source location of the reduced production.

<Module simple-parser-implementation>+=

define macro production-user-reduce-action-function
  { production-user-reduce-action-function([?result:*]; [];
                                           []; ?:body) }
    => { method (p :: <simple-parser>, data, s, e) => (?result);
           ?body
         end }
  { production-user-reduce-action-function([?result:*]; [?symbols];
                                           []; ?:body) }
    => { method (p :: <simple-parser>, data, s, e) => (?result);
           let (?symbols) = apply(values, p);
           ?body
         end }

  { production-user-reduce-action-function([?result:*]; [];
                                           [?data:variable]; ?:body) }
    => { method (p :: <simple-parser>, ?data, s, e) => (?result);
           ?body
         end }
  { production-user-reduce-action-function([?result:*]; [?symbols];
                                           [?data:variable]; ?:body) }
    => { method (p :: <simple-parser>, ?data, s, e) => (?result);
           let (?symbols) = apply(values, p);
           ?body
         end }

  { production-user-reduce-action-function([?result:*]; [];
                                           [?data:variable, ?srcloc:variable];
                                           ?:body) }
    => { method
             (p :: <simple-parser>, ?data, s :: <integer>, e :: <integer>)
          => (?result);
           let ?srcloc = simple-parser-source-location(p, s, e);
           ?body
         end }
  { production-user-reduce-action-function([?result:*]; [?symbols];
                                           [?data:variable, ?srcloc:variable];
                                           ?:body) }
    => { method
             (p :: <simple-parser>, ?data, s :: <integer>, e :: <integer>)
          => (?result);
           let (?symbols) = apply(values, p);
           let ?srcloc = simple-parser-source-location(p, s, e);
           ?body
         end }

symbols:
  { } => { }
  { ?symbol:name ... } => { ?symbol, ... }
end macro;
            

Implementing Generated Parse Actions

The reduce function for #"inert" productions does nothing but return #f.

<Module simple-parser-implementation>+=

define method production-auto-reduce-action-function
    (production :: <simple-production>, kind == #"inert",
     symbol-value-kind :: <object-table>)
 => (function :: <function>);
  method (parser, data, s, e)
    #f
  end
end method;
            

If a production is classified as #"pass", we figure out which right-hand side item contains a value and construct a function that returns that value.

<Module simple-parser-implementation>+=

define method production-auto-reduce-action-function
    (production :: <simple-production>, kind == #"pass",
     symbol-value-kind :: <object-table>)
 => (function :: <function>);
  block (return)
    for (symbol in production.production-derives, index from 0)
      if (element(symbol-value-kind, symbol, default: #f))
        return(method(p :: <simple-parser>, data, s, e)
                 p[index]
               end);
      end if
    end for;
    error("Couldn't find a value symbol in %=", production);
  end block;
end method;
            

<Module simple-parser-implementation>+=

define method production-auto-reduce-action-function
    (production :: <simple-production>, kind == #"sequence-empty",
     symbol-value-kind :: <object-table>)
 => (function :: <function>);
  method (parser, data, s, e)
    make(<stretchy-object-vector>)
  end
end method;
            

<Module simple-parser-implementation>+=

define method production-auto-reduce-action-function
    (production :: <simple-production>, kind == #"sequence-start",
     symbol-value-kind :: <object-table>)
 => (function :: <function>);
  block (return)
    for (symbol in production.production-derives, index from 0)
      if (element(symbol-value-kind, symbol, default: #f))
        return(method(p :: <simple-parser>, data, s, e)
                 let sequence = make(<stretchy-object-vector>);
                 add!(sequence, p[index]);
                 sequence
               end);
      end if
    end for;
    error("Couldn't find a value symbol in %=", production);
  end block;
end method;
            

<Module simple-parser-implementation>+=

define method production-auto-reduce-action-function
    (production :: <simple-production>, kind == #"sequence-add",
     symbol-value-kind :: <object-table>)
 => (function :: <function>);
  block (return)
    for (symbol in production.production-derives, index from 0)
      if (index > 0 & element(symbol-value-kind, symbol, default: #f))
        return(method(p :: <simple-parser>, data, s, e)
                 add!(p[0], p[index]);
                 p[0]
               end);
      end if
    end for;
    error("Couldn't find a value symbol in %=", production);
  end block;
end method;
            

<Module simple-parser-implementation>+=

define method production-auto-reduce-action-function
    (production :: <simple-production>, kind == #"too-many",
     symbol-value-kind :: <object-table>)
 => (function :: <function>);
  //signal("Production %= has too many right-hand side values", production);
  method (parser, data, s, e)
    #f
  end
end method;
            

Syntax Error Reporting