Initial check in docu
This commit is contained in:
Binary file not shown.
@@ -0,0 +1,36 @@
|
||||
\documentclass{zlbook}
|
||||
\usepackage{minitoc}
|
||||
%\usepackage[toc,page]{appendix}
|
||||
%\usepackage{mtcoff}
|
||||
\title{Delphi Parser Generator \\ user's guide}
|
||||
\begin{document}
|
||||
\dominitoc
|
||||
\dominilof
|
||||
\dominilot
|
||||
|
||||
\pagestyle{empty}
|
||||
\renewcommand{\thepage}{\roman{page}}
|
||||
\maketitle
|
||||
|
||||
\tableofcontents
|
||||
%\listoftables
|
||||
\renewcommand{\thepage}{\thechapter\ - \arabic{page}}
|
||||
\clearpage
|
||||
\pagestyle{fancy}
|
||||
|
||||
\input{src/intro/intro}
|
||||
\input{src/start/start}
|
||||
\input{src/lang/lang}
|
||||
\input{src/gram/gram}
|
||||
\input{src/tokens/tokens}
|
||||
\input{src/rt/rt}
|
||||
|
||||
|
||||
\appendix
|
||||
\renewcommand{\thepage}{\Alph{chapter} - \arabic{page}}
|
||||
|
||||
%\begin{appendices}
|
||||
\input{src/app/app-grammar}
|
||||
%\end{appendices}
|
||||
|
||||
\end{document}
|
||||
@@ -0,0 +1,625 @@
|
||||
\chapter{Grammar of Delphi Parser Generator}
|
||||
|
||||
\clearpage \section{Lexical analyzer}
|
||||
\begin{verbatim}
|
||||
unit dpgDpgLexer;
|
||||
|
||||
lexer TdpgDpgLexer;
|
||||
options
|
||||
{
|
||||
testLiterals = false;
|
||||
k = 2;
|
||||
}
|
||||
|
||||
tokens
|
||||
{
|
||||
"unit";
|
||||
"uses";
|
||||
"const";
|
||||
"type";
|
||||
|
||||
"lexer";
|
||||
"parser";
|
||||
|
||||
"options";
|
||||
"tokens";
|
||||
"memberdecl";
|
||||
"memberdef";
|
||||
|
||||
"private";
|
||||
"protected";
|
||||
"public";
|
||||
|
||||
"returns";
|
||||
"local";
|
||||
|
||||
"except";
|
||||
"finally";
|
||||
|
||||
SEMPRED;
|
||||
|
||||
USES;
|
||||
OPTIONS;
|
||||
TOKENS;
|
||||
}
|
||||
|
||||
// --------------------------------------------------------
|
||||
// Simple tokens
|
||||
// --------------------------------------------------------
|
||||
LPAREN: '(';
|
||||
RPAREN: ')';
|
||||
RCURLY: '}';
|
||||
COLON: ':';
|
||||
SEMI: ';';
|
||||
COMMA: ',';
|
||||
ASSIGN: '=';
|
||||
IMPLIES: "=>";
|
||||
QUEST: '?';
|
||||
PLUS: '+';
|
||||
STAR: '*';
|
||||
NOT: '~';
|
||||
OR: '|';
|
||||
BANG: '!';
|
||||
WILDCARD: '.';
|
||||
RANGE: "..";
|
||||
|
||||
// --------------------------------------------------------
|
||||
// Character literal
|
||||
// --------------------------------------------------------
|
||||
CHARLIT
|
||||
: '\''! (ESC | ~'\'') '\''! ;
|
||||
|
||||
// --------------------------------------------------------
|
||||
// String literal
|
||||
// --------------------------------------------------------
|
||||
STRINGLIT
|
||||
: '"' (ESC | ~'"')* '"' ;
|
||||
|
||||
// --------------------------------------------------------
|
||||
// Integer
|
||||
// --------------------------------------------------------
|
||||
INTEGER local
|
||||
{
|
||||
i: integer;
|
||||
v: integer;
|
||||
}
|
||||
: DNUMBER
|
||||
{
|
||||
v := 0;
|
||||
for i:=1 to Length( TokenText) do
|
||||
begin
|
||||
v := v * 10 + ord( TokenText[i]) - ord('0');
|
||||
end;
|
||||
|
||||
TokenText := IntToStr( v);
|
||||
}
|
||||
;
|
||||
|
||||
|
||||
|
||||
|
||||
// --------------------------------------------------------
|
||||
// Argument action
|
||||
// --------------------------------------------------------
|
||||
ARGACTION
|
||||
:
|
||||
'['!
|
||||
(
|
||||
'\r' '\n' { newLine; }
|
||||
| '\r' { newLine; }
|
||||
| '\n' { newLine; }
|
||||
| ~']'
|
||||
)*
|
||||
']'!
|
||||
;
|
||||
|
||||
// --------------------------------------------------------
|
||||
// Action
|
||||
// --------------------------------------------------------
|
||||
ACTION
|
||||
:
|
||||
'{'
|
||||
(
|
||||
'\r' '\n' { newLine; }
|
||||
| '\r' { newLine; }
|
||||
| '\n' { newLine; }
|
||||
| ~'}'
|
||||
)*
|
||||
'}'
|
||||
( '?'! { _ttype := TT_SEMPRED; } )?
|
||||
;
|
||||
|
||||
// --------------------------------------------------------
|
||||
// Token ref
|
||||
// --------------------------------------------------------
|
||||
TOKENREF
|
||||
options
|
||||
{
|
||||
testLiterals = true;
|
||||
}
|
||||
: 'A'..'Z' ('a'..'z' | 'A'..'Z' | '_' | '0'..'9')* ;
|
||||
|
||||
// --------------------------------------------------------
|
||||
// Rule ref
|
||||
// --------------------------------------------------------
|
||||
RULEREF
|
||||
local
|
||||
{
|
||||
t: integer;
|
||||
}
|
||||
:
|
||||
t = INT_RULEREF { _ttype := t; }
|
||||
(
|
||||
{t = LT_uses}? WS_LOOP ('{' { _ttype := TT_USES; } )?
|
||||
| {t = LT_options}? WS_LOOP ('{' { _ttype := TT_OPTIONS; } )?
|
||||
| {t = LT_tokens}? WS_LOOP ('{' { _ttype := TT_TOKENS; } )?
|
||||
)?
|
||||
;
|
||||
|
||||
// --------------------------------------------------------
|
||||
// Internal rule ref
|
||||
// --------------------------------------------------------
|
||||
protected INT_RULEREF returns [integer]
|
||||
{
|
||||
_ttype := TT_RULEREF;
|
||||
}
|
||||
: 'a'..'z' ('a'..'z' | 'A'..'Z' | '_' | '0'..'9')*
|
||||
{
|
||||
result := TestLiteral( _ttype);
|
||||
}
|
||||
;
|
||||
|
||||
// --------------------------------------------------------
|
||||
// COMMENT
|
||||
// --------------------------------------------------------
|
||||
COMMENT
|
||||
: SLCOMMENT { _ttype := TT_SKIP; }
|
||||
| MLCOMMENT { _ttype := TT_SKIP; }
|
||||
;
|
||||
|
||||
// --------------------------------------------------------
|
||||
// SLCOMMENT
|
||||
// --------------------------------------------------------
|
||||
protected SLCOMMENT
|
||||
:
|
||||
"//"
|
||||
( ~( '\r' | '\n') )*
|
||||
(
|
||||
'\r' '\n' { newLine; }
|
||||
| '\r' { newLine; }
|
||||
| '\n' { newLine; }
|
||||
)
|
||||
;
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
// --------------------------------------------------------
|
||||
// Multi line comment version
|
||||
// Nested comments aren't allowed!
|
||||
// --------------------------------------------------------
|
||||
protected MLCOMMENT
|
||||
:
|
||||
"(*"
|
||||
(
|
||||
options
|
||||
{
|
||||
greedy = false;
|
||||
}
|
||||
: '\r' '\n' { newLine; }
|
||||
| '\r' { newLine; }
|
||||
| '\n' { newLine; }
|
||||
| .
|
||||
)*
|
||||
"*)"
|
||||
;
|
||||
|
||||
// --------------------------------------------------------
|
||||
// Numbers
|
||||
// --------------------------------------------------------
|
||||
protected DNUMBER: '0'..'9' (DDIGIT)*;
|
||||
protected DDIGIT: '0'..'9';
|
||||
|
||||
// --------------------------------------------------------
|
||||
// WS
|
||||
// --------------------------------------------------------
|
||||
WS
|
||||
:
|
||||
(
|
||||
' '
|
||||
| '\t' { tab; }
|
||||
| '\r' '\n' { newLine; }
|
||||
| '\r' { newLine; }
|
||||
| '\n' { newLine; }
|
||||
)
|
||||
{
|
||||
_ttype := TT_SKIP;
|
||||
}
|
||||
;
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
// --------------------------------------------------------
|
||||
// WS_LOOP
|
||||
// --------------------------------------------------------
|
||||
protected
|
||||
WS_LOOP
|
||||
:
|
||||
(
|
||||
options
|
||||
{
|
||||
greedy = true;
|
||||
}
|
||||
: WS
|
||||
| COMMENT
|
||||
)*
|
||||
;
|
||||
|
||||
// --------------------------------------------------------
|
||||
// Esc
|
||||
// --------------------------------------------------------
|
||||
protected
|
||||
ESC
|
||||
: '\\'! ( 'r' | 'n' | 't' | '\'' | '"' )
|
||||
;
|
||||
|
||||
\end{verbatim}
|
||||
|
||||
|
||||
\clearpage \section{Parser}
|
||||
\begin{verbatim}
|
||||
unit dpgDpgParser;
|
||||
|
||||
parser TdpgDpgParser;
|
||||
options
|
||||
{
|
||||
defaultErrorHandler = false;
|
||||
importVocab = dpgDpgLexer;
|
||||
k = 2;
|
||||
}
|
||||
|
||||
// --------------------------------------------------------
|
||||
// grammar
|
||||
// --------------------------------------------------------
|
||||
grammar
|
||||
: "unit" id SEMI
|
||||
(usesDecl)?
|
||||
(constDecl)?
|
||||
(typeDecl)?
|
||||
classDecl
|
||||
;
|
||||
|
||||
// --------------------------------------------------------
|
||||
// usesDecl
|
||||
// --------------------------------------------------------
|
||||
usesDecl
|
||||
: USES
|
||||
(
|
||||
TOKENREF SEMI
|
||||
| RULEREF SEMI
|
||||
)*
|
||||
|
||||
RCURLY
|
||||
;
|
||||
|
||||
// --------------------------------------------------------
|
||||
// constDecl
|
||||
// --------------------------------------------------------
|
||||
constDecl
|
||||
: "const" ACTION
|
||||
;
|
||||
|
||||
// --------------------------------------------------------
|
||||
// typeDecl
|
||||
// --------------------------------------------------------
|
||||
typeDecl
|
||||
: "type" ACTION
|
||||
;
|
||||
|
||||
// --------------------------------------------------------
|
||||
// classDecl
|
||||
// --------------------------------------------------------
|
||||
classDecl
|
||||
local
|
||||
{
|
||||
grType: integer;
|
||||
}
|
||||
:
|
||||
// --------------------------------------------------
|
||||
// Determine parser type
|
||||
// --------------------------------------------------
|
||||
( "lexer" { grType := 0; }
|
||||
| "parser" { grType := 1; }
|
||||
)
|
||||
|
||||
// --------------------------------------------------
|
||||
// get class name
|
||||
// --------------------------------------------------
|
||||
id
|
||||
SEMI
|
||||
|
||||
// --------------------------------------------------
|
||||
// Process optional class "options {...}" clause
|
||||
// --------------------------------------------------
|
||||
(classOptions)?
|
||||
|
||||
// --------------------------------------------------
|
||||
// Process optional class "tokens {...}" clause
|
||||
// But only for lexers.
|
||||
// --------------------------------------------------
|
||||
( {grType=0}? classTokens)?
|
||||
|
||||
// --------------------------------------------------
|
||||
// Process optional class "memberDecl {...}" clause
|
||||
// --------------------------------------------------
|
||||
(classMemberDecl)?
|
||||
|
||||
// --------------------------------------------------
|
||||
// Well, the rules
|
||||
// --------------------------------------------------
|
||||
rules
|
||||
|
||||
// --------------------------------------------------
|
||||
// Process optional class "memberDecl {...}" clause
|
||||
// --------------------------------------------------
|
||||
(classMemberDef)?
|
||||
;
|
||||
// --------------------------------------------------------
|
||||
// classOptions
|
||||
// --------------------------------------------------------
|
||||
classOptions
|
||||
: OPTIONS ( id ASSIGN optionValue SEMI )* RCURLY
|
||||
;
|
||||
|
||||
// --------------------------------------------------------
|
||||
// classTokens
|
||||
// --------------------------------------------------------
|
||||
classTokens
|
||||
:
|
||||
TOKENS
|
||||
(
|
||||
TOKENREF SEMI
|
||||
| STRINGLIT SEMI
|
||||
)*
|
||||
|
||||
RCURLY
|
||||
;
|
||||
|
||||
// --------------------------------------------------------
|
||||
// classMemberDecl
|
||||
// --------------------------------------------------------
|
||||
classMemberDecl
|
||||
: "memberDecl" ACTION
|
||||
;
|
||||
|
||||
// --------------------------------------------------------
|
||||
// classMemberDef
|
||||
// --------------------------------------------------------
|
||||
classMemberDef
|
||||
: "memberDef" ACTION
|
||||
;
|
||||
|
||||
// --------------------------------------------------------
|
||||
// rules
|
||||
// --------------------------------------------------------
|
||||
rules
|
||||
: (rule)*
|
||||
;
|
||||
|
||||
// --------------------------------------------------------
|
||||
// ruleExceptionBlock
|
||||
// --------------------------------------------------------
|
||||
ruleExceptionBlock
|
||||
: "except" ACTION
|
||||
| "finally" ACTION
|
||||
;
|
||||
// --------------------------------------------------------
|
||||
// altExceptionBlock
|
||||
// --------------------------------------------------------
|
||||
altExceptionBlock
|
||||
: "except" ACTION
|
||||
| "finally" ACTION
|
||||
;
|
||||
|
||||
// --------------------------------------------------------
|
||||
// rule
|
||||
// --------------------------------------------------------
|
||||
rule
|
||||
:
|
||||
// --------------------------------------------------
|
||||
// Parse rule scope
|
||||
// --------------------------------------------------
|
||||
( "public"
|
||||
| "protected"
|
||||
| "private"
|
||||
)?
|
||||
|
||||
// --------------------------------------------------
|
||||
// Parse rule name
|
||||
// --------------------------------------------------
|
||||
id
|
||||
|
||||
// --------------------------------------------------
|
||||
// Optional arguments
|
||||
// --------------------------------------------------
|
||||
(ARGACTION)?
|
||||
|
||||
// --------------------------------------------------
|
||||
// Optional return type
|
||||
// --------------------------------------------------
|
||||
("returns" ARGACTION)?
|
||||
|
||||
// --------------------------------------------------
|
||||
// Optional rule options
|
||||
// --------------------------------------------------
|
||||
(ruleOptions)?
|
||||
|
||||
// --------------------------------------------------
|
||||
// Optional rule local variable declarations
|
||||
// --------------------------------------------------
|
||||
("local" ACTION)?
|
||||
|
||||
// --------------------------------------------------
|
||||
// Optional rule init action
|
||||
// --------------------------------------------------
|
||||
(ACTION)?
|
||||
|
||||
// --------------------------------------------------
|
||||
// Rule block
|
||||
// --------------------------------------------------
|
||||
COLON
|
||||
block
|
||||
SEMI
|
||||
|
||||
// --------------------------------------------------
|
||||
// Optional exception handler
|
||||
// --------------------------------------------------
|
||||
(ruleExceptionBlock)?
|
||||
;
|
||||
|
||||
// --------------------------------------------------------
|
||||
// block
|
||||
// --------------------------------------------------------
|
||||
block
|
||||
: alternative (OR alternative)*
|
||||
;
|
||||
|
||||
// --------------------------------------------------------
|
||||
// alternative
|
||||
// --------------------------------------------------------
|
||||
alternative
|
||||
: (elem)*
|
||||
(altExceptionBlock)?
|
||||
;
|
||||
|
||||
// --------------------------------------------------------
|
||||
// elem
|
||||
// --------------------------------------------------------
|
||||
elem
|
||||
: element
|
||||
;
|
||||
|
||||
// --------------------------------------------------------
|
||||
// element
|
||||
// --------------------------------------------------------
|
||||
element
|
||||
local
|
||||
{
|
||||
assignLabel : IdpgToken;
|
||||
}
|
||||
{
|
||||
assignLabel := nil;
|
||||
}
|
||||
:
|
||||
(
|
||||
id ASSIGN
|
||||
(id COLON)?
|
||||
(
|
||||
RULEREF (ARGACTION)? (BANG)?
|
||||
| TOKENREF (ARGACTION)?
|
||||
)
|
||||
)
|
||||
|
|
||||
(assignLabel=id COLON)?
|
||||
(
|
||||
RULEREF (ARGACTION)? (BANG)?
|
||||
| range[assignLabel]
|
||||
| terminal[assignLabel]
|
||||
| NOT (notTerminal[assignLabel] | ebnf[ assignLabel, true])
|
||||
| ebnf[ assignLabel, false]
|
||||
)
|
||||
| ACTION
|
||||
| SEMPRED
|
||||
;
|
||||
|
||||
// --------------------------------------------------------
|
||||
// range
|
||||
// --------------------------------------------------------
|
||||
range [pTokenLabel: IdpgToken]
|
||||
local
|
||||
:
|
||||
CHARLIT RANGE CHARLIT
|
||||
| (TOKENREF | STRINGLIT) RANGE (TOKENREF | STRINGLIT)
|
||||
;
|
||||
// --------------------------------------------------------
|
||||
// terminal
|
||||
// --------------------------------------------------------
|
||||
terminal [pTokenLabel: IdpgToken]
|
||||
:
|
||||
CHARLIT (BANG)?
|
||||
| TOKENREF (BANG)? (ARGACTION)?
|
||||
| STRINGLIT (BANG)?
|
||||
| WILDCARD (BANG)?
|
||||
;
|
||||
|
||||
// --------------------------------------------------------
|
||||
// notTerminal
|
||||
// --------------------------------------------------------
|
||||
notTerminal [pTokenLabel: IdpgToken]
|
||||
: CHARLIT (BANG)?
|
||||
| TOKENREF (BANG)?
|
||||
;
|
||||
|
||||
// --------------------------------------------------------
|
||||
// ebnf
|
||||
// --------------------------------------------------------
|
||||
ebnf [pTokenLabel: IdpgToken; pTokenNot: boolean]
|
||||
: LPAREN
|
||||
(
|
||||
subRuleOptions (ACTION)? COLON
|
||||
| ACTION COLON
|
||||
)?
|
||||
|
||||
block
|
||||
RPAREN
|
||||
( QUEST
|
||||
| STAR
|
||||
| PLUS
|
||||
| IMPLIES
|
||||
)?
|
||||
;
|
||||
|
||||
// --------------------------------------------------------
|
||||
// subruleOptions
|
||||
// --------------------------------------------------------
|
||||
subruleOptions
|
||||
: OPTIONS (id ASSIGN optionValue)* SEMI RCURLY
|
||||
;
|
||||
|
||||
// --------------------------------------------------------
|
||||
// ruleOptions
|
||||
// --------------------------------------------------------
|
||||
ruleOptions
|
||||
: OPTIONS (id ASSIGN optionValue)* SEMI RCURLY
|
||||
;
|
||||
|
||||
// --------------------------------------------------------
|
||||
// optionValue
|
||||
// --------------------------------------------------------
|
||||
optionValue returns [IdpgToken]
|
||||
: result=qualifiedId
|
||||
| result:STRINGLIT
|
||||
| result:CHARLIT
|
||||
| result:INTEGER
|
||||
;
|
||||
|
||||
// --------------------------------------------------------
|
||||
// qualifiedId
|
||||
// --------------------------------------------------------
|
||||
qualifiedId returns [IdpgToken]
|
||||
: id (WILDCARD id)*
|
||||
;
|
||||
// --------------------------------------------------------
|
||||
// id
|
||||
// --------------------------------------------------------
|
||||
id returns [IdpgToken]
|
||||
: result:TOKENREF
|
||||
| result:RULEREF
|
||||
;
|
||||
\end{verbatim}
|
||||
@@ -0,0 +1,77 @@
|
||||
\section{Error handling}
|
||||
|
||||
All syntactic and semantic errors cause parser exceptions to be thrown. In
|
||||
particular, the methods used to match tokens in the parser base class (match et
|
||||
al) throw §EdpgMismatchedToken§. The methods in the lexer base class used to
|
||||
match characters (match et al) throw analogous exceptions.
|
||||
|
||||
\subsection{DPG exception hierarchy}
|
||||
|
||||
DPG-generated parsers throw exceptions to signal recognition errors or other
|
||||
stream problems. All exceptions derive from EdpgException. The hierarchy is the
|
||||
following:
|
||||
|
||||
\begin{verbatim}
|
||||
EdpgException
|
||||
EdpgMismatchedChar
|
||||
EdpgMismatchedToken
|
||||
EdpgSemantic
|
||||
\end{verbatim}
|
||||
|
||||
\subsubsection{EdpgException} The EdpgException exception class is the base of
|
||||
all DPG generated exceptions. User defined exceptions must derive from this
|
||||
class.
|
||||
|
||||
\subsubsection{EdpgMismatchedChar} This exception is thrown by the lexer when it
|
||||
is looking for a character, but finds a different one on the input stream.
|
||||
|
||||
\subsubsection{EdpgMismatchedToken} This exception is thrown by the parser when
|
||||
it is looking for a token, but finds a different one on the input token stream.
|
||||
|
||||
\subsubsection{EdpgSemantic} This exception is thrown by a validating semantic
|
||||
predicate.
|
||||
|
||||
\subsection{Specifying exception handlers}
|
||||
|
||||
DPG allows to specify specific exception handler to a given rule or
|
||||
alternative. The general form of an exception handler specification is:
|
||||
|
||||
\begin{verbatim}
|
||||
... except { code to handle exception }
|
||||
... finally { code to handle exception }
|
||||
\end{verbatim}
|
||||
|
||||
\subsubsection{Exception handler for a rule}
|
||||
|
||||
The exception handler for a rule must be placed after the terminating
|
||||
semicolon. The handler can be either an §except§ block or a §finally§ block.
|
||||
The implementation of rule will be surrounded by a try block.
|
||||
|
||||
\begin{verbatim}
|
||||
r : ...
|
||||
;
|
||||
except { handler code }
|
||||
\end{verbatim}
|
||||
|
||||
\subsubsection{Exception handler for an alternative}
|
||||
|
||||
The exception handler of an alternative must be the last element of the
|
||||
alternative. Both exception handler blocks can be used. Every alternative that
|
||||
have exception block specified, will be surrounded by a §try...except/finally§
|
||||
block.
|
||||
|
||||
\begin{verbatim}
|
||||
r : alternative_1 ... except { handler code }
|
||||
| alternative_2 ... finally { handler code }
|
||||
...
|
||||
| alternative_n
|
||||
;
|
||||
\end{verbatim}
|
||||
|
||||
\paragraph{Note:} It is not necessary to define exception handler for each alternative.
|
||||
|
||||
\subsubsection{Default error handler in lexer}
|
||||
|
||||
To skip every character that isn't recognized by any public lexer rule, specify
|
||||
§filter=true§ option for a lexer. That way, the parser doesn't have to deal
|
||||
with lexical errors and ask for another token.
|
||||
@@ -0,0 +1,53 @@
|
||||
\chapter{Grammars}
|
||||
\minitoc \clearpage
|
||||
|
||||
\section{Structure of a grammar}
|
||||
|
||||
The generic structure of a DPG grammar is the following:
|
||||
\begin{itemize}
|
||||
\item \emph{unit declaration}
|
||||
\item \emph{unit sections}
|
||||
\item \emph{grammar class definition}
|
||||
\item \emph{grammar class sections}
|
||||
\end{itemize}
|
||||
\paragraph{Note:} the order of blocks cannot be changed!
|
||||
|
||||
\subsection{Unit declaration}
|
||||
The $unit$~$declaration$ is always the first block in any DPG grammar. It
|
||||
specifies the name of the target Pascal unit generated by DPG from the
|
||||
grammar. The syntax is identical to that of Delphi.
|
||||
\begin{alltt}
|
||||
\textbf{unit} \emph{UnitName} ;
|
||||
\end{alltt}
|
||||
|
||||
\subsection{Unit sections}
|
||||
The $unit$~$sections$ block must follow the $unit$~$declaration$
|
||||
block if it exists. The members of this block are optional, but
|
||||
they must appear in the following order:
|
||||
\begin{itemize}
|
||||
\item \emph{uses section}
|
||||
\item \emph{const section}
|
||||
\item \emph{type section}
|
||||
\end{itemize}
|
||||
|
||||
\subsection{Grammar class definition}
|
||||
This block defines the type of the grammar class. The possible types are
|
||||
§lexer§ and §parser§.
|
||||
\begin{alltt}
|
||||
\textbf{lexer} \emph{myLexer} ; // define lexer
|
||||
\end{alltt}
|
||||
or
|
||||
\begin{alltt}
|
||||
\textbf{parser} \emph{myParser} ; // define parser
|
||||
\end{alltt}
|
||||
|
||||
\subsection{Grammar class sections}
|
||||
This block may contain the following sections in the order
|
||||
specified:
|
||||
\begin{itemize}
|
||||
\item \emph{options section}
|
||||
\item \emph{tokens section} (only for lexers)
|
||||
\item \emph{memberdecl section}
|
||||
\item \emph{rule definitions}
|
||||
\item \emph{memberdef section}
|
||||
\end{itemize}
|
||||
@@ -0,0 +1,63 @@
|
||||
\chapter{Introduction}
|
||||
\minitoc \clearpage
|
||||
|
||||
\section{Overview}
|
||||
The Delphi Parser Generator is a language tool which automatically
|
||||
generates $LL(k)$ parsers in Object Pascal Language based on an
|
||||
intuitive grammar, similar to §EBNF§. The generated code mimics a
|
||||
hand-written parser, so that it is easier to debug and leads to
|
||||
shortened development time compared to state-machine based $LR$ or
|
||||
DFA/NFA parsers. To compensate theoretical limitations of $LL(k)$
|
||||
parsers, DPG features several powerful extensions enhancing its
|
||||
functionality far beyond that of standard $LL(k)$ parsers. The
|
||||
method of syntactic and semantic predicates makes the writing of
|
||||
meta-parsers simple and routine. The philosophy of DPG is to allow
|
||||
the programmer maximum control over the parsing process while
|
||||
eliminating all the routine work.
|
||||
|
||||
\section{Features}
|
||||
\begin{itemize}
|
||||
\item[-] Delphi code generator for $LL(k)$ lexers and parsers.
|
||||
\item[-] Intuitive and consistent EBNF like syntax for both the lexer and the parser generator
|
||||
resulting in a shallow learning curve.
|
||||
\item[-] Extremely easy-to-read generated code undistinguishable from hand-written
|
||||
parsers. The inlined statements are properly indented relative to the surrounding
|
||||
program code.
|
||||
\item[-] Syntactic predicates allow for conditional parsing based on
|
||||
formal syntactic conditions, enhancing the functionality of the $LL(k)$ parsers
|
||||
considerably.
|
||||
\item[-] Semantic predicates allow for conditional parsing based on
|
||||
essentially arbitrary conditions. For example, a DOM-based XML parser is easily
|
||||
written by semantic predicates using an internal hash-table representation of
|
||||
the DOM. Using traditional state-machine based parsers (like §YACC§), programmers
|
||||
often need to delegate parsing tasks to the hand written part of the code. This
|
||||
burdens them with laborious and error-prone routine work. Semantic predicates
|
||||
prevent this, since the parser is allowed to use run-time information for the
|
||||
parsing process dynamically.
|
||||
\item[-] Actions can be inserted in the rules at every possible place. These actions can be
|
||||
used for controlling the parsing process with high granularity.
|
||||
\item[-] All rules may have return values and arguments. Rule arguments add a powerful
|
||||
metaparsing capability completing the predicate and action mechanism optimally.
|
||||
\item[-] All rules may have a code initialization section. This special feature is tuned
|
||||
for Pascal to allow the programmer to declare and initialize local variables for each rule.
|
||||
\item[-] Many convenient extensions to the plain §BNF§ syntax, such as §(...)§, §(...)?§,
|
||||
§(...)+§, §(...)*§, which simplify the task of writing grammars and makes it less
|
||||
error-prone.
|
||||
\item[-] Element complements allow for matching a text not matching a given rule.
|
||||
\item[-] Element labels are used to directly map rule information
|
||||
to Pascal variables. They provide a seamless interaction between the
|
||||
generated and user-written code.
|
||||
\item[-] Intuitive Graphical User Interface with syntax highlighting, and
|
||||
project management capabilities.
|
||||
\end{itemize}
|
||||
|
||||
\section{Installation}
|
||||
The first step in using DPG is to install it in Delphi. However, before using
|
||||
DPG be sure to read over the License Agreement.
|
||||
\begin{itemize}
|
||||
\item[-] run setup.exe and follow the instructions
|
||||
\item[-] run Delphi and add your DPG run-time library directory to Delphi's
|
||||
library path. For example, to do this for Delphi 6 select \emph{Tools} §|§ \emph{Environment Options}
|
||||
on the menu bar. Go to the \emph{Library} tab and add the full path of your DPG run-time directory
|
||||
to the \emph{Library Path} if you have not already done so.
|
||||
\end{itemize}
|
||||
@@ -0,0 +1,40 @@
|
||||
\section{Atomic production elements}
|
||||
\subsection{Character literal}
|
||||
Single characters enclosed in quotes are character literals. A
|
||||
character literal can only be referred to within a lexer rule. For
|
||||
example, §'{'§ needs not be escaped as you are specifying the
|
||||
literal character which is to be matched. Meta symbols are used
|
||||
outside of characters and string literals to specify lexical
|
||||
structure. Special characters can be specified in a similar way to
|
||||
§C§ escape sequences. DPG accepts the following escape sequences:
|
||||
§\n§, §\r§, §\t§, §\'§, §\"§, §\\§. The §#xx§ form is not accepted
|
||||
by DPG.
|
||||
|
||||
\subsection{String literal}
|
||||
String literals are sequences of characters enclosed in double quotes. The same
|
||||
escape sequences can be used in string literals as in character literals.
|
||||
In parser rules, strings represent tokens, and each unique string is assigned
|
||||
to a token type. Referring to a string within a lexer rule matches the
|
||||
indicated sequence of characters and is a shorthand notation. For example,
|
||||
consider the following equivalent lexer rule definitions:
|
||||
\begin{verbatim}
|
||||
BEGIN : "begin";
|
||||
BEGIN : 'b' 'e' 'g' 'i' 'n';
|
||||
\end{verbatim}
|
||||
|
||||
\subsection{Wildcard}
|
||||
The wildcard §.§ within a parser rule matches any single token;
|
||||
within a lexer rule it matches any single character.
|
||||
|
||||
\subsection{Token reference}
|
||||
Identifiers beginning with an uppercase letter are treated as
|
||||
token references. The subsequent characters may be a mixture of
|
||||
letters, digits or underscores. Referencing a token in a parser
|
||||
rule implies that you want to recognize a token with the specified
|
||||
token type. This does not actually call the associated lexer rule
|
||||
-- the lexical analysis phase delivers a stream of tokens to the
|
||||
parser. A token reference within a lexer rule implies a method
|
||||
call to that rule, and carries the same analysis semantics as a
|
||||
rule reference within a parser. So, you may specify rule arguments
|
||||
and return values for non-public tokens and for every parser rule.
|
||||
See the next section on rule references.
|
||||
@@ -0,0 +1,77 @@
|
||||
\section{Error handling}
|
||||
|
||||
All syntactic and semantic errors cause parser exceptions to be thrown. In
|
||||
particular, the methods used to match tokens in the parser base class (match et
|
||||
al) throw §EdpgMismatchedToken§. The methods in the lexer base class used to
|
||||
match characters (match et al) throw analogous exceptions.
|
||||
|
||||
\subsection{DPG exception hierarchy}
|
||||
|
||||
DPG-generated parsers throw exceptions to signal recognition errors or other
|
||||
stream problems. All exceptions derive from EdpgException. The hierarchy is the
|
||||
following:
|
||||
|
||||
\begin{verbatim}
|
||||
EdpgException
|
||||
EdpgMismatchedChar
|
||||
EdpgMismatchedToken
|
||||
EdpgSemantic
|
||||
\end{verbatim}
|
||||
|
||||
\subsubsection{EdpgException} The EdpgException exception class is the base of
|
||||
all DPG generated exceptions. User defined exceptions must derive from this
|
||||
class.
|
||||
|
||||
\subsubsection{EdpgMismatchedChar} This exception is thrown by the lexer when it
|
||||
is looking for a character, but finds a different one on the input stream.
|
||||
|
||||
\subsubsection{EdpgMismatchedToken} This exception is thrown by the parser when
|
||||
it is looking for a token, but finds a different one on the input token stream.
|
||||
|
||||
\subsubsection{EdpgSemantic} This exception is thrown by a validating semantic
|
||||
predicate.
|
||||
|
||||
\subsection{Specifying exception handlers}
|
||||
|
||||
DPG allows to specify specific exception handler to a given rule or
|
||||
alternative. The general form of an exception handler specification is:
|
||||
|
||||
\begin{verbatim}
|
||||
... except { code to handle exception }
|
||||
... finally { code to handle exception }
|
||||
\end{verbatim}
|
||||
|
||||
\subsubsection{Exception handler for a rule}
|
||||
|
||||
The exception handler for a rule must be placed after the terminating
|
||||
semicolon. The handler can be either an §except§ block or a §finally§ block.
|
||||
The implementation of rule will be surrounded by a try block.
|
||||
|
||||
\begin{verbatim}
|
||||
r : ...
|
||||
;
|
||||
except { handler code }
|
||||
\end{verbatim}
|
||||
|
||||
\subsubsection{Exception handler for an alternative}
|
||||
|
||||
The exception handler of an alternative must be the last element of the
|
||||
alternative. Both exception handler blocks can be used. Every alternative that
|
||||
have exception block specified, will be surrounded by a §try...except/finally§
|
||||
block.
|
||||
|
||||
\begin{verbatim}
|
||||
r : alternative_1 ... except { handler code }
|
||||
| alternative_2 ... finally { handler code }
|
||||
...
|
||||
| alternative_n
|
||||
;
|
||||
\end{verbatim}
|
||||
|
||||
\paragraph{Note:} It is not necessary to define exception handler for each alternative.
|
||||
|
||||
\subsubsection{Default error handler in lexer}
|
||||
|
||||
To skip every character that isn't recognized by any public lexer rule, specify
|
||||
§filter=true§ option for a lexer. That way, the parser doesn't have to deal
|
||||
with lexical errors and ask for another token.
|
||||
@@ -0,0 +1,265 @@
|
||||
\section{Options}
|
||||
|
||||
The §options{...}§ section is used to specify options for grammar
|
||||
elements. i.e. elements are the lexer/parser classes, rules and
|
||||
subrules. This section is preceded by the options keyword and
|
||||
contains a series of option/value assignments surrounded by curly
|
||||
braces.
|
||||
|
||||
\subsection{k}
|
||||
\begin{table}[H]
|
||||
\small
|
||||
\begin{tabular}{rl}
|
||||
\emph{synopsis:} & set lookahead depth \\
|
||||
\emph{context:} & parser/lexer class declaration \\
|
||||
\emph{type:} & integer \\
|
||||
\emph{default:} & 1
|
||||
\end{tabular}
|
||||
\end{table}
|
||||
|
||||
For any grammar, the lookahead depth can be specified by using the $k$ option.
|
||||
|
||||
\begin{verbatim}
|
||||
lexer myLexer;
|
||||
options
|
||||
{
|
||||
k = 2;
|
||||
}
|
||||
\end{verbatim}
|
||||
|
||||
Setting the lookahead depth changes the maximum number of tokens that will be
|
||||
examined to select alternative productions, and test for exit conditions of the
|
||||
§EBNF§ constructs §(...)?§, §(...)+§, and §(...)*§. The lookahead analysis is
|
||||
linear approximate (as opposed to full $LL(k)$ ). Consider this example with
|
||||
$k=2$:
|
||||
\begin{verbatim}
|
||||
r : ( A B | B A )
|
||||
| A A
|
||||
;
|
||||
\end{verbatim}
|
||||
|
||||
Full $LL(k)$ analysis would resolve the ambiguity and produce a
|
||||
lookahead test for the first alternative like:
|
||||
\begin{verbatim}
|
||||
if (LA(1)=A and LA(2)=B) or (LA(1)=B and LA(2)=A)
|
||||
\end{verbatim}
|
||||
|
||||
Linear approximate analysis would logically OR the lookahead sets at each
|
||||
depth, resulting in a test like:
|
||||
|
||||
\begin{verbatim}
|
||||
if (LA(1)=A or LA(1)=B) and (LA(2)=A or LA(2)=B)
|
||||
\end{verbatim}
|
||||
|
||||
Which is ambiguous for the second alternative for §{A,A}§.
|
||||
Therefore, setting the lookahead depth very high tends to yield
|
||||
diminishing returns in most cases, because the lookahead sets at
|
||||
large depths will include almost everything. This problem can be
|
||||
solved using a syntactic predicate.
|
||||
|
||||
|
||||
\subsection{importVocab}
|
||||
\begin{table}[H]
|
||||
\small
|
||||
\begin{tabular}{rl}
|
||||
\emph{synopsis:} & set initial grammar vocabulary \\
|
||||
\emph{context:} & parser/lexer class declaration \\
|
||||
\emph{type:} & ID \\
|
||||
\emph{default:} & none
|
||||
\end{tabular}
|
||||
\end{table}
|
||||
|
||||
The import vocabulary for a grammar class can be specified using the
|
||||
§importVocab§ option.
|
||||
|
||||
\begin{verbatim}
|
||||
lexer myLexer;
|
||||
options
|
||||
{
|
||||
importVocab = XML;
|
||||
}
|
||||
\end{verbatim}
|
||||
|
||||
DPG will look for the token exchange file named §XMLTokens.txt§,
|
||||
and import all the token definitions from it. Parser grammar must
|
||||
use this option, because without that, it cannot communicate with
|
||||
the lexer. Lexer grammar can use this option too. It is useful,
|
||||
when a parser class uses multiple lexers to get tokens from the
|
||||
input stream. The vocabulary file has an identifier on the first
|
||||
line that names the token vocabulary. All subsequent lines are of
|
||||
the form §ID=value§ or §ID="literal"=value§. For example:
|
||||
|
||||
\begin{verbatim}
|
||||
ThocLexer
|
||||
TT_EOF = 1
|
||||
TT_LPAREN = 4
|
||||
TT_RPAREN = 5
|
||||
LT_const = "const" = 6
|
||||
\end{verbatim}
|
||||
|
||||
The token exchange file is automatically generated by DPG for each grammar.
|
||||
\paragraph{Note:} you must take care of the order of grammars in a DPG project.
|
||||
Vocabulary-generating grammars must appear before vocabulary-consuming
|
||||
grammars.
|
||||
|
||||
\subsection{exportVocab}
|
||||
\begin{table}[H]
|
||||
\small
|
||||
\begin{tabular}{rl}
|
||||
\emph{synopsis:} & set export grammar vocabulary \\
|
||||
\emph{context:} & parser/lexer class declaration \\
|
||||
\emph{type:} & ID \\
|
||||
\emph{default:} & grammar class name
|
||||
\end{tabular}
|
||||
\end{table}
|
||||
|
||||
The vocabulary of a grammar is the union of the set of tokens provided by an
|
||||
§importVocab§ option and the set of tokens and literals defined in the grammar.
|
||||
|
||||
\begin{verbatim}
|
||||
lexer myParser;
|
||||
options
|
||||
{
|
||||
exportVocab = XML1;
|
||||
}
|
||||
\end{verbatim}
|
||||
|
||||
If the exportVocab options isn't specified, then DPG will use the
|
||||
grammar class name to export the vocabulary. DPG generates the
|
||||
following files for the examp\-le above: §XML1Tokens.txt§ for
|
||||
token exchange, and XML1Tokens.pas for the grammar class.
|
||||
|
||||
\subsection{testLiterals}
|
||||
\begin{table}[H]
|
||||
\small
|
||||
\begin{tabular}{rl}
|
||||
\emph{context:} & lexer class declaration, lexer rule \\
|
||||
\emph{type:} & boolean \\
|
||||
\emph{default:} & false
|
||||
\end{tabular}
|
||||
\end{table}
|
||||
|
||||
By default, DPG doesn't generate code to check the literals table
|
||||
(the table generated for literal strings), because checking the
|
||||
literals table after each token recognition is expensive. Instead,
|
||||
it checks string literals in a lexer rule, that can recognize
|
||||
them. The string literals table contains the strings defined in
|
||||
the §tokens{...}§ section of a lexer grammar.
|
||||
\begin{verbatim}
|
||||
lexer myLexer;
|
||||
options
|
||||
{
|
||||
testLiterals = false;
|
||||
}
|
||||
tokens
|
||||
{
|
||||
"function";
|
||||
"procedure";
|
||||
...
|
||||
}
|
||||
|
||||
ID
|
||||
options
|
||||
{
|
||||
testLiterals = true;
|
||||
}
|
||||
: (‘A’..’Z’ | ‘a’..’z’)(‘A’..’Z’ | ‘a’..’z’ | ‘0’..‘9’)*
|
||||
;
|
||||
\end{verbatim}
|
||||
|
||||
In the example above, if the input is matched by the rule §ID§
|
||||
then the implementation of the rule will check the literals table
|
||||
for the matched token. If it exists, then the returned token type
|
||||
will be set to the token type assigned to the string literal in
|
||||
the literals table. Otherwise the returned token type will remain
|
||||
unchanged.
|
||||
|
||||
It is possible to check the literals table explicitly within an
|
||||
action using the Test\-Li\-te\-ral method:
|
||||
|
||||
\begin{verbatim}
|
||||
{
|
||||
...
|
||||
_ttype := TestLiteral;
|
||||
_ttype := TestLiteral( _ttype);
|
||||
...
|
||||
}
|
||||
\end{verbatim}
|
||||
|
||||
\subsection{caseSensitive}
|
||||
\begin{table}[H]
|
||||
\small
|
||||
\begin{tabular}{rl}
|
||||
\emph{context:} & lexer class declaration \\
|
||||
\emph{type:} & boolean \\
|
||||
\emph{default:} & false
|
||||
\end{tabular}
|
||||
\end{table}
|
||||
|
||||
\begin{verbatim}
|
||||
lexer myLexer;
|
||||
options
|
||||
{
|
||||
caseSensitive = true;
|
||||
}
|
||||
\end{verbatim}
|
||||
|
||||
Case is ignored when comparing against character and string literals in the
|
||||
lexer. The case of the input stream is maintained when stored in the token
|
||||
objects.
|
||||
|
||||
\subsection{filter}
|
||||
\begin{table}[H]
|
||||
\small
|
||||
\begin{tabular}{rl}
|
||||
\emph{context:} & lexer class declaration \\
|
||||
\emph{type:} & boolean / ID \\
|
||||
\emph{default:} & false
|
||||
\end{tabular}
|
||||
\end{table}
|
||||
|
||||
\begin{verbatim}
|
||||
lexer myLexer;
|
||||
options
|
||||
{
|
||||
filter = true;
|
||||
}
|
||||
\end{verbatim}
|
||||
|
||||
When §true§, the lexer ignores any input not exactly matching one of the public
|
||||
lexer rules.
|
||||
|
||||
Notice that the filter rule must track new-lines in the general
|
||||
case where the lexer might emit error messages.
|
||||
|
||||
When set to a rule name, the filter rule is invoked either when the lookahead
|
||||
(in nextToken) predicts none of the public lexical rules or when one of those
|
||||
rules fails. In the latter case, the input is rolled back before attempting
|
||||
the filter rule. Option §filter=true§ is like having a filter rule such as:
|
||||
|
||||
\begin{verbatim}
|
||||
IGNORE : . ;
|
||||
\end{verbatim}
|
||||
|
||||
\subsection{ignore}
|
||||
\begin{table}[H]
|
||||
\small
|
||||
\begin{tabular}{rl}
|
||||
\emph{context:} & lexer rule \\
|
||||
\emph{type:} & ID \\
|
||||
\emph{default:} & none
|
||||
\end{tabular}
|
||||
\end{table}
|
||||
|
||||
\begin{verbatim}
|
||||
lexer myLexer;
|
||||
options
|
||||
{
|
||||
ignore = MyIgnoreRule;
|
||||
}
|
||||
\end{verbatim}
|
||||
|
||||
Specify a lexer rule to use a white space between lexical rule
|
||||
atomic elements (chars, strings, and rule references). The grammar
|
||||
analysis, and hence the look\-ahead sets, are aware of the
|
||||
whitespace references.
|
||||
@@ -0,0 +1,41 @@
|
||||
\section{Production element operators}
|
||||
|
||||
\subsection{Element complement}
|
||||
The unary not operator $\sim$ may be applied to an atomic element
|
||||
such as a token identifier. For some token atom §T§, $\sim$§T§
|
||||
matches any token other than §T§ except end-of-file. Within lexer
|
||||
rules, $\sim$§'a'§ matches any character other than character
|
||||
§'a'§. The sequence $\sim$§.§ (``not anything'') is meaningless
|
||||
and not allowed. Example:
|
||||
\begin{verbatim}
|
||||
SL_COMMENT : "//" (~'\n')* '\n';
|
||||
\end{verbatim}
|
||||
|
||||
\subsection{Set complement}
|
||||
The unary not operator $\sim$ can also be used to construct a
|
||||
token set or character set by complementing another set. This is
|
||||
most useful when you want to match tokens or characters until a
|
||||
certain delimiter set is encountered. Rather than invent a special
|
||||
syntax for such sets, DPG allows the placement of $\sim$ in front
|
||||
of a subrule containing only simple elements and no actions. The
|
||||
simple elements may be token references, token ranges, character
|
||||
literals, or character ranges. For example:
|
||||
\begin{verbatim}
|
||||
SL_COMMENT : "//" (~('\r'|'\n'))* ('\r'|'\n');
|
||||
\end{verbatim}
|
||||
|
||||
\subsection{Range operator}
|
||||
The binary range operator §..§ is used to define a range of atom
|
||||
which may be matched. The expression §c1..c2§ in a lexer matches
|
||||
characters included in that range. The expression §T..U§ in a
|
||||
parser matches any token whose token type is inclusively in that
|
||||
range, which is of dubious value if the token types are generated
|
||||
externally.
|
||||
|
||||
\subsection{Ignore operator}
|
||||
In lexer grammars, the ignore operator §!§ can be applied to any
|
||||
atomic production element. It means that the element followed by
|
||||
the §!§ operator should not appear in the result token. Example:
|
||||
\begin{verbatim}
|
||||
STRING : '"'! (~'"')* '"'! ;
|
||||
\end{verbatim}
|
||||
@@ -0,0 +1,82 @@
|
||||
\section{Element labels}
|
||||
|
||||
Any atomic production element can be labeled by an identifier (case is insignificant).
|
||||
For a labelled atomic element, the identifier is used within a semantic action to access
|
||||
the associated Token object or character. For example,
|
||||
|
||||
\begin{verbatim}
|
||||
assign
|
||||
: v:ID EQUALS expr SEMI
|
||||
{
|
||||
writeln(‘Assign to ‘ + v.TokenText);
|
||||
}
|
||||
;
|
||||
\end{verbatim}
|
||||
|
||||
\section{EBNF rule elements}
|
||||
|
||||
DPG supports the following extended BNF notations:
|
||||
\begin{table}[H]
|
||||
\small
|
||||
\begin{tabular}{ll}
|
||||
% \hline
|
||||
§(...) § & -- exactly one occurrence of a subrule \\
|
||||
§(...)?§ & -- zero or one occurrence of a subrule \\
|
||||
§(...)+§ & -- one or more occurrence of a subrule \\
|
||||
§(...)*§ & -- zero or more occurrence of a subrule
|
||||
% \hline
|
||||
\end{tabular}
|
||||
\end{table}
|
||||
|
||||
\section{Rule arguments}
|
||||
Character sequences in square brackets are arguments or return type specifiers.
|
||||
Square brackets within string and character literals are not argument
|
||||
delimiters. The arguments within §[]§ must follow the Object Pascal syntax.
|
||||
|
||||
\section{Exception handlers}
|
||||
|
||||
DPG allows the specification of exception handlers specific to a
|
||||
given rule or alternative. The general form of an exception
|
||||
handler specification is:
|
||||
|
||||
\begin{verbatim}
|
||||
... except { code to handle exception }
|
||||
... finally { code to handle exception }
|
||||
\end{verbatim}
|
||||
|
||||
\subsection{Exception handler for a rule}
|
||||
|
||||
The exception handler for a rule must be placed after the
|
||||
terminating semicolon. The handler can be either an §except§ block
|
||||
or a §finally§ block. The implementation of a rule will be
|
||||
surrounded by a try block.
|
||||
|
||||
\begin{verbatim}
|
||||
r : ...
|
||||
;
|
||||
except { handler code }
|
||||
\end{verbatim}
|
||||
|
||||
\subsection{Exception handler for an alternative}
|
||||
|
||||
The exception handler of an alternative must be the last element
|
||||
of the alternative. Both exception handler blocks can be used.
|
||||
Every alternative that has an exception block will be surrounded
|
||||
by a §try...except/finally§ block.
|
||||
|
||||
\begin{verbatim}
|
||||
r : alternative_1 ... except { handler code }
|
||||
| alternative_2 ... finally { handler code }
|
||||
...
|
||||
| alternative_n
|
||||
;
|
||||
\end{verbatim}
|
||||
|
||||
\paragraph{Note:} It is not necessary to define an exception handler for each alternative.
|
||||
|
||||
\subsection{Default error handler in lexer}
|
||||
|
||||
To skip every character that isn't recognized by any public lexer
|
||||
rule, specify the option §filter=true§ for a lexer. That way, the
|
||||
parser doesn't have to deal with lexical errors and ask for
|
||||
another token.
|
||||
@@ -0,0 +1,249 @@
|
||||
\section{Sections}
|
||||
|
||||
\subsection{unit}
|
||||
The unit section specifies the unit name of the generated source file.
|
||||
The syntax is identical to Object Pascal.
|
||||
|
||||
\subsection{uses}
|
||||
The §uses{...}§ section is used to specify the units which must be
|
||||
included in the interface's uses clause of the generated pascal
|
||||
unit. Every unit name must be terminated by a semicolon. Repeated
|
||||
units are included only once.
|
||||
\begin{verbatim}
|
||||
uses
|
||||
{
|
||||
Classes;
|
||||
Windows;
|
||||
}
|
||||
\end{verbatim}
|
||||
|
||||
|
||||
\subsection{const}
|
||||
The §const{...}§ section is used to specify items that appear in
|
||||
the interface's const clause of the generated pascal unit. The
|
||||
content of this section is copied verbatim into the unit.
|
||||
\begin{verbatim}
|
||||
const
|
||||
{
|
||||
const1 = 12;
|
||||
const2 = ‘FOO’;
|
||||
}
|
||||
\end{verbatim}
|
||||
|
||||
\subsection{type}
|
||||
The §type{...}§ section is used to specify items that appear in
|
||||
the interface's type clause of the generated pascal unit. The
|
||||
content of this section is copied verbatim into the unit.
|
||||
\begin{verbatim}
|
||||
type
|
||||
{
|
||||
TmyType1 = integer;
|
||||
TmyType2 = array [0..16] of TmyType1;
|
||||
}
|
||||
\end{verbatim}
|
||||
|
||||
\subsection{options}
|
||||
The §options{...}§ section contains options for a given grammar
|
||||
element. Options can be defined for lexer/parser classes, rules
|
||||
and subrules.
|
||||
|
||||
\subsection{tokens}
|
||||
If you need to define an ``imaginary'' token (i.e. one that has no
|
||||
corresponding real input symbol) use the §tokens{...}§ section to
|
||||
define them. You can also define literals in this section.
|
||||
|
||||
\begin{verbatim}
|
||||
tokens
|
||||
{
|
||||
"procedure";
|
||||
"function";
|
||||
INTEGER;
|
||||
}
|
||||
\end{verbatim}
|
||||
|
||||
Strings defined in this way are treated just as if you had referenced them in
|
||||
the parser. The formal syntax is:
|
||||
|
||||
\begin{verbatim}
|
||||
tokenSpecification
|
||||
: "tokens"
|
||||
LCURLY
|
||||
(tokenItem SEMI)*
|
||||
RCURLY
|
||||
;
|
||||
|
||||
tokenItem
|
||||
: TOKEN
|
||||
| STRING
|
||||
;
|
||||
\end{verbatim}
|
||||
|
||||
The §tokens{...}§ section is only valid in lexer grammars.
|
||||
|
||||
\subsection{memberdecl}
|
||||
The §memberdecl{...}§ section contains additional member
|
||||
declarations for the grammar class. It allows the expansion of the
|
||||
grammar class with user defined members, so it is not necessary to
|
||||
derive new classes from the generated class to implement
|
||||
additional functionality. The content of this section is copied
|
||||
verbatim into the class declaration of the generated grammar
|
||||
class.
|
||||
\begin{verbatim}
|
||||
memberdecl
|
||||
{
|
||||
procedure proc1;
|
||||
procedure proc2;
|
||||
}
|
||||
\end{verbatim}
|
||||
|
||||
\subsection{memberdef}
|
||||
The §memberdef{...}§ section contains the implementation of the
|
||||
classes' additional functionality. The content of this section is
|
||||
copied verbatim into the implementation part of the generated
|
||||
unit. This section may also contain the initialization and
|
||||
finalization clauses.
|
||||
|
||||
\begin{verbatim}
|
||||
memberdef
|
||||
{
|
||||
procedure TmyClass.proc1;
|
||||
begin
|
||||
...
|
||||
end;
|
||||
|
||||
procedure TmyClass.proc2;
|
||||
begin
|
||||
...
|
||||
end;
|
||||
}
|
||||
\end{verbatim}
|
||||
|
||||
\subsection{parser}
|
||||
Parser rules must be associated with a parser class. Each parser
|
||||
class specification precedes the options, and rule definitions of
|
||||
the parser. Grammar files §.g§ can hold only one class definition.
|
||||
A parser specification in a grammar file looks like:
|
||||
\begin{verbatim}
|
||||
unit myParser;
|
||||
uses... // optional uses {...} section
|
||||
const... // optional const {...} section
|
||||
type... // optional type {...} section
|
||||
|
||||
parser TmyParser;
|
||||
|
||||
options... // optional options {...} section
|
||||
memberdecl... // optional memberdecl {...} section
|
||||
parser rules...
|
||||
memberdef... // optional memberdef {...} section
|
||||
\end{verbatim}
|
||||
|
||||
In the generated code, the parser class results in an Object
|
||||
Pascal class, and the rules become member methods of the class.
|
||||
|
||||
Note, that the content of the §memberdecl{...}§ section is copied
|
||||
verbatim into the class declaration part of the generated parser
|
||||
class while the content of the §memberdef{...}§ section is copied
|
||||
after the implementation of the member rules, so the
|
||||
initialization and finalization clauses of a pascal unit can be
|
||||
placed in the §memberdef{...}§ section.
|
||||
|
||||
\subsection{lexer}
|
||||
To perform lexical analysis, you need to specify a lexer class that describes
|
||||
how to break up the input character stream into a stream of tokens. The syntax
|
||||
is similar to that of a parser class:
|
||||
\begin{verbatim}
|
||||
unit myLexer;
|
||||
uses... // optional uses {...} section
|
||||
const... // optional const {...} section
|
||||
type... // optional type {...} section
|
||||
|
||||
lexer TmyLexer;
|
||||
|
||||
options... // optional options {...} section
|
||||
tokens... // optional tokens {...} section
|
||||
memberdecl... // optional memberdecl {...} section
|
||||
lexer rules...
|
||||
memberdef... // optional memberdef {...} section
|
||||
\end{verbatim}
|
||||
|
||||
Lexical rules contained within a lexer class become member methods in the
|
||||
generated class. A lexer grammar may have a §tokens{...}§ section to specify
|
||||
imaginary tokens and string literals.
|
||||
|
||||
\subsection{rule definitions}
|
||||
The structure of an input stream of atoms is specified by a set of
|
||||
mutually-referenced rules. Each rule has a name and any of the
|
||||
following optional attributes: a scope specifier; a set of
|
||||
arguments; an init-action; a return value; local variable
|
||||
definitions; an exception handler and an alternative or
|
||||
alternatives. Each alternative contains a series of elements that
|
||||
specify what to match and where. Scope can be specified by
|
||||
private, protected, or public keywords. A rule has public scope by
|
||||
default. The basic form of a rule is:
|
||||
\begin{verbatim}
|
||||
(scope) rulename
|
||||
: alternative_1
|
||||
| alternative_2
|
||||
...
|
||||
| alternative_n
|
||||
;
|
||||
\end{verbatim}
|
||||
|
||||
Parameters for a rule can be specified in the following form:
|
||||
\begin{verbatim}
|
||||
rulename [formal parameters] : ... ;
|
||||
\end{verbatim}
|
||||
|
||||
If the rule returns a value, it’s type can be defined with the
|
||||
returns keyword:
|
||||
\begin{verbatim}
|
||||
rulename returns [typename] : ... ;
|
||||
\end{verbatim}
|
||||
|
||||
where §typename§ is a valid Object Pascal type specifier.
|
||||
|
||||
Local variables for a rule can be defined in the §local{...}§ section:
|
||||
|
||||
\begin{verbatim}
|
||||
rule
|
||||
local
|
||||
{
|
||||
foo: integer;
|
||||
bar: string;
|
||||
}
|
||||
\end{verbatim}
|
||||
|
||||
Init-actions are specified before the colon. Init-actions differ from normal
|
||||
actions because they are always executed regardless of guess mode.
|
||||
|
||||
\begin{verbatim}
|
||||
rule
|
||||
{
|
||||
init-action
|
||||
}
|
||||
: ... ;
|
||||
\end{verbatim}
|
||||
|
||||
|
||||
\paragraph{Parser rules} apply structure to a stream of tokens, whereas
|
||||
lexer rules apply structure to a stream of characters. Parser
|
||||
rules, therefore, must not reference cha\-rac\-ter literals.
|
||||
Double-quoted strings in parser rules are considered to be token
|
||||
references. Note: all parser rules must begin with a lowercase
|
||||
letter.
|
||||
|
||||
\paragraph{Lexer rules} defined within a lexer grammar must have a name beginning
|
||||
with an uppercase letter. These rules implicitly match
|
||||
cha\-rac\-ters on the input stream instead of tokens on the token
|
||||
stream. Referenced grammar elements include token references
|
||||
(implicit lexer rule references), cha\-rac\-ters and strings.
|
||||
Lexer rules are processed in the same manner as parser rules, and
|
||||
may also specify arguments and return values. A scope specifier
|
||||
for a lexer rule has special meaning in lexer grammars. In the
|
||||
generated Object Pascal unit, the lexer class has a §nextToken§
|
||||
function which is the interface between the lexer and the parser.
|
||||
This function is synthesized from the public lexer rules. It means
|
||||
that non-public lexer rules don't modify the prediction logic of
|
||||
the lexer. They are usually helper rules. If the lexer grammar has
|
||||
no public rule at all, the §nextToken§ function returns EOF to the
|
||||
parser.
|
||||
@@ -0,0 +1,79 @@
|
||||
\section{Simple production elements}
|
||||
\subsection{Rule reference}
|
||||
Identifiers beginning with lowercase letter are treated as parser
|
||||
rule references. The subsequent characters may be any letter,
|
||||
digit, number, or underscore. Lexical rules may not reference
|
||||
parser rules. Referencing a rule implies a method call to that
|
||||
rule at that point in the parse. You may pass parameters and
|
||||
obtain return values. For example, formal and actual parameters
|
||||
are specified within square brackets:
|
||||
\begin{verbatim}
|
||||
function
|
||||
: type ID LPAREN args RPAREN block [1]
|
||||
;
|
||||
|
||||
block [scope: integer]
|
||||
: LCURLY
|
||||
...
|
||||
{ (* use arg 'scope' *) }
|
||||
...
|
||||
RCURLY
|
||||
;
|
||||
\end{verbatim}
|
||||
|
||||
Return values that are stored in variables use a simple assignment
|
||||
notation:
|
||||
\begin{verbatim}
|
||||
set
|
||||
local
|
||||
{
|
||||
ids : TStringList;
|
||||
}
|
||||
{
|
||||
ids := nil;
|
||||
}
|
||||
: LPAREN ids=idList RPAREN
|
||||
;
|
||||
|
||||
idList returns [TStringList]
|
||||
{
|
||||
result := TStringList.Create;
|
||||
}
|
||||
: id:ID { result.Add( id.TokenText;); }
|
||||
(
|
||||
COMMA id:ID
|
||||
{
|
||||
result.Add( id.TokenText;);
|
||||
}
|
||||
)*
|
||||
;
|
||||
\end{verbatim}
|
||||
|
||||
\subsection{Semantic action}
|
||||
Actions are blocks of Object Pascal source code enclosed in curly braces. The
|
||||
code is executed after the preceding production element has been recognized and
|
||||
before the recognition of the following element. Actions are typically used to
|
||||
generate out\-put, construct trees, or modify a symbol table. An action's
|
||||
position dictates when it is recognized relative to the surrounding grammar
|
||||
elements.
|
||||
|
||||
If the action is the first element of a production, it is executed
|
||||
before any other e\-le\-ment in that production, but only if that
|
||||
production is predicted by the lookahead.
|
||||
|
||||
The first action of an §EBNF§ subrule may be followed by §:§.
|
||||
Doing so de\-sig\-na\-tes the action as an init-action and
|
||||
associates it with the subrule as a whole, instead of any
|
||||
production. It is executed immediately upon entering the subrule,
|
||||
and is executed even while guessing (testing syntactic
|
||||
predicates). For example:
|
||||
|
||||
\begin{verbatim}
|
||||
( { init-action} :
|
||||
{ action of 1st production} production1
|
||||
| { action of 2nd production} production2
|
||||
)?
|
||||
\end{verbatim}
|
||||
|
||||
The init-action would be executed regardless of what (if anything)
|
||||
matched in the optional subrule.
|
||||
@@ -0,0 +1,49 @@
|
||||
Delphi Parser Generator (DPG) uses the ASCII character set,
|
||||
including the letters \emph{A} through \emph{Z} and \emph{a}
|
||||
through \emph{z}, the digits \emph{0} through \emph{9}, and other
|
||||
standard characters. It is case sensitive. The space character
|
||||
(ASCII 32), the tab character (ASCII 9), and the new-line
|
||||
characters (ASCII 13,10) are called \emph{white-space} characters.
|
||||
|
||||
\section{General}
|
||||
\subsection{Comments}
|
||||
DPG accepts single and multi-line comments. Single-line comments begin with
|
||||
§//§ while multi-line (block) comments are enclosed by §(*§~and~§*)§.
|
||||
|
||||
\subsection{White Space}
|
||||
Spaces, tabs, and new-lines (including most used §CR-LF§, §CR§,
|
||||
§LF§ constructions) are separators in that they separate DPG
|
||||
symbols, such as identifiers. White spaces have no additional
|
||||
significance i.e. the code layout does not play any semantical
|
||||
role. However the layout of the embedded Delphi code is preserved
|
||||
in the ge\-ne\-ra\-ted source files.
|
||||
|
||||
\subsection{Symbols}
|
||||
DPG uses the following punctuation and keywords:
|
||||
|
||||
\begin{table}[H]
|
||||
\small
|
||||
\begin{center}
|
||||
\begin{tabular}{|ll|ll|}
|
||||
\hline
|
||||
§(...)§ & subrule & §unit§ & unit name \\
|
||||
§(...)*§ & closure subrule & §uses§ & uses section \\
|
||||
§(...)+§ & positive closure & §const§ & const section \\
|
||||
§(...)?§ & optional subrule & §type§ & type section \\
|
||||
§[...]§ & rule arguments & §lexer§ & lexer class \\
|
||||
§{...}§ & semantic action & §parser§ & parser class \\
|
||||
§{...}?§ & semantic predicate & §options§ & options section \\
|
||||
§(...)=>§ & syntactic predicate & §tokens§ & tokens section \\
|
||||
§ |§ & alternative operator & §returns§ & rule return value \\
|
||||
§ ..§ & range operator & §except§ & exception handler \\
|
||||
§ ~§ & not operator & §finally§ & exception handler \\
|
||||
§ !§ & ignore operator & §memberdecl§ & member declaration \\
|
||||
§ .§ & wildcard & §memberdef§ & member definition \\
|
||||
§ =§ & assignment operator & §local§ & local rule variables \\
|
||||
§ :§ & label, start rule & & \\
|
||||
§ ;§ & end rule & & \\
|
||||
\hline
|
||||
\end{tabular}
|
||||
\end{center}
|
||||
\caption{DPG symbols}
|
||||
\end{table}
|
||||
@@ -0,0 +1,10 @@
|
||||
\chapter{Syntactic elements}
|
||||
\minitoc
|
||||
\clearpage
|
||||
\include{src/lang/lang-syntactic}
|
||||
\include{src/lang/lang-atomprod}
|
||||
\include{src/lang/lang-simpprod}
|
||||
\include{src/lang/lang-prodoper}
|
||||
\include{src/lang/lang-sect}
|
||||
\include{src/lang/lang-opt}
|
||||
\include{src/lang/lang-rest}
|
||||
@@ -0,0 +1,94 @@
|
||||
\section{Error handling}
|
||||
|
||||
All syntactic and semantic errors throw exceptions. In particular,
|
||||
the methods used to match tokens in the parser base class (match
|
||||
etc) throw §EdpgMismatchedToken§. The methods in the lexer base
|
||||
class used to match characters (match etc) throw exceptions
|
||||
similarly.
|
||||
|
||||
\subsection{DPG exception hierarchy}
|
||||
|
||||
DPG-generated parsers throw exceptions to signal recognition
|
||||
errors or other stream problems. All exceptions derive from
|
||||
EdpgException. The hierarchy is as follows:
|
||||
|
||||
\begin{verbatim}
|
||||
EdpgException
|
||||
EdpgMismatchedChar
|
||||
EdpgMismatchedToken
|
||||
EdpgSemantic
|
||||
\end{verbatim}
|
||||
|
||||
\subsection{EdpgException}
|
||||
The §EdpgException§ is the base class for all DPG exceptions. It
|
||||
defines the following read-only properties:
|
||||
\begin{alltt}
|
||||
FileName : string;
|
||||
Line : integer;
|
||||
Column : integer;
|
||||
\end{alltt}
|
||||
These properties contain information about the location where the exception
|
||||
occurred.
|
||||
|
||||
\subsection{EdpgMismatchedChar}
|
||||
The §EdpgMismatchedChar§ exception is thrown by the lexer when it
|
||||
is looking for a character, but finds a different one on the input
|
||||
stream than expected. It defines the following properties in
|
||||
addition to those of §EdpgException§.
|
||||
\begin{alltt}
|
||||
FoundChar : char;
|
||||
FoundString : string;
|
||||
CharSet : TdpgCharSet;
|
||||
Str : string;
|
||||
Inverted : boolean;
|
||||
\end{alltt}
|
||||
The §FoundChar§ and §FoundString§ properties contain the character
|
||||
or string that was found on the input stream. The §CharSet§ and
|
||||
§Str§ properties contain the values which the lexer expected to
|
||||
find. The §Inverted§ property is set only if the exception came
|
||||
from a §MatchNot(...)§ operation. In this case, the §CharSet§
|
||||
property contains the values, that the lexer must §not§ match. The
|
||||
validity of pro\-per\-ti\-es are shown in the next table,
|
||||
depending on the kind of exception.
|
||||
|
||||
\begin{table}[H]
|
||||
\small
|
||||
\begin{center}
|
||||
\begin{tabular}{lcc}
|
||||
& Mismatched char & Mismatched string \\
|
||||
\hline
|
||||
FoundChar & valid & - \\
|
||||
FoundString & - & valid \\
|
||||
CharSet & valid & - \\
|
||||
Str & - & valid \\
|
||||
Inverted & valid & - \\
|
||||
\hline
|
||||
\end{tabular}
|
||||
\end{center}
|
||||
\end{table}
|
||||
|
||||
\subsection{EdpgMismatchedToken}
|
||||
The §EdpgMismatchedToken§ exception is thrown by the parser when
|
||||
it is looking for a token, but finds a different one on the input
|
||||
token stream than expected. It defines the following properties in
|
||||
addition to those of §EdpgException§.
|
||||
\begin{alltt}
|
||||
FoundToken : IdpgToken;
|
||||
TokenSet : TdpgByteSet;
|
||||
Inverted : boolean;
|
||||
\end{alltt}
|
||||
The §FoundToken§ property contains the token the parser received from the
|
||||
lexer. The §TokenSet§ property contains the vaules the parser expected to
|
||||
get. The §Inverted§ property is set only if the exception came from a
|
||||
§MatchNot(...)§ operation. In this case, the §TokenSet§ property contains the
|
||||
values the parser must §not§ get.
|
||||
|
||||
\subsection{EdpgSemantic}
|
||||
This exception is thrown by a validating semantic predicate. It
|
||||
defines the following property in addition to those of
|
||||
§EdpgException§.
|
||||
\begin{alltt}
|
||||
Assert : string;
|
||||
\end{alltt}
|
||||
The §Assert§ property contains the validating expression that caused the
|
||||
exception.
|
||||
@@ -0,0 +1,3 @@
|
||||
\chapter{Run-time}
|
||||
\minitoc \clearpage
|
||||
\include{src/rt/rt-err}
|
||||
@@ -0,0 +1,200 @@
|
||||
\chapter{Getting started}
|
||||
\minitoc \clearpage
|
||||
|
||||
In this chapter, we develop a simple calculator. It accepts integers, the four
|
||||
arithmetic operators (§+§,§-§,§/§,§*§), and parenthesis on its input.
|
||||
Spaces, tabs and newline characters are treated as white spaces and used for
|
||||
separating tokens. Complete Expressions must be terminated by semicolons.
|
||||
|
||||
\section{Lexical analyzer}
|
||||
|
||||
Let us define the calculator's lexer.
|
||||
|
||||
\begin{verbatim}
|
||||
1 unit myLexer;
|
||||
2
|
||||
3 lexer TmyLexer;
|
||||
4 options
|
||||
5 {
|
||||
6 exportvocab = myLexer;
|
||||
7 }
|
||||
\end{verbatim}
|
||||
In line §1§ we define the unit name of the generated Pascal source
|
||||
file for the lexer. In line §3§ we give a name to the lexer class.
|
||||
If there is an §options§ block for a grammar class, it must follow
|
||||
the class declaration. Here, we define one option for the lexer:
|
||||
§exportVocab§. This option tells the DPG that all the token
|
||||
definitions must be exported to §myLexerTokens.txt§ and
|
||||
§myLexerTokens.pas§. Grammars can import the generated token names
|
||||
using the exported §.txt§ files.
|
||||
|
||||
\paragraph{Note:} it is not necessary to define the §exportVocab§ option for a
|
||||
grammar. The file names for the token exchange files are automatically created
|
||||
using the specified unit name.
|
||||
|
||||
Now we define the lexer tokens.
|
||||
|
||||
\begin{verbatim}
|
||||
8 LPAREN: '(';
|
||||
9 RPAREN: ')';
|
||||
10 PLUS: '+';
|
||||
11 MINUS: '-';
|
||||
12 STAR: '*';
|
||||
13 SLASH: '/';
|
||||
14 SEMI: ';';
|
||||
\end{verbatim}
|
||||
In lines from §8§ to §14§, there are simple token definitions. Each of them
|
||||
recognizes one character from the input stream.
|
||||
|
||||
\begin{verbatim}
|
||||
15 INT: ('0'..'9')+ ;
|
||||
\end{verbatim}
|
||||
In line §15§, we define a rule to recognize integer numbers. This tells us that
|
||||
the INT consists of one or more numeric characters.
|
||||
|
||||
Now, define a rule to handle white space characters.
|
||||
\begin{verbatim}
|
||||
16 WS
|
||||
17 : '\r' '\n' { _ttype := TT_SKIP; }
|
||||
18 | '\t' { _ttype := TT_SKIP; }
|
||||
19 | ' ' { _ttype := TT_SKIP; }
|
||||
20 ;
|
||||
\end{verbatim}
|
||||
Characters surrounded by curly braces are actions. The content of
|
||||
an action block will be copied verbatim into the generated Pascal
|
||||
source file. In this example the expression §_ttype := TT_SKIP;§
|
||||
forbids the §WS§ rule to generate a token, because we don't need
|
||||
it.
|
||||
|
||||
Now the lexer definition is finished. This simple lexer recognizes relevant
|
||||
characters, integers and skips every white spaces on its input.
|
||||
|
||||
\section{Parser}
|
||||
|
||||
Now we define the parser.
|
||||
|
||||
\begin{verbatim}
|
||||
1 unit myParser;
|
||||
2
|
||||
3 parser TmyParser;
|
||||
4 options
|
||||
5 {
|
||||
6 importVocab = myLexer;
|
||||
7 }
|
||||
\end{verbatim}
|
||||
This part is analogous to lexer definition with one exception. In
|
||||
line §6§, we import the tokens from a file specified by the
|
||||
§exportVocab§ option in the lexer grammar. Now, the parser knows
|
||||
which tokens are to be expected from the lexer.
|
||||
|
||||
\begin{verbatim}
|
||||
8 memberdecl
|
||||
9 {
|
||||
10 value: integer;
|
||||
11 }
|
||||
\end{verbatim}
|
||||
In lines from §8§ to §11§, we specify the §memberdecl§ section. This section is
|
||||
used to define members for the generated parser class. In this example, the §TmyParser§
|
||||
class will have a member called §value§. We use this member to store the result
|
||||
of the calculation for the current expression.
|
||||
|
||||
Now we define the rules.
|
||||
\begin{verbatim}
|
||||
12 calc
|
||||
13 : (expression SEMI { writeln( value); } )*
|
||||
14 ;
|
||||
15
|
||||
16 expression
|
||||
17 local
|
||||
18 {
|
||||
19 temp : integer;
|
||||
20 }
|
||||
21 : term { temp := value; }
|
||||
22 (
|
||||
23 PLUS term { temp := temp + value; }
|
||||
24 | MINUS term { temp := temp - value; }
|
||||
25 )* { value := temp; }
|
||||
26 ;
|
||||
\end{verbatim}
|
||||
In lines §17..20§, we define a local variable for the rule
|
||||
§expression§. The following rules are defined in a similar way to
|
||||
the rule §expression§.
|
||||
|
||||
\begin{verbatim}
|
||||
27 term
|
||||
28 local
|
||||
29 {
|
||||
30 temp : integer;
|
||||
31 }
|
||||
32 : factor { temp := value; }
|
||||
33 (
|
||||
34 STAR factor { temp := temp * value; }
|
||||
35 | SLASH factor { temp := temp div value; }
|
||||
36 )* { value := temp; }
|
||||
37 ;
|
||||
38
|
||||
39 factor
|
||||
40 local
|
||||
41 {
|
||||
42 temp : integer;
|
||||
43 }
|
||||
44 : uInt
|
||||
45 | LPAREN expression RPAREN
|
||||
46 ;
|
||||
47
|
||||
48 uInt
|
||||
49 : x:INT { value := StrToInt( x.TokenText); }
|
||||
50 ;
|
||||
\end{verbatim}
|
||||
In line §49§, we specified that the rule must have a variable
|
||||
called 'x' which will contain the INT token. For the moment, it is
|
||||
enough to know that it has a property §TokenText§ which contains
|
||||
the text of the recognized token. This property is a string
|
||||
property, so we have to convert it to an integer, and store it in
|
||||
the §value§ member variable.
|
||||
|
||||
\section{The project}
|
||||
The following simple project demonstrates how the defined lexer
|
||||
and parser classes are used.
|
||||
|
||||
\begin{verbatim}
|
||||
1 program calc;
|
||||
2 {$APPTYPE CONSOLE}
|
||||
3 uses
|
||||
4 Classes,
|
||||
5 SysUtils,
|
||||
6 myLexer in 'myLexer.pas',
|
||||
7 myParser in 'myParser.pas';
|
||||
8
|
||||
9 var
|
||||
10 stm: TFileStream;
|
||||
11 lex: TmyLexer;
|
||||
12 par: TmyParser;
|
||||
13
|
||||
14 begin
|
||||
15 if ParamCount <> 1 then
|
||||
16 begin
|
||||
17 writeln('usage: calc <filename>');
|
||||
18 exit;
|
||||
19 end
|
||||
20 else
|
||||
21 begin
|
||||
22 try
|
||||
23 stm := TFileStream.Create( ParamStr(1),
|
||||
24 fmOpenRead);
|
||||
24 lex := TmyLexer.Create(stm);
|
||||
25 par := TmyParser.Create(lex);
|
||||
26
|
||||
27 par.calc;
|
||||
28 except
|
||||
29 on EdpgMismatchedToken do
|
||||
22 writeln('Syntax error');
|
||||
30 on EdpgMismatchedChar do
|
||||
33 writeln('Syntax rrror');
|
||||
29 end;
|
||||
30 end;
|
||||
31
|
||||
32 stm.Free;
|
||||
33 par.Free;
|
||||
34 end.
|
||||
\end{verbatim}
|
||||
@@ -0,0 +1,281 @@
|
||||
\chapter{Tokens}
|
||||
\minitoc \clearpage
|
||||
\section{Overview}
|
||||
Tokens are the basic building blocks of any parser or compiler.
|
||||
The task of a lexer (lexical analyzer, scanner) is to break up the
|
||||
input character stream into a stream of tokens. Then §nextToken§
|
||||
method of a lexer passes the next token to the parser, or throws
|
||||
an exception if the next character on the input stream cannot be
|
||||
matched by any of the public lexer rules. The §nextToken§ method
|
||||
is always synthesized from the public lexer rules.
|
||||
|
||||
§Tokens§ in DPG are interface pointers. The interface type is §IdpgToken§,
|
||||
which defines the following properties:
|
||||
\begin{verbatim}
|
||||
IdpgToken = interface
|
||||
...
|
||||
property TokenText : string;
|
||||
property TokenType : integer;
|
||||
property TokenLine : integer;
|
||||
property TokenColumn : integer;
|
||||
...
|
||||
end;
|
||||
\end{verbatim}
|
||||
|
||||
where §TokenText§ is the text matched by the lexer; §TokenType§ is
|
||||
the type of token assigned to the token by DPG; §TokenLine§ is the
|
||||
line number where the token starts in the input stream;
|
||||
§TokenColumn§ is the column number.
|
||||
|
||||
Within parser rules, the input token can be accessed via this interface. To
|
||||
obtain the interface to the recognized token, the reference to the token must
|
||||
be prefixed by a label. For example,
|
||||
\begin{verbatim}
|
||||
...
|
||||
x:NUMBER
|
||||
{
|
||||
...
|
||||
LogMsg( 'Token: ' + x.TokenText );
|
||||
LogMsg( 'Type: ' + IntToStr(x.TokenType));
|
||||
...
|
||||
}
|
||||
...
|
||||
\end{verbatim}
|
||||
|
||||
Note: Variables for labels are always generated by DPG, so you should not define
|
||||
them in the §local{...}§ section of the rule.
|
||||
|
||||
\section{Defining tokens}
|
||||
In DPG, tokens can be defined in the lexer grammars. DPG always
|
||||
generates a token exchange file that describes all the token types
|
||||
matched by the lexer. This file can be imported in a parser
|
||||
grammar, so the lexer and parser have the same token types. Tokens
|
||||
can be defined either,
|
||||
\begin{itemize}
|
||||
\item[-] via lexer rules, or
|
||||
\item[-] in the lexer's §tokens{...}§ section
|
||||
\end{itemize}
|
||||
|
||||
\subsubsection{Defining a token using a lexer rule}
|
||||
The commonest method of defining a token is using a lexer rule. In
|
||||
lexer grammars, every rule is associated with a §TokenType§ which
|
||||
is determined by DPG at compile time. This value is assigned to
|
||||
the result token by default, but it can be modified in the given
|
||||
rule if needed. This is used mostly in rules that need runtime
|
||||
information to set the type of the result token, but is otherwise
|
||||
uncommon.
|
||||
|
||||
There is one exception: when a rule must not generate a token at all.
|
||||
This is useful for defining comments or white-spaces for a grammar.
|
||||
Every lexer rule has a local variable called §_ttype§. If
|
||||
§_ttype§ has a value of §TT_SKIP§, then the rule won't generate any token. For
|
||||
example,
|
||||
\begin{verbatim}
|
||||
SLCOMMENT : "//" ( ~'\n')* '\n' { _ttype := TT_SKIP; } ;
|
||||
\end{verbatim}
|
||||
|
||||
The following examples are normal lexer rules, and they are typical in lexers:
|
||||
\begin{verbatim}
|
||||
LPAREN: '(';
|
||||
RPAREN: ')';
|
||||
DIGIT: '0'..'9';
|
||||
NUMBER: DIGIT (DIGIT)*;
|
||||
LETTER: 'a'..'Z' | 'A'..'Z';
|
||||
ID: LETTER (LETTER | DIGIT | '_')*;
|
||||
\end{verbatim}
|
||||
|
||||
\subsubsection{Defining a token in the tokens\{...\} section}
|
||||
|
||||
Lexer grammars may have a §tokens{...}§ section in the class
|
||||
declaration. Within this section you can define ``imaginary''
|
||||
tokens and string literals. These tokens are not ``real'' tokens
|
||||
and cannot be referenced in lexer rules. ``Imaginary'' tokens are
|
||||
helpful when a rule can recognize more than one type of token and
|
||||
defining rules for these tokens would be ambiguous. For example,
|
||||
|
||||
\begin{verbatim}
|
||||
tokens
|
||||
{
|
||||
STRING;
|
||||
CHAR;
|
||||
}
|
||||
// ========================================================
|
||||
// String or char
|
||||
// ========================================================
|
||||
STRING_OR_CHAR
|
||||
: '\'' (~'\'' | '\'' '\'')* '\''
|
||||
{
|
||||
if TokenText = '''''' then _ttype := TT_STRING
|
||||
else if TokenText = '''''''''' then _ttype := TT_CHAR
|
||||
else if Length( TokenText) > 3 then _ttype := TT_STRING
|
||||
else _ttype := TT_CHAR;
|
||||
}
|
||||
;
|
||||
\end{verbatim}
|
||||
The rule §STRING_OR_CHAR§ recognizes a pascal character literal,
|
||||
and a pascal string literal. The code in the action block decides
|
||||
which type of token must be created by the rule. Note: These
|
||||
tokens are ``imaginary'' tokens. Referencing them in lexer
|
||||
grammars is not possible, because they have no implementation.
|
||||
Within parser rules, the tokens §STRING§ and §CHAR§ can be
|
||||
referenced. But §STRING_OR_CHAR§ can't be referenced, because this
|
||||
rule creates a §STRING§ or a §CHAR§ token.
|
||||
|
||||
\paragraph{String literals} in the §tokens{...}§ section are useful when the language
|
||||
defines keywords. In this case you can list your language's keywords in this
|
||||
section. They will be put into the lexer's literals table. The lexer will consult
|
||||
this table in the following cases:
|
||||
\begin{itemize}
|
||||
\item[-] if the §testLiterals§ option for the lexer class is true, the lexer checks the
|
||||
literals table after each recognized token,
|
||||
\item[-] if the §testLiterals§ option for the lexer class is false, the
|
||||
check will be executed in rules, that have this option set.
|
||||
\end{itemize}
|
||||
|
||||
If neither lexer rules nor lexer class have this option set, the
|
||||
lexer's literals table can be explicitly checked via the
|
||||
§TestLiterals§ method. The advantage of using string literals is
|
||||
that you can reference them in the parser as they are defined in
|
||||
the §tokens{...}§ section. For example,
|
||||
|
||||
\begin{verbatim}
|
||||
...
|
||||
lexer TmyLexer;
|
||||
options
|
||||
{
|
||||
testLiterals = true;
|
||||
}
|
||||
tokens
|
||||
{
|
||||
...
|
||||
"function";
|
||||
"procedure";
|
||||
...
|
||||
}
|
||||
...
|
||||
|
||||
parser TmyParser;
|
||||
rule1 : "function" ID SEMI;
|
||||
rule2 : "procedure" ID LPAREN args RPAREN SEMI;
|
||||
...
|
||||
\end{verbatim}
|
||||
In the above example we set the §testLiterals§ option to true for the lexer
|
||||
class. This is not recommended, because the lexer will check the literals table
|
||||
even if it found a non-string token. Instead, you have to check the table in a
|
||||
rule that can recognize these literals. For example:
|
||||
|
||||
\begin{verbatim}
|
||||
...
|
||||
lexer TmyLexer;
|
||||
...
|
||||
|
||||
ID
|
||||
options
|
||||
{
|
||||
testLiterals=true;
|
||||
}
|
||||
: 'a'..'z' | 'A'..'Z' ('a'..'z' | 'A'..'Z' | '0'..'9')*
|
||||
;
|
||||
\end{verbatim}
|
||||
Here the literals table will only be consulted in the rule §ID§.
|
||||
This will improve the lexer's speed. Of course you can set the
|
||||
§testLiterals§ options to true for as many rules as you want. All
|
||||
of them will check the literals table.
|
||||
|
||||
\paragraph{Note:} The §testLiterals§ option has no effect for lexer rules.
|
||||
|
||||
|
||||
\section{User defined token classes}
|
||||
|
||||
By default, DPG uses the §TdpgToken§ class to represent tokens.
|
||||
This class is derived from §TInterfacedObject§, and implements the
|
||||
§IdpgToken§ interface. This interface is used across the generated
|
||||
code. To define a new token class you must derive your new class
|
||||
from §TdpgToken§, implement your interface to access and
|
||||
manipulate your object, and finally tell the lexer that it must
|
||||
create your type of token object instead of the default
|
||||
§TdpgToken§. After that, within the rules you must obtain the
|
||||
interface of your class and use it. Let us have a more detailed
|
||||
look at this:
|
||||
|
||||
1. Create a token class:
|
||||
\begin{verbatim}
|
||||
ImyToken = interface( IdpgToken)
|
||||
[a guid definition]
|
||||
|
||||
function Get_MyString : string;
|
||||
procedure Set_MyString( AString: string);
|
||||
|
||||
property MyString : string read Get_MyString
|
||||
write Set_MyString;
|
||||
end;
|
||||
|
||||
TmyToken = class( TdpgToken,
|
||||
IdpgToken,
|
||||
ImyToken)
|
||||
protected
|
||||
fMyString : string;
|
||||
|
||||
function Get_MyString : string;
|
||||
procedure Set_MyString( AString: string);
|
||||
|
||||
public
|
||||
constructor Create( pType: integer;
|
||||
pText: string); override;
|
||||
|
||||
end;
|
||||
|
||||
constructor TmyToken.Create( pType: integer;
|
||||
pText: string);
|
||||
begin
|
||||
inherited;
|
||||
...
|
||||
your code here
|
||||
...
|
||||
end;
|
||||
|
||||
function TmyToken.Get_MyString: string;
|
||||
begin
|
||||
result := fMyString;
|
||||
end;
|
||||
|
||||
function TmyToken.Set_MyString( pString: string);
|
||||
begin
|
||||
fMyString := pString;
|
||||
end;
|
||||
\end{verbatim}
|
||||
|
||||
2. Tell to lexer that it must use our token class.
|
||||
\begin{verbatim}
|
||||
uses myToken;
|
||||
...
|
||||
myLexer.TokenClass := TmyToken;
|
||||
\end{verbatim}
|
||||
|
||||
3. Use it in a rule.
|
||||
\begin{verbatim}
|
||||
...
|
||||
parser TmyParser;
|
||||
|
||||
rule1
|
||||
:
|
||||
"procedure" x:id (LPAREN params RPAREN)?
|
||||
{
|
||||
(x as ImyToken).MyString := 'procid';
|
||||
}
|
||||
;
|
||||
|
||||
\end{verbatim}
|
||||
|
||||
\paragraph{Note:} You must cast the returned interface to your token interface,
|
||||
because the §makeToken§ method of the lexer always returns an §IdpgToken§
|
||||
interface and the labels specified to obtain a reference to a token are always
|
||||
§IdpgToken§ references.
|
||||
|
||||
\paragraph{Note:} If you have to do special actions to initialize your token
|
||||
class, you must have the same constructor as defined in the
|
||||
example. The §makeToken§ method of the lexer always creates tokens
|
||||
with this constructor. If you have another kind of constructor for
|
||||
your token class, it won't be used by the lexer.
|
||||
|
||||
\clearpage
|
||||
@@ -0,0 +1,88 @@
|
||||
\NeedsTeXFormat{LaTeX2e}
|
||||
\ProvidesClass{zlbook}
|
||||
\LoadClass[a4paper,twoside,11pt]{book}
|
||||
\usepackage{times}
|
||||
\usepackage{chappg}
|
||||
\usepackage{here}
|
||||
\usepackage{alltt}
|
||||
\usepackage[bookman]{quotchap}
|
||||
\RequirePackage{shortvrb}
|
||||
\MakeShortVerb{\§}
|
||||
|
||||
\let\o@verbatim\verbatim
|
||||
|
||||
\def\verbatim{%
|
||||
\ifhmode\unskip\par\fi
|
||||
% \nopagebreak % Overridden by list penalty
|
||||
\ifx\@currsize\normalsize
|
||||
\small
|
||||
\fi
|
||||
\o@verbatim
|
||||
}
|
||||
|
||||
% No paragraph indentation, space between paragraphs
|
||||
\setlength{\parindent}{0pt}
|
||||
\setlength{\parskip}{\medskipamount}
|
||||
|
||||
\renewcommand{\thepage}{\thechapter\ - \arabic{page}}
|
||||
|
||||
\usepackage{fancyhdr}
|
||||
\pagestyle{fancy}
|
||||
%\addtolength{\headwidth}{0.5in}
|
||||
%\addtolength{\headwidth}{\marginparsep}
|
||||
%\addtolength{\headwidth}{\marginparwidth}
|
||||
\renewcommand{\chaptermark}[1]{\markboth{#1}{}}
|
||||
\renewcommand{\sectionmark}[1]{\markright{\thesection\ #1}}
|
||||
\fancyhf{}
|
||||
%\fancyhead[LE,RO]{\bfseries\thepage}
|
||||
%\fancyhead[RO]{\bfseries\rightmark}
|
||||
%\fancyhead[LE]{\bfseries\leftmark}
|
||||
\fancyfoot[RO]{\bfseries\thepage}
|
||||
\fancyfoot[LE]{\bfseries\thepage}
|
||||
|
||||
\fancyhead[RO]{\rightmark}
|
||||
\fancyhead[LE]{\leftmark}
|
||||
\fancyfoot[RO]{\thepage}
|
||||
\fancyfoot[LE]{\thepage}
|
||||
|
||||
\fancypagestyle{plain}{%
|
||||
\fancyhf{}
|
||||
\renewcommand{\headrulewidth}{0pt}
|
||||
\renewcommand{\footrulewidth}{0pt}
|
||||
}
|
||||
|
||||
\renewcommand{\headrulewidth}{0.4pt}
|
||||
\renewcommand{\footrulewidth}{0.4pt}
|
||||
%\renewcommand{\normalsize}{\fontsize{10pt}{12pt}\selectfont}
|
||||
|
||||
|
||||
\def\cleardoublepage{\clearpage\if@twoside \ifodd\c@page\else
|
||||
\hbox{}
|
||||
\vspace*{\fill}
|
||||
\begin{center}
|
||||
% This page intentionally left blank.
|
||||
\end{center}
|
||||
\vspace{\fill}
|
||||
\thispagestyle{empty}
|
||||
\newpage
|
||||
\if@twocolumn\hbox{}\newpage\fi\fi\fi}
|
||||
|
||||
%\addtolength{\textwidth}{1cm}
|
||||
|
||||
|
||||
\newenvironment{decl}[1][]%
|
||||
{\par\small\addvspace{4.5ex plus 1ex}%
|
||||
\vskip -\parskip
|
||||
\ifx\relax#1\relax
|
||||
\def\@decl@date{}%
|
||||
\else
|
||||
\def\@decl@date{\NEWfeature{#1}}%
|
||||
\fi
|
||||
\noindent\hspace{-\leftmargini}%
|
||||
\begin{tabular}{|l|}\hline\ignorespaces}%
|
||||
{\\\hline\end{tabular}\nobreak\@decl@date\par\nobreak
|
||||
\vspace{2.3ex}\vskip -\parskip}
|
||||
|
||||
\newcommand{\NEWfeature}[1]{%
|
||||
\hskip 1sp \marginpar{\small\sffamily\raggedright
|
||||
New feature\\#1}}
|
||||
Reference in New Issue
Block a user