CSci 450/503: Programming Languages

Expression Language Parsing



H. Conrad Cunningham

29 November 2017

Copyright (C) 2017, H. Conrad Cunningham

Acknowledgements: These slides accompany Chapter 11 “Expression Language Parsing” of the in-work textbook “Introduction to Functional Programming Using Haskell”.

Advisory: The HTML version of this document may require use of a browser that supports the display of MathML. A good choice as of November 2017 is a recent version of Firefox from Mozilla.

Lecture Goals

Expression Language Modules

Expression Language (Ch. 10)

Expression Language (Ch. 11)

Parsing

Lexical Analysis

Informally called lexer, scanner, or tokenizer

Phases (often combined)

  1. Scanner – breaks character stream into lexemes (words)

  2. Evaluator – determines syntactic category of lexeme (not Evaluator module)

Tokenization

Infix Grammar (Ch. 10)

    -- Context-free grammar (CFG)
    <expression> ::= <term> { <addop> <term> }
    <term>       ::= <factor> { <mulop> <factor> }
    <factor>     ::= <var> | <val>
                   | '(' <expression> ')'
    <val>        ::= [ '-' ] <unsigned>
    <var>        ::= <id>
    <addop>      ::=  '+'  |  '-' 
    <mulop>      ::=  '*'  |  '/' 
    -- Regular grammar (same as prefix grammar)
    <id>         ::=  <firstid>  |  <firstid> <idseq>
    <idseq>      ::=  <restid>   |  <restid> <idseq>
    <firstid>    ::=  <alpha>    | '_'
    <restid>     ::=  <alpha>    | '_'  | <digit>
    <unsigned>   ::=  <digit>    |  <digit> <unsigned> 
    <digit>      ::=  any numeric character
    <alpha>      ::=  any alphabetic character 

Prefix Grammar (Ch. 10)

    -- Context-free Grammar (CFG)
    <expression> ::=  <var> | <val> | <operexpr>
    <var>        ::=  <id>
    <val>        ::=  [ "-" ] <unsigned> 
    <operexpr>   ::=  '(' <operator> <operandseq> ')'
    <operandseq> ::=  { <expression> }
    <operator>   ::=  '+'  |  '*'  |  '-'  |  '/' | ...
    -- Reglar grammar (same as infix grammar)
    <id>         ::=  <firstid>  |  <firstid> <idseq>
    <idseq>      ::=  <restid>   |  <restid> <idseq>
    <firstid>    ::=  <alpha>    | '_'
    <restid>     ::=  <alpha>    | '_'  | <digit>
    <unsigned>   ::=  <digit>    |  <digit> <unsigned> 
    <digit>      ::=  any numeric character
    <alpha>      ::=  any alphabetic character 

Lexical Analyzer

Data Type Token

import Values ( NumType, Name, toNumType )
-- e.g., NumType = Int , Name = String
data Token = TokLeft -- left parenthesis
| TokRight -- right parenthesis
| TokNum NumType -- unsigned integer literal
| TokId Name -- names of variables, etc.
| TokOp Name -- names of primitive functions
| TokKey Name -- keywords (no use currently)
| TokOther String -- other characters
deriving (Show, Eq)

Function lexx

import Data.Char ( isSpace, isDigit, isAlpha, isAlphaNum )
lexx :: String -> [Token] -- scan, most of lexeme eval
lexx [] = []
lexx xs@(x:xs')
| isSpace x = lexx xs'
| x == ';' = lexx (dropWhile (/='\n') xs')
| x == '(' = TokLeft : lexx xs'
| x == ')' = TokRight : lexx xs'
| isDigit x = let (num,rest) = span isDigit xs
in (TokNum (convertNumType num)) : lexx rest
| isFirstId x = let (id,rest) = span isRestId xs
in (TokId id) : lexx rest
| isOpChar x = let (op,rest) = span isOpChar xs
in (TokOp op) : lexx rest
| otherwise = (TokOther [x]) : lexx xs'
where
isFirstId c = isAlpha c || c == '_'
isRestId c = isAlphaNum c || c == '_'
isOpChar c = elem c opchars
opchars = "+-*/~<=>!&|@#$%^?:" -- not " ' ` ( ) [ ] { } , . ;

Function lexer

lexer :: String -> [Token] -- rest of lexeme eval
lexer xs = markSpecials (lexx xs)
markSpecials :: [Token] -> [Token]
markSpecials ts = map xformTok ts
xformTok :: Token -> Token
xformTok t@(TokId id) -- is keyword?
| elem id keywords = TokOp id
| otherwise = t
xformTok t@(TokOp op) -- is valid operator?
| elem op primitives = t
| otherwise = TokOther op
xformTok t = t
keywords = [] -- none defined currently
primitives = ["+","-","*","/"]

Recursive Descent Parsing

Example CFG

    S ::= A | B  -- (1) alternatives
    A ::= C D    -- (2) sequencing
    B ::= { E }  -- (3) zero or more occurrences of E 
    C ::= [ F ]  -- (4) zero or one occurrence of F 
    D ::= '1' | '@' S 
    E ::= '3'
    F ::= '2'    -- (5) base case

Recursive Descent Recognizer

(1) Alternatives: S ::= A | B

parseS :: String -> (Bool,String) -- A | B
parseS xs =
case parseA xs of -- try A
(True, ys) -> (True, ys) -- A succeeds
(False, _ ) ->
case parseB xs of -- else try B
(True, ys) -> (True, ys) -- B succeeds
(False, _) -> (False, xs) -- both A & B fail
-- nest more alternatives?

(2) Sequencing: A ::= C D

parseA :: String -> (Bool,String) -- C D
parseA xs =
case parseC xs of -- try C
(True, ys) ->
case parseD ys of -- then try D
(True, zs) -> (True, zs) -- C D succeeds
(False, _) -> (False, xs) -- D fails
-- nest more sequentially?
(False, _) -> (False,xs) -- C fails (ERR in handout)

(3) Repetition: B ::= { E }

parseB :: String -> (Bool,String) -- { E }
parseB xs =
case parseE xs of -- try E
(True, ys) -> parseB ys -- one E, try again
(False, _) -> (True,xs) -- stop, succeeds

(4) Optional Element: C ::= [ F ]

parseC :: String -> (Bool,String) -- [ F ]
parseC xs =
case parseF xs of -- try F
(True, ys) -> (True,ys)
(False, _ ) -> (True,xs)

(5) Base Case: E ::= '3'

parseE :: String -> (Bool,String)
parseE (x:xs') = (x == '3', xs')
parseE xs = (False, xs )

Refactor: D ::= '1' | '@' S

parseD :: String -> (Bool,String) -- '1' | '@' S
parseD ('1':xs) = (True, xs) -- try '1' (shortcut)
parseD ('@':xs) = -- try '@' on DS (shortcut)
case parseS xs of -- try S
(True, ys) -> (True, ys)
(False, _ ) -> (False,xs) --
parseD xs = (False, xs)

Outer Level Recognizer

parse :: String -> Bool
parse xs =
case parseS xs of
(True, []) -> True
(_, _ ) -> False

Expression Language Prefix CFG

    <expression> ::=  <var> | <val> | <operexpr>
    <var>        ::=  <id>
    <val>        ::=  [ "-" ] <unsigned> 
    <operexpr>   ::=  '(' <operator> <operandseq> ')'
    <operandseq> ::=  { <expression> }
    <operator>   ::=  '+'  |  '*'  |  '-'  |  '/' | ...

Parse <expression>

type ParErr = String
-- <expression> ::= <var> | <val> | <operexpr>
parseExpression :: [Token] -> (Either ParErr Expr, [Token])
parseExpression xs = -- use template (1)
case parseVar xs of
r@(Right _, _) -> r -- <var>
_ ->
case parseVal xs of
r@(Right _, _) -> r -- <val>
_ ->
case parseOperExpr xs of
r@(Right _, _) -> r -- <operexpr>
(Left m, ts) -> (missingExpr m ts, ts)
missingExpr m ts =
Left ("Missing expression at " ++ (showTokens (pref ts))
++ "..\n..Nested error { " ++ m ++ " }")

Outer-Level Parser

parse :: String -> Either ParErr Expr
parse xs =
case lexer xs of -- do lexical analysis
[] -> incompleteExpr xs
ts ->
case parseExpression ts of
(ex@(Right _), []) -> ex
(ex@(Left _), _ ) -> ex
(ex, ss) -> extraAtEnd ex ss
incompleteExpr xs = Left ("Incomplete expression: " ++ xs)
extraAtEnd ex xs =
Left ("Nonspace token(s) \"" ++ (showTokens xs) ++
"\" at end of the expression \"" ++ (show ex) ++ "\"")

Parse <var>

-- <var> ::= <id> -- use template (5), constructs Var node
parseVar :: [Token] -> (Either ParErr Expr, [Token])
parseVar ((TokId id):ts) = (Right (Var id),ts) -- Var node
parseVar ts = (missingVar ts, ts)
missingVar ts =
Left ("Missing variable at " ++ (showTokens (pref ts)))

Parse <val>

-- <val> ::= [ '-' ] <unsigned>
-- REFACTOR AS
-- <val> ::= <optminus> <unsigned> -- template (2)
-- <optminus> ::= [ '-' ] -- template (4)
-- Implement as special base case, constructs Val
parseVal :: [Token] -> (Either ParErr Expr, [Token]) -- Val node
parseVal ((TokNum i):ts) = (Right (Val i), ts)
parseVal ((TokOp "-"):(TokNum i):ts) = (Right (Val (-i)), ts)
parseVal ts = (missingVal ts, ts)
missingVal ts =
Left ("Missing value at " ++ (showTokens (pref ts)))

Parse <operexpr>

-- <operexpr> ::= "(" <operator> <operandseq> ")"
-- use modified version of template (2)
parseOperExpr :: [Token] -> (Either ErrMsg Expr, [Token])
parseOperExpr xs@(TokLeft:(TokOp op):ys) = -- ( <operator>
case parseOperandSeq ys of -- <operandseq>
(args, zs) -> case zs of -- )
(TokRight:zs') -> (makeExpr op args, zs') -- **
zs' -> (missingRParen zs, xs)
-- ill-formed <operexpr>s
parseOperExpr (TokLeft:ts) = (missingOp ts, ts)
parseOperExpr (TokRight:ts) = (invalidOpExpr ")", ts)
parseOperExpr ((TokOther s):ts) = (invalidOpExpr s, ts)
parseOperExpr ((TokOp op):ts) = (invalidOpExpr op, ts)
parseOperExpr ((TokId s):ts) = (invalidOpExpr s, ts)
parseOperExpr ((TokNum i):ts) = (invalidOpExpr (show i), ts)
parseOperExpr [] = (incompleteExpr, [])
missingRParen ts = Left ("Missing `)` at " ++ (show (take 3 ts)))
missingOp ts = Left ("Missing operator at " ++ (show (take 3 ts)))
invalidOpExpr s = Left ("Invalid operation expression beginning with " ++ s)
incompleteExpr = Left "Incomplete expression"

Parse <operandseq>

-- <operandseq> ::= { <expression> }
-- uses template (3), has special type signature
parseOperandSeq :: [Token] -> ([Expr],[Token])
parseOperandSeq xs =
case parseExpression xs of
(Left _, _ ) -> ([],xs)
(Right ex, ys) ->
let (exs,zs) = parseOperandSeq ys
in (ex:exs,zs)

AST Construction (makeExpr)

import Data.Maybe
makeExpr :: String -> [Expr] -> Either ErrMsg Expr
makeExpr op exs =
case arity op of
0 -> opCons0 op exs -- not implemented
1 -> opCons1 op exs
2 -> opCons2 op exs
3 -> opCons3 op exs
4 -> opCons4 op exs -- not implemented
5 -> opCons5 op exs -- not implemented
_ -> opConsX op exs -- not implemented
arityMap = [ ("+",2), ("-",2), ("*",2), ("/",2) ]
-- add (operator,arity) pairs as needed
arity :: String -> Int
arity op = fromMaybe (-1) (lookup op arityMap)

AST Construction (opCons2)

assocOpCons2 = [ ("+",Add), ("-",Sub), ("*",Mul), ("/",Div) ]
-- add new pairs as needed
opCons2 :: String -> [Expr] -> Either ParErr Expr
opCons2 op exs =
case length exs of
2 -> case lookup op assocOpCons2 of -- construct op
Just c -> Right (c (exs!!0) (exs!!1))
Nothing -> invalidOp op
n -> arityErr op n
invalidOp op =` Left ("Invalid operator '" ++ op ++ "'")
arityErr op n =
Left ("Operator '" ++ op ++ "' incorrectly called with "
++ (show n) ++ " operand(s)")

Expression Language Infix CFG

    <expression> ::= <term> { <addop> <term> }
    <term>       ::= <factor> { <mulop> <factor> }
    <factor>     ::= <var> | <val>
                   | '(' <expression> ')'
    <val>        ::= [ '-' ] <unsigned>
    <var>        ::= <id>
    <addop>      ::=  '+'  |  '-' 
    <mulop>      ::=  '*'  |  '/' 

Other Parsing Approaches

Key Ideas