Saturday, April 15, 2006

Packrat parsers

Recently I've been reading up on the packrat parsing algorithm. I think I'm going to attempt to use it to create a parser-generator in Ruby. I've already written a very retarded recursive-descent-oh-dear-god-don't-make-me-backtrack-I-can't-do-it parser-generator in ruby. I'll stick the source code at the end of this post. I think it's time to up the ante, as it were. I figure it's a hell of a lot easier to do a packrat parser generator than an LALR(1) or similiar. Also, I think I will allow it to either use parser combinators or a yacc-esque .y file. Of course it will use PEGs instead of (E)BNF, and I'm going to try and make it self-hosting. We'll see.


% cat parser.rb
module Parser
NoMatch = Object.new
BadMatch = Object.new
def Lit(*args)
Parser::Lit(*args)
end
def self.Lit(val)
LiteralMatcher.new(val)
end
def self.included(mod)
s = self
(class << mod; self; end).module_eval do
define_method(:Lit) do |*args|
s.Lit(*args)
end
end
end


mod = self
(class << BadMatch; self; end).class_eval do
define_method(:to_s) { "#{mod}::BadMatch" }
end
(class << NoMatch; self; end).class_eval do
define_method(:to_s) { "#{mod}::NoMatch" }
end


module ParserHandlers
attr_accessor :parse_succeeded_proc, :parse_failed_proc
def on_parse(&block)
self.parse_succeeded_proc = block
end

def on_error(&block)
self.parse_failed_proc = block
end
def |(other_parser)
Or.new(self, other_parser)
end
def >>(other_parser)
Sequence.new(self, other_parser)
end
def _?
ZeroOrOne.new(self)
end
private
def post_parse(parse_results, current_token)
case parse_reslts
when NoMatch, BadMatch
return parse_failed_proc.call(current_token) if parse_failed_proc
parse_results
else
return parse_succeeded_proc.call(parse_results) if parse_succeeded_proc
parse_results
end
end


end
class EndOfStream
include ParserHandlers
def initialize()
end

def parse(token_stream)
if token_stream.end_of_stream? or token_stream.current == EOS
post_parse(true, nil)
else
post_parse(NoMatch, token_stream.current)
end
end
end
class LiteralMatcher
include ParserHandlers
def initialize(const)
@const = const
end

def parse(token_stream)
token = token_stream.current
if @const === token
token_stream.advance
post_parse(token, token)
else
post_parse(NoMatch, token)
end
end
end


class Or
include ParserHandlers
def initialize(first_choice, second_choice, *remaining_choices)
@parser_choices = [first_choice, second_choice, *remaining_choices]
end

def parse(token_stream)
result = NoMatch
@parser_choices.each do |parser|
choice_result = parser.parse(token_stream)
if choice_result == BadMatch or choice_result != NoMatch
result = choice_result
break
end
end
post_parse(result, token_stream.current)
end
def []=(index, value)
@parser_choices[index] = value
end
def |(additional_choice)
@parser_choices << additional_choice
self
end
end

class Sequence
include ParserHandlers
def initialize(parser_first, parser_second, *rest)
@parsers = [parser_first, parser_second, *rest]
end

def parse(token_stream)
first_parser, *remaining_parsers = @parsers
results = []
first_result = first_parser.parse(token_stream)
if first_result != NoMatch
results << first_result
else
return post_parse(NoMatch, token_stream.current)
end

remaining_parsers.each do |parser|
result = parser.parse(token_stream)
if result == NoMatch or result == BadMatch
return post_parse(BadMatch, token_stream.current)
end
results << result
end
post_parse(results, token_stream.current)
end
def []=(index, value)
@parsers[index] = value
end
def >>(other_parser)
@parsers << other_parser
self
end
end

class ZeroOrOne
include ParserHandlers
def initialize(parser)
@parser = parser
end
def parse(token_stream)
result = @parser.parse(token_stream)
if result == NoMatch
post_parse(true, token_stream.current)
else
post_parse(result, token_stream.current)
end
end
end

class ZeroOrMany
include ParserHandlers
def initialize(parser)
@parser = parser
end

def parse(token_stream)
results = []
loop do
result = @parser.parse(token_stream)
return post_parse(BadMatch, token_stream.current) if result == BadMatch
break if result == NoMatch
results << result
end
post_parse(results, nil)
end
end


class ArrayTokenStream
attr_accessor :array
def initialize(source_array)
@array = source_array.dup
end

def current
return EOS if end_of_stream?
array.first
end

def advance
fail "Can't advance past end of stream" if end_of_stream?
array.shift
end

def end_of_stream?
array.empty?
end
end
EOS = Object.new # End of stream constant
def EOS.to_s
"Parser::EOS"
end
EOSMatcher = EndOfStream.new # Matches the end of a stream
end

The above expects an argument to #parse that responds to #advance and #current and #end_of_stream? as demonstrated by the ArrayTokenStream class.