# ChartParser.py, version 1.0
# Released to the public domain 10 August 1999 by Oliver Steele.

"""Module ChartParser -- a simple chart parser

	>>> parse("John loves Mary")
	[S[NP[PN(John)], VP[VBAR[V(loves), NP[PN(Mary)]]]]]
	>>> parse("He saw the man with the binoculars")[0]
	S[NP[PRON(He)], VP[VBAR[VBAR[V(saw), NP[DET(the), NBAR[N(man)]]], PP[PREP(with), NP[DET(the), NBAR[N(binoculars)]]]]]]
	>>> parse("He saw the man with the binoculars")[1]
	S[NP[PRON(He)], VP[VBAR[V(saw), NP[DET(the), NBAR[NBAR[N(man)], PP[PREP(with), NP[DET(the), NBAR[N(binoculars)]]]]]]]]

__author__  = 'Oliver Steele', 'steele@cs.brandeis.edu'
__version__ = '1.0'

import string

# Simple tokenizer: split at whitespace, and separate out leading quotes and
# trailing quotes and punctuation.
def tokenize(str):
	tokens = string.split(str, ' ')
	i = 0
	while i < len(tokens):
		token = tokens[i]
		if len(token) > 1 and token[0] in '\'"':
			tokens[i:i+1] = [token[0], token[1:]]
		elif len(token) > 1 and token[-1] in '.?,!;:\'"':
			tokens[i:i+1] = [token[:-1], token[-1]]
			i = i + 1
	return tokens

# Define some basic syntactic categories, as variables bound to their own names.
for name in string.split('V VP VBAR N NBAR NP PRON PN S DET PREP PP'):
	globals()[name] = name
del name

# Just enough of a tag dictionary to parse the test sentence.
TAGS = {'John': PN, 'loves': V, 'Mary': PN, 'he': PRON, 'saw': V, 'the': DET, 'man': N,
			'with': PREP, 'binoculars': N}

# Simple tagger.
def tags(token):
	return [TAGS.get(token) or TAGS.get(string.lower(token))]

class Rule:
	"""A rule represents a phrase-structure production rule of the form:
		A => B C
	where A expands to B followed by C (and therefore, B followed by
	C can be composed into an A).  In this example, the left-hand-side
	(lhs) is A, and the right-hand-side (rhs) is [B, C]."""
	def __init__(self, spec):
		"""spec is of the form [A, B, C], where A is the lhs and the remaining
		items are the rhs.  In other words, [A, B, C] represents A => B C"""
		self.lhs = spec[0]
		self.rhs = spec[1:]
	def __repr__(self):
		return '%s => %s' % (string.join(self.lhs, ' '), self.rhs)
	def matches(self, category):
		return self.rhs[0] == category

# A toy grammar:
RULES = map(Rule, [
		[VP, VBAR],
		[VBAR, V, NP],
		[NBAR, N],
		[NP, DET, NBAR],
		[NP, PN],
		[NP, PRON],
		[PP, PREP, NP],
		[S, NP, VP]])

class Constituent:
	def __init__(self, type, children, left, right):
		self.type = type
		self.children = children
		self.left = left
		self.right = right
	def __repr__(self):
		return '%s%s' % (self.type, `self.children`)
	def tree(self):
		return [self.type] + self.children
	def terminals(self):
		return(reduce(lambda a,b:a+b, map(lambda c:c.terminals(), self.children)))

class PreTerminal(Constituent):
	def __init__(self, tag, token, left):
		Constituent.__init__(self, tag, None, left, left+1)
		self.token = token
	def __repr__(self):
		return '%s(%s)' % (self.type, self.token)
	def tree(self):
		return self
	def terminals(self):
		return [self]

class Edge:
	def __init__(self, rule, left, right=None, index=0, children=None):
		self.rule = rule
		self.left = left
		self.right = right or left
		self.index = index
		self.children = children or []
	def __repr__(self):
		str = []
		for i in range(len(self.rule.rhs)):
			if i == self.index:
			str.append(self.rule.rhs[i] + ' ')
		return '<%s => %s at %s:%s>' % (self.rule.lhs, string.join(str, '')[:-1], self.left, self.right)
	def advanceOver(self, chart, constituent):
		rule = self.rule
		if self.right == constituent.left and rule.rhs[self.index] == constituent.type:
			if chart.TRACE:
				print 'advancing', self, 'over', constituent
			chart.addEdge(Edge(rule, self.left, constituent.right, self.index + 1, self.children + [constituent]))
	def active(self):
		return self.index < len(self.rule.rhs)

class Chart:
	TRACE = 0
	def __init__(self, rules=RULES):
		self.rules = rules
	def init(self, n):
		self.edges = map(lambda n:[], range(n))
		self.constituents = map(lambda n:[], range(n))
	def parse(self, string):
		tokens = tokenize(string)
		for i in range(len(tokens)):
				self.addToken(tokens[i], i)
		return self.spans()
	def spans(self):
		return filter(lambda c, len=len(self.edges):c.right == len, self.constituents[0])
	def addToken(self, token, position):
		for tag in tags(token):
			self.addConstituent(PreTerminal(tag, token, position))
	def addConstituent(self, constituent):
		if self.TRACE:
			print 'adding', constituent,
			print 'at', constituent.left, '[]', constituent.left
		for edge in self.edges[constituent.left]:
			edge.advanceOver(self, constituent)
		for rule in self.rules:
			if rule.matches(constituent.type):
				Edge(rule, constituent.left).advanceOver(self, constituent)
	def addEdge(self, edge):
		if self.TRACE:
			print 'adding', edge
		if edge.active():
			if edge.right < len(self.edges):
				for constituent in self.constituents[edge.right]:
					edge.advanceOver(self, constituent)
			self.addConstituent(Constituent(edge.rule.lhs, edge.children, edge.left, edge.right))

def parse(string):
	return Chart().parse(string)

def _test():
	import doctest, ChartParser
	return doctest.testmod(ChartParser)