# -*- Mode: Python; tab-width: 4 -*-

# A simple table-based lexer framework.

import string

LexicalError = "LexicalError"

#
# possible improvements:
#
#  verify completeness, i.e., verify that any character
#    delivered from any state is handled.
#
#  use tables for character classes
#

class lexer:
	def __init__ (self, token_consumer, initial_state):
		# `token_consumer' is a method or function of two arguments,
		# <state> and <ch>
		self.token_consumer = token_consumer
		self.clear_buffer()
		self.translate_machine (self.build_machine())
		self.initial_state = self.set_state (initial_state)

	def set_state (self, state_key):
		"Set the machine's state.  Return the index for that state"
		self.state = self.state_keys.index (state_key)
		return self.state

	# The machine created by 'build_machine()' is in a format
	# convenient for humans: it is a dictionary, using strings to name
	# states.  This method converts the machine to one based on
	# tables. [i.e., states are numbers, which are in turn indices
	# into the state machine table].

	def translate_machine (self, machine):
		# build a table version of the machine
		states = machine.keys()
		states.sort()
		skmap = {}
		for i in range(len(states)):
			skmap[states[i]] = i

		table = []
		for state in states:
			entry = []
			for char_class, action, next_state in machine[state]:

				if not skmap.has_key (next_state):
					raise ValueError, "undefined state: %s" % next_state

				entry.append ((char_class, action, skmap[next_state]))
			table.append (entry)

		self.table_machine = table
		self.state_keys = states

	def clear_buffer (self):
		self.buffer = []

	def next_char (self, ch):
		for char_class, action, next_state in self.table_machine[self.state]:
			if (char_class is None) or (ch in char_class):
				if (type(action) == type('')):
					self.token_consumer ('punctuation', action)
					self.state = self.initial_state
					self.clear_buffer()
					return
				else:
					action (ch)

				self.state = next_state
				if (char_class is None) and (next_state == self.initial_state):
					# If we transition to the start state via wildcard,
					# then assume that we want to process the offending
					# character.
					self.next_char (ch)
				return

		raise LexicalError, "unhandled character: %s" % ch

	def action_drop (self, ch):
		pass

	def action_shift (self, ch):
		self.buffer.append (ch)

	def action_emit (self, ch):
		"Default emitter.  Names the token class with the state key"
		self.token_consumer (
			self.state_keys[self.state],
			string.join (self.buffer, '')
		)
		self.clear_buffer()

	def action_error (self, ch):
		raise LexicalError, (self.state_keys[state], ch)

	# ===========================================================================
	# Everything below this point is lexer-specific.  In other words,
	# derive from this class and override this stuff (or simply
	# rewrite it).
	#
	# The lexer below is a simple recognizer of identifiers, strings, and numbers.
	# ===========================================================================	

	def emit_integer (self, ch):
		self.token_consumer (
			'integer',
			string.atoi (string.join (self.buffer, ''))
			)
		self.clear_buffer()

	def emit_float (self, ch):
		self.token_consumer (
			'float',
			string.atof (string.join (self.buffer, ''))
			)
		self.clear_buffer()

	def build_machine (self):

		# Character classes
		LETTER		= string.letters
		DIGIT		= string.digits
		WHITESPACE	= string.whitespace
		PUNCTUATION	= "()[]:=,"
		QUOTE		= "'\""


		# actions
		DROP  = self.action_drop
		SHIFT = self.action_shift
		ERROR = self.action_error
		EMIT  = self.action_emit

		# Machine format is a dictionary.  the keys are the state names
		# (though they don't _have_ to be strings).  Each state is associated
		# with a list of possible transitions.  Each transition is a tuple of
		# <characters>, <action>, <next_state>.
		#
		# If <characters> is 'None', it is a wildcard.
		#
		# <action> is usually a procedure of two arguments.  It is often a
		#   'bound method' (i.e., a method associated with a particular instance),
		#   though it doesn't have to be.
		#
		# As a convenience, if <action> is a string, then the string is emitted
		#   as a token of class 'punctuation'.  This simplifies the implementation
		#   of multi-character punctuations like '!=', cutting down on the number
		#   of states
		#
		# <next_state> defines the state to transition to.
		#

		machine = {
			'start' : (
				# characters,		action,					next state
				(WHITESPACE,		DROP,					'start'			),
				(LETTER,			SHIFT,					'identifier'	),
				(DIGIT,				SHIFT,					'integer'		),
				(QUOTE,				DROP,					'string'		),
				(PUNCTUATION,		SHIFT,					'punctuation'	),
				(None,				ERROR,					'start'			)
				),

			'identifier' : (
				(LETTER,			SHIFT,					'identifier'	),
				(DIGIT,				SHIFT,					'identifier'	),
				('_',				SHIFT,					'identifier'	),
				(None,				EMIT,					'start'			),
				),

			'integer' : (
				(DIGIT,				SHIFT,					'integer'		),
				('.',				SHIFT,					'float'			),
				('Ee',				SHIFT,					'float2'		),
				(None,				self.emit_integer,		'start'			),
				),

			'string' : (
				('\\',				DROP,					'string2'		),
				(QUOTE,				EMIT,					'start'			),
				(None,				SHIFT,					'string'		),
				),

			'string2' : (
				(None,				SHIFT,					'string'		),
				),

			'float' : (
				(DIGIT,				SHIFT,					'float'			),
				('Ee',				SHIFT,					'float2'		),
				(None,				self.emit_float,		'start'			),
				),
			'float2' : (
				(DIGIT,				SHIFT,					'float2'		),
				(None,				self.emit_float,		'start'			),
				),
			'punctuation' : (
				(None,				EMIT,					'start'			),
				)
		}
		return machine

def test ():
	def print_token (c,t):
		print "class: %20s  token: %s" % (repr(c), repr(t))

	l = lexer(print_token, 'start')

	import sys
	sys.stdout.write ('input string: ')
	sys.stdout.flush()
	s = raw_input()
	for ch in s:
		l.next_char (ch)
	return l

