# -*- Mode: Python; tab-width: 4 -*-

# state-machine lexer for python

# TODO
#   different types of numbers:
#     octal, hex, long, imaginary...
#   string joining [parser?]

# Notes:
# no <newlines> generated while in brackets.
# blank lines are ignored. [thus they do not affect the indent/dedent stack]
# also, they do not generate a newline token.
#
# tabs are not a fixed size: they represent from 1 to 8 characters.

import string

import lexer

TAB_WIDTH = 8

class indent_stack:
	def __init__ (self):
		self.stack = []
		self.column = 0

	def push (self, num):
		self.stack.append (num)
		self.column = self.column + num

	def pop (self, num):
		# verify dedent consistency
		# <num> may include 1 or more elements of the stack,
		# but must pop consistently
		dedents = 0
		self.column = self.column - num
		while num:
			top = self.stack[-1]
			if num < top:
				return -1
			elif num >= top:
				del self.stack[-1]
				dedents = dedents + 1
				num = num - top
		return dedents

	def depth (self):
		return len(self.stack)

	def current_column (self):
		return self.column

class python_lexer (lexer.lexer):

	def __init__ (self, token_consumer, initial_state):
		lexer.lexer.__init__ (self, token_consumer, initial_state)
		self.bracket = 0
		self.indent_stack = indent_stack()

	def emit_integer (self, ch):
		self.token_consumer (
			'integer',
			string.atoi (string.join (self.buffer, ''))
			)
		self.clear_buffer()

	def emit_long (self, ch):
		self.token_consumer (
			'long',
			string.atol (string.join (self.buffer, ''))
			)
		self.clear_buffer()

	def emit_float (self, ch):
		self.token_consumer (
			'float',
			string.atof (string.join (self.buffer, ''))
			)
		self.clear_buffer()

	def emit_punctuation (self, ch):
		# Note: <ch> is _not_ the punctuation character, but rather
		# the character taking us out of the punctuation state.
		punch = self.buffer[-1]

		lexer.lexer.action_emit (self, ch)

		# keep track of bracketing for indent/dedent purposes;
		# indentation is ignore when inside multi-line bracketed
		# expressions.
		if punch in "([{":
			self.bracket = self.bracket + 1
		elif punch in ")]}":
			self.bracket = self.bracket - 1

	keywords = string.split (
		"and break class continue def del elif else except exec"
		"finally for from global if import in is lambda not or pass"
		"print raise return try while"
		)

	def emit_name (self, ch):
		ident = string.join (self.buffer, '')
		if ident in self.keywords:
			char_class = 'keyword'
		else:
			char_class = 'name'
		self.token_consumer (char_class, ident)
		self.clear_buffer()

	def emit_string (self, ch):
		self.token_consumer ('string', string.join (self.buffer, ''))
		self.clear_buffer()

	def emit_triple_quote_string (self, ch):
		# toss the last three characters, they are quote
		# marks collected while in triple-quote mode.
		self.token_consumer ('string', string.join (self.buffer[:-3], ''))
		self.clear_buffer()

	def action_unrecog (self, ch):
		# We have seen an unrecognized escape sequence
		# in a string:  keep the backslash and the strange
		# character.
		self.buffer = self.buffer + ['\\',ch]

	def action_front (self, ch):
		# whitespace at the beginning of a line is 'frontspace'.
		# use it to calculate the emission of indent/dedent tokens.

		if ch == '#':
			# ignore frontspace on comment lines
			self.clear_buffer()
			return

		if self.bracket:
			# if we're in the middle of a bracketed expression,
			# ignore the frontspace.
			self.clear_buffer()
			return

		# is the line blank?
		if not self.buffer and ch == '\n':
			# ignore it
			return
		else:
			# generate a newline token
			self.token_consumer ('punctuation', '<newline>')

		# calculate the indent
		column = 0
		for space in self.buffer:
			if space == ' ':
				column = column + 1
			elif space == '\t':
				column = (column/TAB_WIDTH + 1) * TAB_WIDTH
			else:
				raise lexer.LexicalError, "unknown whitespace character"

		ci = self.indent_stack.current_column()
		if (column > ci):
			self.token_consumer ('indent', None)
			self.indent_stack.push (column - ci)
		elif (column < ci):
			# check for consistent dedent
			dedents = self.indent_stack.pop (ci - column)
			if dedents == -1:
				raise lexer.LexicalError, "inconsistent dedent"
			else:
				for i in range(dedents):
					self.token_consumer ('dedent', None)

		self.clear_buffer()


	# ======================================================================
	# actions for octal and hex escape sequences in strings

	current_oct_esc = []

	def action_oct_esc (self, ch):
		self.current_oct_esc.append (ch)
		if len(self.current_oct_esc) > 3:
			raise lexer.LexicalError, 'octal escape sequence too long (must be 3 chars or less)'

	def emit_oct_esc (self, ch):
		ascii = chr (
			string.atoi (
				string.join (self.current_oct_esc, ''),
				8
				)
			)
		self.buffer.append (ascii)
		self.buffer.append (ch)
		self.current_oct_esc = []

	current_hex_esc = []

	def action_hex_esc (self, ch):
		self.current_hex_esc.append (ch)
		if len(self.current_hex_esc) > 3:
			raise lexer.LexicalError, 'hex escape sequence too long (must be 3 chars or less)'

	def emit_hex_esc (self, ch):
		# pay attention only to the lower 8 bits
		ascii = chr (
			string.atoi (
				string.join (self.current_hex_esc[-2:], ''),
				16
				)
			)
		self.buffer.append (ascii)
		self.buffer.append (ch)
		self.current_hex_esc = []

	def action_ascii_esc (self, ch):
		self.buffer.append (
			{'a':'\a', 'b':'\b',
			 'f':'\f', 'n':'\n',
			 'r':'\r', 't':'\t',
			 'v':'\v'}[ch]
			)

	def build_machine (self):

		# Character classes
		LETTER		= string.letters
		DIGIT		= string.digits
		WHITESPACE	= ' \t'
		NEWLINE		= '\n\r'
		# characters that start a two-char punctuation
		TWOPUNCT	= '<>!*='
		# what's left
		ONEPUNCT	= '.:,(){}[]+-/%^&'
		HEXDIGIT	= string.hexdigits
		OCTDIGIT	= string.octdigits

		# actions
		DROP 	= self.action_drop		# drop a character
		SHIFT	= self.action_shift		# collect a character
		ERROR	= self.action_error
		EMIT 	= self.action_emit
		FRONT	= self.action_front
		UNRECOG = self.action_unrecog
		# special emitters
		PUNCT	= self.emit_punctuation
		IDENT	= self.emit_name
		FLOAT	= self.emit_float
		INT		= self.emit_integer
		LONG	= self.emit_long
		STR		= self.emit_string
		TQSTR	= self.emit_triple_quote_string
		HEX_ESC = self.action_hex_esc
		OCT_ESC = self.action_oct_esc
		EMIT_HEX_ESC = self.emit_hex_esc
		EMIT_OCT_ESC = self.emit_oct_esc
		ASCII   = self.action_ascii_esc
		
		machine = {
			'start' : (
				# characters,		action,			next state
				(WHITESPACE,		DROP,			'whitespace'	),
				('\\',				DROP,			'linejoin'		),
				(NEWLINE,			DROP,			'frontspace'	),
				(LETTER,			SHIFT,			'name'			),
				("_",				SHIFT,			'name'			),
				(DIGIT,				SHIFT,			'integer'		),
				('"',				DROP,			'dstring'		),
				("'",				DROP,			'sstring'		),
				("<",				SHIFT,			'lessthan'		),
				(">",				SHIFT,			'greaterthan'	),
				("!",				SHIFT,			'bang'			),
				("*",				SHIFT,			'splat'			),
				("=",				SHIFT,			'equal'			),
				(ONEPUNCT,			SHIFT,			'punctuation'	),
				("#",				DROP,			'comment'		),
				(None,				ERROR,			'start'			)
				),

			# whitespace at the front of a line
			'frontspace' : (
				(WHITESPACE,		SHIFT,			'frontspace'	),
				(None,				FRONT,			'start'			),
				),

			# whitespace elsewhere
			'whitespace' : (
				(None,				DROP,			'start'			),
				),

			'linejoin'	: (
				(NEWLINE,			DROP,			'start'			),
				(None,				ERROR,			'start'			),
				),

			'comment' : (
				(NEWLINE,			DROP,			'frontspace'	),
				(None,				DROP,			'comment'		),
				),

			# also recognizes keywords
			'name' : (
				(LETTER,			SHIFT,			'name'			),
				(DIGIT,				SHIFT,			'name'			),
				('_',				SHIFT,			'name'			),
				(None,				IDENT,			'start'			),
				),

			# numeric literals

			'integer' : (
				(DIGIT,				SHIFT,			'integer'		),
				('L',				LONG,			'start'			),
				('.',				SHIFT,			'float'			),
				('Ee',				SHIFT,			'float2'		),
				(None,				INT,			'start'			),
				),

			'float' : (
				(DIGIT,				SHIFT,			'float'			),
				('Ee',				SHIFT,			'float2'		),
				(None,				FLOAT,			'start'			),
				),

			'float2' : (
				(DIGIT,				SHIFT,			'float2'		),
				(None,				FLOAT,			'start'			),
				),
		
			# Python uses a sophisticated string recognizer;
			# There are two types of strings: single or double-quote delimited.
			# Each of these can also be involved in a triple-quote drama.
			# ANSI C escape sequences are also supported.
			#
			# The machine for each string type is duplicated.

			# double-quote strings

			'dstring' : (
				('\\',				DROP,			'descape'		),
				('"',				DROP,			'dstring2'		),
				(None,				SHIFT,			'dstring6'		),
				),

			'dstring2' : (
				('"',				DROP,			'dstring3'		),
				(None,				STR,			'start'			),
				),

			'dstring3' : (
				('"',				SHIFT,			'dstring4'		),
				(None,				SHIFT,			'dstring3'		),
				),

			'dstring4' : (
				('"',				SHIFT,			'dstring5'		),
				(None,				SHIFT,			'dstring3'		),
				),

			'dstring5' : (
				('"',				TQSTR,			'start'			),
				(None,				SHIFT,			'dstring3'		),
				),

			'dstring6' : (
				('\\',				DROP,			'dstring7'		),
				('"',				STR,			'start'			),
				(None,				SHIFT,			'dstring6'		),
				),

			'dstring7' : (
				(None,				SHIFT,			'dstring6'		),
				),

			# identical to the 'dstring' states, but with single quotes

			'sstring' : (
				('\\',				DROP,			'sescape'		),
				("'",				DROP,			'sstring2'		),
				(None,				SHIFT,			'sstring'		),
				),

			'sstring2' : (
				("'",				DROP,			'sstring3'		),
				(None,				STR,			'start'			),
				),

			'sstring3' : (
				("'",				SHIFT,			'sstring4'		),
				(None,				SHIFT,			'sstring3'		),
				),

			'sstring4' : (
				("'",				SHIFT,			'sstring5'		),
				(None,				SHIFT,			'sstring3'		),
				),

			'sstring5' : (
				("'",				TQSTR,			'start'			),
				(None,				SHIFT,			'sstring3'		),
				),

			'sstring6' : (
				("'",				STR,			'start'			),
				('\\',				DROP,			'sstring7'		),
				(None,				SHIFT,			'sstring6'		),
				),

			'sstring7' : (
				(None,				SHIFT,			'sstring6'		),
				),

			# ANSI C escape sequences

			'descape' : (
				(NEWLINE,			DROP,			'dstring'		),
				('\\\'\"',			SHIFT,			'dstring'		),
				('abfnrtv',			ASCII,			'dstring'		),
				('01',				OCT_ESC,		'doct_esc'		),
				('x',				HEX_ESC,		'dhex_esc'		),
				(None,				UNRECOG,		'dstring'		),
				),


			'doct_esc' : (
				(OCTDIGIT,			OCT_ESC,		'doct_esc'		),
				(None,				EMIT_OCT_ESC,	'dstring'		),
				),

			'dhex_esc' : (
				(HEXDIGIT,			HEX_ESC,		'dhex_esc'		),
				(None,				EMIT_HEX_ESC,	'dstring'		),
				),

			# duplicated for single-quote strings

			'sescape' : (
				(NEWLINE,			DROP,			'sstring'		),
				('\\\'\"',			SHIFT,			'sstring'		),
				('abfnrtv',			ASCII,			'sstring'		),
				('01',				OCT_ESC,		'soct_esc'		),
				('x',				HEX_ESC,		'shex_esc'		),
				(None,				UNRECOG,		'sstring'		),
				),


			'soct_esc' : (
				(OCTDIGIT,			OCT_ESC,		'soct_esc'		),
				(None,				EMIT_OCT_ESC,	'sstring'		),
				),

			'shex_esc' : (
				(HEXDIGIT,			HEX_ESC,		'shex_esc'		),
				(None,				EMIT_HEX_ESC,	'sstring'		),
				),


			# Punctuation.  Some are made of two characters.

			'punctuation' : (
				(None,				PUNCT,			'start'			),
				),

			'lessthan' : (
				('>',				'!=',			'start'			),
				('<',				'<<',			'start'			),
				('=',				'<=',			'start'			),
				(None,				PUNCT,			'start'			),
				),

			'greaterthan' : (
				('>',				'>>',			'start'			),
				('=',				'>=',			'start'			),
				(None,				PUNCT,			'start'			),
				),

			'bang' : (
				('=',				'!=',			'start'			),
				(None,				PUNCT,			'start'			),
				),

			'splat' : (
				('*',				'**',			'start'			),
				(None,				PUNCT,			'start'			),
				),

			'equal' : (
				('=',				'==',			'start'			),
				(None,				PUNCT,			'start'			),
				),

		}
		return machine


sample = """
# snide

class middle:
	def madness (did,did_not,did_so,shut_up):
		print "nevermore"
		return 1<<2+did

class lower:
	def madness ():
		print "quoth"
		return a != b

thing = None

"""

# indent stack:
#
# 0         |class snout:
# 2      i  |  def method (stuff):
# 2 2    i  |    if test:
# 2 2 2  i  |      do_thing()
# 2 2    d  |    else:
# 2 2 2  i  |      do_other_thing()
#           |
# 2      dd |  def other_method (other_stuff):
# 2 2    i  |    yet_more_stuff()
#           |
# 0      dd |thing = None

def print_token (c,t):
	print "class: %20s  token: %s" % (repr(c), repr(t))

class token_collector:

	def __init__ (self):
		self.tokens = []

	def collect (self, token_class, token_data):
		self.tokens.append (token_class, token_data)

# scan a string
def test (s=sample, consumer=print_token):

	l = python_lexer (consumer, 'start')

	for ch in s:
		l.next_char (ch)
	return l

# scan a file
def test2 (filename='python_lexer.py', consumer=print_token):

	l = python_lexer (consumer, 'start')

	for ch in open(filename, 'rb').read():
		l.next_char (ch)
	return l

# collect the tokens from a file
def test3 (filename='python_lexer.py'):
	tc = token_collector()
	test2 (filename,tc.collect)
	return tc.tokens


def generate_gml (file, lexer):
	file.write ('graph [\n')
	file.write ('  directed 1\n')

	table = lexer.table_machine
	labels = lexer.state_keys
	nstates = len(table)

	# emit node info
	for s in range(nstates):
		file.write ('  node [ id %d label "%s" ]\n' % (s,labels[s]))

	# emit edge info
	for s in range(nstates):
		for c,a,n in table[s]:
			file.write ('  edge [ source %d target %d ]\n' % (s,n))
			
	file.write (']\n')
	
if __name__ == '__main__':
	test2()
