開発環境
- OS X El Capitan - Apple (OS)
- Emacs(Text Editor)
- Java (実行環境)
- Python 3.5(プログラミング言語)
コンピュータシステムの理論と実装 (Noam Nisan (著)、Shimon Schocken (著)、斎藤 康毅(翻訳)、オライリージャパン)の10章(コンパイラ#1:構文解析)、10.5(プロジェクト)、10.5.3(第2段階: パーサー)を取り組んでみる。
10.5(プロジェクト)、10.5.3(第2段階: パーサー)
コード(Emacs)
JackAnalyzer.py
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import os
import glob
import sys
import re
class JackTokenizer:
def __init__(self, file):
self.file = file
self.next_ch = ''
self.cur_token_type = ''
self.cur_token = ''
self.next_token_type = ''
self.next_token = self.get_next_token()
def get_next_token(self):
token = ''
if self.next_ch != '':
c = self.next_ch
self.next_ch = ''
else:
c = self.file.read(1)
while re.match('\s', c):
c = self.file.read(1)
if c == '':
return ''
while True:
if re.match('\s', c):
return self.get_next_token()
if c == '/':
token += c
c = self.file.read(1)
if c == '/':
self.file.readline()
return self.get_next_token()
if c == '*':
while True:
c = self.file.read(1)
if c == '*':
c = self.file.read(1)
if c == '/':
break
return self.get_next_token()
self.next_ch = c
self.next_token_type = 'SYMBOL'
return token
if re.match(r'[-{}()\[\].,;+*/&|<>=~]', c):
token = c
self.next_token_type = 'SYMBOL'
return token
if re.match(r'\d', c):
token = c
while True:
c = self.file.read(1)
if re.match(r'\d', c):
token += c
else:
self.next_ch = c
break
self.next_token_type = 'INT_CONST'
return token
if c == '"':
while True:
c = self.file.read(1)
if c == '"':
break
else:
token += c
self.next_token_type = 'STRING_CONST'
return token
token = c
while True:
c = self.file.read(1)
if re.match(r'[a-zA-Z0-9_]', c):
token += c
else:
self.next_ch = c
break
if token in ['class', 'constructor', 'function', 'method', 'field',
'static', 'var', 'int', 'char', 'boolean', 'void',
'true', 'false', 'null', 'this', 'let', 'do', 'if',
'else', 'while', 'return']:
self.next_token_type = 'KEYWORD'
else:
self.next_token_type = 'IDENTIFIER'
return token
def has_more_tokens(self):
return self.next_token != ''
def advance(self):
self.cur_token = self.next_token
self.cur_token_type = self.next_token_type
self.next_token = self.get_next_token()
def token_type(self):
return self.cur_token_type
def keyword(self):
return self.cur_token
def symbol(self):
return self.cur_token. \
replace('&', '&'). \
replace('<', '<'). \
replace('>', '>')
def identifier(self):
return self.cur_token
def int_val(self):
return int(self.cur_token)
def string_val(self):
return self.cur_token
class CompilationEngine:
def __init__(self, inf, outf):
self.tokenizer = JackTokenizer(inf)
self.outf = outf
def compile_class(self):
self.tokenizer.advance()
print('<class>', file=outf)
print('<keyword> {0} </keyword>'.format(self.tokenizer.keyword()),
file=outf)
self.tokenizer.advance()
print('<identifier> {0} </identifier>'.format(
self.tokenizer.identifier()), file=outf)
self.tokenizer.advance()
print('<symbol> {0} </symbol>'.format(self.tokenizer.symbol()),
file=self.outf)
self.tokenizer.advance()
# classVarDec*
while self.tokenizer.token_type() == 'KEYWORD' and \
self.tokenizer.keyword() in ['static', 'field']:
print('<classVarDec>', file=self.outf)
self.compile_class_var_dec()
print('</classVarDec>', file=self.outf)
# subrotuineDec*
while self.tokenizer.token_type() == 'KEYWORD' and \
self.tokenizer.keyword() in ['constructor', 'function', 'method']:
print('<subroutineDec>', file=self.outf)
self.compile_subrotine()
print('</subroutineDec>', file=self.outf)
print('<symbol> } </symbol>', file=self.outf)
print('</class>', file=self.outf)
def compile_class_var_dec(self):
print('<keyword> {0} </keyword>'.format(self.tokenizer.keyword()),
file=outf)
self.tokenizer.advance()
t = self.tokenizer.token_type()
# type
if t == 'KEYWORD':
print('<keyword> {0} </keyword>'.format(
self.tokenizer.keyword()), file=self.outf)
elif t == 'IDENTIFIER':
print('<identifier> {0} </identifier>'.format(
self.tokenizer.identifier()), file=self.outf)
else:
raise Exception('compile class: {0}'.format(t))
# varName
self.tokenizer.advance()
print('<identifier> {0} </identifier>'.format(
self.tokenizer.identifier()), file=self.outf)
# (',' varName)*
self.tokenizer.advance()
sym = self.tokenizer.symbol()
while sym == ',':
print('<symbol> {0} </symbol>'.format(sym), file=self.outf)
self.tokenizer.advance()
print('<identifier> {0} </identifier>'.format(
self.tokenizer.identifier()), file=self.outf)
self.tokenizer.advance()
sym = self.tokenizer.symbol()
if sym == ';':
print('<symbol> {0} </symbol>'.format(sym), file=self.outf)
else:
raise Exception('compile error: {0}'.format(sym))
self.tokenizer.advance()
def compile_subrotine(self):
print('<keyword> {0} </keyword>'.format(self.tokenizer.keyword()),
file=self.outf)
self.tokenizer.advance()
# ('void' | type)
if self.tokenizer.token_type() == 'KEYWORD':
print('<keyword> {0} </keyword>'.format(self.tokenizer.keyword()),
file=self.outf)
else:
print('<identifier> {0} </identifier>'.format(
self.tokenizer.identifier()), file=self.outf)
# subroutineName
self.tokenizer.advance()
print('<identifier> {0} </identifier>'.format(
self.tokenizer.identifier()), file=self.outf)
self.tokenizer.advance()
print('<symbol> ( </symbol>', file=self.outf)
self.tokenizer.advance()
print('<parameterList>', file=self.outf)
if self.tokenizer.token_type() != 'SYMBOL':
self.compile_parameter_list()
print('</parameterList>', file=self.outf)
print('<symbol> ) </symbol>', file=self.outf)
# subroutineBody
print('<subroutineBody>', file=self.outf)
self.tokenizer.advance()
print('<symbol> { </symbol>', file=self.outf)
# varDec*
self.tokenizer.advance()
while self.tokenizer.token_type() == 'KEYWORD' and \
self.tokenizer.keyword() == 'var':
self.compile_var_dec()
# statements
print('<statements>', file=self.outf)
self.compile_statements()
print('</statements>', file=self.outf)
print('<symbol> } </symbol>', file=self.outf)
self.tokenizer.advance()
print('</subroutineBody>', file=self.outf)
def compile_parameter_list(self):
if self.tokenizer.token_type() == 'SYMBOL' and \
self.tokenizer.symbol() == ')':
return
if self.tokenizer.token_type() == 'KEYWORD':
print('<keyword> {0} </keyword>'.format(self.tokenizer.keyword()),
file=self.outf)
else:
print('<identifier> {0} </identifier>'.format(
self.tokenizer.identifier()), file=self.outf)
self.tokenizer.advance()
print('<identifier> {0} </identifier>'.format(
self.tokenizer.identifier()), file=self.outf)
self.tokenizer.advance()
while self.tokenizer.symbol() != ')':
print('<symbol> , </symbol>', file=self.outf)
self.tokenizer.advance()
if self.tokenizer.token_type() == 'KEYWORD':
print('<keyword> {0} </keyword>'.format(
self.tokenizer.keyword()), file=self.outf)
else:
print('<identifier> {0} </identifier>'.format(
self.tokenizer.identifier()), file=self.outf)
self.tokenizer.advance()
print('<identifier> {0} </identifier>'.format(
self.tokenizer.identifier()), file=self.outf)
self.tokenizer.advance()
def compile_var_dec(self):
print('<varDec>', file=self.outf)
print('<keyword> var </keyword>', file=self.outf)
self.tokenizer.advance()
if self.tokenizer.token_type() == 'KEYWORD':
print('<keyword> {0} </keyword>'.format(
self.tokenizer.keyword()), file=self.outf)
else:
print('<identifier> {0} </identifier>'.format(
self.tokenizer.identifier()), file=self.outf)
self.tokenizer.advance()
print('<identifier> {0} </identifier>'.format(
self.tokenizer.identifier()), file=self.outf)
self.tokenizer.advance()
while self.tokenizer.symbol() != ';':
print('<symbol> , </symbol>', file=self.outf)
self.tokenizer.advance()
print('<identifier> {0} </identifier>'.format(
self.tokenizer.identifier()), file=self.outf)
self.tokenizer.advance()
print('<symbol>;</symbol>', file=self.outf)
self.tokenizer.advance()
print('</varDec>', file=self.outf)
def compile_statements(self):
while True:
k = self.tokenizer.keyword()
if k == 'let':
print('<letStatement>', file=self.outf)
self.compile_let()
print('</letStatement>', file=self.outf)
elif k == 'if':
print('<ifStatement>', file=self.outf)
self.compile_if()
print('</ifStatement>', file=self.outf)
elif k == 'while':
print('<whileStatement>', file=self.outf)
self.compile_while()
print('</whileStatement>', file=self.outf)
elif k == 'do':
print('<doStatement>', file=self.outf)
self.compile_do()
print('</doStatement>', file=self.outf)
elif k == 'return':
print('<returnStatement>', file=self.outf)
self.compile_return()
print('</returnStatement>', file=self.outf)
else:
break
def compile_do(self):
print('<keyword>do</keyword>', file=self.outf)
# subroutineCall
self.tokenizer.advance()
print('<identifier>{0}</identifier>'.format(
self.tokenizer.identifier()), file=self.outf)
self.tokenizer.advance()
sym = self.tokenizer.symbol()
print('<symbol>{0}</symbol>'.format(sym), file=self.outf)
self.tokenizer.advance()
if sym == '.':
print('<identifier>{0}</identifier>'.format(
self.tokenizer.identifier()), file=self.outf)
self.tokenizer.advance()
print('<symbol>(</symbol>', file=self.outf)
self.tokenizer.advance()
print('<expressionList>', file=self.outf)
self.compile_expression_list()
print('</expressionList>', file=self.outf)
print('<symbol>)</symbol>', file=self.outf)
self.tokenizer.advance()
print('<symbol>;</symbol>', file=self.outf)
self.tokenizer.advance()
def compile_let(self):
print('<keyword> let </keyword>', file=self.outf)
self.tokenizer.advance()
print('<identifier> {0} </identifier>'.format(
self.tokenizer.identifier()), file=self.outf)
self.tokenizer.advance()
if self.tokenizer.symbol() == '[':
print('<symbol> [ </symbol>', file=self.outf)
self.tokenizer.advance()
print('<expression>', file=self.outf)
self.compile_expression()
print('</expression>', file=self.outf)
print('<symbol> ] </symbol>', file=self.outf)
self.tokenizer.advance()
print('<symbol> = </symbol>', file=self.outf)
self.tokenizer.advance()
print('<expression>', file=self.outf)
self.compile_expression()
print('</expression>', file=self.outf)
print('<symbol>;</symbol>', file=self.outf)
self.tokenizer.advance()
def compile_while(self):
print('<keyword> while </keyword>', file=self.outf)
self.tokenizer.advance()
print('<symbol>(</symbol>', file=self.outf)
self.tokenizer.advance()
print('<expression>', file=self.outf)
self.compile_expression()
print('</expression>', file=self.outf)
print('<symbol>)</symbol>', file=self.outf)
self.tokenizer.advance()
print('<symbol>{</symbol>', file=self.outf)
self.tokenizer.advance()
print('<statements>', file=self.outf)
self.compile_statements()
print('</statements>', file=self.outf)
print('<symbol>}</symbol>', file=self.outf)
self.tokenizer.advance()
def compile_return(self):
print('<keyword> return </keyword>', file=self.outf)
self.tokenizer.advance()
if self.tokenizer.token_type() != 'SYMBOL' and \
self.tokenizer.symbol() != ';':
print('<expression>', file=self.outf)
self.compile_expression()
print('</expression>', file=self.outf)
print('<symbol>;</symbol>', file=self.outf)
self.tokenizer.advance()
def compile_if(self):
print('<keyword>if</keyword>', file=self.outf)
self.tokenizer.advance()
print('<symbol>(</symbol>', file=self.outf)
self.tokenizer.advance()
print('<expression>', file=self.outf)
self.compile_expression()
print('</expression>', file=self.outf)
print('<symbol>)</symbol>', file=self.outf)
self.tokenizer.advance()
print('<symbol>{</symbol>', file=self.outf)
self.tokenizer.advance()
print('<statements>', file=self.outf)
self.compile_statements()
print('</statements>', file=self.outf)
print('<symbol>}</symbol>', file=self.outf)
self.tokenizer.advance()
if self.tokenizer.token_type() == 'KEYWORD' and \
self.tokenizer.keyword() == 'else':
print('<keyword>else</keyword>', file=self.outf)
self.tokenizer.advance()
print('<symbol>{</symbol>', file=self.outf)
self.tokenizer.advance()
print('<statements>', file=self.outf)
self.compile_statements()
print('<statements>', file=self.outf)
print('<symbol>}</symbol>', file=self.outf)
self.tokenizer.advance()
def compile_expression(self):
print('<term>', file=self.outf)
self.compile_term()
print('</term>', file=self.outf)
while True:
if self.tokenizer.token_type() == 'SYMBOL':
s = self.tokenizer.symbol()
if s in ['+', '-', '*', '/', '&', '|', '<', '>', '=']:
print('<symbol>{0}</symbol>'.format(s), file=self.outf)
self.tokenizer.advance()
print('<term>', file=self.outf)
self.compile_term()
print('</term>', file=self.outf)
else:
break
else:
break
def compile_term(self):
t = self.tokenizer.token_type()
if t == 'INT_CONST':
print('<integerConstant>{0}</integerConstant>'.format(
self.tokenizer.int_val()), file=self.outf)
self.tokenizer.advance()
elif t == 'STRING_CONST':
print('<stringConstant>{0}</stringConstant>'.format(
self.tokenizer.string_val()), file=self.outf)
self.tokenizer.advance()
elif t == 'KEYWORD':
print('<keyword>{0}</keyword>'.format(self.tokenizer.keyword()),
file=self.outf)
self.tokenizer.advance()
elif t == 'IDENTIFIER':
# varName | varName '[' expression ']' | subroutineCall
print('<identifier>{0}</identifier>'.format(
self.tokenizer.identifier()), file=self.outf)
self.tokenizer.advance()
if self.tokenizer.token_type() == 'SYMBOL':
s = self.tokenizer.symbol()
# varName '[' expression ']'
if s == '[':
print('<symbol>[</symbol>', file=self.outf)
self.tokenizer.advance()
print('<expression>', file=self.outf)
self.compile_expression()
print('</expression>', file=self.outf)
print('<symbol>]</symbol>', file=self.outf)
self.tokenizer.advance()
# subroutineCall
elif s in ['(', '.']:
print('<symbol>{0}</symbol>'.format(s), file=self.outf)
self.tokenizer.advance()
if s == '.':
print('<identifier>{0}</identifier>'.format(
self.tokenizer.identifier()), file=self.outf)
self.tokenizer.advance()
print('<symbol>(</symbol>', file=self.outf)
self.tokenizer.advance()
print('<expressionList>', file=self.outf)
self.compile_expression_list()
print('</expressionList>', file=self.outf)
print('<symbol>)</symbol>', file=self.outf)
self.tokenizer.advance()
elif t == 'SYMBOL':
sym = self.tokenizer.symbol()
print('<symbol>{0}</symbol>'.format(sym), file=self.outf)
self.tokenizer.advance()
if sym in '(':
print('<expression>', file=self.outf)
self.compile_expression()
print('</expression>', file=self.outf)
print('<symbol>)</symbol>', file=self.outf)
self.tokenizer.advance()
elif sym in ['-', '~']:
print('<term>', file=self.outf)
self.compile_term()
print('</term>', file=self.outf)
else:
raise Exception('compile term: {0}'.format(sym))
else:
raise Exception('compile term: {0}'.format(t))
def compile_expression_list(self):
if self.tokenizer.token_type() == 'SYMBOL' and \
self.tokenizer.symbol() == ')':
return
print('<expression>', file=self.outf)
self.compile_expression()
print('</expression>', file=self.outf)
while self.tokenizer.token_type() == 'SYMBOL' and \
self.tokenizer.symbol() == ',':
print('<symbol>,</symbol>', file=self.outf)
self.tokenizer.advance()
print('<expression>', file=self.outf)
self.compile_expression()
print('</expression>', file=self.outf)
if __name__ == '__main__':
source = sys.argv[1]
filenames = []
if os.path.isfile(source):
filenames.append(source)
elif os.path.isdir(source):
filenames = glob.glob('{0}{1}*.jack'.format(source, os.path.sep))
for filename in filenames:
with open(filename) as inf, \
open(filename.replace('.jack', '.xml'), 'w') as outf:
compilation_engine = CompilationEngine(inf, outf)
compilation_engine.compile_class()
入出力結果(Terminal, IPython)
$ make rm -f ExpressionlessSquare/*.xml ./JackAnalyzer.py ExpressionlessSquare ./TextComparer ExpressionlessSquare/Main.xml \ test_xml/ExpressionlessSquare/Main.xml Comparison ended successfully ./TextComparer ExpressionlessSquare/Square.xml \ test_xml/ExpressionlessSquare/Square.xml Comparison ended successfully ./TextComparer ExpressionlessSquare/SquareGame.xml \ test_xml/ExpressionlessSquare/SquareGame.xml Comparison ended successfully rm -f Square/*.xml ./JackAnalyzer.py Square ./TextComparer Square/Main.xml test_xml/Square/Main.xml Comparison ended successfully ./TextComparer Square/Square.xml test_xml/Square/Square.xml Comparison ended successfully ./TextComparer Square/SquareGame.xml test_xml/Square/SquareGame.xml Comparison ended successfully rm -f ArrayTest/*.xml ./JackAnalyzer.py ArrayTest ./TextComparer ArrayTest/Main.xml test_xml/ArrayTest/Main.xml Comparison ended successfully $
0 コメント:
コメントを投稿