(* * LANGUAGE : ANS Forth + extensions * PROJECT : Forth Environments * DESCRIPTION : A little language compiler * CATEGORY : TINY language compiler vsn 0.1 * AUTHOR : Jack W. Crenshaw * LAST CHANGE : Sunday, December 18, 2005, 17:23 PM, Marcel Hendrix *) NEEDS -miscutil REVISION -tiny12 "ÄÄÄ TINY compiler Version 1.12 ÄÄÄ" DOC (* The TINY compiler from Jack Crenshaw's excellent non-technical introduction to compiler construction. The text file was downloaded from http://compilers.iecc.com/crenshaw/ . The code is translated from Pascal to Forth, and the generated code is for the Intel x86 family. *) ENDDOC -- Variable declarations -------------------------------------------------------------------------------------------------- -1 =: unknown 0 VALUE Lcount -- label counter 0 VALUE Look -- lookahead character 0 VALUE token -- encoded token #100 =: maxentry CREATE token$ 0 C, #256 CHARS ALLOT -- unencoded token CREATE prog$ 0 C, #256 CHARS ALLOT -- program name : char+! ( c addr -- ) DUP >S COUNT + C! 1 S> C+! ; : char-place ( c addr -- ) 1 OVER C! CHAR+ C! ; : token> ( -- c-addr u ) token$ COUNT ; : =token ( char -- tf ) token = ; : <>token ( char -- tf ) token <> ; -- Symbol tables and labels ---------------------------------------------------------------------------------------------------------- -- Helper to build symbol tables : $, ( c-addr u size -- ) >S S MIN >R R@ C, R@ 0 ?DO C@+ C, LOOP DROP S> R> - CHARS ALLOT ; -- Type declarations : SYMTAB ( size -- ) CREATE DUP 0< IF 0 , ABS DUP , 1+ maxentry * ALLOT ELSE HERE >R 0 , ( #items ) DUP >S , ( itemsize ) BEGIN BL DUP WHILE 2DUP S" \" COMPARE 0= IF 2DROP REFILL DROP ELSE S $, 1 R@ +! ENDIF REPEAT 2DROP -R -S ENDIF DOES> CELL+ @+ 1+ ROT * + ( ix -- addr ) ; -- Definition of keywords and token types 8 CHARS =: /symbol /symbol SYMTAB KWlist IF ELSE ENDIF \ WHILE ENDWHILE \ DO ENDDO \ LOOP ENDLOOP \ REPEAT UNTIL \ FOR TO ENDFOR \ BREAK \ READ WRITE \ VAR END : KW->token ( kw_index -- ) 2+ C" xilewedeLerufteBRWve" + C@ TO token ; /symbol NEGATE SYMTAB ST =CELL 1- NEGATE SYMTAB SType 0 ST 2 CELLS - =: [cnt]ST 0 SType 2 CELLS - =: [cnt]SType : lookup ( c-addr u 'table -- n2|unknown ) 0 0 LOCALS| /symbol n table sz addr | table 2 CELLS - @+ TO n @ TO /symbol n 0<= IF unknown EXIT ENDIF 0 n DO /symbol 1+ I * table + COUNT addr sz COMPARE 0= IF I UNLOOP EXIT ENDIF -1 +LOOP unknown ; -- Locate symbol in table : locate ( c-addr u -- ix ) 0 ST lookup ; -- Returns -1 | the index of the entry. : intable ( c-addr u -- tf ) 0 ST lookup 0>= ; -- Look for symbol in table -- Dump symbol table : .symbols ( -- ) [cnt]ST @ 0= IF CR ." No symbols defined." EXIT ENDIF CR ." -- type --.--- name ---" [cnt]ST @ 0 ?DO CR 5 HTAB I 1+ SType C@ EMIT #16 HTAB I 1+ ST .$ LOOP ; -- Tools ------------------------------------------------------------------------------------------------------------------ : getchar ( -- ) EKEY TO Look ; -- read new character from input stream : error ( c-addr u -- ) CR ^G EMIT ." Error: " TYPE ." ." ; -- report an error : aborts ( c-addr u -- ) error ABORT ; -- report error and halt : expected ( c-addr u -- ) S" expected" $+ aborts ; -- report what was expected : alpha? ( char -- tf ) >UPC 'A' 'Z' 1+ WITHIN ; -- recognize an alpha character : digit? ( char -- tf ) '0' '9' 1+ WITHIN ; -- recognize a decimal digit : alnum? ( char -- tf ) DUP alpha? SWAP digit? OR ; -- recognize alphanumeric : orop? ( char -- tf ) DUP '|' = SWAP '~' = OR ; -- recognize an OR operand : addop? ( char -- tf ) DUP '+' = SWAP '-' = OR ; -- test for AddOp : mulop? ( char -- tf ) DUP '*' = SWAP '/' = OR ; -- test for MulOp : emits ( c-addr u -- ) Tab EMIT TYPE ; -- output a string with tab : emitdln ( c-addr u -- ) CR TYPE ; -- output a string without tab, +crlf : emitln ( c-addr u -- ) CR emits ; -- output a string with tab and crlf -- Recognize white space : white? ( char -- tf ) DUP BL = OVER Tab = OR OVER ^M = OR OVER ^J = OR SWAP '{' = OR ; -- Skip a comment field : skipcomment RECURSIVE ( -- ) BEGIN Look '}' <> WHILE getchar Look '{' = IF skipcomment ENDIF REPEAT getchar ; -- Skip white space : skipwhite ( -- ) BEGIN Look white? WHILE Look '{' = IF skipcomment ELSE getchar ENDIF REPEAT ; -- Abort, reporting an offending item : name.aborts ( c-addr1 u1 c-addr2 u2 ) &' CHAR-APPEND S" `" 2SWAP $+ $+ aborts ; : .undefined ( c-addr u -- ) S" Undefined identifier" 2SWAP name.aborts ; : .duplicate ( c-addr u -- ) S" Duplicate identifier" 2SWAP name.aborts ; : checktable ( c-addr u -- ) 2DUP intable IF 2DROP EXIT ENDIF .undefined ; -- Is identifier in the symbol table? : checkdup ( c-addr u -- ) 2DUP intable 0= IF 2DROP EXIT ENDIF .duplicate ; -- Is identifier already in symbol table? : checkident ( -- ) 'x' <>token IF S" Identifier" expected ENDIF ; -- Is current token an identifier? -- Generate a unique label / post it to output : newlabel ( -- c-addr u ) S" @" Lcount U>D <# # # # #> $+ 1 +TO Lcount ; : postlabel ( c-addr u -- ) CR TYPE ':' EMIT ; -- Recognize a relop : relop? ( char -- tf ) DUP '=' = OVER '#' = OR OVER '<' = OR SWAP '>' = OR ; -- Match a specific input character : match ( char -- ) DUP Look = IF DROP getchar skipwhite ELSE S" `" ROT CHAR-APPEND &' CHAR-APPEND expected ENDIF ; -- Get an identifier : getname ( -- ) skipwhite Look alpha? 0= IF S" Identifier" expected ENDIF 'x' TO token token$ C0! BEGIN Look >UPC token$ char+! getchar Look alnum? 0= UNTIL ; -- Get a number : getnum ( -- ) skipwhite Look digit? 0= IF S" Number" expected ENDIF '#' TO token token$ C0! BEGIN Look token$ char+! getchar Look digit? 0= UNTIL ; -- Get an operator : getop ( -- ) skipwhite Look TO token Look token$ char-place getchar ; -- Get next token of any type : next ( -- ) skipwhite Look alpha? IF getname EXIT ENDIF Look digit? IF getnum EXIT ENDIF getop ; -- Get an identifier and scan it for keywords : scan ( -- ) 'x' =token IF token> 0 KWlist lookup KW->token ENDIF ; -- Match a specific input string : matchstring ( c-addr u -- ) token> 2OVER COMPARE IF &` CHAR-PREPEND &' CHAR-APPEND expected ELSE 2DROP next ENDIF ; -- Match a semicolon : semi ( -- ) ';' =token IF next ENDIF ; -- Add a new entry to symbol table : addentry ( c-addr u type -- ) [cnt]ST LOCALS| #entries T sz addr | addr sz checkdup #entries @ maxentry = IF S" Symbol table full" aborts ENDIF 1 #entries +! addr sz #entries @ ST PACK DROP T #entries @ SType C! ; -- Code generation ------------------------------------------------------------------------------------------------------- -- Load primary register with a constant : loadconstant ( c-addr u -- ) S" d# -> eax mov," $+ emitln ; -- Load primary register from variable : loadvariable ( c-addr u -- ) 2DUP intable 0= IF .undefined ENDIF S" dword-ptr -> eax mov," $+ emitln ; -- Store primary register in variable : storevariable ( c-addr u -- ) 2DUP intable 0= IF .undefined ENDIF S" eax -> " 2SWAP $+ S" dword-ptr mov," $+ emitln ; -- Allocate storage for a static variable : allocatestorage ( $value $name -- ) S" CREATE " 2SWAP $+ BL CHAR-APPEND 2SWAP $+ S" , " $+ emitdln ; : _callw ( c-addr u -- ) S" offset NEAR call," $+ emitln ; -- call a word : _clear ( -- ) S" eax -> eax xor," emitln ; -- clear primary : _negate ( -- ) S" eax neg," emitln ; -- negate primary : _not ( -- ) S" eax not," emitln ; -- not primary : _push ( -- ) S" eax push," emitln ; -- push primary to stack : _pop ( -- ) S" eax pop," emitln ; -- pop primary from stack : _decr ( -- ) S" 1 b# -> eax sub," emitln ; -- decrement primary : _incr ( -- ) S" 1 b# -> eax add," emitln ; -- increment primary : _decSP ( -- ) S" [esp -4 +] -> esp lea," emitln ; -- drop 1 stack item : _incSP ( -- ) S" [esp 4 +] -> esp lea," emitln ; -- restore stack item : _popadd ( -- ) S" [esp] -> eax add, [esp 4 +] -> esp lea," emitln ; -- add TOS to primary : _popsub ( -- ) S" [esp] -> eax sub, [esp 4 +] -> esp lea," emitln _negate ; -- subtract TOS from primary : _popor ( -- ) S" [esp] -> eax or, [esp 4 +] -> esp lea," emitln ; -- or TOS to primary : _popxor ( -- ) S" [esp] -> eax xor, [esp 4 +] -> esp lea," emitln ; -- xor TOS to primary : _popand ( -- ) S" [esp] -> eax and, [esp 4 +] -> esp lea," emitln ; -- xor TOS to primary : _popcmp ( -- ) S" [esp] -> eax cmp, [esp 4 +] -> esp lea," emitln ; -- compare TOS to primary : _sete ( -- ) S" al sete, al -> eax movsx," emitln ; -- set primary if equal : _setne ( -- ) S" al setne, al -> eax movsx," emitln ; -- set primary if NOT equal : _setg ( -- ) S" al setl, al -> eax movsx," emitln ; -- set primary if greater : _setl ( -- ) S" al setg, al -> eax movsx," emitln ; -- set primary if less : _setge ( -- ) S" al setle, al -> eax movsx," emitln ; -- set primary if greater or equal : _setle ( -- ) S" al setge, al -> eax movsx," emitln ; -- set primary if less or equal -- A collection of jumps : _branch ( c-addr u -- ) S" offset NEAR jmp," $+ emitln ; : _pcmp+b0> ( c-addr u -- ) _popcmp S" offset NEAR jg," $+ emitln ; : _branch0 ( c-addr u -- ) S" eax -> eax or, " 2SWAP $+ S" offset NEAR je," $+ emitln ; : _branch<>0 ( c-addr u -- ) S" eax -> eax or, " 2SWAP $+ S" offset NEAR jnz," $+ emitln ; -- Multiply TOS and primary; divide primary by TOS : _popmul ( -- ) S" [esp] dword mul, [esp 4 +] -> esp lea," emitln ; : _popdiv ( -- ) S" ecx pop, ecx -> eax xchg, eax -> edx mov, #31 b# -> edx sar, ebx idiv," emitln ; -- Read to primary register and store in variable : readit ( c-addr u -- ) S" READ" _callw storevariable ; -- Write from primary register : writeit ( -- ) S" WRITE" _callw ; -- Write assembler header info : header ( -- ) S" -- DATA section --------" emitdln S" " emitdln ; -- Write assembler prologue : prolog ( -- ) S" " emitln S" -- CODE section --------" emitdln S" " emitdln S" CODE " prog$ COUNT $+ emitdln S" rpush," emitln newlabel postlabel ; -- Write assembler epilogue : epilog ( -- ) S" rpop, ebx jmp," emitln S" END-CODE" emitdln ; -- Expressions ----------------------------------------------------------------------------------------------------------- DEFER boolexpression -- Parse and translate a math factor : factor ( -- ) '(' =token IF next boolexpression S" )" matchstring EXIT ENDIF 'x' =token IF token> loadvariable next EXIT ENDIF '#' =token IF token> loadconstant next EXIT ENDIF S" Math factor" expected ; -- Recognize and translate multiply / divide : multiply ( -- ) next factor _popmul ; : divide ( -- ) next factor _popdiv ; -- Parse and translate a math term : term ( -- ) factor BEGIN token mulop? WHILE _push CASE token '*' OF multiply ENDOF '/' OF divide ENDOF ENDCASE REPEAT ; -- Recognize and translate add / subtract : add ( -- ) next term _popadd ; : subtract ( -- ) next term _popsub ; -- Parse and translate an expression : expression ( -- ) token addop? IF _clear ELSE term ENDIF BEGIN token addop? WHILE _push CASE token '+' OF add ENDOF '-' OF subtract ENDOF ENDCASE REPEAT ; -- Get another / next expression and compare : comparex ( -- ) expression _popcmp ; : nextx ( -- ) next comparex ; -- Recognize and translate a relational "equals" / "less or equal" / "not equals" : equals ( -- ) nextx _sete ; : lessorequal ( -- ) nextx _setle ; : notequals ( -- ) nextx _setne ; -- Recognize and translate a relational "less than" : less ( -- ) next CASE token '=' OF lessorequal ENDOF '>' OF notequals ENDOF comparex _setl ENDCASE ; -- Recognize and translate a relational "greater than" : greater ( -- ) next '=' =token IF nextx _setge EXIT ENDIF comparex _setg ; -- Parse and translate a relation : relation ( -- ) expression token relop? 0= ?EXIT _push CASE token '=' OF equals ENDOF '<' OF less ENDOF '>' OF greater ENDOF ENDCASE ; -- Parse and translate a boolean factor with NOT : notfactor ( -- ) '!' <>token IF relation EXIT ENDIF next relation _not ; -- Parse and translate a boolean term : boolterm ( -- ) notfactor BEGIN '&' =token WHILE _push next notfactor _popand REPEAT ; -- Recognize and translate a boolean OR / XOR : boolOR ( -- ) next boolterm _popor ; : boolXOR ( -- ) next boolterm _popxor ; -- Parse and translate a boolean expression :NONAME ( -- ) boolterm BEGIN token orop? WHILE _push CASE token '|' OF boolOR ENDOF '~' OF boolXOR ENDOF ENDCASE REPEAT ; IS boolexpression -- Parse and translate an assignment statement : assignment ( -- ) token> DUP 1+ ALLOCATE ?ALLOCATE DUP LOCAL name PACK DROP next S" =" matchstring boolexpression name COUNT storevariable name FREE ?ALLOCATE ; -- Block statements ------------------------------------------------------------------------------------------------------- DEFER block -- Recognize and translate an IF construct : doIF ( label -- ) 0 0 LOCALS| L2 L1 L | next boolexpression newlabel DUP 1+ ALLOCATE ?ALLOCATE DUP TO L1 PACK DROP L1 COUNT DUP 1+ ALLOCATE ?ALLOCATE DUP TO L2 PACK DROP L1 COUNT _branch0 L block 'l' =token IF L2 FREE ?ALLOCATE next newlabel DUP 1+ ALLOCATE ?ALLOCATE DUP TO L2 PACK DROP L2 COUNT _branch L1 COUNT postlabel L block ENDIF L2 COUNT postlabel S" ENDIF" matchstring L1 FREE ?ALLOCATE L2 FREE ?ALLOCATE ; -- Recognize and translate a WHILE construct : doWHILE ( -- ) 0 0 LOCALS| L2 L1 | newlabel DUP 1+ ALLOCATE ?ALLOCATE DUP TO L1 PACK DROP newlabel DUP 1+ ALLOCATE ?ALLOCATE DUP TO L2 PACK DROP next L1 COUNT postlabel boolexpression L2 COUNT _branch0 L2 block S" ENDWHILE" matchstring L1 COUNT _branch L2 COUNT postlabel L1 FREE ?ALLOCATE L2 FREE ?ALLOCATE ; -- Parse and translate a LOOP statement : doLOOP ( -- ) 0 0 LOCALS| L2 L1 | newlabel DUP 1+ ALLOCATE ?ALLOCATE DUP TO L1 PACK DROP newlabel DUP 1+ ALLOCATE ?ALLOCATE DUP TO L2 PACK DROP next L1 COUNT postlabel L2 block L1 COUNT _branch L2 COUNT postlabel S" ENDLOOP" matchstring L1 FREE ?ALLOCATE L2 FREE ?ALLOCATE ; -- Parse and translate a REPEAT statement : doREPEAT ( -- ) 0 0 LOCALS| L2 L1 | newlabel DUP 1+ ALLOCATE ?ALLOCATE DUP TO L1 PACK DROP newlabel DUP 1+ ALLOCATE ?ALLOCATE DUP TO L2 PACK DROP next L1 COUNT postlabel L2 block S" UNTIL" matchstring boolexpression L1 COUNT _branch0 L2 COUNT postlabel L1 FREE ?ALLOCATE L2 FREE ?ALLOCATE ; -- Parse and translate a FOR statement : doFOR ( -- ) 0 0 0 LOCALS| name L2 L1 | next checkident token> checktable token> DUP 1+ ALLOCATE ?ALLOCATE DUP TO name PACK DROP newlabel DUP 1+ ALLOCATE ?ALLOCATE DUP TO L1 PACK DROP newlabel DUP 1+ ALLOCATE ?ALLOCATE DUP TO L2 PACK DROP next S" =" matchstring expression _decr name COUNT storevariable S" TO" matchstring expression _push L1 COUNT postlabel name COUNT loadvariable _incr name COUNT storevariable L2 COUNT _pcmp+b0> L2 block L1 COUNT _branch L2 COUNT postlabel S" ENDFOR" matchstring _incSP name FREE ?ALLOCATE L1 FREE ?ALLOCATE L2 FREE ?ALLOCATE ; -- Parse and Translate a DO Statement : doDO ( -- ) 0 0 LOCALS| L2 L1 | newlabel DUP 1+ ALLOCATE ?ALLOCATE DUP TO L1 PACK DROP newlabel DUP 1+ ALLOCATE ?ALLOCATE DUP TO L2 PACK DROP next expression L1 COUNT postlabel _push L2 block _pop _decr L1 COUNT _branch<>0 _decSP L2 COUNT postlabel S" ENDDO" matchstring _incSP L1 FREE ?ALLOCATE L2 FREE ?ALLOCATE ; -- Recognize and translate a BREAK : doBREAK ( label -- ) DUP C@ 0= IF S" No loop to break from" aborts ENDIF COUNT _branch next ; -- Read a single variable : readvar ( -- ) checkident token> checktable token> readit next ; -- Process a read statement : doread ( -- ) next S" (" matchstring readvar BEGIN ',' =token WHILE next readvar REPEAT S" )" matchstring ; -- Process a write statement : dowrite ( -- ) next S" (" matchstring expression writeit BEGIN ',' =token WHILE next expression writeit REPEAT S" )" matchstring ; -- Recognize and translate a statement block :NONAME ( label -- ) LOCAL L scan BEGIN 'e' <>token 'l' <>token AND 'u' <>token AND WHILE CASE token 'i' OF L doIF ENDOF 'w' OF doWHILE ENDOF 'd' OF doDO ENDOF 'L' OF doLOOP ENDOF 'r' OF doREPEAT ENDOF 'f' OF doFOR ENDOF 'B' OF L doBREAK ENDOF 'R' OF doread ENDOF 'W' OF dowrite ENDOF 'x' OF assignment ENDOF ENDCASE semi scan REPEAT ; IS block -- Declarations ----------------------------------------------------------------------------------------------------------- -- Allocate storage for a variable : alloc ( -- ) 0 LOCAL aname next token> DUP 1+ ALLOCATE ?ALLOCATE DUP TO aname PACK DROP 'x' <>token IF S" Variable name" expected ENDIF aname COUNT checkdup aname COUNT 'v' addentry next '=' =token IF S" =" matchstring '-' =token IF S" -" matchstring S" -" ELSE S" " ENDIF token> $+ next ELSE S" 0" ENDIF aname COUNT allocatestorage aname FREE ?ALLOCATE ; -- Parse and translate the global declarations : topdecls ( -- ) scan BEGIN 'v' =token WHILE alloc BEGIN ',' =token WHILE alloc REPEAT semi REPEAT ; -- Parse and translate a program ------------------------------------------------------------------------------------------ -- Initialize everything : init ( -- ) CLEAR Lcount [cnt]ST 0! [cnt]SType 0! CR getchar next ; : TINY12 ( -- ) init S" PROGRAM" matchstring token> prog$ PACK DROP next semi header topdecls S" BEGIN" matchstring prolog C" " block S" END" matchstring epilog ; :ABOUT CR ." Try: tiny12 -- compile text" CR ." Also: .SYMBOLS -- dump symbol table" CR ." This release supports semicolons and nested comments." CR ." See the documentation for a BNF." CR CR ." Example input:" CR ." program test; { a test {obviously} }" CR ." var x=22,y=-2;" CR ." begin" CR ." if (!x) = (!y)" CR ." write(x); read(y)" CR ." endif" CR ." end." ; DEPRIVE .ABOUT -tiny12 CR (* End of Source *)