(* * LANGUAGE : ANS Forth * PROJECT : Forth Environments * DESCRIPTION : generate and use a directed acyclic word graph (DAWG) * CATEGORY : Tool * AUTHOR : Ian Osgood * LAST CHANGE : Sunday, May 11, 2003 10:00 AM, Marcel Hendrix *) NEEDS -miscutil REVISION -dawg "ÄÄÄ DAWG Version 1.01 ÄÄÄ" PRIVATES DOC (* From: iano@quirkster.com (Ian Osgood) Newsgroups: comp.lang.forth Subject: DAWG.F 0/2 Date: 10 May 2003 18:51:42 -0700 The following is code to generate and use a directed acyclic word graph (DAWG). A DAWG is a data structure for compressing a list of words into a form amenable to quick searching. Processing a 90000 word Scrabble dictionary obtains a 240KB DAWG file in a handful of seconds. The DAWG is suitable for use in a spell-checker, anagram finder, or word game. Its use for instantly enumerating every legal play in a game of Scrabble is described in "The World's Fastest Scrabble Program", Journal of the ACM. The DAWG is a cell array of 32-bit "nodes". The first cell contains the index of the root "block", which happens to be the final block in the array. A node contains a 5-bit letter, an end-of-word flag, an end-of-block flag, and the 24-bit array index of a block. A node represents an edge of the word graph labeled with a letter. It also represents a subgraph, a set of suffixes beginning with that letter. A block is a contiguous sequence of nodes representing all the possible next letters, or suffixes, of a prefix. Part 1/2 contains some base words and the DAWG generator. It uses a hash table. Part 2/2 contains sample code for using the DAWG, including a Boggle game solver. It uses a sorted string array with a binary search. Comments and suggestions are welcome. Ian Osgood ------------------------ BUGS ---------------------------------- These engwords.txt words have more than 16 characters, so the dawg fails to build, unless max-word-len is 24 or so. arteriolosclerosis contradistinction contradistinguish diethylstilbestrol ... ultraconservative *) ENDDOC \ make-dawg - converts the file "ospd.txt" of lowercase words, \ sorted alphabetically one per line, into "dawg.out" \ TRIE/DAWG node structure definition 1 #30 LSHIFT =: EOB_MASK PRIVATE 1 #29 LSHIFT =: EOW_MASK PRIVATE 1 #24 LSHIFT 1- =: INDEX_MASK PRIVATE :INLINE Let ( node -- 1-26) #24 RSHIFT #31 AND ; PRIVATE :INLINE EOW ( node -- nz ) EOW_MASK AND ; PRIVATE :INLINE EOB ( node -- nz ) EOB_MASK AND ; PRIVATE :INLINE Ind ( node -- index ) INDEX_MASK AND ; PRIVATE :INLINE InitLet ( 1-26 -- node ) #24 LSHIFT ; PRIVATE :INLINE let>c ( 1-26 -- a-z ) [CHAR] a + 1- ; PRIVATE :INLINE c>let ( a-z -- 1-26 ) [CHAR] a - 1+ ; PRIVATE :INLINE ?c>let ( c -- 0-26 ) DUP [CHAR] a [CHAR] z 1+ WITHIN IF c>let ELSE DROP 0 ENDIF ; PRIVATE \ utility :INLINE CELL/ 2 RSHIFT ; PRIVATE \ common #24 CHARS =: max-word-len VARIABLE word-len PRIVATE CREATE next-word PRIVATE max-word-len ALLOT VARIABLE prefix-len PRIVATE CREATE prefix PRIVATE max-word-len ALLOT :INLINE prefix-len+ ( addr1 -- addr2 ) prefix-len @ CHARS + ; PRIVATE :INLINE next-word-has-prefix? ( -- nz ) next-word prefix-len @ prefix OVER COMPARE 0= word-len @ AND ; PRIVATE \ DAWG builder VARIABLE words-file PRIVATE VARIABLE dawg-file PRIVATE VARIABLE cur-dawg-index PRIVATE \ [a-z] only : legal-pad? ( -- tf ) word-len @ 0= IF TRUE EXIT THEN \ EOF max-word-len word-len @ < IF FALSE EXIT THEN PAD word-len @ BOUNDS DO I C@ ?c>let 0= IF FALSE UNLOOP EXIT THEN LOOP TRUE ; : get-next-word BEGIN PAD #80 words-file @ READ-LINE 2DROP word-len ! legal-pad? UNTIL PAD next-word word-len @ CMOVE ; : write-to-dawg ( block size -- ) dawg-file @ WRITE-FILE ABORT" Can't write to dawg" ;P \ Hash Table for blocks #2311 =: hash-size PRIVATE VARIABLE htab PRIVATE : create-hash-table hash-size CELLS ALLOCATE ABORT" Hash table too big" DUP hash-size CELLS ERASE htab ! ;P : htab@i ( hash-index -- head-entry-addr ) CELLS htab @ + ; : ->next ;P IMMEDIATE :INLINE ->index CELL+ ; PRIVATE :INLINE ->block CELL+ CELL+ ; PRIVATE : destroy-hash-table htab @ hash-size 0 DO DUP @ BEGIN ?DUP WHILE DUP ->next @ SWAP FREE DROP REPEAT CELL+ LOOP DROP htab @ FREE DROP ;P \ 0 for a trie, >5 for a dawg (measured no dups above size 5) 6 CELLS =: Block-size-hash-threshold PRIVATE : hash-block ( block -- hash ) 0 >R CELL- BEGIN CELL+ DUP @ DUP R> XOR >R EOB UNTIL DROP R> U>D hash-size UM/MOD DROP ;P : blocks-equivalent? ( block1 block2 -- TF ) BEGIN OVER @ OVER @ <> IF 2DROP FALSE EXIT THEN DUP @ EOB 0= WHILE CELL+ SWAP CELL+ REPEAT 2DROP TRUE ;P : find-hash-block ( block -- index | 0 ) DUP hash-block htab@i ( block hash-block-addr ) BEGIN @ DUP WHILE 2DUP ->block blocks-equivalent? IF ->index @ NIP EXIT THEN ->next REPEAT NIP ( 0 ) ;P : add-hash-block ( size block -- ) OVER ->block ALLOCATE ABORT" Can't allocate hash entry" OVER hash-block htab@i ( size block h head-addr ) 2DUP @ OVER ->next ! SWAP ! \ replace the head <- h cur-dawg-index @ OVER ->index ! ->block ROT MOVE ;P \ Core DAWG building algorithm : index-for-block ( size block -- index ) OVER Block-size-hash-threshold < IF DUP find-hash-block ?DUP IF NIP NIP EXIT THEN 2DUP add-hash-block THEN OVER write-to-dawg ( size ) CELL/ cur-dawg-index @ TUCK + cur-dawg-index ! ;P : append-next-letter-to-prefix ( -- a-z ) next-word prefix-len+ C@ prefix prefix-len+ 2DUP C@ <= IF 2DROP ABORT" Words out of order" THEN 2DUP C! 0 SWAP CHAR+ C! 1 prefix-len +! ;P : init-node-with-letter ( node a-z -- node ) c>let InitLet OVER ! word-len @ prefix-len @ = IF EOW_MASK OVER +! get-next-word THEN ;P : remove-letter-from-prefix -1 prefix-len +! ;P : finish-block ( prefix-node last-node -- prefix-node ) 2DUP = IF DROP EXIT THEN EOB_MASK OVER +! OVER - OVER CELL+ ( prefix size block ) index-for-block OVER +! ;P : suffixes ( prefix-node-addr -- prefix-node-addr ) DUP ( prefix current ) BEGIN next-word-has-prefix? WHILE CELL+ \ allocate a new node append-next-letter-to-prefix init-node-with-letter RECURSE \ process all suffixes from this prefix remove-letter-from-prefix REPEAT ( prefix last ) finish-block ;P \ Using ftp://puzzlers.org/pub/wordlists/engwords.txt \ or ftp://puzzlers.org/pub/wordlists/ospd.txt : make-dawg ( -- ) S" ospd.txt" R/O OPEN-FILE ABORT" No input file" words-file ! S" dawg.out" R/W CREATE-FILE ABORT" No output file" dawg-file ! create-hash-table #100 CELLS ALLOCATE ABORT" Can't allocate block stack" ( blocks ) \ max 87 for "outstunting" 0 OVER ! DUP 4 write-to-dawg \ skip root pointer 1 cur-dawg-index ! get-next-word 0 prefix-len ! 0 prefix C! suffixes ( blocks[0] filled with root node index ) 0. dawg-file @ REPOSITION-FILE ABORT" Can't rewind" DUP 4 write-to-dawg \ backpatch root pointer FREE DROP destroy-hash-table dawg-file @ CLOSE-FILE DROP words-file @ CLOSE-FILE DROP ; \ DAWG usage utilities (start session with "load-dawg") \ Top level commands \ load-dawg - load a trie/dawg into memory \ unload-dawg \ tdtrav - interactively traverse a TRIE/DAWG \ word? - lookup a word in the dawg VARIABLE dawg PRIVATE : read-trie ( fname count -- trie^ code ) R/O OPEN-FILE ?DUP IF EXIT THEN ( file ) DUP FILE-SIZE ?DUP IF EXIT THEN ( file udsize) D>S DUP ALLOCATE ?DUP IF EXIT THEN ( file size mem^ ) DUP 2OVER SWAP READ-FILE ?DUP IF EXIT THEN ( file size mem read ) ROT <> ?DUP IF EXIT THEN ( file mem ) SWAP CLOSE-FILE ;P : load-dawg S" ../dawg/dawg.out" read-trie ABORT" Can't load dawg" dawg ! ; : unload-dawg dawg @ FREE DROP ; :INLINE dawg-root ( -- root-block ) dawg @ DUP @ CELLS + ; PRIVATE :INLINE dawg@i ( index -- block ) CELLS dawg @ + ; PRIVATE : .prefix ." '" prefix prefix-len @ TYPE ." '" ;P : letter-in-block ( letter block-addr -- node-addr | 0 ) SWAP LOCAL letter CELL- BEGIN CELL+ DUP @ Let letter - DUP 0= IF DROP EXIT ENDIF 0>= IF DROP 0 EXIT ENDIF DUP @ EOB UNTIL DROP 0 ;P \ \ TRIE/DAWG checker \ : .block ( block -- ) CELL- BEGIN CELL+ DUP @ DUP EOW IF [CHAR] A ELSE [CHAR] a THEN OVER Let 1- + EMIT EOB UNTIL DROP ;P : trav ( index -- command[0^-.] ) DUP 0= IF .prefix ." End of line." CR EXIT THEN CELLS dawg @ + 0 ( block^ command ) BEGIN DROP .prefix ." [" DUP .block ." ^-.] " KEY CR DUP [CHAR] a [CHAR] z 1+ WITHIN IF DUP prefix prefix-len+ C! c>let OVER letter-in-block DUP IF 1 prefix-len +! @ Ind RECURSE -1 prefix-len +! DUP [CHAR] - = OVER 8 = OR OVER 127 = OR IF DROP 0 THEN THEN THEN DUP [CHAR] ^ = OVER [CHAR] - = OR OVER 8 = OR OVER 127 = OR OVER [CHAR] . = OR UNTIL NIP ;P : tdtrav ( -- ) 0 prefix-len ! BEGIN dawg @ @ trav [CHAR] . = UNTIL ; \ \ spell check \ : is-word? ( addr len -- TF ) BOUNDS dawg @ ( end cur node-addr ) BEGIN @ Ind DUP 0= IF 3DROP FALSE EXIT ENDIF \ word too long dawg@i OVER C@ c>Let SWAP letter-in-block DUP 0= IF 3DROP FALSE EXIT ENDIF \ word not found >R CHAR+ 2DUP = R> SWAP UNTIL 2NIPS @ EOW 0<> ; \ word maybe too short : word? BL PARSE is-word? IF ." Yes" ELSE ." No" THEN ; -- A string with spaces : $QUALIFY? ( c-addr u -- bool ) 2DUP BL (lex) 0= IF IS-WORD? EXIT ENDIF ( c-addr u c-addr2 u2 c-addr1 u1 del ) DROP IS-WORD? 0= IF 4DROP FALSE EXIT ENDIF 2SWAP 2DROP RECURSE ; [DEFINED] testing [IF] \ FORTH> mini-bench \ 50000 searches in 0.552 seconds elapsed. \ 40000 positive results. ok \ FORTH> mini-bench \ 50000 searches in 0.264 seconds elapsed. \ 40000 positive results. ok : MINI-BENCH ( -- ) 0 LOCAL res #10000 LOCAL #times LOAD-DAWG CR 5 #times * DEC. ." searches in " TIMER-RESET #times 0 DO S" arteriolosclerosis" is-word? 1 AND +TO res S" zygote" is-word? 1 AND +TO res S" 123" is-word? 1 AND +TO res S" conceit" is-word? 1 AND +TO res S" forth" is-word? 1 AND +TO res LOOP .ELAPSED CR res DEC. ." positive results." ; \ Boggle sample program \ random-board - fill the board with random letters \ fill-board - set the board to a particular state \ .board - show the board \ solve-board - use the DAWG to find all the words \ of length min-len or greater 4 VALUE min-len 6 5 * 1+ CHARS =: board-size PRIVATE CREATE board PRIVATE board-size ALLOT \ 0 , 0 , 0 , 0 , 0 , \ 0 , 1 , 2 , 3 , 4 , \ 0 , 5 , 6 , 7 , 8 , \ 0 , 9 , 10 , 11 , 12 , \ 0 , 13 , 14 , 15 , 16 , \ 0 , 0 , 0 , 0 , 0 , 0 , \ UI : .line CHARS board + 4 TYPE CR ;P ( index -- ) : .board CR 6 .line #11 .line #16 .line #21 .line ; ( -- ) : fill-line ( index "abcd" -- ) BL PARSE 4 MIN ROT CHARS board + SWAP CMOVE ;P : fill-board ( -- ) board board-size ERASE 6 fill-line #11 fill-line #16 fill-line #21 fill-line .board ; : rand-letter ( -- a-z ) #26 CHOOSE 1+ let>c ;P : rlc!+ ( sq -- sq+1 ) rand-letter OVER C! CHAR+ ;P : rand-line ( index -- ) CHARS board + rlc!+ rlc!+ rlc!+ rlc!+ DROP ;P : random-board ( -- ) board board-size ERASE 6 rand-line #11 rand-line #16 rand-line #21 rand-line .board ; \ results (sorted list, unique words) 0 VALUE found-words PRIVATE 0 VALUE size-words PRIVATE \ allocated size 0 VALUE num-words PRIVATE : grow-words ( -- ) size-words 0= IF #16 DUP CELLS ALLOCATE ?ALLOCATE ( -- 16 addr ) ELSE size-words 2* found-words OVER CELLS RESIZE ?ALLOCATE THEN TO found-words TO size-words ;P : allocate-string ( addr len -- c-str ) DUP 1+ ALLOCATE ?ALLOCATE PACK ;P \ DUP >R ( addr len caddr ) 2DUP C! CHAR+ SWAP CMOVE R> ;P : insert-word ( n addr len -- ) num-words size-words = IF grow-words THEN allocate-string ( n c-str -- ) SWAP DUP >R CELLS found-words + ( c-str fw+ncells ) DUP DUP CELL+ num-words R> - CELLS MOVE ( c-str fw+ncells ) ! num-words 1+ TO num-words ;P : add-word ( addr len -- ) ( / binary search ) 2>R num-words 0 BEGIN 2DUP - WHILE 2DUP + 2/ DUP 2R@ ROT CELLS found-words + @ COUNT COMPARE DUP 0= IF 4DROP 2R> 2DROP EXIT THEN 0< IF ROT DROP SWAP ELSE 1+ NIP THEN REPEAT DROP 2R> insert-word ;P : add-prefix ( -- ) prefix prefix-len @ add-word ;P : clear-words ( -- ) num-words 0 ?DO I CELLS found-words + @ FREE DROP LOOP 0 TO num-words ;P : .words ( -- ) num-words 0 ?DO I CELLS found-words + @ COUNT TYPE SPACE LOOP CR ; \ smarts : solve-square ( block sq -- block sq ) DUP C@ 0= ?EXIT \ edge or already used \ can traverse to letter on this square? 2DUP C@ c>let SWAP letter-in-block ?DUP 0= ?EXIT \ OK: add letter to prefix ( sq block-node ) OVER C@ prefix prefix-len+ C! 1 prefix-len +! \ found a word? DUP @ EOW IF min-len prefix-len @ <= IF add-prefix THEN THEN \ no more suffixes? @ Ind ?DUP 0= IF -1 prefix-len +! EXIT THEN \ continue to surrounding squares dawg@i OVER ( next-block next-sq ) 0 OVER C! \ mark used 6 CHARS - RECURSE CHAR+ RECURSE CHAR+ RECURSE 3 CHARS + RECURSE 2 CHARS + RECURSE 3 CHARS + RECURSE CHAR+ RECURSE CHAR+ RECURSE 2DROP -1 prefix-len +! \ mark usable again prefix prefix-len+ C@ OVER C! ;P : solve-line ( root sq -- root sq+5 ) solve-square CHAR+ solve-square CHAR+ solve-square CHAR+ solve-square CHAR+ CHAR+ ;P : solve-board 0 prefix-len ! clear-words dawg-root 6 CHARS board + solve-line solve-line solve-line solve-line 2DROP CR .words ; [THEN] :ABOUT CR .~ make-dawg -- converts "ospd.txt" of lowercase words,~ CR .~ sorted alphabetically one per line, into "dawg.out"~ CR ." load-dawg -- load the dawg file made by make-dawg (dawg.out)." [ [DEFINED] testing ] [IF] CR ." Application: BOGGLE ( after load-dawg )" CR ." Usage: random-board " CR ." or fill-board abcd efgh ijkl mnop" CR ." Then: solve-board -- solve the boggle board" [THEN] ; .ABOUT -dawg CR DEPRIVE (* End of Source *)