bash$ ls affixspell.rul footparse.rul spanish.rul tester vocalisms arabic.sym roots spanish.sym ulwa.lx clean spanish.al templates ulwa.sym ## Start with the Spanish example ## The symbol file defines the label set. Things on the left ## are the "superclasses". On the right are labels OR superclasses. bash$ cat spanish.sym C b c d f g h j k l m n p q r s t v w x y z V a e i o u other ' back a o u feat diph c/z ind pres 1 2 3 sg pl vb pret boundary + # ## lexmakelab compiles the label set bash$ lexmakelab spanish ## spanish.lab contains the basic labels bash$ cat spanish.lab 0 b 1 c 2 d 3 f 4 g 5 h 6 j 7 k 8 l 9 m 10 n 11 p 12 q 13 r 14 s 15 t 16 v 17 w 18 x 19 y 20 z 21 a 22 e 23 i 24 o 25 u 26 ' 27 diph 28 c/z 29 ind 30 pres 31 1 32 2 33 3 34 sg 35 pl 36 vb 37 pret 38 + 39 # 40 ## spanish.scl contains the superclass labels bash$ cat spanish.scl other 27 C 1 C 2 C 3 C 4 C 5 C 6 C 7 C 8 C 9 C 10 C 11 C 12 C 13 C 14 C 15 C 16 C 17 C 18 C 19 C 20 C 21 V 22 V 23 V 24 V 25 V 26 feat 28 feat 29 feat 30 feat 31 feat 32 feat 33 feat 34 feat 35 feat 36 feat 37 feat 38 back 22 back 25 back 26 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 boundary 39 boundary 40 ## spanish.al defines the arclist for this tiny fragment bash$ cat spanish.al START ar (habl) : (hablar [vb] ) START ar (cerr[diph]) : (cerrar [vb]) START er (coc [diph] [c/z]) : (cocer [vb]) ar WORD (\+o\#) : (\+ [ind] [pres] [1] [sg]) ar WORD (\+as\#) (\+[ind] [pres] [2] [sg]) ar WORD (\+a\#) : (\+ [ind] [pres] [3] [sg]) ar WORD (\+amos\#) : (\+ [ind] [pres] [1] [pl]) ar WORD (\+an\#) : (\+ [ind] [pres] [3] [pl]) ar WORD (\+'e\#) : (\+ [ind] [pret] [1] [sg]) ar WORD (\+aste\#) : (\+ [ind] [pret] [2] [sg]) ar WORD (\+'o\#) : (\+ [ind] [pret] [3] [sg]) ar WORD (\+amos\#) : (\+ [ind] [pret] [1] [pl]) ar WORD (\+asteis\#) : (\+ [ind] [pret] [2] [pl]) ar WORD (\+aron\#) : (\+ [ind] [pret] [3] [pl]) er WORD (\+o\#) : (\+ [ind] [pres] [1] [sg]) er WORD (\+es\#) : (\+ [ind] [pres] [2] [sg]) er WORD (\+e\#) : (\+ [ind] [pres] [3] [sg]) er WORD (\+emos\#) : (\+ [ind] [pres] [1] [pl]) er WORD (\+'eis\#) : (\+ [ind] [pres] [2] [pl]) er WORD (\+en\#) : (\+ [ind] [pres] [3] [pl]) er WORD (\+'i\#) : (\+ [ind] [pret] [1] [sg]) er WORD (\+iste\#) : (\+ [ind] [pret] [2] [sg]) er WORD (\+i'o\#) : (\+ [ind] [pret] [3] [sg]) er WORD (\+imos\#) : (\+ [ind] [pret] [1] [pl]) er WORD (\+isteis\#) : (\+ [ind] [pret] [2] [pl]) er WORD (\+ieron\#) : (\+ [ind] [pret] [3] [pl]) WORD ## lexarclist compiles that into an fst bash$ lexarclist -l spanish.lab -S spanish.scl -F spanishal.fst spanish.al ## spanish.rul implements some of the segmental changes we saw bash$ cat spanish.rul e -> ie / ___ (C*) ([feat]*) [diph] ([feat]*) \+ (V*) (C*) \# o -> ue / ___ (C*) ([feat]*) [diph] ([feat]*) \+ (V*) (C*) \# c -> z / ___ ([feat]*) [c/z] \+ [back] [feat] | [boundary] -> [] ## lexrulecomp compiles that into an fst bash$ lexrulecomp -l spanish.lab -S spanish.scl -F spanishrul.fst spanish.rul ## now put it all together: bash$ fsminvert spanishal.fst | fsmcompose - spanishrul.fst | fsminvert | fsmcompact >spanish.fst ## test it. lexcompre compiles regular expressions or strings on the ## command line. fsmcompose does what you would think it does. ## lexfsmtrings prints out strings from acyclic transducers given an alphabet bash$ lexcompre -l spanish.lab -s "cerrasteis" | fsmcompose - spanish.fst | lexfsmstrings -l spanish.lab cerrasteis cerrar[vb]+[ind][pret]2[pl] bash$ lexcompre -l spanish.lab -s "hablo" | fsmcompose - spanish.fst | lexfsmstrings -l spanish.lab hablo hablar[vb]+[ind][pres]1[sg] ## Now let's look at Ulwa ## ulwa. ## Label definitions for ulwa. Note that superclass X is all segments: ## C and V. bash$ cat ulwa.sym V a i o u C b d g h k l m n p s t w y X C V feat 1 2 3 sg pl 1plincl bound + $ english a b c d e f g h i j l m n o p q r s t u v w x y z _ ## Build the label file. For the Ulwa rules (below) we need to make ## use of special builtin labels (beginning of string) and ## (end of string). Those are not built by default in lexmakelab, but ## you can get them if you give the flag -L. bash$ lexmakelab -L ulwa ## Here's the arclist for Ulwa. Here, I decided to have it ## morphologically analyze AND translate into English. bash$ cat ulwa.lx START WORD baa : excrement START WORD bilam : fish START WORD diimuih : snake START WORD gaad : god START WORD iibin : heaven START WORD iililih : shark START WORD kahma : iguana START WORD kapak : manner START WORD liima : lemon START WORD mistu : cat START WORD onyan : onion START WORD paumak : tomato START WORD sikbilh : horsefly START WORD taim : time START WORD taitai : grey_squirrel START WORD uumak : window START WORD waiku : moon START WORD wasala : possum WORD WORD PERSON \+([1]|[2]|[3]) WORD NUMBER \+[1plincl] PERSON NUMBER \+([sg]|[pl]) NUMBER ## Compile it bash$ lexarclist -l ulwa.lab -S ulwa.scl -F ulwalx.fst ulwa.lx ## In Ulwa you put the infix after the first foot (wherever that ## may end). Foot is disyllabic CVCV, or monosyllabic CVV or CVC. ## (In prosodic terms it's a bimoraic foot.) bash$ cat footparse.rul [] -> $ / [] (C?) (VC | VV | VCV) __ CV [] -> $ / [] (C?) (VC | VVC | VCVC | VV) __ ([]|\+) ## Compile the rules bash$ lexrulecomp -l ulwa.lab -S ulwa.scl -F footparse.fst footparse.rul ## Spell out for affixes: spell out the foot boundary "$" depending ## upon what the morphological features are bash$ cat affixspell.rul $ -> ki$ / __ []* [1] $ -> ma$ / __ []* [2] $ -> ka$ / __ []* [3] $ -> ni / __ []* [1plincl] $ -> na / __ []* [pl] [feat] | [bound] -> [] ## Compile the rules bash$ lexrulecomp -l ulwa.lab -S ulwa.scl -F affixspell.fst affixspell.rul ## Put it all together bash$ fsminvert ulwalx.fst | fsmcompose - footparse.fst affixspell.fst | fsminvert | fsmcompact >ulwa.fst ## Test some examples bash$ lexcompre -l ulwa.lab -s "liikanama" | fsmcompose - ulwa.fst | lexfsmstrings -l ulwa.lab liikanama lemon+3+[pl] bash$ lexcompre -l ulwa.lab -s "taimka" | fsmcompose - ulwa.fst | lexfsmstrings -l ulwa.lab taimka time+3+[sg] ## Now for Arabic ## First the symbol file bash$ cat arabic.sym V a i u C k t b n s X V C Feat BinyanI BinyanII BinyanIII BinyanIV BinyanV BinyanVI BinyanVII Feat BinyanVIII BinyanIX BinyanX act pass Bound + ## Compile it bash$ lexmakelab arabic ## Templates. These take a pattern and insert the appropriate Binyan feature bash$ cat templates CVCVC ([]:\+[BinyanI]) CVCCVC ([]:\+[BinyanII]) CVVCVC ([]:\+[BinyanIII]) tVCVVCVC ([]:\+[BinyanVI]) nCVVCVC ([]:\+[BinyanVII]) CtVCVC ([]:\+[BinyanVIII]) stVCCVC ([]:\+[BinyanX]) ## lexcomplex compiles sets of regular expressions bash$ lexcomplex -F templates.fst -l arabic.lab -S arabic.scl templates ## vocalisms is similar: given a particular vowel sequence with ## possibly intervening consonants, insert the appropriate voice ## feature bash$ cat vocalisms (a | C)* (([Bound]|[Feat])*) ([] : \+[act]) (u | C)* i (C*) (([Bound]|[Feat])*) ([] : \+[pass]) ## compile that bash$ lexcomplex -F vocalisms.fst -l arabic.lab -S arabic.scl vocalisms ## roots is a bit complicated. actually here there's just the one root ## "ktb" `write'. We need to allow for material that occurs before the ## root, such as the tV- prefix of Binyan VI. We also need to allow ## medial consonant to occur more than once (e.g., for Binyan II). ## Then we need to allow following features. This regular expression ## Then deletes all but the root and the features. bash$ cat roots (X* : []) \ k (V+ : []) \ t (t* : []) (V+ : []) \ b (([Bound]|[Feat])*) ## compile it bash$ lexcomplex -F roots.fst -l arabic.lab -S arabic.scl roots ## combine it all bash$ fsmcompose templates.fst vocalisms.fst roots.fst | fsmcompact >arabic.fst ## print out the transducer: bash$ lexfsmstrings -l arabic.lab arabic.fst kaatab ktb+[BinyanIII]+[act] katab ktb+[BinyanI]+[act] kattab ktb+[BinyanII]+[act] kuutib ktb+[BinyanIII]+[pass] kutib ktb+[BinyanI]+[pass] kuttib ktb+[BinyanII]+[pass] takaatab ktb+[BinyanVI]+[act] tukuutib ktb+[BinyanVI]+[pass] nkaatab ktb+[BinyanVII]+[act] nkuutib ktb+[BinyanVII]+[pass] ######################################################## ### Finally the script "tester" does what I just did bash$ tester Testing Spanish Testing "cerrasteis" cerrasteis cerrar[vb]+[ind][pret]2[pl] Testing "hablo" hablo hablar[vb]+[ind][pres]1[sg] Testing Ulwa Testing "liikanama" liikanama lemon+3+[pl] Testing "taimka" taimka time+3+[sg] Testing Arabic kaatab ktb+[BinyanIII]+[act] katab ktb+[BinyanI]+[act] kattab ktb+[BinyanII]+[act] kuutib ktb+[BinyanIII]+[pass] kutib ktb+[BinyanI]+[pass] kuttib ktb+[BinyanII]+[pass] takaatab ktb+[BinyanVI]+[act] tukuutib ktb+[BinyanVI]+[pass] nkaatab ktb+[BinyanVII]+[act] nkuutib ktb+[BinyanVII]+[pass] bash$