## Create a Unigram text model from the words in the book "Flatland". >>> flatland = DataFile("flat11.txt").read() >>> wordseq = words(flatland) >>> P = UnigramTextModel(wordseq) ## Now do segmentation, using the text model as a prior. >>> s, p = viterbi_segment('itiseasytoreadwordswithoutspaces', P) >>> s ['it', 'is', 'easy', 'to', 'read', 'words', 'without', 'spaces'] >>> 1e-30 < p < 1e-20 True >>> s, p = viterbi_segment('wheninthecourseofhumaneventsitbecomesnecessary', P) >>> s ['when', 'in', 'the', 'course', 'of', 'human', 'events', 'it', 'becomes', 'necessary'] ## Test the decoding system >>> shift_encode("This is a secret message.", 17) 'Kyzj zj r jvtivk dvjjrxv.' >>> ring = ShiftDecoder(flatland) >>> ring.decode('Kyzj zj r jvtivk dvjjrxv.') 'This is a secret message.' >>> ring.decode(rot13('Hello, world!')) 'Hello, world!' ## CountingProbDist ## Add a thousand samples of a roll of a die to D. >>> D = CountingProbDist() >>> for i in range(10000): ... D.add(random.choice('123456')) >>> ps = [D[n] for n in '123456'] >>> 1./7. <= min(ps) <= max(ps) <= 1./5. True ## demo ## Compare 1-, 2-, and 3-gram word models of the same text. >>> flatland = DataFile("flat11.txt").read() >>> wordseq = words(flatland) >>> P1 = UnigramTextModel(wordseq) >>> P2 = NgramTextModel(2, wordseq) >>> P3 = NgramTextModel(3, wordseq) ## Generate random text from the N-gram models >>> P1.samples(20) 'you thought known but were insides of see in depend by us dodecahedrons just but i words are instead degrees' >>> P2.samples(20) 'flatland well then can anything else more into the total destruction and circles teach others confine women must be added' >>> P3.samples(20) 'flatland by edwin a abbott 1884 to the wake of a certificate from nature herself proving the equal sided triangle' ## The most frequent entries in each model >>> P1.top(10) [(2081, 'the'), (1479, 'of'), (1021, 'and'), (1008, 'to'), (850, 'a'), (722, 'i'), (640, 'in'), (478, 'that'), (399, 'is'), (348, 'you')] >>> P2.top(10) [(368, ('of', 'the')), (152, ('to', 'the')), (152, ('in', 'the')), (86, ('of', 'a')), (80, ('it', 'is')), (71, ('by', 'the')), (68, ('for', 'the')), (68, ('and', 'the')), (62, ('on', 'the')), (60, ('to', 'be'))] >>> P3.top(10) [(30, ('a', 'straight', 'line')), (19, ('of', 'three', 'dimensions')), (16, ('the', 'sense', 'of')), (13, ('by', 'the', 'sense')), (13, ('as', 'well', 'as')), (12, ('of', 'the', 'circles')), (12, ('of', 'sight', 'recognition')), (11, ('the', 'number', 'of')), (11, ('that', 'i', 'had')), (11, ('so', 'as', 'to'))] ## Probabilities of some common n-grams >>> P1['the'] 0.061139348356200607 >>> P2[('of', 'the')] 0.010812081325655188 >>> P3[('', '', 'but')] 0.0 >>> P3[('so', 'as', 'to')] 0.00032318721353860618 ## Distributions given the previous n-1 words >>> P2.cond_prob['went',].dictionary >>> P3.cond_prob['in', 'order'].dictionary {'to': 6} ## Build and test an IR System >>> uc = UnixConsultant() >>> uc.present_results("how do I remove a file") 76.83| ../data/man/rm.txt | RM(1) FSF RM(1) 67.83| ../data/man/tar.txt | TAR(1) TAR(1) 67.79| ../data/man/cp.txt | CP(1) FSF CP(1) 66.58| ../data/man/zip.txt | ZIP(1L) ZIP(1L) 64.58| ../data/man/gzip.txt | GZIP(1) GZIP(1) 63.74| ../data/man/pine.txt | pine(1) pine(1) 62.95| ../data/man/shred.txt | SHRED(1) FSF SHRED(1) 57.46| ../data/man/pico.txt | pico(1) pico(1) 43.38| ../data/man/login.txt | LOGIN(1) Linux Programmer's Manual 41.93| ../data/man/ln.txt | LN(1) FSF LN(1) >>> uc.present_results("how do I delete a file") 75.47| ../data/man/diff.txt | DIFF(1) GNU Tools DIFF(1) 69.12| ../data/man/pine.txt | pine(1) pine(1) 63.56| ../data/man/tar.txt | TAR(1) TAR(1) 60.63| ../data/man/zip.txt | ZIP(1L) ZIP(1L) 57.46| ../data/man/pico.txt | pico(1) pico(1) 51.28| ../data/man/shred.txt | SHRED(1) FSF SHRED(1) 26.72| ../data/man/tr.txt | TR(1) User Commands TR(1) >>> uc.present_results("email") 18.39| ../data/man/pine.txt | pine(1) pine(1) 12.01| ../data/man/info.txt | INFO(1) FSF INFO(1) 9.89| ../data/man/pico.txt | pico(1) pico(1) 8.73| ../data/man/grep.txt | GREP(1) GREP(1) 8.07| ../data/man/zip.txt | ZIP(1L) ZIP(1L) >>> uc.present_results("word counts for files") 112.38| ../data/man/grep.txt | GREP(1) GREP(1) 101.84| ../data/man/wc.txt | WC(1) User Commands WC(1) 82.46| ../data/man/find.txt | FIND(1L) FIND(1L) 74.64| ../data/man/du.txt | DU(1) FSF DU(1) >>> uc.present_results("learn: date") >>> uc.present_results("2003") 14.58| ../data/man/pine.txt | pine(1) pine(1) 11.62| ../data/man/jar.txt | FASTJAR(1) GNU FASTJAR(1)