Try using markdown as the default view

rspeer · Aug 26, 2013 · 639d690 · 639d690
1 parent 2d90d51
commit 639d690
Show file tree

Hide file tree

Showing 13 changed files with 972 additions and 5 deletions.
diff --git a/1 - Word splitting.md b/1 - Word splitting.md
@@ -0,0 +1,187 @@
+
+In[1]:
+
+```
+from __future__ import unicode_literals
+```
+
+In[2]:
+
+```
+text = '''"Words," you might say, "are things separated by spaces."'''
+```
+
+In[3]:
+
+```
+text.split(' ')
+```
+
+
+
+
+    [u'"Words,"',
+     u'you',
+     u'might',
+     u'say,',
+     u'"are',
+     u'things',
+     u'separated',
+     u'by',
+     u'spaces."']
+
+
+
+In[4]:
+
+```
+text2 = '''Okay, words are sequences of letters that don't include punctuation.'''
+```
+
+In[5]:
+
+```
+import re
+re.findall(r'[A-Za-z]+', text2)
+```
+
+
+
+
+    [u'Okay',
+     u'words',
+     u'are',
+     u'sequences',
+     u'of',
+     u'letters',
+     u'that',
+     u'don',
+     u't',
+     u'include',
+     u'punctuation']
+
+
+
+In[6]:
+
+```
+text3 = '''Isn't it naïve to not include the apostrophe?'''
+```
+
+In[7]:
+
+```
+re.findall(r"[A-Za-z']+", text3)
+```
+
+
+
+
+    [u"Isn't",
+     u'it',
+     u'na',
+     u've',
+     u'to',
+     u'not',
+     u'include',
+     u'the',
+     u'apostrophe']
+
+
+
+In[8]:
+
+```
+text4 = "Fine, let's use NLTK. That shouldn't be too hard."
+```
+
+In[9]:
+
+```
+import nltk
+```
+
+In[10]:
+
+```
+[nltk.word_tokenize(sent) for sent in nltk.sent_tokenize(text4)]
+```
+
+
+
+
+    [[u'Fine', u',', u'let', u"'s", u'use', u'NLTK', u'.'],
+     [u'That', u'should', u"n't", u'be', u'too', u'hard', u'.']]
+
+
+
+In[11]:
+
+```
+text = 'この文も、言葉で構成されています'
+# Translation: "This sentence is also made of words"
+```
+
+In[12]:
+
+```
+for word in nltk.word_tokenize(text): print(word)
+```
+
+
+    この文も、言葉で構成されています
+
+
+In[13]:
+
+```
+from metanl import japanese
+```
+
+In[14]:
+
+```
+for word in japanese.normalize_list(text): print(word)
+```
+
+
+    文
+    言葉
+    構成
+
+
+In[15]:
+
+```
+text2 = 'You might be wondering whether we can deal with suffixes in English'
+```
+
+In[16]:
+
+```
+from metanl import english
+```
+
+In[17]:
+
+```
+english.normalize_list(text2)
+```
+
+
+
+
+    [u'you',
+     u'might',
+     u'be',
+     u'wonder',
+     u'whether',
+     u'we',
+     u'can',
+     u'deal',
+     u'with',
+     u'suffix',
+     u'in',
+     u'english']
+
+
diff --git a/2 - Interesting n-grams.md b/2 - Interesting n-grams.md
@@ -0,0 +1,148 @@
+
+In[3]:
+
+```
+from nltk.book import *
+```
+
+
+    *** Introductory Examples for the NLTK Book ***
+    Loading text1, ..., text9 and sent1, ..., sent9
+    Type the name of the text or sentence to view it.
+    Type: 'texts()' or 'sents()' to list the materials.
+    text1: Moby Dick by Herman Melville 1851
+    text2:
+
+In[4]:
+
+```
+import nltk
+```
+
+In[5]:
+
+```
+nltk.bigrams(text1[4712:4732])
+```
+
+
+
+
+    [('Call', 'me'),
+     ('me', 'Ishmael'),
+     ('Ishmael', '.'),
+     ('.', 'Some'),
+     ('Some', 'years'),
+     ('years', 'ago'),
+     ('ago', '--'),
+     ('--', 'never'),
+     ('never', 'mind'),
+     ('mind', 'how'),
+     ('how', 'long'),
+     ('long', 'precisely'),
+     ('precisely', '--'),
+     ('--', 'having'),
+     ('having', 'little'),
+     ('little', 'or'),
+     ('or', 'no'),
+     ('no', 'money'),
+     ('money', 'in')]
+
+
+
+In[6]:
+
+```
+text1.collocations()
+```
+
+
+    Building collocations list
+    Sperm Whale; Moby Dick; White Whale; old man; Captain Ahab; sperm
+    whale; Right Whale; Captain Peleg; New Bedford; Cape Horn; cried Ahab;
+    years ago; lower jaw; never mind; Father Mapple; cried Stubb; chief
+    mate; white whale; ivory leg; one hand
+
+
+In[7]:
+
+```
+text3.collocations()
+```
+
+
+    Building collocations list
+    said unto; pray thee; thou shalt; thou hast; thy seed; years old;
+    spake unto; thou art; LORD God; every living; God hath; begat sons;
+    seven years; shalt thou; little ones; living creature; creeping thing;
+    savoury meat; thirty years; every beast
+
+
+In[8]:
+
+```
+text2.collocations()
+```
+
+
+    Building collocations list
+    Colonel Brandon; Sir John; Lady Middleton; Miss Dashwood; every thing;
+    thousand pounds; dare say; Miss Steeles; said Elinor; Miss Steele;
+    every body; John Dashwood; great deal; Harley Street; Berkeley Street;
+    Miss Dashwoods; young man; Combe Magna; every day; next morning
+
+
+In[9]:
+
+```
+text6.collocations()
+```
+
+
+    Building collocations list
+    BLACK KNIGHT; HEAD KNIGHT; Holy Grail; FRENCH GUARD; Sir Robin; Run
+    away; CARTOON CHARACTER; King Arthur; Iesu domine; Pie Iesu; DEAD
+    PERSON; Round Table; OLD MAN; dramatic chord; dona eis; eis requiem;
+    LEFT HEAD; FRENCH GUARDS; music stops; Sir Launcelot
+
+
+In[10]:
+
+```
+# Make a frequency distribution.
+# It maps the first two words in a trigram to a
+# distribution of what the third word could be.
+cfd = nltk.ConditionalFreqDist(
+    ((first, second), third)
+    for first, second, third in nltk.trigrams(text6)
+)
+```
+
+In[11]:
+
+```
+# Generate text by repeatedly adding the most likely
+# word given the previous two.
+def generate_words(cfdist, word1, word2, num=30):
+    print word1,
+    for i in range(num):
+        print word2,
+        next = cfdist[word1, word2].max()
+        word1, word2 = word2, next
+```
+
+In[12]:
+
+```
+generate_words(cfd, 'KING', 'ARTHUR')
+```
+
+
+    KING ARTHUR : What is your name ? TIM : I ' m not dead ! [ clang ] Bring out your dead ! [ clang ] Bring out your dead
+
+
+In[ ]:
+
+```
+
+```