-
Notifications
You must be signed in to change notification settings - Fork 11
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Try using markdown as the default view
- Loading branch information
Rob Speer
committed
Aug 26, 2013
1 parent
2d90d51
commit 639d690
Showing
13 changed files
with
972 additions
and
5 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,187 @@ | ||
|
||
In[1]: | ||
|
||
``` | ||
from __future__ import unicode_literals | ||
``` | ||
|
||
In[2]: | ||
|
||
``` | ||
text = '''"Words," you might say, "are things separated by spaces."''' | ||
``` | ||
|
||
In[3]: | ||
|
||
``` | ||
text.split(' ') | ||
``` | ||
|
||
|
||
|
||
|
||
[u'"Words,"', | ||
u'you', | ||
u'might', | ||
u'say,', | ||
u'"are', | ||
u'things', | ||
u'separated', | ||
u'by', | ||
u'spaces."'] | ||
|
||
|
||
|
||
In[4]: | ||
|
||
``` | ||
text2 = '''Okay, words are sequences of letters that don't include punctuation.''' | ||
``` | ||
|
||
In[5]: | ||
|
||
``` | ||
import re | ||
re.findall(r'[A-Za-z]+', text2) | ||
``` | ||
|
||
|
||
|
||
|
||
[u'Okay', | ||
u'words', | ||
u'are', | ||
u'sequences', | ||
u'of', | ||
u'letters', | ||
u'that', | ||
u'don', | ||
u't', | ||
u'include', | ||
u'punctuation'] | ||
|
||
|
||
|
||
In[6]: | ||
|
||
``` | ||
text3 = '''Isn't it naïve to not include the apostrophe?''' | ||
``` | ||
|
||
In[7]: | ||
|
||
``` | ||
re.findall(r"[A-Za-z']+", text3) | ||
``` | ||
|
||
|
||
|
||
|
||
[u"Isn't", | ||
u'it', | ||
u'na', | ||
u've', | ||
u'to', | ||
u'not', | ||
u'include', | ||
u'the', | ||
u'apostrophe'] | ||
|
||
|
||
|
||
In[8]: | ||
|
||
``` | ||
text4 = "Fine, let's use NLTK. That shouldn't be too hard." | ||
``` | ||
|
||
In[9]: | ||
|
||
``` | ||
import nltk | ||
``` | ||
|
||
In[10]: | ||
|
||
``` | ||
[nltk.word_tokenize(sent) for sent in nltk.sent_tokenize(text4)] | ||
``` | ||
|
||
|
||
|
||
|
||
[[u'Fine', u',', u'let', u"'s", u'use', u'NLTK', u'.'], | ||
[u'That', u'should', u"n't", u'be', u'too', u'hard', u'.']] | ||
|
||
|
||
|
||
In[11]: | ||
|
||
``` | ||
text = 'この文も、言葉で構成されています' | ||
# Translation: "This sentence is also made of words" | ||
``` | ||
|
||
In[12]: | ||
|
||
``` | ||
for word in nltk.word_tokenize(text): print(word) | ||
``` | ||
|
||
|
||
この文も、言葉で構成されています | ||
|
||
|
||
In[13]: | ||
|
||
``` | ||
from metanl import japanese | ||
``` | ||
|
||
In[14]: | ||
|
||
``` | ||
for word in japanese.normalize_list(text): print(word) | ||
``` | ||
|
||
|
||
文 | ||
言葉 | ||
構成 | ||
|
||
|
||
In[15]: | ||
|
||
``` | ||
text2 = 'You might be wondering whether we can deal with suffixes in English' | ||
``` | ||
|
||
In[16]: | ||
|
||
``` | ||
from metanl import english | ||
``` | ||
|
||
In[17]: | ||
|
||
``` | ||
english.normalize_list(text2) | ||
``` | ||
|
||
|
||
|
||
|
||
[u'you', | ||
u'might', | ||
u'be', | ||
u'wonder', | ||
u'whether', | ||
u'we', | ||
u'can', | ||
u'deal', | ||
u'with', | ||
u'suffix', | ||
u'in', | ||
u'english'] | ||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,148 @@ | ||
|
||
In[3]: | ||
|
||
``` | ||
from nltk.book import * | ||
``` | ||
|
||
|
||
*** Introductory Examples for the NLTK Book *** | ||
Loading text1, ..., text9 and sent1, ..., sent9 | ||
Type the name of the text or sentence to view it. | ||
Type: 'texts()' or 'sents()' to list the materials. | ||
text1: Moby Dick by Herman Melville 1851 | ||
text2: | ||
|
||
In[4]: | ||
|
||
``` | ||
import nltk | ||
``` | ||
|
||
In[5]: | ||
|
||
``` | ||
nltk.bigrams(text1[4712:4732]) | ||
``` | ||
|
||
|
||
|
||
|
||
[('Call', 'me'), | ||
('me', 'Ishmael'), | ||
('Ishmael', '.'), | ||
('.', 'Some'), | ||
('Some', 'years'), | ||
('years', 'ago'), | ||
('ago', '--'), | ||
('--', 'never'), | ||
('never', 'mind'), | ||
('mind', 'how'), | ||
('how', 'long'), | ||
('long', 'precisely'), | ||
('precisely', '--'), | ||
('--', 'having'), | ||
('having', 'little'), | ||
('little', 'or'), | ||
('or', 'no'), | ||
('no', 'money'), | ||
('money', 'in')] | ||
|
||
|
||
|
||
In[6]: | ||
|
||
``` | ||
text1.collocations() | ||
``` | ||
|
||
|
||
Building collocations list | ||
Sperm Whale; Moby Dick; White Whale; old man; Captain Ahab; sperm | ||
whale; Right Whale; Captain Peleg; New Bedford; Cape Horn; cried Ahab; | ||
years ago; lower jaw; never mind; Father Mapple; cried Stubb; chief | ||
mate; white whale; ivory leg; one hand | ||
|
||
|
||
In[7]: | ||
|
||
``` | ||
text3.collocations() | ||
``` | ||
|
||
|
||
Building collocations list | ||
said unto; pray thee; thou shalt; thou hast; thy seed; years old; | ||
spake unto; thou art; LORD God; every living; God hath; begat sons; | ||
seven years; shalt thou; little ones; living creature; creeping thing; | ||
savoury meat; thirty years; every beast | ||
|
||
|
||
In[8]: | ||
|
||
``` | ||
text2.collocations() | ||
``` | ||
|
||
|
||
Building collocations list | ||
Colonel Brandon; Sir John; Lady Middleton; Miss Dashwood; every thing; | ||
thousand pounds; dare say; Miss Steeles; said Elinor; Miss Steele; | ||
every body; John Dashwood; great deal; Harley Street; Berkeley Street; | ||
Miss Dashwoods; young man; Combe Magna; every day; next morning | ||
|
||
|
||
In[9]: | ||
|
||
``` | ||
text6.collocations() | ||
``` | ||
|
||
|
||
Building collocations list | ||
BLACK KNIGHT; HEAD KNIGHT; Holy Grail; FRENCH GUARD; Sir Robin; Run | ||
away; CARTOON CHARACTER; King Arthur; Iesu domine; Pie Iesu; DEAD | ||
PERSON; Round Table; OLD MAN; dramatic chord; dona eis; eis requiem; | ||
LEFT HEAD; FRENCH GUARDS; music stops; Sir Launcelot | ||
|
||
|
||
In[10]: | ||
|
||
``` | ||
# Make a frequency distribution. | ||
# It maps the first two words in a trigram to a | ||
# distribution of what the third word could be. | ||
cfd = nltk.ConditionalFreqDist( | ||
((first, second), third) | ||
for first, second, third in nltk.trigrams(text6) | ||
) | ||
``` | ||
|
||
In[11]: | ||
|
||
``` | ||
# Generate text by repeatedly adding the most likely | ||
# word given the previous two. | ||
def generate_words(cfdist, word1, word2, num=30): | ||
print word1, | ||
for i in range(num): | ||
print word2, | ||
next = cfdist[word1, word2].max() | ||
word1, word2 = word2, next | ||
``` | ||
|
||
In[12]: | ||
|
||
``` | ||
generate_words(cfd, 'KING', 'ARTHUR') | ||
``` | ||
|
||
|
||
KING ARTHUR : What is your name ? TIM : I ' m not dead ! [ clang ] Bring out your dead ! [ clang ] Bring out your dead | ||
|
||
|
||
In[ ]: | ||
|
||
``` | ||
``` |
Oops, something went wrong.