Last active
August 29, 2015 14:16
-
-
Save bage79/eb2012efb05a0946427f to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"metadata": { | |
"name": "", | |
"signature": "sha256:976b053b3fc9365d88c7eb49e0505472456f537c77473ecef8011caaf34121f2" | |
}, | |
"nbformat": 3, | |
"nbformat_minor": 0, | |
"worksheets": [ | |
{ | |
"cells": [ | |
{ | |
"cell_type": "heading", | |
"level": 1, | |
"metadata": {}, | |
"source": [ | |
"6.1 Supervised Classification (\uac10\ub3c5\ud559\uc2b5\uc5d0 \uc758\ud55c \ubd84\ub958)" | |
] | |
}, | |
{ | |
"cell_type": "heading", | |
"level": 6, | |
"metadata": {}, | |
"source": [ | |
"\ubd84\ub958\ub294 \uc785\ub825\uc5d0 \ub300\ud55c \uc801\uc808\ud55c \ub77c\ubca8(class)\uc744 \uace0\ub974\ub294 \uc791\uc5c5\uc774\ub2e4. <BR>\n", | |
"<BR>\n", | |
"Classification is the task of choosing the correct class label for a given input.<BR>\n", | |
"A classifier is called supervised if it is built based on training corpora containing the correct label for each input." | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"<img src=\"http://www.nltk.org/images/supervised-classification.png\" width=\"700\"><BR>\n", | |
"(Figure 6-1) Supervised Classification" | |
] | |
}, | |
{ | |
"cell_type": "heading", | |
"level": 2, | |
"metadata": {}, | |
"source": [ | |
"Gender Identification (\uc131\ubcc4 \ud310\ubcc4)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"from __future__ import print_function, unicode_literals\n", | |
"from pprint import pprint\n", | |
"import nltk\n", | |
"from nltk.corpus import names as name2gender\n", | |
"import random\n", | |
"import sys\n", | |
"\n", | |
"\n", | |
"# \uc785\ub825\ub370\uc774\ud0c0 \uc0dd\uc131. (\uc774\ub984, \uc131\ubcc4)\n", | |
"names = ([(name, 'male') for name in name2gender.words('male.txt')] + \\\n", | |
"[(name,'female') for name in name2gender.words('female.txt')])\n", | |
"random.shuffle(names)\n", | |
"\n", | |
"print('len(names):', len(names))\n", | |
"pprint(names[:10])\n", | |
"print()\n", | |
"\n", | |
"# \ud2b9\uc9d5 \ucd94\ucd9c \ud568\uc218 \uc815\uc758. (\uc774\ub984) -> (\ub9c8\uc9c0\ub9c9 \ubb38\uc790)\n", | |
"def gender_features(word):\n", | |
" return {'last_letter': word[-1]}\n", | |
"\n", | |
"print(\"gender_features('Shrek'):\", gender_features('Shrek'))\n", | |
"print(\"names ended with 'k':\")\n", | |
"pprint([(name, gender) for (name,gender) in names if gender_features(name)['last_letter']=='k'][:10])\n", | |
"print()\n", | |
"\n", | |
"# \ud559\uc2b5\uc14b\uacfc \ud14c\uc2a4\ud2b8\uc14b\uc744 \uc0dd\uc131. (\ud2b9\uc9d5, \uc131\ubcc4)\n", | |
"featuresets = [(gender_features(name), gender) for (name,gender) in names]\n", | |
"train_set = featuresets[500:]\n", | |
"test_set = featuresets[:500] # \ud14c\uc2a4\ud2b8\uc14b\uc744 500\uac1c \ubf51\uc74c.\n", | |
"print('len(train_set):', len(train_set))\n", | |
"print('len(test_set):', len(test_set))\n", | |
"pprint(test_set[:10])\n", | |
"print()\n", | |
"\n", | |
"# \ubd84\ub958\uae30 \ud559\uc2b5\n", | |
"classifier=nltk.NaiveBayesClassifier.train(train_set)\n", | |
"print()\n", | |
"\n", | |
"# \ubd84\ub958\uae30 \ud14c\uc2a4\ud2b8\n", | |
"print(\"classifier.classify(gender_features('Neo')):\", classifier.classify(gender_features('Neo'))) \n", | |
"print(\"classifier.classify(gender_features('Trinity')):\", classifier.classify(gender_features('Trinity')))\n", | |
"print(\"classifier.classify(gender_features('Tony')):\", classifier.classify(gender_features('Tony')))\n", | |
"print()\n", | |
"\n", | |
"# \ubd84\ub958\uae30 \ud3c9\uac00\n", | |
"print('accuracy:', nltk.classify.accuracy(classifier, test_set)) # \ud14c\uc2a4\ud2b8\uc14b\uc73c\ub85c \uc815\ud655\ub960 \ud655\uc778\n", | |
"print()\n", | |
"\n", | |
"classifier.show_most_informative_features(10) # likelihood-ratio (\uc6b0\ub3c4\ube44)\n", | |
"print()\n" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"len(names): 7944\n", | |
"[(u'Hynda', u'female'),\n", | |
" (u'Jabez', u'male'),\n", | |
" (u'Tracy', u'female'),\n", | |
" (u'Isahella', u'female'),\n", | |
" (u'Wayland', u'male'),\n", | |
" (u'Vally', u'female'),\n", | |
" (u'Dee Dee', u'female'),\n", | |
" (u'Anastassia', u'female'),\n", | |
" (u'Sophie', u'female'),\n", | |
" (u'Mariele', u'female')]\n", | |
"\n", | |
"gender_features('Shrek'): {u'last_letter': u'k'}\n", | |
"names ended with 'k':\n", | |
"[(u'Izak', u'male'),\n", | |
" (u'Vick', u'male'),\n", | |
" (u'Dirk', u'male'),\n", | |
" (u'Merrick', u'male'),\n", | |
" (u'Roderick', u'male'),\n", | |
" (u'Tuck', u'male'),\n", | |
" (u'Erik', u'male'),\n", | |
" (u'Ulrick', u'male'),\n", | |
" (u'Kirk', u'male'),\n", | |
" (u'Jack', u'male')]\n", | |
"\n", | |
"len(train_set): 7444\n", | |
"len(test_set): 500\n", | |
"[({u'last_letter': u'a'}, u'female'),\n", | |
" ({u'last_letter': u'z'}, u'male'),\n", | |
" ({u'last_letter': u'y'}, u'female'),\n", | |
" ({u'last_letter': u'a'}, u'female'),\n", | |
" ({u'last_letter': u'd'}, u'male'),\n", | |
" ({u'last_letter': u'y'}, u'female'),\n", | |
" ({u'last_letter': u'e'}, u'female'),\n", | |
" ({u'last_letter': u'a'}, u'female'),\n", | |
" ({u'last_letter': u'e'}, u'female'),\n", | |
" ({u'last_letter': u'e'}, u'female')]\n", | |
"\n", | |
"\n", | |
"classifier.classify(gender_features('Neo')): male\n", | |
"classifier.classify(gender_features('Trinity')): female\n", | |
"classifier.classify(gender_features('Tony')): female\n", | |
"\n", | |
"accuracy: 0.756\n", | |
"\n", | |
"Most Informative Features\n", | |
" last_letter = u'a' female : male = 36.1 : 1.0\n", | |
" last_letter = u'k' male : female = 32.1 : 1.0\n", | |
" last_letter = u'f' male : female = 16.6 : 1.0\n", | |
" last_letter = u'p' male : female = 12.5 : 1.0\n", | |
" last_letter = u'v' male : female = 11.2 : 1.0\n", | |
" last_letter = u'd' male : female = 9.6 : 1.0\n", | |
" last_letter = u'm' male : female = 9.0 : 1.0\n", | |
" last_letter = u'o' male : female = 8.6 : 1.0\n", | |
" last_letter = u'w' male : female = 7.5 : 1.0\n", | |
" last_letter = u'r' male : female = 6.6 : 1.0\n", | |
"\n" | |
] | |
} | |
], | |
"prompt_number": 1 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"# (\ud301) LazyMap\uc744 \uc774\uc6a9\ud558\uc5ec \uba54\ubaa8\ub9ac \uc0ac\uc6a9\uc744 \ucd5c\uc18c\ud654 \ud558\ub77c.\n", | |
"from __future__ import print_function, unicode_literals\n", | |
"from pprint import pprint\n", | |
"from nltk.classify import apply_features \n", | |
"import sys\n", | |
"\n", | |
"\n", | |
"# \uc785\ub825\ub370\uc774\ud0c0 \uc0dd\uc131. (\uc774\ub984, \uc131\ubcc4)\n", | |
"# names = ([(name, 'male') for name in name2gender.words('male.txt')] + \\\n", | |
"# [(name,'female') for name in name2gender.words('female.txt')])\n", | |
"# random.shuffle(names)\n", | |
"\n", | |
"print('len(names):', len(names))\n", | |
"pprint(names[:10])\n", | |
"print()\n", | |
"\n", | |
"# \ud2b9\uc9d5 \ucd94\ucd9c \ud568\uc218 \uc815\uc758. (\uc774\ub984) -> (\ub9c8\uc9c0\ub9c9 \ubb38\uc790)\n", | |
"def gender_features(word):\n", | |
" return {'last_letter': word[-1]}\n", | |
"\n", | |
"print(\"gender_features('Shrek'):\", gender_features('Shrek'))\n", | |
"pprint([(name, gender) for (name,gender) in names if gender_features(name)['last_letter']=='k'][:10])\n", | |
"print()\n", | |
"\n", | |
"\n", | |
"%reload_ext memory_profiler\n", | |
"\n", | |
"# \ud559\uc2b5\uc14b\uacfc \ud14c\uc2a4\ud2b8\uc14b\uc744 \uc0dd\uc131. (\ub370\uc774\ud0c0\uc14b \ud06c\uae30\uac00 \uc791\uc744 \ub54c)\n", | |
"%memit train_set = [(gender_features(name), gender) for (name,gender) in names][500:]\n", | |
"print(\"train_set:\", type(train_set), sys.getsizeof(train_set), 'bytes')\n", | |
"%memit classifier=nltk.NaiveBayesClassifier.train(train_set)\n", | |
"print()\n", | |
"print()\n", | |
"\n", | |
"# \ud559\uc2b5\uc14b\uacfc \ud14c\uc2a4\ud2b8\uc14b\uc744 \uc0dd\uc131. (\ub370\uc774\ud0c0\uc14b \ud06c\uae30\uac00 \ud074 \ub54c)\n", | |
"%memit train_set2 = apply_features(gender_features, names[500:])\n", | |
"print(\"train_set2:\", type(train_set2), sys.getsizeof(train_set2), 'bytes')\n", | |
"%memit classifier=nltk.NaiveBayesClassifier.train(train_set2)\n", | |
"print()\n", | |
"print()\n", | |
"\n", | |
"\n", | |
"\n", | |
"# \ub05d \uae00\uc790\uc5d0 \ud574\ub2f9\ud558\ub294 \uc774\ub984 \ud655\uc778 (\uad50\uc7ac\uc5d0 \uc5c6\uc74c)\n", | |
"def list_from_last_letter(names, letter): \n", | |
" li = []\n", | |
" for name, gender in names:\n", | |
" if name.endswith(letter):\n", | |
" li.append((name, gender))\n", | |
" return li\n", | |
"\n", | |
"print(\"ends with 'k'\")\n", | |
"pprint(list_from_last_letter(names, 'k')[:10])" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"len(names): 7944\n", | |
"[(u'Hynda', u'female'),\n", | |
" (u'Jabez', u'male'),\n", | |
" (u'Tracy', u'female'),\n", | |
" (u'Isahella', u'female'),\n", | |
" (u'Wayland', u'male'),\n", | |
" (u'Vally', u'female'),\n", | |
" (u'Dee Dee', u'female'),\n", | |
" (u'Anastassia', u'female'),\n", | |
" (u'Sophie', u'female'),\n", | |
" (u'Mariele', u'female')]\n", | |
"\n", | |
"gender_features('Shrek'): {u'last_letter': u'k'}\n", | |
"[(u'Izak', u'male'),\n", | |
" (u'Vick', u'male'),\n", | |
" (u'Dirk', u'male'),\n", | |
" (u'Merrick', u'male'),\n", | |
" (u'Roderick', u'male'),\n", | |
" (u'Tuck', u'male'),\n", | |
" (u'Erik', u'male'),\n", | |
" (u'Ulrick', u'male'),\n", | |
" (u'Kirk', u'male'),\n", | |
" (u'Jack', u'male')]\n", | |
"\n" | |
] | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"peak memory: 70.50 MiB, increment: 5.88 MiB\n", | |
"train_set: <type 'list'> 59624 bytes\n" | |
] | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"peak memory: 70.51 MiB, increment: 0.00 MiB\n", | |
"\n", | |
"\n" | |
] | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"peak memory: 70.51 MiB, increment: 0.00 MiB\n", | |
"train_set2: <class 'nltk.util.LazyMap'> 64 bytes\n" | |
] | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"peak memory: 70.51 MiB, increment: 0.00 MiB\n", | |
"\n", | |
"\n", | |
"ends with 'k'\n", | |
"[(u'Izak', u'male'),\n", | |
" (u'Vick', u'male'),\n", | |
" (u'Dirk', u'male'),\n", | |
" (u'Merrick', u'male'),\n", | |
" (u'Roderick', u'male'),\n", | |
" (u'Tuck', u'male'),\n", | |
" (u'Erik', u'male'),\n", | |
" (u'Ulrick', u'male'),\n", | |
" (u'Kirk', u'male'),\n", | |
" (u'Jack', u'male')]\n" | |
] | |
} | |
], | |
"prompt_number": 2 | |
}, | |
{ | |
"cell_type": "heading", | |
"level": 2, | |
"metadata": {}, | |
"source": [ | |
"Choosing the Right Features (\uc62c\ubc14\ub978 \ud2b9\uc9d5 \uc120\ud0dd)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"from __future__ import print_function, unicode_literals\n", | |
"from pprint import pprint\n", | |
"import nltk\n", | |
"from nltk.corpus import names as name2gender\n", | |
"import random\n", | |
"import sys\n", | |
"\n", | |
"\n", | |
"# \uc785\ub825\ub370\uc774\ud0c0 \uc0dd\uc131. (\uc774\ub984, \uc131\ubcc4)\n", | |
"# names = ([(name, 'male') for name in name2gender.words('male.txt')] + \\\n", | |
"# [(name,'female') for name in name2gender.words('female.txt')])\n", | |
"# random.shuffle(names)\n", | |
"\n", | |
"print('len(names):', len(names))\n", | |
"pprint(names[:10])\n", | |
"print()\n", | |
"\n", | |
"# \ud2b9\uc9d5 \ucd94\ucd9c \ud568\uc218 \uc815\uc758. (\uc774\ub984) -> (\uccab/\ub05d \ubb38\uc790, \ubb38\uc790\ubcc4 \uac1c\uc218, \ud3ec\ud568\uc5ec\ubd80)\n", | |
"def gender_features2(name):\n", | |
" features={}\n", | |
" features['firstletter']=name[0].lower()\n", | |
" features['lastletter']=name[-1].lower()\n", | |
" for letter in 'abcdefghijklmnopqrstuvwxyz':\n", | |
" features['count(%s)'%letter]=name.lower().count(letter)\n", | |
" features['has(%s)'%letter]=(letter in name.lower())\n", | |
" return features\n", | |
"\n", | |
"print(\"gender_features2('Shrek'):\")\n", | |
"pprint(gender_features2('Shrek'))\n", | |
"print()\n", | |
"\n", | |
"# \ud559\uc2b5\uc14b\uacfc \ud14c\uc2a4\ud2b8\uc14b\uc744 \uc0dd\uc131. (\ud2b9\uc9d5, \uc131\ubcc4)\n", | |
"featuresets=[(gender_features2(name),gender) for (name, gender) in names]\n", | |
"train_set=featuresets[500:]\n", | |
"test_set=featuresets[:500]\n", | |
"\n", | |
"# \ubd84\ub958\uae30 \ud559\uc2b5\n", | |
"classifier=nltk.NaiveBayesClassifier.train(train_set)\n", | |
"\n", | |
"# \ubd84\ub958\uae30 \ud3c9\uac00\n", | |
"print('accuracy:', nltk.classify.accuracy(classifier, test_set))\n", | |
"classifier.show_most_informative_features(100)\n", | |
"print()" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"len(names): 7944\n", | |
"[(u'Hynda', u'female'),\n", | |
" (u'Jabez', u'male'),\n", | |
" (u'Tracy', u'female'),\n", | |
" (u'Isahella', u'female'),\n", | |
" (u'Wayland', u'male'),\n", | |
" (u'Vally', u'female'),\n", | |
" (u'Dee Dee', u'female'),\n", | |
" (u'Anastassia', u'female'),\n", | |
" (u'Sophie', u'female'),\n", | |
" (u'Mariele', u'female')]\n", | |
"\n", | |
"gender_features2('Shrek'):\n", | |
"{u'count(a)': 0,\n", | |
" u'count(b)': 0,\n", | |
" u'count(c)': 0,\n", | |
" u'count(d)': 0,\n", | |
" u'count(e)': 1,\n", | |
" u'count(f)': 0,\n", | |
" u'count(g)': 0,\n", | |
" u'count(h)': 1,\n", | |
" u'count(i)': 0,\n", | |
" u'count(j)': 0,\n", | |
" u'count(k)': 1,\n", | |
" u'count(l)': 0,\n", | |
" u'count(m)': 0,\n", | |
" u'count(n)': 0,\n", | |
" u'count(o)': 0,\n", | |
" u'count(p)': 0,\n", | |
" u'count(q)': 0,\n", | |
" u'count(r)': 1,\n", | |
" u'count(s)': 1,\n", | |
" u'count(t)': 0,\n", | |
" u'count(u)': 0,\n", | |
" u'count(v)': 0,\n", | |
" u'count(w)': 0,\n", | |
" u'count(x)': 0,\n", | |
" u'count(y)': 0,\n", | |
" u'count(z)': 0,\n", | |
" u'firstletter': u's',\n", | |
" u'has(a)': False,\n", | |
" u'has(b)': False,\n", | |
" u'has(c)': False,\n", | |
" u'has(d)': False,\n", | |
" u'has(e)': True,\n", | |
" u'has(f)': False,\n", | |
" u'has(g)': False,\n", | |
" u'has(h)': True,\n", | |
" u'has(i)': False,\n", | |
" u'has(j)': False,\n", | |
" u'has(k)': True,\n", | |
" u'has(l)': False,\n", | |
" u'has(m)': False,\n", | |
" u'has(n)': False,\n", | |
" u'has(o)': False,\n", | |
" u'has(p)': False,\n", | |
" u'has(q)': False,\n", | |
" u'has(r)': True,\n", | |
" u'has(s)': True,\n", | |
" u'has(t)': False,\n", | |
" u'has(u)': False,\n", | |
" u'has(v)': False,\n", | |
" u'has(w)': False,\n", | |
" u'has(x)': False,\n", | |
" u'has(y)': False,\n", | |
" u'has(z)': False,\n", | |
" u'lastletter': u'k'}\n", | |
"\n", | |
"accuracy:" | |
] | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
" 0.778\n", | |
"Most Informative Features\n", | |
" lastletter = u'a' female : male = 36.1 : 1.0\n", | |
" lastletter = u'k' male : female = 32.1 : 1.0\n", | |
" lastletter = u'f' male : female = 16.6 : 1.0\n", | |
" lastletter = u'p' male : female = 12.5 : 1.0\n", | |
" lastletter = u'v' male : female = 11.2 : 1.0\n", | |
" lastletter = u'd' male : female = 9.6 : 1.0\n", | |
" lastletter = u'm' male : female = 9.0 : 1.0\n", | |
" count(v) = 2 female : male = 8.9 : 1.0\n", | |
" lastletter = u'o' male : female = 8.6 : 1.0\n", | |
" lastletter = u'w' male : female = 7.5 : 1.0\n", | |
" lastletter = u'r' male : female = 6.6 : 1.0\n", | |
" lastletter = u'g' male : female = 4.9 : 1.0\n", | |
" firstletter = u'w' male : female = 4.7 : 1.0\n", | |
" count(a) = 3 female : male = 4.5 : 1.0\n", | |
" count(w) = 1 male : female = 4.5 : 1.0\n", | |
" has(w) = True male : female = 4.5 : 1.0\n", | |
" lastletter = u't' male : female = 4.4 : 1.0\n", | |
" lastletter = u'b' male : female = 4.3 : 1.0\n", | |
" lastletter = u's' male : female = 4.0 : 1.0\n", | |
" lastletter = u'j' male : female = 3.9 : 1.0\n", | |
" lastletter = u'i' female : male = 3.7 : 1.0\n", | |
" lastletter = u'z' male : female = 3.6 : 1.0\n", | |
" count(o) = 2 male : female = 3.5 : 1.0\n", | |
" count(f) = 2 male : female = 3.3 : 1.0\n", | |
" firstletter = u'u' male : female = 3.3 : 1.0\n", | |
" count(e) = 3 female : male = 3.2 : 1.0\n", | |
" count(a) = 2 female : male = 3.1 : 1.0\n", | |
" count(w) = 2 male : female = 3.0 : 1.0\n", | |
" lastletter = u'u' male : female = 3.0 : 1.0\n", | |
" count(d) = 3 male : female = 2.8 : 1.0\n", | |
" count(i) = 3 male : female = 2.7 : 1.0\n", | |
" count(l) = 3 female : male = 2.6 : 1.0\n", | |
" count(u) = 2 male : female = 2.6 : 1.0\n", | |
" firstletter = u'q' male : female = 2.6 : 1.0\n", | |
" count(p) = 3 female : male = 2.6 : 1.0\n", | |
" count(y) = 2 female : male = 2.4 : 1.0\n", | |
" count(m) = 3 male : female = 2.4 : 1.0\n", | |
" firstletter = u'k' female : male = 2.3 : 1.0\n", | |
" firstletter = u'h' male : female = 2.2 : 1.0\n", | |
" count(h) = 2 male : female = 2.1 : 1.0\n", | |
" lastletter = u'n' male : female = 2.1 : 1.0\n", | |
" count(p) = 2 male : female = 2.0 : 1.0\n", | |
" count(k) = 2 female : male = 2.0 : 1.0\n", | |
" firstletter = u'x' male : female = 2.0 : 1.0\n", | |
" count(r) = 2 male : female = 1.9 : 1.0\n", | |
" count(i) = 2 female : male = 1.9 : 1.0\n", | |
" count(d) = 2 male : female = 1.9 : 1.0\n", | |
" lastletter = u'x' male : female = 1.9 : 1.0\n", | |
" count(n) = 3 female : male = 1.8 : 1.0\n", | |
" lastletter = u'l' male : female = 1.8 : 1.0\n", | |
" firstletter = u'z' male : female = 1.8 : 1.0\n", | |
" has(u) = True male : female = 1.8 : 1.0\n", | |
" count(u) = 1 male : female = 1.8 : 1.0\n", | |
" lastletter = u'e' female : male = 1.8 : 1.0\n", | |
" count(t) = 3 female : male = 1.8 : 1.0\n", | |
" firstletter = u'l' female : male = 1.8 : 1.0\n", | |
" count(p) = 1 male : female = 1.8 : 1.0\n", | |
" has(f) = True male : female = 1.7 : 1.0\n", | |
" has(p) = True male : female = 1.7 : 1.0\n", | |
" count(e) = 2 female : male = 1.7 : 1.0\n", | |
" count(n) = 2 female : male = 1.7 : 1.0\n", | |
" count(t) = 2 female : male = 1.7 : 1.0\n", | |
" count(h) = 3 male : female = 1.7 : 1.0\n", | |
" firstletter = u'c' female : male = 1.7 : 1.0\n", | |
" firstletter = u't' male : female = 1.6 : 1.0\n", | |
" count(l) = 2 female : male = 1.6 : 1.0\n", | |
" firstletter = u'y' male : female = 1.6 : 1.0\n", | |
" count(f) = 1 male : female = 1.6 : 1.0\n", | |
" has(a) = False male : female = 1.6 : 1.0\n", | |
" count(a) = 0 male : female = 1.6 : 1.0\n", | |
" has(o) = True male : female = 1.5 : 1.0\n", | |
" count(b) = 2 female : male = 1.5 : 1.0\n", | |
" lastletter = u'h' male : female = 1.5 : 1.0\n", | |
" count(v) = 1 male : female = 1.5 : 1.0\n", | |
" count(g) = 1 male : female = 1.5 : 1.0\n", | |
" has(g) = True male : female = 1.5 : 1.0\n", | |
" count(z) = 1 male : female = 1.4 : 1.0\n", | |
" firstletter = u'o' male : female = 1.4 : 1.0\n", | |
" count(m) = 2 male : female = 1.4 : 1.0\n", | |
" count(o) = 1 male : female = 1.4 : 1.0\n", | |
" firstletter = u'r' male : female = 1.4 : 1.0\n", | |
" firstletter = u'm' female : male = 1.4 : 1.0\n", | |
" has(z) = True male : female = 1.4 : 1.0\n", | |
" count(b) = 3 male : female = 1.4 : 1.0\n", | |
" has(i) = True female : male = 1.4 : 1.0\n", | |
" has(h) = True male : female = 1.4 : 1.0\n", | |
" firstletter = u'p' male : female = 1.4 : 1.0\n", | |
" count(s) = 2 female : male = 1.4 : 1.0\n", | |
" has(v) = True male : female = 1.4 : 1.0\n", | |
" has(a) = True female : male = 1.4 : 1.0\n", | |
" count(h) = 1 male : female = 1.3 : 1.0\n", | |
" count(i) = 1 female : male = 1.3 : 1.0\n", | |
" has(i) = False male : female = 1.3 : 1.0\n", | |
" count(i) = 0 male : female = 1.3 : 1.0\n", | |
" has(x) = True male : female = 1.3 : 1.0\n", | |
" count(t) = 1 male : female = 1.3 : 1.0\n", | |
" has(l) = True female : male = 1.3 : 1.0\n", | |
" count(s) = 1 male : female = 1.3 : 1.0\n", | |
" count(b) = 1 male : female = 1.3 : 1.0\n", | |
" firstletter = u's' male : female = 1.3 : 1.0\n", | |
"\n" | |
] | |
} | |
], | |
"prompt_number": 3 | |
}, | |
{ | |
"cell_type": "heading", | |
"level": 6, | |
"metadata": {}, | |
"source": [ | |
"\ud2b9\uc9d5\uc774 \ub108\ubb34 \ub9ce\uace0 \ud559\uc2b5\uc14b(training set)\uc774 \uc791\uc744 \uacbd\uc6b0, overfitting\n", | |
"(\uacfc\uc801\ud569)\uc774 \uc0dd\uae38 \uc218 \uc788\uc73c\ub2c8, \uc870\uc2ec\ud558\uc790." | |
] | |
}, | |
{ | |
"cell_type": "heading", | |
"level": 2, | |
"metadata": {}, | |
"source": [ | |
"error analysis (\uc624\ub958\ubd84\uc11d)" | |
] | |
}, | |
{ | |
"cell_type": "heading", | |
"level": 6, | |
"metadata": {}, | |
"source": [ | |
"\uc624\ub958 \ubd84\uc11d(error analysis)\ub97c \uc704\ud574\uc11c trianing set\uc744 training set, dev-test set\uc73c\ub85c \ubd84\ub9ac\ud568.<BR>\n", | |
"<img src=\"http://www.nltk.org/images/corpus-org.png\" width=\"700\">" | |
] | |
}, | |
{ | |
"cell_type": "heading", | |
"level": 6, | |
"metadata": {}, | |
"source": [ | |
"\uac1c\ubc1c\uc6a9 \ud14c\uc2a4\ud2b8\uc14b(dev-test)\uc744 \uc5ec\ub7ec\ubc8c \uc900\ube44\ud558\uc5ec, \uac01 \uac1c\ubc1c\uc6a9 \ud14c\uc2a4\ud2b8\uc14b\uc73c\ub85c \uac01\uac01 \uc624\ub958 \ubd84\uc11d\uc744 \uc218\ud589." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"from __future__ import print_function, unicode_literals\n", | |
"from pprint import pprint\n", | |
"import nltk\n", | |
"from nltk.corpus import names as name2gender\n", | |
"import random\n", | |
"import sys\n", | |
"\n", | |
"\n", | |
"# \uc785\ub825\ub370\uc774\ud0c0 \uc0dd\uc131. (\uc774\ub984, \uc131\ubcc4)\n", | |
"# names = ([(name, 'male') for name in name2gender.words('male.txt')] + \\\n", | |
"# [(name,'female') for name in name2gender.words('female.txt')])\n", | |
"# random.shuffle(names)\n", | |
"\n", | |
"# \ud2b9\uc9d5 \ucd94\ucd9c \ud568\uc218 \uc815\uc758. (\uc774\ub984) -> (\uccab/\ub05d \ubb38\uc790, \ubb38\uc790\ubcc4 \uac1c\uc218, \ud3ec\ud568\uc5ec\ubd80)\n", | |
"def gender_features2(name):\n", | |
" features={}\n", | |
" features['firstletter']=name[0].lower()\n", | |
" features['lastletter']=name[-1].lower()\n", | |
" for letter in 'abcdefghijklmnopqrstuvwxyz':\n", | |
" features['count(%s)'%letter]=name.lower().count(letter)\n", | |
" features['has(%s)'%letter]=(letter in name.lower())\n", | |
" return features\n", | |
"\n", | |
"# \uc624\ub958\ubd84\uc11d\uc744 \uc704\ud574, \ud559\uc2b5\uc14b\uc744 \ubd84\ub9ac.\n", | |
"train_names=names[1500:] # \uac1c\ubc1c\uc6a9 \ud559\uc2b5\uc14b\n", | |
"devtest_names=names[500:1500] # \uac1c\ubc1c\uc6a9 \ud14c\uc2a4\ud2b8\uc14b 1000\uac1c\n", | |
"# test_names=names[:500] # \uc2e4\uc804\uc6a9 \ud14c\uc2a4\ud2b8\uc14b 500\uac1c\n", | |
"\n", | |
"# \ud559\uc2b5\uc14b\uacfc \ud14c\uc2a4\ud2b8\uc14b\uc744 \uc0dd\uc131. (\ud2b9\uc9d5, \uc131\ubcc4)\n", | |
"train_set = [(gender_features2(n), g) for (n,g) in train_names]\n", | |
"devtest_set = [(gender_features2(n), g) for (n,g) in devtest_names]\n", | |
"# test_set = [(gender_features2(n), g) for (n,g) in test_names]\n", | |
"\n", | |
"# \ubd84\ub958\uae30 \ud559\uc2b5\n", | |
"classifier = nltk.NaiveBayesClassifier.train(train_set)\n", | |
"\n", | |
"# \uc624\ub958 \uae30\ub85d\n", | |
"errors=[]\n", | |
"for(name, tag) in devtest_names:\n", | |
" guess=classifier.classify(gender_features(name))\n", | |
" if guess != tag:\n", | |
" errors.append((tag,guess,name))\n", | |
"\n", | |
"# \uc624\ub958 \ubd84\uc11d\n", | |
"print(\"error analysis (names ending with 'n')\")\n", | |
"for (tag, guess, name) in sorted(errors):\n", | |
" if name.endswith('n'):\n", | |
" print('correct=%-8s guess=%-8s name=%-30s' % (tag, guess, name)) # \uc815\ub2f5, \ubd84\ub958\uae30\uc758 \ucd94\uce21, \uc785\ub825\ub370\uc774\ud0c0(\uc774\ub984)" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"error analysis (names ending with 'n')\n", | |
"correct=male guess=female name=Alwin \n", | |
"correct=male guess=female name=Archon \n", | |
"correct=male guess=female name=Baron \n", | |
"correct=male guess=female name=Bjorn \n", | |
"correct=male guess=female name=Brandon \n", | |
"correct=male guess=female name=Clinton \n", | |
"correct=male guess=female name=Coleman \n", | |
"correct=male guess=female name=Darien \n", | |
"correct=male guess=female name=Darin \n", | |
"correct=male guess=female name=Darren \n", | |
"correct=male guess=female name=Donn \n", | |
"correct=male guess=female name=Elton \n", | |
"correct=male guess=female name=Erwin \n", | |
"correct=male guess=female name=Evan \n", | |
"correct=male guess=female name=Fabian \n", | |
"correct=male guess=female name=Ferguson \n", | |
"correct=male guess=female name=Gideon \n", | |
"correct=male guess=female name=Gretchen \n", | |
"correct=male guess=female name=Hanan \n", | |
"correct=male guess=female name=Hassan \n", | |
"correct=male guess=female name=Huntington \n", | |
"correct=male guess=female name=Juan \n", | |
"correct=male guess=female name=Ken \n", | |
"correct=male guess=female name=Lynn \n", | |
"correct=male guess=female name=Milton \n", | |
"correct=male guess=female name=Muffin \n", | |
"correct=male guess=female name=Nathan \n", | |
"correct=male guess=female name=Oran \n", | |
"correct=male guess=female name=Orion \n", | |
"correct=male guess=female name=Orrin \n", | |
"correct=male guess=female name=Patin \n", | |
"correct=male guess=female name=Quintin \n", | |
"correct=male guess=female name=Ramon \n", | |
"correct=male guess=female name=Reagan \n", | |
"correct=male guess=female name=Reuben \n", | |
"correct=male guess=female name=Reuven \n", | |
"correct=male guess=female name=Robin \n", | |
"correct=male guess=female name=Ron \n", | |
"correct=male guess=female name=Ronen \n", | |
"correct=male guess=female name=Shaughn \n", | |
"correct=male guess=female name=Shawn \n", | |
"correct=male guess=female name=Shimon \n", | |
"correct=male guess=female name=Simeon \n", | |
"correct=male guess=female name=Simon \n", | |
"correct=male guess=female name=Stan \n", | |
"correct=male guess=female name=Tarzan \n", | |
"correct=male guess=female name=Tedman \n", | |
"correct=male guess=female name=Torin \n", | |
"correct=male guess=female name=Trenton \n", | |
"correct=male guess=female name=Tristan \n", | |
"correct=male guess=female name=Tyson \n", | |
"correct=male guess=female name=Vaughan \n", | |
"correct=male guess=female name=Vernon \n", | |
"correct=male guess=female name=Washington \n", | |
"correct=male guess=female name=Waylen \n", | |
"correct=male guess=female name=Waylon \n", | |
"correct=male guess=female name=Weston \n", | |
"correct=male guess=female name=Weylin \n", | |
"correct=male guess=female name=Wilburn \n", | |
"correct=male guess=female name=Wyatan \n", | |
"correct=male guess=female name=Wynn \n", | |
"correct=male guess=female name=Zebulen \n", | |
"correct=male guess=female name=Zebulon \n" | |
] | |
} | |
], | |
"prompt_number": 4 | |
}, | |
{ | |
"cell_type": "heading", | |
"level": 2, | |
"metadata": {}, | |
"source": [ | |
"Document Classification (\ubb38\uc11c \ubd84\ub958)" | |
] | |
}, | |
{ | |
"cell_type": "heading", | |
"level": 6, | |
"metadata": {}, | |
"source": [ | |
"\ubb38\uc11c \ubd84\ub958 (\uc601\ud654 \ub9ac\ubdf0\ub97c \ud1b5\ud55c \uac10\uc131 \ubd84\uc11d)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"from __future__ import print_function, unicode_literals\n", | |
"from pprint import pprint\n", | |
"from nltk.corpus import movie_reviews\n", | |
"import random\n", | |
"import nltk\n", | |
"\n", | |
"\n", | |
"print(\"movie_reviews.categories():\", movie_reviews.categories()) # category\ub294 pos or neg\n", | |
"print(\"movie_reviews.fileids('pos'):\", movie_reviews.fileids('pos')[:10], \"...\")\n", | |
"print()\n", | |
"\n", | |
"# \uc785\ub825 \ub370\uc774\ud0c0 \uc0dd\uc131 (\ubb38\uc11c, \uae0d\uc815/\ubd80\uc815)\n", | |
"documents = [(list(movie_reviews.words(fileid)), category)\n", | |
" for category in movie_reviews.categories()\n", | |
" for fileid in movie_reviews.fileids(category)]\n", | |
"# random.shuffle(documents)\n", | |
"\n", | |
"print(\"documents[0]:\", documents[0][0][:10], \"...\", documents[0][1])\n", | |
"print()\n", | |
"\n", | |
"all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words()) # \ubb38\uc11c\uc5d0 \ucd9c\ud604\ub41c \ub2e8\uc5b4 \ubaa9\ub85d\uc744 \ube48\ub3c4\uc21c\uc73c\ub85c \uc815\ub82c\n", | |
"print(\"len(all_words):\", len(all_words))\n", | |
"\n", | |
"print(\"nltk.__version__:\", nltk.__version__)\n", | |
"if nltk.__version__.startswith('3.'):\n", | |
" word_features = [k for (k,v) in all_words.most_common(2000)] # \uc790\uc8fc \ucd9c\ud604\ud55c \ub2e8\uc5b4 \ubaa9\ub85d (for nltk 3.x)\n", | |
"else:\n", | |
" word_features = all_words.keys()[:2000] # \uc790\uc8fc \ucd9c\ud604\ud55c \ub2e8\uc5b4 \ubaa9\ub85d (for nltk 2.x)\n", | |
"\n", | |
"print(\"word_features:\", word_features[:10], \"...\")\n", | |
"\n", | |
"# \ud2b9\uc9d5 \ucd94\ucd9c \ud568\uc218 \uc815\uc758. (\ubb38\uc11c) -> (\ub2e8\uc5b4\ud3ec\ud568 \uc5ec\ubd80)\n", | |
"def document_features(document): \n", | |
" document_words = set(document)\n", | |
" features = {}\n", | |
" for word in word_features:\n", | |
" features['contains(%s)' % word] = (word in document_words) # word in set \uc774 word in list \ubcf4\ub2e4 \ube60\ub974\ub2e4. (4\uc7a5 \ucc38\uc870)\n", | |
" return features\n", | |
"pprint(document_features(movie_reviews.words('pos/cv957_8737.txt')).items()[:10])\n", | |
"print()\n", | |
"\n", | |
"# \ud559\uc2b5\uc14b, \ud14c\uc2a4\ud2b8\uc14b \uc0dd\uc131 (\ud2b9\uc9d5, \uae0d\uc815/\ubd80\uc815)\n", | |
"featuresets = [(document_features(doc), category) for (doc, category) in documents]\n", | |
"train_set, test_set = featuresets[100:], featuresets[:100]\n", | |
"print(\"featuresets[0]:\", featuresets[0][0].items()[:20], \"...\", featuresets[0][1])\n", | |
"print()\n", | |
"\n", | |
"# \ubd84\ub958\uae30 \ud559\uc2b5\n", | |
"classifier = nltk.NaiveBayesClassifier.train(train_set)\n", | |
"\n", | |
"# \ubd84\ub958\uae30 \ud3c9\uac00\n", | |
"print('accuracy:', nltk.classify.accuracy(classifier, test_set))\n", | |
"print(classifier.show_most_informative_features(5))" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"movie_reviews.categories(): [u'neg', u'pos']\n", | |
"movie_reviews.fileids('pos'): [u'pos/cv000_29590.txt', u'pos/cv001_18431.txt', u'pos/cv002_15918.txt', u'pos/cv003_11664.txt', u'pos/cv004_11636.txt', u'pos/cv005_29443.txt', u'pos/cv006_15448.txt', u'pos/cv007_4968.txt', u'pos/cv008_29435.txt', u'pos/cv009_29592.txt'] ...\n", | |
"\n", | |
"documents[0]:" | |
] | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
" [u'plot', u':', u'two', u'teen', u'couples', u'go', u'to', u'a', u'church', u'party'] ... neg\n", | |
"\n", | |
"len(all_words):" | |
] | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
" 39768\n", | |
"nltk.__version__: 3.0.1\n", | |
"word_features: [u',', u'the', u'.', u'a', u'and', u'of', u'to', u\"'\", u'is', u'in'] ...\n", | |
"[(u'contains(waste)', False),\n", | |
" (u'contains(lot)', False),\n", | |
" (u'contains(*)', True),\n", | |
" (u'contains(black)', False),\n", | |
" (u'contains(rated)', False),\n", | |
" (u'contains(potential)', False),\n", | |
" (u'contains(m)', False),\n", | |
" (u'contains(understand)', False),\n", | |
" (u'contains(drug)', True),\n", | |
" (u'contains(case)', False)]\n", | |
"\n", | |
"featuresets[0]:" | |
] | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
" [(u'contains(waste)', False), (u'contains(lot)', False), (u'contains(*)', False), (u'contains(black)', False), (u'contains(rated)', False), (u'contains(potential)', False), (u'contains(m)', False), (u'contains(understand)', False), (u'contains(drug)', False), (u'contains(case)', False), (u'contains(created)', False), (u'contains(kiss)', False), (u'contains(needed)', False), (u'contains(c)', False), (u'contains(about)', True), (u'contains(toy)', False), (u'contains(longer)', False), (u'contains(ready)', False), (u'contains(certainly)', False), (u'contains(lame)', False)] ... neg\n", | |
"\n", | |
"accuracy:" | |
] | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
" 0.86\n", | |
"Most Informative Features\n", | |
" contains(outstanding) = True pos : neg = 10.4 : 1.0\n", | |
" contains(seagal) = True neg : pos = 8.7 : 1.0\n", | |
" contains(mulan) = True pos : neg = 8.1 : 1.0\n", | |
" contains(wonderfully) = True pos : neg = 6.3 : 1.0\n", | |
" contains(damon) = True pos : neg = 5.7 : 1.0\n", | |
"None\n" | |
] | |
} | |
], | |
"prompt_number": 5 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"import gc; gc.collect() # release memory." | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 6, | |
"text": [ | |
"0" | |
] | |
} | |
], | |
"prompt_number": 6 | |
}, | |
{ | |
"cell_type": "heading", | |
"level": 6, | |
"metadata": {}, | |
"source": [ | |
"\ubb38\uc11c \ubd84\ub958 (\uc601\ud654 \ub9ac\ubdf0\ub97c \ud1b5\ud55c \uac10\uc131 \ubd84\uc11d) #2 (\uad50\uc7ac\uc5d0 \uc5c6\uc74c) = \ubaa8\ub4e0 \ub2e8\uc5b4\uac00 \uc544\ub2cc \uc774\ub984(\ubc30\uc6b0)\ub85c\ub9cc \ud2b9\uc9d5\uc744 \ucd94\ucd9c\ud558\uba74 \uc5b4\ub5bb\uac8c \ub420\uae4c?" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"from __future__ import print_function, unicode_literals\n", | |
"from pprint import pprint\n", | |
"import nltk\n", | |
"from nltk.corpus import movie_reviews\n", | |
"from nltk.corpus import names as name2gender\n", | |
"import random\n", | |
"\n", | |
"# \uc785\ub825 \ub370\uc774\ud0c0 \uc0dd\uc131 (\ubb38\uc11c, \uae0d\uc815/\ubd80\uc815)\n", | |
"# documents = [(list(movie_reviews.words(fileid)), category)\n", | |
"# for category in movie_reviews.categories()\n", | |
"# for fileid in movie_reviews.fileids(category)]\n", | |
"# random.shuffle(documents)\n", | |
"\n", | |
"all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words())\n", | |
"# print(\"len(all_words):\", len(all_words))\n", | |
"\n", | |
"_names = set([name.lower() for name in name2gender.words('male.txt')] + \\\n", | |
"[name.lower() for name in name2gender.words('female.txt')]) # \uc774\ub984 \ubaa9\ub85d\n", | |
" \n", | |
"if nltk.__version__.startswith('3.'): \n", | |
" actor_names = [name.lower() for (name,v) in all_words.most_common() if name in _names] # \uc601\ud654 \ub9ac\ubdf0\uc548\uc5d0 \uc788\ub294 \uc774\ub984 \ubaa9\ub85d.\n", | |
"else:\n", | |
" actor_names = [name.lower() for (name,v) in all_words.keys() if name in _names] # \uc601\ud654 \ub9ac\ubdf0\uc548\uc5d0 \uc788\ub294 \uc774\ub984 \ubaa9\ub85d.\n", | |
" \n", | |
"actor_names = actor_names[:2000] # \uc774\uc804 \ubd84\uc11d\uacfc \uc870\uac74\uc744 \uac19\uac8c \ud558\uae30 \uc704\ud574, feature(\uc774\ub984) \uac1c\uc218\ub97c 2000\uac1c\ub85c \uc81c\ud55c.\n", | |
"print(\"len(actor_names):\", len(actor_names), actor_names[:100], \"...\")\n", | |
"print('jolie in actor_names:', 'jolie' in actor_names)\n", | |
"print()\n", | |
"\n", | |
"# \ud2b9\uc9d5 \ucd94\ucd9c \ud568\uc218 \uc815\uc758. (\ubb38\uc11c) -> (\ubc30\uc6b0\uc774\ub984 \ud3ec\ud568\uc5ec\ubd80)\n", | |
"def document_features2(document): \n", | |
" document_words = set(document)\n", | |
" features = {}\n", | |
" for word in actor_names:\n", | |
" features['contains(%s)' % word] = (word in document_words) # word in set \uc774 word in list \ubcf4\ub2e4 \ube60\ub974\ub2e4. (4\uc7a5 \ucc38\uc870)\n", | |
" return features\n", | |
"\n", | |
"# \ud559\uc2b5\uc14b, \ud14c\uc2a4\ud2b8\uc14b \uc0dd\uc131 (\ud2b9\uc9d5, \uae0d\uc815/\ubd80\uc815)\n", | |
"featuresets = [(document_features2(doc), category) for (doc, category) in documents]\n", | |
"train_set, test_set = featuresets[100:], featuresets[:100]\n", | |
"print(\"featuresets[0]:\", featuresets[0][0].items()[:20], \"...\", featuresets[0][1])\n", | |
"print()\n", | |
"\n", | |
"# \ubd84\ub958\uae30 \ud559\uc2b5\n", | |
"classifier = nltk.NaiveBayesClassifier.train(train_set)\n", | |
"\n", | |
"# \ubd84\ub958\uae30 \ud3c9\uac00\n", | |
"print('accuracy:', nltk.classify.accuracy(classifier, test_set))\n", | |
"print(classifier.show_most_informative_features(5))" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"len(actor_names): 2000 [u'will', u'see', u'way', u'don', u'love', u'may', u'say', u'john', u'star', u'guy', u'job', u'james', u'case', u'michael', u'town', u'david', u'else', u'son', u'kevin', u'joe', u'worth', u'jack', u'major', u'robert', u'jackie', u'tom', u'lee', u'peter', u'hope', u'king', u'oscar', u'saw', u'paul', u'van', u'smith', u'george', u'chance', u'chris', u'happy', u'art', u'robin', u'ryan', u'william', u'ben', u'red', u'rock', u'rich', u'jim', u'harry', u'bob', u'bill', u'sam', u'martin', u'murphy', u'mark', u'scott', u'truman', u'cameron', u'bruce', u'frank', u'richard', u'chase', u'carter', u'fan', u'haven', u'allen', u'tim', u'west', u'park', u'steve', u'eddie', u'chan', u'max', u'woody', u'wait', u'simon', u'mary', u'steven', u'nick', u'willis', u'grace', u'mike', u'carry', u'sean', u'french', u'jerry', u'jackson', u'tarzan', u'pace', u'trip', u'billy', u'julia', u'la', u'christopher', u'matthew', u'danny', u'win', u'fox', u'julie', u'jennifer'] ...\n", | |
"jolie in actor_names: True\n", | |
"\n", | |
"featuresets[0]:" | |
] | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
" [(u'contains(andre)', False), (u'contains(malka)', False), (u'contains(torry)', False), (u'contains(terence)', False), (u'contains(shaw)', False), (u'contains(lex)', False), (u'contains(philippe)', False), (u'contains(jolie)', False), (u'contains(rea)', False), (u'contains(petra)', False), (u'contains(di)', False), (u'contains(nanni)', False), (u'contains(case)', False), (u'contains(laure)', False), (u'contains(lydia)', False), (u'contains(rick)', False), (u'contains(mathilda)', False), (u'contains(kelsey)', False), (u'contains(chip)', False), (u'contains(leila)', False)] ... neg\n", | |
"\n", | |
"accuracy:" | |
] | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
" 0.65\n", | |
"Most Informative Features\n", | |
" contains(hudson) = True neg : pos = 10.7 : 1.0\n", | |
" contains(elliot) = True pos : neg = 9.9 : 1.0\n", | |
" contains(ivy) = True neg : pos = 7.8 : 1.0\n", | |
" contains(terri) = True neg : pos = 7.8 : 1.0\n", | |
" contains(hugo) = True pos : neg = 6.9 : 1.0\n", | |
"None\n" | |
] | |
} | |
], | |
"prompt_number": 7 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"import gc; gc.collect() # release memory." | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 8, | |
"text": [ | |
"0" | |
] | |
} | |
], | |
"prompt_number": 8 | |
}, | |
{ | |
"cell_type": "heading", | |
"level": 2, | |
"metadata": {}, | |
"source": [ | |
"Part-of-Speech Tagging (\ud488\uc0ac \ubd80\ucc29)" | |
] | |
}, | |
{ | |
"cell_type": "heading", | |
"level": 6, | |
"metadata": {}, | |
"source": [ | |
"brown corpus pos tags: http://en.wikipedia.org/wiki/Brown_Corpus#Part-of-speech_tags_used" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"from __future__ import print_function, unicode_literals\n", | |
"from pprint import pprint\n", | |
"import nltk\n", | |
"from nltk.corpus import brown\n", | |
"\n", | |
"\n", | |
"# \ud2b9\uc9d5 \ucd94\ucd9c \ud568\uc218 \uc815\uc758 (\ub2e8\uc5b4) -> (\uc811\ubbf8\uc0ac)\n", | |
"suffix_fdist = nltk.FreqDist()\n", | |
"print(\"len(brown.words()):\", len(brown.words()))\n", | |
"for word in brown.words()[:100000]: # \uba54\ubaa8\ub9ac\ub3c4 \ub9ce\uc774 \uc0ac\uc6a9\ud558\uace0, \ub108\ubb34 \uc624\ub798 \uac78\ub824\uc11c, \ub370\uc774\ud0c0\ub97c \uc77c\ubd80\ub9cc \uc0ac\uc6a9\ud568.\n", | |
" word = word.lower()\n", | |
" suffix_fdist[word[-1:]] += 1\n", | |
" suffix_fdist[word[-2:]] += 1\n", | |
" suffix_fdist[word[-3:]] += 1\n", | |
"print(\"nltk.__version__:\", nltk.__version__)\n", | |
"if nltk.__version__.startswith('3.'): \n", | |
" common_suffixes = [k for (k,v) in suffix_fdist.most_common(100)] # for nltk 3.x \n", | |
"else:\n", | |
" common_suffixes = suffix_fdist.keys()[:100] # for nltk 2.x\n", | |
"suffix_fdist=None\n", | |
"print(\"common_suffixes:\", common_suffixes) \n", | |
"print()\n", | |
"\n", | |
"def pos_features(word):\n", | |
" features = {}\n", | |
" for suffix in common_suffixes:\n", | |
" features['endswith(%s)' % suffix] = word.lower().endswith(suffix)\n", | |
" return features\n", | |
" \n", | |
"# \ud14c\uc2a4\ud2b8\uc6a9\n", | |
"def pos_features_print(word): # True\uc778 feature\ub9cc \ucd9c\ub825\ud568. (\uad50\uc7ac\uc5d0 \uc5c6\uc74c)\n", | |
" print(\"pos_features('\"+word+\"'):\", [(k, v) for (k, v) in pos_features(word).items() if v is True])\n", | |
" \n", | |
"pos_features_print('studied') \n", | |
"print()\n", | |
"\n", | |
"# \uc785\ub825\ub370\uc774\ud0c0 \uc0dd\uc131 (\ub2e8\uc5b4, \ud488\uc0ac)\n", | |
"tagged_words = brown.tagged_words(categories='news')\n", | |
"print(\"len(tagged_words):\", len(tagged_words))\n", | |
"tagged_words = tagged_words[:10000] # \uba54\ubaa8\ub9ac\ub3c4 \ub9ce\uc774 \uc0ac\uc6a9\ud558\uace0, \ub108\ubb34 \uc624\ub798 \uac78\ub824\uc11c, \ub370\uc774\ud0c0\ub97c \uc77c\ubd80\ub9cc \uc0ac\uc6a9\ud568.\n", | |
"print(\"tagged_words:\", tagged_words[:10], \"...\")\n", | |
"print()\n", | |
"\n", | |
"# \ud559\uc2b5\uc14b, \ud14c\uc2a4\ud2b8\uc14b \uc0dd\uc131 (\ud2b9\uc9d5, \ud488\uc0ac)\n", | |
"featuresets = [(pos_features(word), tag) for (word, tag) in tagged_words]\n", | |
"size = int(len(featuresets) * 0.1) # test set size\n", | |
"train_set, test_set = featuresets[size:], featuresets[:size]\n", | |
"tagged_words = None\n", | |
"print(\"featuresets:\")\n", | |
"pprint(featuresets[0])\n", | |
"print()\n", | |
"\n", | |
"# \ubd84\ub958\uae30 \ud559\uc2b5\n", | |
"classifier = nltk.DecisionTreeClassifier.train(train_set)\n", | |
"\n", | |
"# \ubd84\ub958\uae30 \ud3c9\uac00\n", | |
"print(\"accuracy:\", nltk.classify.accuracy(classifier, test_set))\n", | |
"print()\n", | |
"\n", | |
"# \ubd84\ub958\uae30 \ud14c\uc2a4\ud2b8\n", | |
"print(\"classifier.classify(pos_features('cats')):\", classifier.classify(pos_features('cats'))) # NNS = plural noun\n", | |
"print()\n", | |
"\n", | |
"print(classifier.pseudocode(depth=4))\n", | |
"print(classifier.pp(depth=4))\n" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"len(brown.words()): 1161192\n", | |
"nltk.__version__:" | |
] | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
" 3.0.1\n", | |
"common_suffixes: [u'e', u',', u'.', u's', u'd', u'n', u't', u'he', u'a', u'the', u'of', u'r', u'y', u'to', u'in', u'o', u'ed', u'on', u'f', u'l', u'nd', u'er', u'g', u'ng', u'and', u'is', u'at', u'as', u'ing', u'h', u'es', u'or', u're', u'an', u'``', u\"''\", u'ion', u'al', u'm', u'nt', u'st', u'll', u'en', u'it', u'be', u'ly', u'by', u'rs', u'th', u'ent', u'ts', u'for', u'k', u\"'\", u';', u'hat', u'le', u'ce', u'ay', u'ted', u'ld', u've', u'w', u'te', u'me', u'ry', u'his', u'se', u'ns', u'ut', u'`', u'ch', u'was', u'i', u\"'s\", u'ers', u'ere', u'id', u'ty', u'--', u'ith', u'ne', u'ter', u'her', u'ill', u'p', u')', u'(', u'ey', u'0', u'ate', u'aid', u'ar', u'day', u'ad', u':', u'et', u'om', u'nce', u's.']\n", | |
"\n", | |
"pos_features('studied'): [(u'endswith(d)', True), (u'endswith(ed)', True)]\n", | |
"\n", | |
"len(tagged_words):" | |
] | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
" 100554\n", | |
"tagged_words: [(u'The', u'AT'), (u'Fulton', u'NP-TL'), (u'County', u'NN-TL'), (u'Grand', u'JJ-TL'), (u'Jury', u'NN-TL'), (u'said', u'VBD'), (u'Friday', u'NR'), (u'an', u'AT'), (u'investigation', u'NN'), (u'of', u'IN')] ...\n", | |
"\n", | |
"featuresets:" | |
] | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"\n", | |
"({u\"endswith('')\": False,\n", | |
" u\"endswith(')\": False,\n", | |
" u\"endswith('s)\": False,\n", | |
" u'endswith(()': False,\n", | |
" u'endswith())': False,\n", | |
" u'endswith(,)': False,\n", | |
" u'endswith(--)': False,\n", | |
" u'endswith(.)': False,\n", | |
" u'endswith(0)': False,\n", | |
" u'endswith(:)': False,\n", | |
" u'endswith(;)': False,\n", | |
" u'endswith(`)': False,\n", | |
" u'endswith(``)': False,\n", | |
" u'endswith(a)': False,\n", | |
" u'endswith(ad)': False,\n", | |
" u'endswith(aid)': False,\n", | |
" u'endswith(al)': False,\n", | |
" u'endswith(an)': False,\n", | |
" u'endswith(and)': False,\n", | |
" u'endswith(ar)': False,\n", | |
" u'endswith(as)': False,\n", | |
" u'endswith(at)': False,\n", | |
" u'endswith(ate)': False,\n", | |
" u'endswith(ay)': False,\n", | |
" u'endswith(be)': False,\n", | |
" u'endswith(by)': False,\n", | |
" u'endswith(ce)': False,\n", | |
" u'endswith(ch)': False,\n", | |
" u'endswith(d)': False,\n", | |
" u'endswith(day)': False,\n", | |
" u'endswith(e)': True,\n", | |
" u'endswith(ed)': False,\n", | |
" u'endswith(en)': False,\n", | |
" u'endswith(ent)': False,\n", | |
" u'endswith(er)': False,\n", | |
" u'endswith(ere)': False,\n", | |
" u'endswith(ers)': False,\n", | |
" u'endswith(es)': False,\n", | |
" u'endswith(et)': False,\n", | |
" u'endswith(ey)': False,\n", | |
" u'endswith(f)': False,\n", | |
" u'endswith(for)': False,\n", | |
" u'endswith(g)': False,\n", | |
" u'endswith(h)': False,\n", | |
" u'endswith(hat)': False,\n", | |
" u'endswith(he)': True,\n", | |
" u'endswith(her)': False,\n", | |
" u'endswith(his)': False,\n", | |
" u'endswith(i)': False,\n", | |
" u'endswith(id)': False,\n", | |
" u'endswith(ill)': False,\n", | |
" u'endswith(in)': False,\n", | |
" u'endswith(ing)': False,\n", | |
" u'endswith(ion)': False,\n", | |
" u'endswith(is)': False,\n", | |
" u'endswith(it)': False,\n", | |
" u'endswith(ith)': False,\n", | |
" u'endswith(k)': False,\n", | |
" u'endswith(l)': False,\n", | |
" u'endswith(ld)': False,\n", | |
" u'endswith(le)': False,\n", | |
" u'endswith(ll)': False,\n", | |
" u'endswith(ly)': False,\n", | |
" u'endswith(m)': False,\n", | |
" u'endswith(me)': False,\n", | |
" u'endswith(n)': False,\n", | |
" u'endswith(nce)': False,\n", | |
" u'endswith(nd)': False,\n", | |
" u'endswith(ne)': False,\n", | |
" u'endswith(ng)': False,\n", | |
" u'endswith(ns)': False,\n", | |
" u'endswith(nt)': False,\n", | |
" u'endswith(o)': False,\n", | |
" u'endswith(of)': False,\n", | |
" u'endswith(om)': False,\n", | |
" u'endswith(on)': False,\n", | |
" u'endswith(or)': False,\n", | |
" u'endswith(p)': False,\n", | |
" u'endswith(r)': False,\n", | |
" u'endswith(re)': False,\n", | |
" u'endswith(rs)': False,\n", | |
" u'endswith(ry)': False,\n", | |
" u'endswith(s)': False,\n", | |
" u'endswith(s.)': False,\n", | |
" u'endswith(se)': False,\n", | |
" u'endswith(st)': False,\n", | |
" u'endswith(t)': False,\n", | |
" u'endswith(te)': False,\n", | |
" u'endswith(ted)': False,\n", | |
" u'endswith(ter)': False,\n", | |
" u'endswith(th)': False,\n", | |
" u'endswith(the)': True,\n", | |
" u'endswith(to)': False,\n", | |
" u'endswith(ts)': False,\n", | |
" u'endswith(ty)': False,\n", | |
" u'endswith(ut)': False,\n", | |
" u'endswith(ve)': False,\n", | |
" u'endswith(w)': False,\n", | |
" u'endswith(was)': False,\n", | |
" u'endswith(y)': False},\n", | |
" u'AT')\n", | |
"\n", | |
"accuracy:" | |
] | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
" 0.68\n", | |
"\n", | |
"classifier.classify(pos_features('cats')): NNS\n", | |
"\n", | |
"if endswith(he) == False: \n", | |
" if endswith(s) == False: \n", | |
" if endswith(,) == False: \n", | |
" if endswith(.) == False: return u'``'\n", | |
" if endswith(.) == True: return u'.'\n", | |
" if endswith(,) == True: return u','\n", | |
" if endswith(s) == True: \n", | |
" if endswith(was) == False: \n", | |
" if endswith(is) == False: return u'NN'\n", | |
" if endswith(is) == True: return u'BEZ'\n", | |
" if endswith(was) == True: return u'BEDZ'\n", | |
"if endswith(he) == True: \n", | |
" if endswith(the) == False: return u'PPS'\n", | |
" if endswith(the) == True: return u'AT'\n", | |
"\n", | |
"endswith(he)=False? ................................... ``\n", | |
" endswith(s)=False? .................................. ``\n", | |
" endswith(,)=False? ................................ ``\n", | |
" endswith(.)=False? .............................. ``\n", | |
" endswith(.)=True? ............................... .\n", | |
" endswith(,)=True? ................................. ,\n", | |
" endswith(s)=True? ................................... NN\n", | |
" endswith(was)=False? .............................. NN\n", | |
" endswith(is)=False? ............................. NN\n", | |
" endswith(is)=True? .............................. BEZ\n", | |
" endswith(was)=True? ............................... BEDZ\n", | |
"endswith(he)=True? .................................... AT\n", | |
" endswith(the)=False? ................................ PPS\n", | |
" endswith(the)=True? ................................. AT\n", | |
"\n" | |
] | |
} | |
], | |
"prompt_number": 9 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"import gc; gc.collect() # release memory." | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 10, | |
"text": [ | |
"0" | |
] | |
} | |
], | |
"prompt_number": 10 | |
}, | |
{ | |
"cell_type": "heading", | |
"level": 2, | |
"metadata": {}, | |
"source": [ | |
"Exploiting Context (\ubb38\ub9e5 \ud65c\uc6a9)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"from __future__ import print_function, unicode_literals \n", | |
"from pprint import pprint\n", | |
"import nltk\n", | |
"from nltk.corpus import brown\n", | |
"\n", | |
"\n", | |
"# \ud2b9\uc9d5 \ucd94\ucd9c \ud568\uc218 \uc815\uc758 (\ub2e8\uc5b4) -> (\uc811\ubbf8\uc0ac, \uc55e\ub2e8\uc5b4)\n", | |
"def pos_features(sentence, i):\n", | |
" features = {\"suffix(1)\": sentence[i][-1:],\n", | |
" \"suffix(2)\": sentence[i][-2:],\n", | |
" \"suffix(3)\": sentence[i][-3:]}\n", | |
" if i == 0:\n", | |
" features[\"prev-word\"] = \"<START>\"\n", | |
" else:\n", | |
" features[\"prev-word\"] = sentence[i-1]\n", | |
" return features\n", | |
"\n", | |
"print(\"brown.sents()[0][7]:\", brown.sents()[0][7])\n", | |
"print(\"brown.sents()[0][8]:\", brown.sents()[0][8])\n", | |
"print(\"pos_features(brown.sents()[0], 8):\", pos_features(brown.sents()[0], 8))\n", | |
"print()\n", | |
"\n", | |
"# \uc785\ub825\ub370\uc774\ud0c0 \uc0dd\uc131 (\ub2e8\uc5b4, \ud488\uc0ac)\n", | |
"tagged_sents = brown.tagged_sents(categories='news')\n", | |
"print(\"tagged_sents[0]:\", tagged_sents[0])\n", | |
"print(\"nltk.tag.untag(tagged_sents[0]):\", nltk.tag.untag(tagged_sents[0]))\n", | |
"print()\n", | |
"\n", | |
"# \ud559\uc2b5\uc14b, \ud14c\uc2a4\ud2b8\uc14b \uc0dd\uc131 (\ud2b9\uc9d5, \ud488\uc0ac)\n", | |
"featuresets = []\n", | |
"for tagged_sent in tagged_sents:\n", | |
" untagged_sent = nltk.tag.untag(tagged_sent)\n", | |
" for i, (word, tag) in enumerate(tagged_sent):\n", | |
" featuresets.append( (pos_features(untagged_sent, i), tag) )\n", | |
"size = int(len(featuresets) * 0.1)\n", | |
"train_set, test_set = featuresets[size:], featuresets[:size]\n", | |
"print(\"train_set[0]:\", train_set[0])\n", | |
"\n", | |
"# \ubd84\ub958\uae30 \ud559\uc2b5\n", | |
"classifier = nltk.NaiveBayesClassifier.train(train_set)\n", | |
"print(\"featuresets[0]:\", featuresets[0])\n", | |
"print()\n", | |
"\n", | |
"# \ubd84\ub958\uae30 \ud3c9\uac00\n", | |
"print(\"accuracy:\", nltk.classify.accuracy(classifier, test_set))\n", | |
"print()" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"brown.sents()[0][7]: an\n", | |
"brown.sents()[0][8]: investigation\n", | |
"pos_features(brown.sents()[0], 8): {u'suffix(3)': u'ion', u'prev-word': u'an', u'suffix(2)': u'on', u'suffix(1)': u'n'}\n", | |
"\n", | |
"tagged_sents[0]: [(u'The', u'AT'), (u'Fulton', u'NP-TL'), (u'County', u'NN-TL'), (u'Grand', u'JJ-TL'), (u'Jury', u'NN-TL'), (u'said', u'VBD'), (u'Friday', u'NR'), (u'an', u'AT'), (u'investigation', u'NN'), (u'of', u'IN'), (u\"Atlanta's\", u'NP$'), (u'recent', u'JJ'), (u'primary', u'NN'), (u'election', u'NN'), (u'produced', u'VBD'), (u'``', u'``'), (u'no', u'AT'), (u'evidence', u'NN'), (u\"''\", u\"''\"), (u'that', u'CS'), (u'any', u'DTI'), (u'irregularities', u'NNS'), (u'took', u'VBD'), (u'place', u'NN'), (u'.', u'.')]\n", | |
"nltk.tag.untag(tagged_sents[0]): [u'The', u'Fulton', u'County', u'Grand', u'Jury', u'said', u'Friday', u'an', u'investigation', u'of', u\"Atlanta's\", u'recent', u'primary', u'election', u'produced', u'``', u'no', u'evidence', u\"''\", u'that', u'any', u'irregularities', u'took', u'place', u'.']\n", | |
"\n", | |
"train_set[0]:" | |
] | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
" ({u'suffix(3)': u'our', u'prev-word': u'of', u'suffix(2)': u'ur', u'suffix(1)': u'r'}, u'PP$')\n", | |
"featuresets[0]:" | |
] | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
" ({u'suffix(3)': u'The', u'prev-word': u'<START>', u'suffix(2)': u'he', u'suffix(1)': u'e'}, u'AT')\n", | |
"\n", | |
"accuracy:" | |
] | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
" 0.789159622079\n", | |
"\n" | |
] | |
} | |
], | |
"prompt_number": 11 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"?nltk.NaiveBayesClassifier" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 12 | |
}, | |
{ | |
"cell_type": "heading", | |
"level": 2, | |
"metadata": {}, | |
"source": [ | |
"Sequence Classification" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"from __future__ import print_function, unicode_literals \n", | |
"from pprint import pprint\n", | |
"import nltk\n", | |
"from nltk.corpus import brown\n", | |
"\n", | |
"\n", | |
"# \ud2b9\uc9d5 \ucd94\ucd9c \ud568\uc218 \uc815\uc758 (\ub2e8\uc5b4) -> (\uc811\ubbf8\uc0ac, \uc55e\ub2e8\uc5b4, \uc55e\ud488\uc0ac)\n", | |
"def pos_features(sentence, i, history):\n", | |
" features = {\"suffix(1)\": sentence[i][-1:],\n", | |
" \"suffix(2)\": sentence[i][-2:],\n", | |
" \"suffix(3)\": sentence[i][-3:]}\n", | |
" if i == 0:\n", | |
" features[\"prev-word\"] = \"<START>\"\n", | |
" features[\"prev-tag\"] = \"<START>\"\n", | |
" else:\n", | |
" features[\"prev-word\"] = sentence[i-1]\n", | |
" features[\"prev-tag\"] = history[i-1]\n", | |
" return features\n", | |
"\n", | |
"# \ubd84\ub958\uae30 \uc815\uc758 (\ucd08\uae30\ud654\uc5d0\uc11c \uc790\ub3d9\uc73c\ub85c \ud559\uc2b5\ud568) \n", | |
"class ConsecutivePosTagger(nltk.TaggerI):\n", | |
" def __init__(self, train_sents):\n", | |
" train_set = []\n", | |
" for tagged_sent in train_sents:\n", | |
" untagged_sent = nltk.tag.untag(tagged_sent)\n", | |
" history = []\n", | |
" for i, (word, tag) in enumerate(tagged_sent):\n", | |
" featureset = pos_features(untagged_sent, i, history)\n", | |
" train_set.append( (featureset, tag) )\n", | |
" history.append(tag)\n", | |
" self.classifier = nltk.NaiveBayesClassifier.train(train_set)\n", | |
"\n", | |
" def tag(self, sentence):\n", | |
" history = []\n", | |
" for i, word in enumerate(sentence):\n", | |
" featureset = pos_features(sentence, i, history)\n", | |
" tag = self.classifier.classify(featureset)\n", | |
" history.append(tag)\n", | |
" return zip(sentence, history)\n", | |
"\n", | |
"# \uc785\ub825\ub370\uc774\ud0c0 \uc0dd\uc131 (\ub2e8\uc5b4, \ud488\uc0ac) \n", | |
"tagged_sents = brown.tagged_sents(categories='news')\n", | |
"\n", | |
"# \ud559\uc2b5\uc14b, \ud14c\uc2a4\ud2b8\uc14b \uc0dd\uc131 (\ud2b9\uc9d5, \ud488\uc0ac)\n", | |
"size = int(len(tagged_sents) * 0.1)\n", | |
"train_sents, test_sents = tagged_sents[size:], tagged_sents[:size]\n", | |
"tagger = ConsecutivePosTagger(train_sents)\n", | |
"\n", | |
"# \ubd84\ub958\uae30 \ud3c9\uac00\n", | |
"print(\"accuracy:\", tagger.evaluate(test_sents))" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"accuracy: 0.798052851182\n" | |
] | |
} | |
], | |
"prompt_number": 13 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"?nltk.TaggerI" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 17 | |
}, | |
{ | |
"cell_type": "heading", | |
"level": 2, | |
"metadata": {}, | |
"source": [ | |
"Other Methods for Sequence Classification" | |
] | |
}, | |
{ | |
"cell_type": "heading", | |
"level": 6, | |
"metadata": {}, | |
"source": [ | |
"Hidden Markov Model (HMM) <BR>\n", | |
"Maximum Entropy Markov Model (MEMM) <BR>\n", | |
"Linear-Chain Conditional Random Field Model (CRF) <BR>" | |
] | |
}, | |
{ | |
"cell_type": "heading", | |
"level": 1, | |
"metadata": {}, | |
"source": [ | |
"6.2 Further Examples of Supervised Classification" | |
] | |
}, | |
{ | |
"cell_type": "heading", | |
"level": 2, | |
"metadata": {}, | |
"source": [ | |
"Sentence Segmentation" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"from __future__ import print_function, unicode_literals \n", | |
"from pprint import pprint\n", | |
"import nltk\n", | |
"\n", | |
"\n", | |
"# \uc785\ub825\ub370\uc774\ud0c0 \uc0dd\uc131 (\ub2e8\uc5b4 \ubaa9\ub85d, \uacbd\uacc4\uc704\uce58)\n", | |
"sents = nltk.corpus.treebank_raw.sents()\n", | |
"tokens = []\n", | |
"boundaries = set() # \ub04a\uc5b4\uc9c0\ub294 \ub2e8\uc5b4 \uc704\uce58. (0\ubd80\ud130 \uc2dc\uc791)\n", | |
"offset = 0\n", | |
"for sent in sents: \n", | |
" tokens.extend(sent)\n", | |
" offset += len(sent)\n", | |
" boundaries.add(offset-1)\n", | |
"print(\"len(sents):\", len(sents), sents[0:3], \"...\")\n", | |
"print()\n", | |
"print(\"len(tokens):\", len(tokens), tokens[0:30], \"...\")\n", | |
"print()\n", | |
"print(\"len(boundaries):\", len(boundaries), sorted(list(boundaries))[0:10], \"...\")\n", | |
"print()\n", | |
"\n", | |
"\n", | |
"# \ud2b9\uc9d5 \ucd94\ucd9c \ud568\uc218 \uc815\uc758 (\ub2e8\uc5b4\ubaa9\ub85d) -> (\ub2e4\uc74c\ub2e8\uc5b4\uc758 \ub300\ubb38\uc790\uc2dc\uc791\uc5ec\ubd80, \uc774\uc804\ub2e8\uc5b4, \ub2e8\uc5b4, \uc774\uc804\ub2e8\uc5b4\uac00 \ud55c \ubb38\uc790\uc778\uc9c0)\n", | |
"def punct_features(tokens, i): # by punctuation(\uad6c\ub450\uc810)\n", | |
" try:\n", | |
" return {'next-word-capitalized': tokens[i+1][0].isupper(),\n", | |
" 'prevword': tokens[i-1].lower(),\n", | |
" 'punct': tokens[i],\n", | |
" 'prev-word-is-one-char': len(tokens[i-1]) == 1}\n", | |
" except:\n", | |
" return {'next-word-capitalized': False,\n", | |
" 'prevword': '',\n", | |
" 'punct': tokens[i],\n", | |
" 'prev-word-is-one-char': False}\n", | |
" \n", | |
"featuresets = [(punct_features(tokens, i), (i in boundaries))\n", | |
" for i in range(1, len(tokens)-1)\n", | |
" if tokens[i] in '.?!']\n", | |
"print(\"featuresets:\", featuresets[0])\n", | |
"print()\n", | |
"\n", | |
"# \ud559\uc2b5\uc14b, \ud14c\uc2a4\ud2b8\uc14b \uc0dd\uc131 (\ud2b9\uc9d5, \ud488\uc0ac)\n", | |
"size = int(len(featuresets) * 0.1)\n", | |
"train_set, test_set = featuresets[size:], featuresets[:size]\n", | |
"print(\"train_set[0]:\", train_set[0])\n", | |
"print()\n", | |
"\n", | |
"# \ubd84\ub958\uae30 \ud559\uc2b5\n", | |
"classifier = nltk.NaiveBayesClassifier.train(train_set)\n", | |
"\n", | |
"# \ubd84\ub958\uae30 \ud3c9\uac00\n", | |
"print(\"accuracy:\", nltk.classify.accuracy(classifier, test_set))\n", | |
"print()\n", | |
"\n", | |
"# \ubb38\uc7a5 \ubd84\ub9ac\uae30\n", | |
"def segment_sentences(words):\n", | |
" start = 0\n", | |
" sents = []\n", | |
" for i, word in enumerate(words):\n", | |
" if word in '.?!' and classifier.classify(punct_features(words, i)) == True: \n", | |
" sents.append(words[start:i+1])\n", | |
" start = i+1\n", | |
" if start < len(words):\n", | |
" sents.append(words[start:])\n", | |
" return sents\n", | |
"\n", | |
"# \ubb38\uc7a5 \ubd84\ub9ac\uae30 \ud14c\uc2a4\ud2b8\n", | |
"sents = nltk.corpus.treebank_raw.sents()[:10]\n", | |
"words=[]\n", | |
"for s in sents:\n", | |
" words.extend(s)\n", | |
"# print(\"words:\", words)\n", | |
"# print()\n", | |
"print(\"correct:\\n\", '\\n'.join([' '.join(s) for s in sents ]))\n", | |
"print()\n", | |
"print(\"guess:\\n\", '\\n'.join([' '.join(s) for s in segment_sentences(words)]))\n", | |
"print()" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"len(sents): 4193 [[u'.', u'START'], [u'Pierre', u'Vinken', u',', u'61', u'years', u'old', u',', u'will', u'join', u'the', u'board', u'as', u'a', u'nonexecutive', u'director', u'Nov', u'.', u'29', u'.'], [u'Mr', u'.', u'Vinken', u'is', u'chairman', u'of', u'Elsevier', u'N', u'.', u'V', u'.,', u'the', u'Dutch', u'publishing', u'group', u'.']] ...\n", | |
"\n", | |
"len(tokens): 101797 [u'.', u'START', u'Pierre', u'Vinken', u',', u'61', u'years', u'old', u',', u'will', u'join', u'the', u'board', u'as', u'a', u'nonexecutive', u'director', u'Nov', u'.', u'29', u'.', u'Mr', u'.', u'Vinken', u'is', u'chairman', u'of', u'Elsevier', u'N', u'.'] ...\n", | |
"\n", | |
"len(boundaries): 4193 [1, 20, 36, 38, 64, 66, 102, 134, 163, 199] ...\n", | |
"\n", | |
"featuresets: ({u'next-word-capitalized': False, u'punct': u'.', u'prev-word-is-one-char': False, u'prevword': u'nov'}, False)\n", | |
"\n", | |
"train_set[0]: ({u'next-word-capitalized': True, u'punct': u'.', u'prev-word-is-one-char': False, u'prevword': u'popular'}, True)\n", | |
"\n", | |
"accuracy:" | |
] | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
" 0.936026936027\n", | |
"\n", | |
"correct:\n", | |
" . START\n", | |
"Pierre Vinken , 61 years old , will join the board as a nonexecutive director Nov . 29 .\n", | |
"Mr . Vinken is chairman of Elsevier N . V ., the Dutch publishing group .\n", | |
". START\n", | |
"Rudolph Agnew , 55 years old and former chairman of Consolidated Gold Fields PLC , was named a nonexecutive director of this British industrial conglomerate .\n", | |
". START\n", | |
"A form of asbestos once used to make Kent cigarette filters has caused a high percentage of cancer deaths among a group of workers exposed to it more than 30 years ago , researchers reported .\n", | |
"The asbestos fiber , crocidolite , is unusually resilient once it enters the lungs , with even brief exposures to it causing symptoms that show up decades later , researchers said .\n", | |
"Lorillard Inc ., the unit of New York - based Loews Corp . that makes Kent cigarettes , stopped using crocidolite in its Micronite cigarette filters in 1956 .\n", | |
"Although preliminary findings were reported more than a year ago , the latest results appear in today ' s New England Journal of Medicine , a forum likely to bring new attention to the problem .\n", | |
"\n", | |
"guess:\n", | |
" . START Pierre Vinken , 61 years old , will join the board as a nonexecutive director Nov . 29 . Mr . Vinken is chairman of Elsevier N . V ., the Dutch publishing group .\n", | |
". START Rudolph Agnew , 55 years old and former chairman of Consolidated Gold Fields PLC , was named a nonexecutive director of this British industrial conglomerate . . START A form of asbestos once used to make Kent cigarette filters has caused a high percentage of cancer deaths among a group of workers exposed to it more than 30 years ago , researchers reported .\n", | |
"The asbestos fiber , crocidolite , is unusually resilient once it enters the lungs , with even brief exposures to it causing symptoms that show up decades later , researchers said .\n", | |
"Lorillard Inc ., the unit of New York - based Loews Corp . that makes Kent cigarettes , stopped using crocidolite in its Micronite cigarette filters in 1956 .\n", | |
"Although preliminary findings were reported more than a year ago , the latest results appear in today ' s New England Journal of Medicine , a forum likely to bring new attention to the problem .\n", | |
"\n" | |
] | |
} | |
], | |
"prompt_number": 14 | |
}, | |
{ | |
"cell_type": "heading", | |
"level": 2, | |
"metadata": {}, | |
"source": [ | |
"Identifying Dialogue Act Types (\ud654\ud589 \uc885\ub958 \uc815\uc758)" | |
] | |
}, | |
{ | |
"cell_type": "heading", | |
"level": 6, | |
"metadata": {}, | |
"source": [ | |
"Act types: \"Statement,\" \"Emotion,\" \"ynQuestion\", and \"Continuer.\" " | |
] | |
}, | |
{ | |
"cell_type": "heading", | |
"level": 6, | |
"metadata": {}, | |
"source": [ | |
"Accept, Bye, Clarify, Continuer, Emotion, Emphasis, Greet, No Answer, Other, Reject, Statement, System, Wh-Question, Yes Answer, Yes/No Question." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"from __future__ import print_function, unicode_literals \n", | |
"from pprint import pprint\n", | |
"import nltk\n", | |
"\n", | |
"\n", | |
"posts = nltk.corpus.nps_chat.xml_posts()[:10000]\n", | |
"print(\"posts[0]:\", posts[0].text)\n", | |
"print()\n", | |
"\n", | |
"def dialogue_act_features(post):\n", | |
" features = {}\n", | |
" for word in nltk.word_tokenize(post):\n", | |
" features['contains(%s)' % word.lower()] = True\n", | |
" return features\n", | |
"\n", | |
"featuresets = [(dialogue_act_features(post.text), post.get('class'))\n", | |
" for post in posts]\n", | |
"size = int(len(featuresets) * 0.1)\n", | |
"train_set, test_set = featuresets[size:], featuresets[:size]\n", | |
"classifier = nltk.NaiveBayesClassifier.train(train_set)\n", | |
"print(\"featuresets[0]:\", featuresets[0])\n", | |
"print()\n", | |
"\n", | |
"print(\"accuracy:\", nltk.classify.accuracy(classifier, test_set))\n", | |
"print(classifier.classify(dialogue_act_features(\"My name is Hyewoong?\")))\n", | |
"print(classifier.classify(dialogue_act_features(\"What a beautiful girl?\")))\n", | |
"print(classifier.classify(dialogue_act_features(\"Do you want my love?\")))\n" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"posts[0]: now im left with this gay name\n", | |
"\n", | |
"featuresets[0]:" | |
] | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
" ({u'contains(im)': True, u'contains(now)': True, u'contains(this)': True, u'contains(left)': True, u'contains(name)': True, u'contains(with)': True, u'contains(gay)': True}, 'Statement')\n", | |
"\n", | |
"accuracy:" | |
] | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
" 0.668\n", | |
"whQuestion\n", | |
"whQuestion\n", | |
"ynQuestion\n" | |
] | |
} | |
], | |
"prompt_number": 15 | |
}, | |
{ | |
"cell_type": "heading", | |
"level": 2, | |
"metadata": {}, | |
"source": [ | |
"Recognizing Textual Entailment (\ud14d\uc2a4\ud2b8 \ud568\uc758 \uc778\uc2dd)" | |
] | |
}, | |
{ | |
"cell_type": "heading", | |
"level": 6, | |
"metadata": {}, | |
"source": [ | |
"Challenge 3, Pair 34 (True) <BR>\n", | |
"<BR>\n", | |
"T: Parviz Davudi was representing Iran at a meeting of the Shanghai Co-operation Organisation (SCO), the fledgling association that binds Russia, China and four former Soviet republics of central Asia together to fight terrorism.<BR>\n", | |
"<BR>\n", | |
"H: China is a member of SCO.<BR>\n", | |
"<BR>\n", | |
"<BR>\n", | |
"<BR>\n", | |
"Challenge 3, Pair 81 (False)<BR>\n", | |
"<BR>\n", | |
"T: According to NC Articles of Organization, the members of LLC company are H. Nelson Beavers, III, H. Chester Beavers and Jennie Beavers Stewart.<BR>\n", | |
"<BR>\n", | |
"H: Jennie Beavers Stewart is a share-holder of Carolina Analytical Laboratory.<BR>" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"from __future__ import print_function, unicode_literals \n", | |
"from pprint import pprint\n", | |
"import nltk\n", | |
"\n", | |
"\n", | |
"def rte_features(rtepair):\n", | |
" extractor = nltk.RTEFeatureExtractor(rtepair)\n", | |
" features = {}\n", | |
" features['word_overlap'] = len(extractor.overlap('word'))\n", | |
" features['word_hyp_extra'] = len(extractor.hyp_extra('word'))\n", | |
" features['ne_overlap'] = len(extractor.overlap('ne'))\n", | |
" features['ne_hyp_extra'] = len(extractor.hyp_extra('ne'))\n", | |
" return features\n", | |
"\n", | |
"rtepair = nltk.corpus.rte.pairs(['rte3_dev.xml'])[33]\n", | |
"print(\"rtepair:\", rtepair.__dict__)\n", | |
"print()\n", | |
"print(\"text:\", rtepair.text)\n", | |
"print()\n", | |
"print(\"hypothesis(=keyword) :\", rtepair.hyp)\n", | |
"print()\n", | |
"\n", | |
"extractor = nltk.RTEFeatureExtractor(rtepair)\n", | |
"print(\"text_words:\", extractor.text_words) \n", | |
"print(\"overlap('word'):\", extractor.overlap('word'))\n", | |
"print(\"overlap('ne')\", extractor.overlap('ne'))\n", | |
"print(\"hyp_words:\", extractor.hyp_words)\n", | |
"print(\"hyp_extra('word'):\", extractor.hyp_extra('word'))\n", | |
"?nltk.RTEFeatureExtractor" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"rtepair: {'task': 'IE', 'text': 'Parviz Davudi was representing Iran at a meeting of the Shanghai Co-operation Organisation (SCO), the fledgling association that binds Russia, China and four former Soviet republics of central Asia together to fight terrorism.', 'challenge': '3', 'value': 1, 'hyp': 'China is a member of SCO.', 'length': 'short', 'gid': u'3-34', 'id': '34'}\n", | |
"\n", | |
"text: Parviz Davudi was representing Iran at a meeting of the Shanghai Co-operation Organisation (SCO), the fledgling association that binds Russia, China and four former Soviet republics of central Asia together to fight terrorism.\n", | |
"\n", | |
"hypothesis(=keyword) : China is a member of SCO.\n", | |
"\n", | |
"text_words: set(['Russia', 'Organisation', 'Shanghai', 'Asia', 'four', 'at', 'operation', 'SCO', 'Iran', 'Soviet', 'Davudi', 'fight', 'China', 'association', 'fledgling', 'terrorism', 'was', 'that', 'republics', 'Co', 'representing', 'former', 'Parviz', 'central', 'meeting', 'together', 'binds'])\n", | |
"overlap('word'): set([])\n", | |
"overlap('ne') set(['SCO', 'China'])\n", | |
"hyp_words: set(['member', 'SCO', 'China'])\n", | |
"hyp_extra('word'): set(['member'])\n" | |
] | |
} | |
], | |
"prompt_number": 16 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"print(help(extractor.overlap))\n", | |
"print(help(extractor.hyp_extra))" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"Help on method overlap in module nltk.classify.rte_classify:\n", | |
"\n", | |
"overlap(self, toktype, debug=False) method of nltk.classify.rte_classify.RTEFeatureExtractor instance\n", | |
" Compute the overlap between text and hypothesis.\n", | |
" \n", | |
" :param toktype: distinguish Named Entities from ordinary words\n", | |
" :type toktype: 'ne' or 'word'\n", | |
"\n", | |
"None\n", | |
"Help on method hyp_extra in module nltk.classify.rte_classify:\n", | |
"\n", | |
"hyp_extra(self, toktype, debug=True) method of nltk.classify.rte_classify.RTEFeatureExtractor instance\n", | |
" Compute the extraneous material in the hypothesis.\n", | |
" \n", | |
" :param toktype: distinguish Named Entities from ordinary words\n", | |
" :type toktype: 'ne' or 'word'\n", | |
"\n", | |
"None\n" | |
] | |
} | |
], | |
"prompt_number": 17 | |
}, | |
{ | |
"cell_type": "heading", | |
"level": 2, | |
"metadata": {}, | |
"source": [ | |
"Scaling Up to Large Datasets" | |
] | |
}, | |
{ | |
"cell_type": "heading", | |
"level": 6, | |
"metadata": {}, | |
"source": [ | |
"\ub370\uc774\ud0c0\ub7c9\uc774 \ub9ce\uc744 \uacbd\uc6b0, \uc21c\uc218 python\ubcf4\ub2e4 C\ub85c \uad6c\ud604\ub41c python \ud328\ud0a4\uc9c0\ub97c \uc0ac\uc6a9\ud558\ub294 \uac8c \uc88b\ub2e4. (\uc218\ud589 \uc18d\ub3c4) <BR>\n", | |
"<BR>\n", | |
"we recommend that you explore NLTK's facilities for interfacing with external machine learning packages <BR>\n", | |
"... to train classifier models significantly faster than the pure-Python classifier implementation" | |
] | |
}, | |
{ | |
"cell_type": "heading", | |
"level": 1, | |
"metadata": {}, | |
"source": [ | |
"6.3 Evaluation (\ud3c9\uac00)" | |
] | |
}, | |
{ | |
"cell_type": "heading", | |
"level": 2, | |
"metadata": {}, | |
"source": [ | |
"The Test Set / Accuracy" | |
] | |
}, | |
{ | |
"cell_type": "heading", | |
"level": 6, | |
"metadata": {}, | |
"source": [ | |
"However, it is very important that the test set be distinct from the training corpus: <BR>\n", | |
"it is common to err on the side of safety by using 10% of the overall data for evaluation \n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"# \uad50\uc7ac\uc758 \uc18c\uc2a4\uac00 \ub3d9\uc791\ud558\ub3c4\ub85d \uc77c\ubd80 \uc218\uc815\ud568. \uc624\ub798 \uac78\ub9bc.\n", | |
"from __future__ import print_function, unicode_literals \n", | |
"from pprint import pprint\n", | |
"import nltk\n", | |
"import random\n", | |
"from nltk.corpus import brown\n", | |
"\n", | |
"\n", | |
"# \ud2b9\uc9d5 \ucd94\ucd9c \ud568\uc218 \uc815\uc758 (\ub2e8\uc5b4) -> (\uc811\ubbf8\uc0ac, \uc55e\ub2e8\uc5b4, \uc55e\ud488\uc0ac)\n", | |
"def pos_features(sentence, i, history):\n", | |
" features = {\"suffix(1)\": sentence[i][-1:],\n", | |
" \"suffix(2)\": sentence[i][-2:],\n", | |
" \"suffix(3)\": sentence[i][-3:]}\n", | |
" if i == 0:\n", | |
" features[\"prev-word\"] = \"<START>\"\n", | |
" features[\"prev-tag\"] = \"<START>\"\n", | |
" else:\n", | |
" features[\"prev-word\"] = sentence[i-1]\n", | |
" features[\"prev-tag\"] = history[i-1]\n", | |
" return features\n", | |
"\n", | |
"# \ubd84\ub958\uae30 \uc815\uc758 (\ucd08\uae30\ud654\uc5d0\uc11c \uc790\ub3d9\uc73c\ub85c \ud559\uc2b5\ud568) \n", | |
"class ConsecutivePosTagger(nltk.TaggerI):\n", | |
" def __init__(self, train_sents):\n", | |
" train_set = []\n", | |
" for tagged_sent in train_sents:\n", | |
" untagged_sent = nltk.tag.untag(tagged_sent)\n", | |
" history = []\n", | |
" for i, (word, tag) in enumerate(tagged_sent):\n", | |
" featureset = pos_features(untagged_sent, i, history)\n", | |
" train_set.append( (featureset, tag) )\n", | |
" history.append(tag)\n", | |
" self.classifier = nltk.NaiveBayesClassifier.train(train_set)\n", | |
"\n", | |
" def tag(self, sentence):\n", | |
" history = []\n", | |
" for i, word in enumerate(sentence):\n", | |
" featureset = pos_features(sentence, i, history)\n", | |
" tag = self.classifier.classify(featureset)\n", | |
" history.append(tag)\n", | |
" return zip(sentence, history)\n", | |
"\n", | |
"# \uc801\uc808\ud558\uc9c0 \uc54a\uc740 \ud559\uc2b5\uc14b, \ud14c\uc2a4\ud2b8\uc14b\uc758 \uacbd\uc6b0\n", | |
"# 1. \uac19\uc740 \uc7a5\ub974\ub85c \ud559\uc2b5\uc14b, \ud14c\uc2a4\ud2b8\uc14b\uc744 \uc0dd\uc131\ud558\uba74, \ud3c9\uac00 \uacb0\uacfc\ub97c \ud655\uc2e0\ud558\uae30 \uc5b4\ub835\ub2e4. (?)\n", | |
"# 2. random.shuffle()\uc744 \ud558\uba74, \uac19\uc740 \ubb38\uc11c\uc5d0\uc11c \ud559\uc2b5\uc14b, \ud14c\uc2a4\ud2b8\uc14b \ubb38\uc7a5\uc774 \uc0dd\uc131\ub420 \uc218 \uc788\uc5b4 \uc88b\uc9c0 \uc54a\ub2e4.\n", | |
"tagged_sents = list(brown.tagged_sents(categories='news'))\n", | |
"print(\"tagged_sents[0]:\", tagged_sents[0])\n", | |
"random.shuffle(tagged_sents)\n", | |
"size = int(len(tagged_sents) * 0.1)\n", | |
"train_sents, test_sents = tagged_sents[size:], tagged_sents[:size] \n", | |
"tagger = ConsecutivePosTagger(train_sents)\n", | |
"print('Accuracy: %4.2f' % tagger.evaluate(test_sents))\n", | |
"print()\n", | |
"\n", | |
"# 1. \ub2e4\ub978 \uc7a5\ub974\uc5d0\uc11c \ud559\uc2b5\uc14b, \ud14c\uc2a4\ud2b8\uc14b\uc744 \uc0dd\uc131\ud558\ub3c4\ub85d \uc218\uc815.\n", | |
"train_sents = brown.tagged_sents(categories='news')\n", | |
"test_sents = brown.tagged_sents(categories='fiction')\n", | |
"tagger = ConsecutivePosTagger(train_sents)\n", | |
"print('Accuracy: %4.2f' % tagger.evaluate(test_sents))\n", | |
"print()\n", | |
"\n", | |
"# 2. \uac19\uc740 \ubb38\uc11c\uc5d0\uc11c \ud559\uc2b5\uc14b, \ud14c\uc2a4\ud2b8\uc14b\uc774 \uc0dd\uc131\ub418\uc9c0 \uc54a\ub3c4\ub85d \uc218\uc815.\n", | |
"file_ids = brown.fileids(categories='news')\n", | |
"size = int(len(file_ids) * 0.1)\n", | |
"train_sents = brown.tagged_sents(file_ids[size:])\n", | |
"test_sents = brown.tagged_sents(file_ids[:size])\n", | |
"tagger = ConsecutivePosTagger(train_sents)\n", | |
"print('Accuracy: %4.2f' % tagger.evaluate(test_sents))\n", | |
"print()\n" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"tagged_sents[0]: [(u'The', u'AT'), (u'Fulton', u'NP-TL'), (u'County', u'NN-TL'), (u'Grand', u'JJ-TL'), (u'Jury', u'NN-TL'), (u'said', u'VBD'), (u'Friday', u'NR'), (u'an', u'AT'), (u'investigation', u'NN'), (u'of', u'IN'), (u\"Atlanta's\", u'NP$'), (u'recent', u'JJ'), (u'primary', u'NN'), (u'election', u'NN'), (u'produced', u'VBD'), (u'``', u'``'), (u'no', u'AT'), (u'evidence', u'NN'), (u\"''\", u\"''\"), (u'that', u'CS'), (u'any', u'DTI'), (u'irregularities', u'NNS'), (u'took', u'VBD'), (u'place', u'NN'), (u'.', u'.')]\n", | |
"Accuracy: 0.79" | |
] | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"\n", | |
"\n", | |
"Accuracy: 0.79" | |
] | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"\n", | |
"\n", | |
"Accuracy: 0.79" | |
] | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"\n", | |
"\n" | |
] | |
} | |
], | |
"prompt_number": 18 | |
}, | |
{ | |
"cell_type": "heading", | |
"level": 2, | |
"metadata": {}, | |
"source": [ | |
"Precision and Recall (\uc815\ud655\ub960, \uc7ac\ud604\uc728)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"<img src=\"http://www.nltk.org/images/precision-recall.png\" width=\"700\">\n", | |
"<img src=\"http://upload.wikimedia.org/wikipedia/commons/thumb/2/26/Precisionrecall.svg/700px-Precisionrecall.svg.png\" width=\"700\">\n", | |
"<img src=\"https://fbcdn-sphotos-c-a.akamaihd.net/hphotos-ak-xpa1/v/t1.0-9/10991051_844288612293942_8690474408857494396_n.jpg?oh=f4a68cc3875ebea360d2e2fbb1db68f8&oe=554DA29E&__gda__=1434765606_73492ef515b8cf34ddc9a82af0aff2d4\" width=\"700\">" | |
] | |
}, | |
{ | |
"cell_type": "heading", | |
"level": 2, | |
"metadata": {}, | |
"source": [ | |
"F-Measure (F-Score, F1 score)" | |
] | |
}, | |
{ | |
"cell_type": "heading", | |
"level": 6, | |
"metadata": {}, | |
"source": [ | |
"http://ko.wikipedia.org/wiki/\uc870\ud654_\ud3c9\uade0 <BR>\n", | |
"http://en.wikipedia.org/wiki/F1_score <BR>\n", | |
"<img src=\"http://upload.wikimedia.org/math/9/9/1/991d55cc29b4867c88c6c22d438265f9.png\">" | |
] | |
}, | |
{ | |
"cell_type": "heading", | |
"level": 2, | |
"metadata": {}, | |
"source": [ | |
"Confusion Matrices" | |
] | |
}, | |
{ | |
"cell_type": "heading", | |
"level": 6, | |
"metadata": {}, | |
"source": [ | |
"label \uc885\ub958\uac00 3\uac1c \uc774\uc0c1\uc77c \ub54c, label\ubcc4 \uc624\ub958\ube44\uc728\uc744 \ube44\uad50\ud560 \ub54c confusion matrice \uac00 \uc720\uc6a9\ud558\ub2e4" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"?nltk.UnigramTagger\n", | |
"?nltk.BigramTagger" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 19 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"from __future__ import print_function, unicode_literals \n", | |
"from pprint import pprint\n", | |
"import nltk\n", | |
"from nltk.corpus import brown\n", | |
"\n", | |
"\n", | |
"file_ids = brown.fileids(categories='editorial')\n", | |
"size = int(len(file_ids) * 0.1)\n", | |
"train_sents = brown.tagged_sents(file_ids[size:])\n", | |
" \n", | |
"def tag_list(tagged_sents):\n", | |
" return [tag for sent in tagged_sents for (word, tag) in sent]\n", | |
"def apply_tagger(tagger, corpus):\n", | |
" return [tagger.tag(nltk.tag.untag(sent)) for sent in corpus]\n", | |
"\n", | |
"gold = tag_list(brown.tagged_sents(categories='editorial')) # \uc0ac\uc124\n", | |
" \n", | |
"t0 = nltk.DefaultTagger('NN')\n", | |
"test = tag_list(apply_tagger(t0, brown.tagged_sents(categories='editorial')))\n", | |
"cm = nltk.ConfusionMatrix(gold, test)\n", | |
"print(\"nltk.DefaultTagger('NN'):\")\n", | |
"print(cm.pp(sort_by_count=True, show_percents=True, truncate=9))\n", | |
"print()\n", | |
"\n", | |
"t1 = nltk.UnigramTagger(train_sents, backoff=t0)\n", | |
"test = tag_list(apply_tagger(t1, brown.tagged_sents(categories='editorial')))\n", | |
"cm = nltk.ConfusionMatrix(gold, test)\n", | |
"print(\"nltk.UnigramTagger(train_sents):\")\n", | |
"print(cm.pp(sort_by_count=True, show_percents=True, truncate=9))\n", | |
"print()\n", | |
"\n", | |
"t2 = nltk.BigramTagger(train_sents, backoff=t1)\n", | |
"test = tag_list(apply_tagger(t2, brown.tagged_sents(categories='editorial')))\n", | |
"cm = nltk.ConfusionMatrix(gold, test)\n", | |
"print(\"nltk.BigramTagger(train_sents):\")\n", | |
"print(cm.pp(sort_by_count=True, show_percents=True, truncate=9))\n", | |
"print()\n", | |
" " | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"nltk.DefaultTagger('NN'):\n", | |
" | N |\n", | |
" | N I A J N V N |\n", | |
" | N N T J . S , B P |\n", | |
"----+----------------------------------------------------------------+\n", | |
" NN | <12.5%> . . . . . . . . |\n", | |
" IN | 10.1% <.> . . . . . . . |\n", | |
" AT | 8.6% . <.> . . . . . . |\n", | |
" JJ | 5.8% . . <.> . . . . . |\n", | |
" . | 4.9% . . . <.> . . . . |\n", | |
"NNS | 4.8% . . . . <.> . . . |\n", | |
" , | 4.4% . . . . . <.> . . |\n", | |
" VB | 3.5% . . . . . . <.> . |\n", | |
" NP | 3.1% . . . . . . . <.>|\n", | |
"----+----------------------------------------------------------------+\n", | |
"(row = reference; col = test)\n", | |
"\n", | |
"\n", | |
"nltk.UnigramTagger(train_sents):" | |
] | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"\n", | |
" | N |\n", | |
" | N I A J N V N |\n", | |
" | N N T J . S , B P |\n", | |
"----+----------------------------------------------------------------+\n", | |
" NN | <11.8%> 0.0% . 0.2% . . . 0.3% 0.0% |\n", | |
" IN | 0.0% <8.9%> . 0.0% . 0.0% . . . |\n", | |
" AT | . . <8.6%> . . . . . . |\n", | |
" JJ | 0.2% . . <5.6%> . . . 0.0% 0.0% |\n", | |
" . | . . . . <4.8%> . . . . |\n", | |
"NNS | 0.1% . . . . <4.6%> . . 0.0% |\n", | |
" , | . . . . . . <4.4%> . . |\n", | |
" VB | 0.4% . . 0.0% . . . <3.0%> . |\n", | |
" NP | 0.1% . . 0.0% . . . . <2.9%>|\n", | |
"----+----------------------------------------------------------------+\n", | |
"(row = reference; col = test)\n", | |
"\n", | |
"\n", | |
"nltk.BigramTagger(train_sents):" | |
] | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"\n", | |
" | N |\n", | |
" | N I A J N V N |\n", | |
" | N N T J . S , B P |\n", | |
"----+----------------------------------------------------------------+\n", | |
" NN | <12.3%> 0.0% . 0.0% . . . 0.1% 0.0% |\n", | |
" IN | 0.0% <9.1%> . 0.0% . 0.0% . . . |\n", | |
" AT | . . <8.6%> . . . . . . |\n", | |
" JJ | 0.2% . . <5.6%> . . . 0.0% 0.0% |\n", | |
" . | . . . . <4.8%> . . . 0.0% |\n", | |
"NNS | 0.1% . . . . <4.7%> . . . |\n", | |
" , | . . . . . . <4.4%> . . |\n", | |
" VB | 0.1% . . . . . . <3.4%> . |\n", | |
" NP | 0.1% . . 0.0% . . . . <2.9%>|\n", | |
"----+----------------------------------------------------------------+\n", | |
"(row = reference; col = test)\n", | |
"\n", | |
"\n" | |
] | |
} | |
], | |
"prompt_number": 23 | |
}, | |
{ | |
"cell_type": "heading", | |
"level": 2, | |
"metadata": {}, | |
"source": [ | |
"K Cross-Validation (K \uad50\ucc28 \uac80\uc99d)" | |
] | |
}, | |
{ | |
"cell_type": "heading", | |
"level": 6, | |
"metadata": {}, | |
"source": [ | |
"\uc77c\ubc18\uc801\uc73c\ub85c \ud559\uc2b5/\ud14c\uc2a4\ud2b8\uc6a9\uc758 \ucda9\ubd84\ud55c \ub370\uc774\ud0c0\uac00 \uc5c6\uae30 \ub54c\ubb38\uc5d0 \uc0ac\uc6a9\ud558\ub294 \ubc29\ubc95. <BR>\n", | |
"\ucf54\ud37c\uc2a4(\ub370\uc774\ud0c0)\ub97c K\uac1c\uc758 folds(subset)\ub85c \ub098\ub208 \ud6c4, \ud55c fold\ub97c \ud14c\uc2a4\ud2b8\uc14b\uc73c\ub85c \uc120\ud0dd\ud558\ub294 \ubc29\ubc95. <BR>\n", | |
"\uc774 \ub54c, \ud574\ub2f9 fold\uc678\uc758 \ub098\uba38\uc9c0 fold\ub294 \ud559\uc2b5\uc14b\uc774 \ub41c\ub2e4. <BR>\n", | |
"<BR>\n", | |
"\ub9cc\uc57d K\ubc88\uc758 \ud3c9\uac00 \uc810\uc218\uac00 \ube44\uc2b7\ud560 \uacbd\uc6b0, \uadf8 \uacb0\uacfc\ub97c \ud655\uc2e0\ud558\uae30 \uc88b\ub2e4.\n", | |
"<BR>\n", | |
"e.g. 10 folds cross-validation <BR>" | |
] | |
}, | |
{ | |
"cell_type": "heading", | |
"level": 1, | |
"metadata": {}, | |
"source": [ | |
"6.4 Decision Trees" | |
] | |
}, | |
{ | |
"cell_type": "heading", | |
"level": 6, | |
"metadata": {}, | |
"source": [ | |
"<img src=\"http://www.nltk.org/images/decision-tree.png\" width=\"700\">" | |
] | |
}, | |
{ | |
"cell_type": "heading", | |
"level": 2, | |
"metadata": {}, | |
"source": [ | |
"Entropy and Information Gain" | |
] | |
}, | |
{ | |
"cell_type": "heading", | |
"level": 6, | |
"metadata": {}, | |
"source": [ | |
"H = \u2212\u03a3l |in| labelsP(l) \u00d7 log2P(l).\n", | |
"<img src=\"http://www.nltk.org/images/Binary_entropy_plot.png\" width=\"200\"> <BR>\n", | |
"class: a,b \uc77c \ub54c, <BR>\n", | |
"\uac00\ub85c: a\uc758 \ud655\ub960 = a\uc758 \ube48\ub3c4/(a\uc758 \ube48\ub3c4 + b\uc758 \ube48\ub3c4) <BR>\n", | |
"\uc138\ub85c: entropy <BR>" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"from __future__ import print_function, unicode_literals \n", | |
"from pprint import pprint\n", | |
"import nltk\n", | |
"\n", | |
"import math\n", | |
"def entropy(labels):\n", | |
" freqdist = nltk.FreqDist(labels)\n", | |
" probs = [freqdist.freq(l) for l in nltk.FreqDist(labels)]\n", | |
" return -sum([p * math.log(p,2) for p in probs])\n", | |
"\n", | |
"print(\"entropy(['male', 'male', 'male', 'male']):\", entropy(['male', 'male', 'male', 'male']))\n", | |
"print(\"entropy(['male', 'female', 'male', 'male']):\", entropy(['male', 'female', 'male', 'male']))\n", | |
"print(\"entropy(['female', 'male', 'female', 'male']):\", entropy(['female', 'male', 'female', 'male']))\n", | |
"print(\"entropy(['female', 'female', 'male', 'female']):\", entropy(['female', 'female', 'male', 'female']))\n", | |
"print(\"entropy(['female', 'female', 'female', 'female']):\", entropy(['female', 'female', 'female', 'female']))\n" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"entropy(['male', 'male', 'male', 'male']): -0.0\n", | |
"entropy(['male', 'female', 'male', 'male']): 0.811278124459\n", | |
"entropy(['female', 'male', 'female', 'male']): 1.0\n", | |
"entropy(['female', 'female', 'male', 'female']): 0.811278124459\n", | |
"entropy(['female', 'female', 'female', 'female']): -0.0\n" | |
] | |
} | |
], | |
"prompt_number": 24 | |
}, | |
{ | |
"cell_type": "heading", | |
"level": 1, | |
"metadata": {}, | |
"source": [ | |
"6.5 Naive Bayes Classifiers" | |
] | |
}, | |
{ | |
"cell_type": "heading", | |
"level": 6, | |
"metadata": {}, | |
"source": [ | |
"<img src=\"http://www.nltk.org/images/naive-bayes-triangle.png\" width=\"700\">" | |
] | |
}, | |
{ | |
"cell_type": "heading", | |
"level": 6, | |
"metadata": {}, | |
"source": [ | |
"<img src=\"http://www.nltk.org/images/naive_bayes_bargraph.png\" width=\"700\">" | |
] | |
}, | |
{ | |
"cell_type": "heading", | |
"level": 2, | |
"metadata": {}, | |
"source": [ | |
"Underlying Probabilistic Model" | |
] | |
}, | |
{ | |
"cell_type": "heading", | |
"level": 6, | |
"metadata": {}, | |
"source": [ | |
"<img src=\"http://www.nltk.org/images/naive_bayes_graph.png\" width=\"700\">" | |
] | |
}, | |
{ | |
"cell_type": "heading", | |
"level": 2, | |
"metadata": {}, | |
"source": [ | |
"Zero Counts and Smoothing" | |
] | |
}, | |
{ | |
"cell_type": "heading", | |
"level": 2, | |
"metadata": {}, | |
"source": [ | |
"Non-Binary Features" | |
] | |
}, | |
{ | |
"cell_type": "heading", | |
"level": 2, | |
"metadata": {}, | |
"source": [ | |
"The Naivete of Independence" | |
] | |
}, | |
{ | |
"cell_type": "heading", | |
"level": 2, | |
"metadata": {}, | |
"source": [ | |
"The Cause of Double-Counting" | |
] | |
}, | |
{ | |
"cell_type": "heading", | |
"level": 1, | |
"metadata": {}, | |
"source": [ | |
"6.6 Maximum Entropy Classifiers" | |
] | |
}, | |
{ | |
"cell_type": "heading", | |
"level": 2, | |
"metadata": {}, | |
"source": [ | |
"The Maximum Entropy Model" | |
] | |
}, | |
{ | |
"cell_type": "heading", | |
"level": 2, | |
"metadata": {}, | |
"source": [ | |
"Maximizing Entropy" | |
] | |
}, | |
{ | |
"cell_type": "heading", | |
"level": 2, | |
"metadata": {}, | |
"source": [ | |
"Generative Versus Conditional Classifiers" | |
] | |
}, | |
{ | |
"cell_type": "heading", | |
"level": 1, | |
"metadata": {}, | |
"source": [ | |
"6.7 Modeling Linguistic Patterns" | |
] | |
}, | |
{ | |
"cell_type": "heading", | |
"level": 2, | |
"metadata": {}, | |
"source": [ | |
"What Do Models Tell Us?" | |
] | |
} | |
], | |
"metadata": {} | |
} | |
] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment