Created
August 17, 2019 01:06
-
-
Save stephenhandley/a33b541b10d5b089ba52d0fd6a300b3a to your computer and use it in GitHub Desktop.
Natural Language Processing for Hackers (Chapter 1).ipynb
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 82, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"########\n", | |
"# Deps #\n", | |
"########\n", | |
"# pip install nltk numpy\n", | |
"\n", | |
"import nltk\n", | |
"from nltk.corpus import reuters\n", | |
"from random import randrange\n", | |
"\n", | |
"reuters_words = nltk.corpus.reuters.words()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"#########\n", | |
"# Setup #\n", | |
"#########\n", | |
"\n", | |
"nltk.download('all')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 173, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Found 9 sentences in this paragraph:\n", | |
"AMPLE SUPPLIES LIMIT U.S. STRIKE'S OIL PRICE IMPACT\n", | |
" Ample supplies of OPEC crude weighing on\n", | |
" world markets helped limit and then reverse oil price gains\n", | |
" that followed the U.S. Strike on an Iranian oil platform in the\n", | |
" Gulf earlier on Monday, analysts said.\n", | |
" December loading rose to 19.65 dlrs, up 45 cents before\n", | |
" falling to around 19.05/15 later, unchanged from last Friday.\n", | |
" \"Fundamentals are awful,\" said Philip Lambert, analyst with\n", | |
" stockbrokers Kleinwort Grieveson, adding that total OPEC\n", | |
" production in the first week of October could be above 18.5 mln\n", | |
" bpd, little changed from September levels.\n", | |
" Peter Nicol, analyst at Chase Manhattan Bank, said OPEC\n", | |
" production could be about 18.5-19.0 mln in October. Reuter and\n", | |
" International Energy Agency (IEA) estimates put OPEC September\n", | |
" production at 18.5 mln bpd.\n", | |
" The U.S. Attack was in retaliation of last Friday's hit of\n", | |
" a Kuwaiti oil products tanker flying the U.S. Flag, the Sea\n", | |
" Isle City. It was struc...\n", | |
"\n", | |
"1.\n", | |
"AMPLE SUPPLIES LIMIT U.S. STRIKE'S OIL PRICE IMPACT\n", | |
" Ample supplies of OPEC crude weighing on\n", | |
" world markets helped limit and then reverse oil price gains\n", | |
" that followed the U.S. Strike on an Iranian oil platform in the\n", | |
" Gulf earlier on Monday, analysts said.\n", | |
"2.\n", | |
"December loading rose to 19.65 dlrs, up 45 cents before\n", | |
" falling to around 19.05/15 later, unchanged from last Friday.\n", | |
"3.\n", | |
"\"Fundamentals are awful,\" said Philip Lambert, analyst with\n", | |
" stockbrokers Kleinwort Grieveson, adding that total OPEC\n", | |
" production in the first week of October could be above 18.5 mln\n", | |
" bpd, little changed from September levels.\n", | |
"4.\n", | |
"Peter Nicol, analyst at Chase Manhattan Bank, said OPEC\n", | |
" production could be about 18.5-19.0 mln in October.\n", | |
"5.\n", | |
"Reuter and\n", | |
" International Energy Agency (IEA) estimates put OPEC September\n", | |
" production at 18.5 mln bpd.\n", | |
"6.\n", | |
"The U.S.\n", | |
"7.\n", | |
"Attack was in retaliation of last Friday's hit of\n", | |
" a Kuwaiti oil products tanker flying the U.S.\n", | |
"8.\n", | |
"Flag, the Sea\n", | |
" Isle City.\n", | |
"9.\n", | |
"It was struc\n" | |
] | |
} | |
], | |
"source": [ | |
"#############\n", | |
"# Sentences #\n", | |
"#############\n", | |
"\n", | |
"paragraph = reuters.raw('test/21131')[:1000]\n", | |
"sentences = nltk.sent_tokenize(paragraph) \n", | |
"num_sentences = len(sentences)\n", | |
"\n", | |
"print(f'Found {num_sentences} sentences in this paragraph:\\n{paragraph}...\\n')\n", | |
"for i in range(num_sentences):\n", | |
" sentence = sentences[i]\n", | |
" print(f'{i + 1}.\\n{sentence}')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 175, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"[\"Don't say that Randy Quaid is a good example.\"]\n", | |
"['Do', \"n't\", 'say', 'that', 'Randy', 'Quaid', 'is', 'a', 'good', 'example', '.']\n" | |
] | |
} | |
], | |
"source": [ | |
"#########\n", | |
"# Words #\n", | |
"#########\n", | |
"\n", | |
"easy_sentence = \"Don't say that Randy Quaid is a good example.\"\n", | |
"print(nltk.sent_tokenize(easy_sentence))\n", | |
"easy_words = nltk.word_tokenize(easy_sentence)\n", | |
"print(easy_words)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 176, | |
"metadata": { | |
"scrolled": true | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"most common words are: [('.', 94687), (',', 72360), ('the', 58251), ('of', 35979), ('to', 34035), ('in', 26478), ('said', 25224), ('and', 25043), ('a', 23492), ('mln', 18037)]\n", | |
"\"stock\" occurs 2346 times\n", | |
"\"stork\" occurs 0 times\n", | |
"frequency of the word \"the\" is 0.033849129031826936\n", | |
"37.37% (15545 / 41600) of the words occur only once. Here is a random example: ERB\n", | |
"samples: 1720901\n" | |
] | |
} | |
], | |
"source": [ | |
"############################\n", | |
"# Occurrences, basic stats #\n", | |
"############################\n", | |
"\n", | |
"def percent (num): \n", | |
" return '%.2f' % (num * 100)\n", | |
"\n", | |
"fdist = nltk.FreqDist(reuters_words)\n", | |
"most_common = fdist.most_common(n=10)\n", | |
"\n", | |
"stock_count = fdist['stock']\n", | |
"stork_count = fdist['stork']\n", | |
"\n", | |
"the_freq = fdist.freq('the')\n", | |
"\n", | |
"singles = fdist.hapaxes() # \"hapaxes\" is term for words occurring once\n", | |
"num_singles = len(singles)\n", | |
"total = len(fdist.keys())\n", | |
"percent_single = percent((num_singles / total));\n", | |
"index = randrange(num_singles)\n", | |
"example = singles[index]\n", | |
"\n", | |
"num_samples = fdist.N()\n", | |
"\n", | |
"print('most common words are: ', most_common)\n", | |
"print(f'\"stock\" occurs {stock_count} times')\n", | |
"print(f'\"stork\" occurs {stork_count} times')\n", | |
"print(f'frequency of the word \"the\" is {the_freq}')\n", | |
"print(f'{percent_single}% ({num_singles} / {total}) of the words occur only once. Here is a random example:', example)\n", | |
"print('samples:', num_samples)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 177, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"bigrams are pairs of words that are adjacent [('Randy', 'works'), ('works', 'at'), ('at', 'Quaid'), ('Quaid', 'Corp'), ('Corp', '.')]\n", | |
"trigrams are similar but for 3 words [('Randy', 'works', 'at'), ('works', 'at', 'Quaid'), ('at', 'Quaid', 'Corp'), ('Quaid', 'Corp', '.')]\n" | |
] | |
} | |
], | |
"source": [ | |
"###########\n", | |
"# n-grams #\n", | |
"###########\n", | |
"\n", | |
"text = \"Randy works at Quaid Corp.\"\n", | |
"tokens = nltk.word_tokenize(text)\n", | |
"bigrams = nltk.bigrams(tokens)\n", | |
"trigrams = nltk.trigrams(tokens)\n", | |
"\n", | |
"print('bigrams are pairs of words that are adjacent', list(bigrams))\n", | |
"print('trigrams are similar but for 3 words', list(trigrams))\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 178, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"50 bigrams with highest pointwise mututal information: [('Het', 'Comite'), ('Lago', 'Agrio'), ('Hoare', 'Govett'), ('Dar', 'es'), ('Ranks', 'Hovis'), ('es', 'Salaam'), ('SRI', 'LANKA'), ('CRAZY', 'EDDIE'), ('LORD', 'ABBETT'), ('Corazon', 'Aquino'), ('MERRILL', 'LYNCH'), ('Ay', 'Expd'), ('Abu', 'Dhabi'), ('Sao', 'Paulo'), ('HONG', 'KONG'), ('THOUS', 'BUSHELS'), ('poison', 'pill'), ('Lear', 'Siegler'), ('Pizza', 'Inn'), ('Kleinwort', 'Benson'), ('Hajime', 'Tamura'), ('Gates', 'Learjet'), ('ORANGE', 'JUICE'), ('Wells', 'Fargo'), ('Real', 'Estate'), ('Margaret', 'Thatcher'), ('SANTA', 'FE'), ('Brace', 'Jovanovich'), ('DIAMOND', 'SHAMROCK'), ('Phelps', 'Dodge'), ('Fort', 'Worth'), ('Puerto', 'Rico'), ('del', 'Este'), ('WALL', 'STREET'), ('REAL', 'ESTATE'), ('Hiram', 'Walker'), ('Du', 'Pont'), ('Partly', 'offsetting'), ('Punta', 'del'), ('Hk', 'Hotels'), ('DAYTON', 'HUDSON'), ('Bra', 'Kanon'), ('HUGHES', 'TOOL'), ('Rank', 'Xerox'), ('Exporting', 'Countries'), ('Marlin', 'Fitzwater'), ('King', 'Fahd'), ('Dean', 'Witter'), ('continental', 'shelf'), ('Costa', 'Rica')]\n", | |
"50 trigrams with highest pointwise mututal information: [('Dar', 'es', 'Salaam'), ('Punta', 'del', 'Este'), ('Arturo', 'Hernandez', 'Grisanti'), ('Speaker', 'Jim', 'Wright'), ('SANTA', 'FE', 'SOUTHERN'), ('THOUS', 'BUSHELS', 'SOYBEANS'), ('Denis', 'Bra', 'Kanon'), ('Nil', 'Nil', 'Nil'), ('Harcourt', 'Brace', 'Jovanovich'), ('Fernando', 'Santos', 'Alvite'), ('Drexel', 'Burnham', 'Lambert'), ('WALL', 'STREET', 'STOCKS'), ('Liberal', 'Democratic', 'Party'), ('Prev', 'Wk', 'Named'), ('Dean', 'Witter', 'Reynolds'), ('CUSTOMER', 'REPURCHASE', 'AGREEMENTS'), ('Lloyds', 'Shipping', 'Intelligence'), ('Karl', 'Otto', 'Poehl'), ('DISCOUNT', 'BORROWINGS', 'AVERAGE'), ('ADDS', 'RESERVES', 'VIA'), ('VIA', 'CUSTOMER', 'REPURCHASES'), ('Light', 'Louisiana', 'Sweet'), ('President', 'Corazon', 'Aquino'), ('Export', 'Enhancement', 'Program'), ('Exchequer', 'Nigel', 'Lawson'), ('Governor', 'Satoshi', 'Sumita'), ('Rio', 'de', 'Janeiro'), ('governor', 'Satoshi', 'Sumita'), ('Gross', 'Domestic', 'Product'), ('Enhancement', 'Program', 'initiative'), ('ruling', 'Liberal', 'Democratic'), ('OLD', 'RATE', 'MATURITY'), ('Partly', 'offsetting', 'these'), ('Petroleum', 'Exporting', 'Countries'), ('Minister', 'Margaret', 'Thatcher'), ('House', 'Speaker', 'Jim'), ('CURRENT', 'ACCOUNT', 'DEFICIT'), ('Banco', 'do', 'Brasil'), ('DISTILLATE', 'STOCKS', 'OFF'), ('RESERVES', 'VIA', 'CUSTOMER'), ('Representative', 'Clayton', 'Yeutter'), ('ranges', 'broadly', 'consistent'), ('Secretary', 'Caspar', 'Weinberger'), ('offsetting', 'these', 'outflows'), ('MARKET', 'SHORTAGE', 'FORECAST'), ('REUTER', '^', 'M'), ('Marine', 'Midland', 'Banks'), ('DLR', 'CUSTOMER', 'REPURCHASE'), ('Papua', 'New', 'Guinea'), ('President', 'Karl', 'Otto')]\n" | |
] | |
} | |
], | |
"source": [ | |
"from nltk.collocations import (\n", | |
" BigramAssocMeasures, \n", | |
" BigramCollocationFinder,\n", | |
" TrigramAssocMeasures,\n", | |
" TrigramCollocationFinder\n", | |
")\n", | |
"\n", | |
"bigram_measures = BigramAssocMeasures()\n", | |
"trigram_measures = TrigramAssocMeasures()\n", | |
"\n", | |
"def gramStats (*, measures, Finder, prefix, min_freq = 10, pmi_count = 50):\n", | |
" finder = Finder.from_words(reuters_words)\n", | |
" finder.apply_freq_filter(min_freq)\n", | |
" highest_pmi = finder.nbest(bigram_measures.pmi, pmi_count)\n", | |
" label = f'{prefix}grams'\n", | |
" print(f'{pmi_count} {label} with highest pointwise mututal information:', highest_pmi)\n", | |
" \n", | |
"gramStats(\n", | |
" measures = bigram_measures,\n", | |
" Finder = BigramCollocationFinder,\n", | |
" prefix = 'bi'\n", | |
")\n", | |
"\n", | |
"gramStats(\n", | |
" measures = trigram_measures,\n", | |
" Finder = TrigramCollocationFinder,\n", | |
" prefix = 'tri'\n", | |
")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 179, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Tokens: ['Wikipedia.com', 'says', 'that', 'Randy', 'Randall', 'Rudy', 'Quaid', 'is', 'an', 'American', 'film', 'and', 'television', 'actor', 'and', 'Oscar', 'nominee', 'known', 'for', 'his', 'roles', 'in', 'both', 'serious', 'drama', 'and', 'light', 'comedy', '.']\n", | |
"Tagged Tokens: [('Wikipedia.com', 'NNP'), ('says', 'VBZ'), ('that', 'IN'), ('Randy', 'NNP'), ('Randall', 'NNP'), ('Rudy', 'NNP'), ('Quaid', 'NNP'), ('is', 'VBZ'), ('an', 'DT'), ('American', 'JJ'), ('film', 'NN'), ('and', 'CC'), ('television', 'NN'), ('actor', 'NN'), ('and', 'CC'), ('Oscar', 'NNP'), ('nominee', 'RB'), ('known', 'VBN'), ('for', 'IN'), ('his', 'PRP$'), ('roles', 'NNS'), ('in', 'IN'), ('both', 'DT'), ('serious', 'JJ'), ('drama', 'NN'), ('and', 'CC'), ('light', 'JJ'), ('comedy', 'NN'), ('.', '.')]\n", | |
"Named entities (S\n", | |
" Wikipedia.com/NNP\n", | |
" says/VBZ\n", | |
" that/IN\n", | |
" (PERSON Randy/NNP Randall/NNP Rudy/NNP Quaid/NNP)\n", | |
" is/VBZ\n", | |
" an/DT\n", | |
" (GPE American/JJ)\n", | |
" film/NN\n", | |
" and/CC\n", | |
" television/NN\n", | |
" actor/NN\n", | |
" and/CC\n", | |
" (PERSON Oscar/NNP)\n", | |
" nominee/RB\n", | |
" known/VBN\n", | |
" for/IN\n", | |
" his/PRP$\n", | |
" roles/NNS\n", | |
" in/IN\n", | |
" both/DT\n", | |
" serious/JJ\n", | |
" drama/NN\n", | |
" and/CC\n", | |
" light/JJ\n", | |
" comedy/NN\n", | |
" ./.)\n" | |
] | |
} | |
], | |
"source": [ | |
"####################################\n", | |
"# Tokenization and parts of speech #\n", | |
"####################################\n", | |
"\n", | |
"randy_sentence = \"Wikipedia.com says that Randy Randall Rudy Quaid is an American film and television actor and Oscar nominee known for his roles in both serious drama and light comedy.\" \n", | |
"randy_tokens = nltk.word_tokenize(randy_sentence)\n", | |
"tagged_tokens = nltk.pos_tag(randy_tokens) \n", | |
"ner_annotated_tree = nltk.ne_chunk(tagged_tokens)\n", | |
"\n", | |
"print(\"Tokens: \", randy_tokens)\n", | |
"print(\"Tagged Tokens: \", tagged_tokens)\n", | |
"print(\"Named entities\", ner_annotated_tree)\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 180, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"(S\n", | |
" (PERSON Randy/NNP)\n", | |
" (PERSON Quaid/NNP)\n", | |
" does/VBZ\n", | |
" not/RB\n", | |
" go/VB\n", | |
" to/TO\n", | |
" (ORGANIZATION Harvard/NNP)\n", | |
" ./.)\n" | |
] | |
} | |
], | |
"source": [ | |
"sentence = \"\"\"Randy Quaid does not go to Harvard.\"\"\"\n", | |
"tokens = nltk.word_tokenize(sentence)\n", | |
"tagged_tokens = nltk.pos_tag(tokens) \n", | |
"ner_annotated_tree = nltk.ne_chunk(tagged_tokens) \n", | |
"print(ner_annotated_tree)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 181, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Synsets for car (filter = None):\n", | |
"\n", | |
"car.n.01:\n", | |
"['car', 'auto', 'automobile', 'machine', 'motorcar']\n", | |
"a motor vehicle with four wheels; usually propelled by an internal combustion engine\n", | |
"\n", | |
"car.n.02:\n", | |
"['car', 'railcar', 'railway_car', 'railroad_car']\n", | |
"a wheeled vehicle adapted to the rails of railroad\n", | |
"\n", | |
"car.n.03:\n", | |
"['car', 'gondola']\n", | |
"the compartment that is suspended from an airship and that carries personnel and the cargo and the power plant\n", | |
"\n", | |
"car.n.04:\n", | |
"['car', 'elevator_car']\n", | |
"where passengers ride up and down\n", | |
"\n", | |
"cable_car.n.01:\n", | |
"['cable_car', 'car']\n", | |
"a conveyance for passengers or freight on a cable railway\n", | |
"\n", | |
"------\n", | |
"\n", | |
"Synsets for fight (filter = None):\n", | |
"\n", | |
"battle.n.01:\n", | |
"['battle', 'conflict', 'fight', 'engagement']\n", | |
"a hostile meeting of opposing military forces in the course of a war\n", | |
"\n", | |
"fight.n.02:\n", | |
"['fight', 'fighting', 'combat', 'scrap']\n", | |
"the act of fighting; any contest or struggle\n", | |
"\n", | |
"competitiveness.n.01:\n", | |
"['competitiveness', 'fight']\n", | |
"an aggressive willingness to compete\n", | |
"\n", | |
"fight.n.04:\n", | |
"['fight']\n", | |
"an intense verbal dispute\n", | |
"\n", | |
"fight.n.05:\n", | |
"['fight']\n", | |
"a boxing or wrestling match\n", | |
"\n", | |
"contend.v.06:\n", | |
"['contend', 'fight', 'struggle']\n", | |
"be engaged in a fight; carry on a fight\n", | |
"\n", | |
"fight.v.02:\n", | |
"['fight', 'oppose', 'fight_back', 'fight_down', 'defend']\n", | |
"fight against or resist strongly\n", | |
"\n", | |
"fight.v.03:\n", | |
"['fight', 'struggle']\n", | |
"make a strenuous or labored effort\n", | |
"\n", | |
"crusade.v.01:\n", | |
"['crusade', 'fight', 'press', 'campaign', 'push', 'agitate']\n", | |
"exert oneself continuously, vigorously, or obtrusively to gain an end or engage in a crusade for a certain cause or person; be an advocate for\n", | |
"\n", | |
"------\n", | |
"\n", | |
"Synsets for fight (filter = n):\n", | |
"\n", | |
"battle.n.01:\n", | |
"['battle', 'conflict', 'fight', 'engagement']\n", | |
"a hostile meeting of opposing military forces in the course of a war\n", | |
"\n", | |
"fight.n.02:\n", | |
"['fight', 'fighting', 'combat', 'scrap']\n", | |
"the act of fighting; any contest or struggle\n", | |
"\n", | |
"competitiveness.n.01:\n", | |
"['competitiveness', 'fight']\n", | |
"an aggressive willingness to compete\n", | |
"\n", | |
"fight.n.04:\n", | |
"['fight']\n", | |
"an intense verbal dispute\n", | |
"\n", | |
"fight.n.05:\n", | |
"['fight']\n", | |
"a boxing or wrestling match\n", | |
"\n", | |
"------\n", | |
"\n" | |
] | |
} | |
], | |
"source": [ | |
"###########\n", | |
"# Wordnet #\n", | |
"###########\n", | |
"\n", | |
"wordnet = nltk.corpus.wordnet\n", | |
" \n", | |
"def printSynsets (*, word, filter = None):\n", | |
" print(f'Synsets for {word} (filter = {filter}):')\n", | |
" synsets = wordnet.synsets(word, filter)\n", | |
" for synset in synsets:\n", | |
" print()\n", | |
" print(f'{synset.name()}:')\n", | |
" print([l.name() for l in synset.lemmas()])\n", | |
" print(synset.definition())\n", | |
" print('\\n------\\n')\n", | |
"\n", | |
"printSynsets(\n", | |
" word = 'car'\n", | |
")\n", | |
" \n", | |
"printSynsets(\n", | |
" word = 'fight'\n", | |
")\n", | |
" \n", | |
"# just the nouns\n", | |
"printSynsets(\n", | |
" word = 'fight',\n", | |
" filter = wordnet.NOUN \n", | |
")\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 182, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Synset('walk.v.01')\n" | |
] | |
} | |
], | |
"source": [ | |
"# Get a single synset by name \n", | |
"walk_verb = wordnet.synset('walk.v.01')\n", | |
"print(walk_verb)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 183, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"(vehicle.n.01\n", | |
" (bumper_car.n.01 )\n", | |
" (craft.n.02\n", | |
" (aircraft.n.01\n", | |
" (bogy.n.01 )\n", | |
" (cruise_missile.n.01 )\n", | |
" (heavier-than-air_craft.n.01\n", | |
" (airplane.n.01\n", | |
" (airliner.n.01\n", | |
" (airbus.n.01 )\n", | |
" (narrowbody_aircraft.n.01 )\n", | |
" (widebody_aircraft.n.01 ))\n", | |
" (amphibian.n.02 )\n", | |
" (biplane.n.01 )\n", | |
" (bomber.n.01 (dive_bomber.n.01 ) (stealth_bomber.n.01 ))\n", | |
" (delta_wing.n.01 )\n", | |
" (fighter.n.02\n", | |
" (interceptor.n.01 )\n", | |
" (kamikaze.n.01 )\n", | |
" (stealth_fighter.n.01 ))\n", | |
" (hangar_queen.n.01 )\n", | |
" (jet.n.01\n", | |
" (fanjet.n.01 )\n", | |
" (jetliner.n.01 )\n", | |
" (jumbojet.n.01 )\n", | |
" (twinjet.n.01 ))\n", | |
" (monoplane.n.01 )\n", | |
" (multiengine_airplane.n.01 ))\n", | |
" (autogiro.n.01 )\n", | |
" (drone.n.04 )\n", | |
" (glider.n.01 (hang_glider.n.02 ))\n", | |
" (helicopter.n.01\n", | |
" (cargo_helicopter.n.01 )\n", | |
" (shuttle_helicopter.n.01 )\n", | |
" (single-rotor_helicopter.n.01 )\n", | |
" (skyhook.n.01 ))\n", | |
" (orthopter.n.01 )\n", | |
" (warplane.n.01\n", | |
" (bomber.n.01 (dive_bomber.n.01 ) (stealth_bomber.n.01 ))\n", | |
" (fighter.n.02\n", | |
" (interceptor.n.01 )\n", | |
" (kamikaze.n.01 )\n", | |
" (stealth_fighter.n.01 ))\n", | |
" (reconnaissance_plane.n.01 )))\n", | |
" (lighter-than-air_craft.n.01\n", | |
" (airship.n.01\n", | |
" (barrage_balloon.n.01 (kite_balloon.n.01 ))\n", | |
" (blimp.n.02 ))\n", | |
" (balloon.n.01\n", | |
" (hot-air_balloon.n.01 )\n", | |
" (meteorological_balloon.n.01 (pilot_balloon.n.01 ))\n", | |
" (trial_balloon.n.02 )))\n", | |
" (stealth_aircraft.n.01\n", | |
" (stealth_bomber.n.01 )\n", | |
" (stealth_fighter.n.01 )))\n", | |
" (hovercraft.n.01 )\n", | |
" (landing_craft.n.01 )\n", | |
" (spacecraft.n.01\n", | |
" (lander.n.02 )\n", | |
" (lunar_excursion_module.n.01 )\n", | |
" (space_capsule.n.01 )\n", | |
" (space_shuttle.n.01 )\n", | |
" (starship.n.01 ))\n", | |
" (vessel.n.02\n", | |
" (bareboat.n.01 )\n", | |
" (boat.n.01\n", | |
" (ark.n.02 )\n", | |
" (barge.n.01\n", | |
" (dredger.n.01 )\n", | |
" (houseboat.n.01 )\n", | |
" (pontoon.n.01 )\n", | |
" (scow.n.02 )\n", | |
" (wherry.n.01 ))\n", | |
" (bumboat.n.01 )\n", | |
" (canal_boat.n.01 )\n", | |
" (ferry.n.01 (car-ferry.n.01 ))\n", | |
" (fireboat.n.01 )\n", | |
" (gondola.n.02 )\n", | |
" (guard_boat.n.01 )\n", | |
" (gunboat.n.01 )\n", | |
" (junk.n.02 ))\n", | |
" (fishing_boat.n.01 (trawler.n.02 ))\n", | |
" (galley.n.01 )\n", | |
" (galley.n.02 (trireme.n.01 ))\n", | |
" (iceboat.n.02 )\n", | |
" (patrol_boat.n.01 )\n", | |
" (sailing_vessel.n.01\n", | |
" (bark.n.03 )\n", | |
" (brig.n.01 )\n", | |
" (brigantine.n.01 )\n", | |
" (clipper.n.02 )\n", | |
" (cutter.n.05 )\n", | |
" (dhow.n.01 )\n", | |
" (felucca.n.01 )\n", | |
" (fore-and-after.n.01 )\n", | |
" (galleon.n.01 (carrack.n.01 ))\n", | |
" (indiaman.n.01 ))\n", | |
" (ship.n.01\n", | |
" (abandoned_ship.n.01 )\n", | |
" (blockade-runner.n.01 )\n", | |
" (cargo_ship.n.01\n", | |
" (banana_boat.n.01 )\n", | |
" (bottom.n.07 )\n", | |
" (cattleship.n.01 )\n", | |
" (container_ship.n.01 )\n", | |
" (liberty_ship.n.01 )\n", | |
" (oil_tanker.n.01 (supertanker.n.01 )))\n", | |
" (flagship.n.02 )\n", | |
" (gas-turbine_ship.n.01 )\n", | |
" (hospital_ship.n.01 )\n", | |
" (hulk.n.02 )\n", | |
" (icebreaker.n.01 )\n", | |
" (lightship.n.01 )\n", | |
" (minelayer.n.01 ))\n", | |
" (shrimper.n.01 )))\n", | |
" (military_vehicle.n.01\n", | |
" (caisson.n.02 )\n", | |
" (half_track.n.01 )\n", | |
" (humvee.n.01 )\n", | |
" (personnel_carrier.n.01 )\n", | |
" (picket.n.04 (picket_boat.n.01 ) (picket_ship.n.01 ))\n", | |
" (reconnaissance_vehicle.n.01 )\n", | |
" (tank.n.01 (panzer.n.01 ))\n", | |
" (technical.n.01 )\n", | |
" (troop_carrier.n.01 (troopship.n.01 ))\n", | |
" (warplane.n.01\n", | |
" (bomber.n.01 (dive_bomber.n.01 ) (stealth_bomber.n.01 ))\n", | |
" (fighter.n.02\n", | |
" (interceptor.n.01 )\n", | |
" (kamikaze.n.01 )\n", | |
" (stealth_fighter.n.01 ))\n", | |
" (reconnaissance_plane.n.01 )))\n", | |
" (rocket.n.01\n", | |
" (missile.n.01\n", | |
" (air-to-air_missile.n.01 )\n", | |
" (air-to-ground_missile.n.01 )\n", | |
" (ballistic_missile.n.01\n", | |
" (intercontinental_ballistic_missile.n.01 (minuteman.n.02 )))\n", | |
" (guided_missile.n.01\n", | |
" (antiballistic_missile.n.01 )\n", | |
" (buzz_bomb.n.01 )\n", | |
" (exocet.n.01 )\n", | |
" (space_probe.n.01 )\n", | |
" (surface-to-air_missile.n.01 (manpad.n.01 ) (stinger.n.03 )))\n", | |
" (heat-seeking_missile.n.01\n", | |
" (brilliant_pebble.n.01 )\n", | |
" (stinger.n.03 ))\n", | |
" (sidewinder.n.02 ))\n", | |
" (multistage_rocket.n.01 )\n", | |
" (test_rocket.n.01 (sounding_rocket.n.01 )))\n", | |
" (skibob.n.01 )\n", | |
" (sled.n.01\n", | |
" (bobsled.n.01 )\n", | |
" (bobsled.n.02 )\n", | |
" (dogsled.n.01 )\n", | |
" (luge.n.01 )\n", | |
" (pung.n.01 )\n", | |
" (toboggan.n.01 ))\n", | |
" (steamroller.n.02 )\n", | |
" (wheeled_vehicle.n.01\n", | |
" (baby_buggy.n.01 (bassinet.n.02 ))\n", | |
" (bicycle.n.01\n", | |
" (bicycle-built-for-two.n.01 )\n", | |
" (mountain_bike.n.01 )\n", | |
" (ordinary.n.04 )\n", | |
" (push-bike.n.01 )\n", | |
" (safety_bicycle.n.01 )\n", | |
" (velocipede.n.01 ))\n", | |
" (boneshaker.n.01 )\n", | |
" (car.n.02\n", | |
" (baggage_car.n.01 )\n", | |
" (cabin_car.n.01 )\n", | |
" (club_car.n.01 )\n", | |
" (freight_car.n.01\n", | |
" (boxcar.n.01 (stockcar.n.01 ))\n", | |
" (cattle_car.n.01 )\n", | |
" (coal_car.n.01 )\n", | |
" (flatcar.n.01 )\n", | |
" (gondola_car.n.01 )\n", | |
" (refrigerator_car.n.01 )\n", | |
" (tank_car.n.01 ))\n", | |
" (guard's_van.n.01 )\n", | |
" (handcar.n.01 )\n", | |
" (mail_car.n.01 )\n", | |
" (passenger_car.n.01\n", | |
" (dining_car.n.01 )\n", | |
" (nonsmoker.n.02 )\n", | |
" (parlor_car.n.01 )\n", | |
" (pullman.n.01 )\n", | |
" (sleeping_car.n.01 )\n", | |
" (smoker.n.03 ))\n", | |
" (slip_coach.n.01 )\n", | |
" (tender.n.04 ))\n", | |
" (handcart.n.01\n", | |
" (applecart.n.02 )\n", | |
" (barrow.n.03 )\n", | |
" (hand_truck.n.01 )\n", | |
" (laundry_cart.n.01 )\n", | |
" (serving_cart.n.01 (pastry_cart.n.01 ) (tea_cart.n.01 ))\n", | |
" (shopping_cart.n.01 ))\n", | |
" (horse-drawn_vehicle.n.01\n", | |
" (carriage.n.02\n", | |
" (barouche.n.01 )\n", | |
" (brougham.n.01 )\n", | |
" (buckboard.n.01 )\n", | |
" (buggy.n.01 )\n", | |
" (cab.n.02 )\n", | |
" (caroche.n.01 )\n", | |
" (chaise.n.02 )\n", | |
" (chariot.n.01 )\n", | |
" (clarence.n.01 )\n", | |
" (coach.n.04 (stagecoach.n.01 )))\n", | |
" (chariot.n.02 )\n", | |
" (limber.n.01 )\n", | |
" (sulky.n.01 ))\n", | |
" (motor_scooter.n.01 )\n", | |
" (rolling_stock.n.01 )\n", | |
" (scooter.n.02 )\n", | |
" (self-propelled_vehicle.n.01\n", | |
" (armored_vehicle.n.01\n", | |
" (armored_car.n.01 )\n", | |
" (armored_car.n.02 )\n", | |
" (armored_personnel_carrier.n.01 )\n", | |
" (assault_gun.n.02 )\n", | |
" (tank.n.01 (panzer.n.01 ))\n", | |
" (tank_destroyer.n.01 ))\n", | |
" (carrier.n.02 )\n", | |
" (forklift.n.01 )\n", | |
" (locomotive.n.01\n", | |
" (choo-choo.n.01 )\n", | |
" (diesel_locomotive.n.01\n", | |
" (diesel-electric_locomotive.n.01 )\n", | |
" (diesel-hydraulic_locomotive.n.01 ))\n", | |
" (dinky.n.01 )\n", | |
" (electric_locomotive.n.01 )\n", | |
" (iron_horse.n.01 )\n", | |
" (pilot_engine.n.01 )\n", | |
" (shunter.n.01 )\n", | |
" (steam_locomotive.n.01 )\n", | |
" (switch_engine.n.01 )\n", | |
" (tank_engine.n.01 ))\n", | |
" (motor_vehicle.n.01\n", | |
" (amphibian.n.01 (swamp_buggy.n.01 ))\n", | |
" (bloodmobile.n.01 )\n", | |
" (car.n.01\n", | |
" (ambulance.n.01 (funny_wagon.n.01 ))\n", | |
" (beach_wagon.n.01 (shooting_brake.n.01 ))\n", | |
" (bus.n.04 )\n", | |
" (cab.n.03 (gypsy_cab.n.01 ) (minicab.n.01 ))\n", | |
" (compact.n.03 )\n", | |
" (convertible.n.01 )\n", | |
" (coupe.n.01 )\n", | |
" (cruiser.n.01 (panda_car.n.01 ))\n", | |
" (electric.n.01 )\n", | |
" (gas_guzzler.n.01 ))\n", | |
" (doodlebug.n.01 )\n", | |
" (four-wheel_drive.n.01 )\n", | |
" (go-kart.n.01 )\n", | |
" (golfcart.n.01 )\n", | |
" (hearse.n.01 )\n", | |
" (motorcycle.n.01\n", | |
" (minibike.n.01 (moped.n.01 ))\n", | |
" (trail_bike.n.01 ))\n", | |
" (snowplow.n.01 ))\n", | |
" (personnel_carrier.n.01 )\n", | |
" (reconnaissance_vehicle.n.01 )\n", | |
" (recreational_vehicle.n.01\n", | |
" (camper.n.02 (van.n.04 ))\n", | |
" (dune_buggy.n.01 ))\n", | |
" (streetcar.n.01 (horsecar.n.01 ))\n", | |
" (tracked_vehicle.n.01\n", | |
" (caterpillar.n.02 )\n", | |
" (half_track.n.01 )\n", | |
" (snowmobile.n.01 (sno-cat.n.01 ))\n", | |
" (tank.n.01 (panzer.n.01 ))))))\n" | |
] | |
} | |
], | |
"source": [ | |
"def treeify (word):\n", | |
" def treeifyHyponyms (*, hyponym, max_breadth = 10, max_depth = 10, depth = 0):\n", | |
" def _treeify (hyponym):\n", | |
" return treeifyHyponyms(\n", | |
" hyponym = hyponym,\n", | |
" depth = depth + 1\n", | |
" )\n", | |
" \n", | |
" children = []\n", | |
" if (depth < max_depth):\n", | |
" hyponyms = hyponym.hyponyms()[:max_breadth]\n", | |
" children = [_treeify(h) for h in hyponyms]\n", | |
" \n", | |
" name = hyponym.name()\n", | |
" return nltk.Tree(name, children)\n", | |
" \n", | |
" return treeifyHyponyms(hyponym = synsets(word)[0])\n", | |
"\n", | |
"# The above is a generalization of this example from the text:\n", | |
"#\n", | |
"# vehicle = synsets('car')[0]\\\n", | |
"# t = nltk.Tree(vehicle.name(), children=[\n", | |
"# nltk.Tree(vehicle.hyponyms()[3].name(), children=[]), \n", | |
"# nltk.Tree(vehicle.hyponyms()[4].name(), children=[]), \n", | |
"# nltk.Tree(vehicle.hyponyms()[5].name(), children=[]), \n", | |
"# nltk.Tree(vehicle.hyponyms()[7].name(), children=[\n", | |
"# nltk.Tree(vehicle.hyponyms()[7].hyponyms()[1].name(), children=[]), \n", | |
"# nltk.Tree(vehicle.hyponyms()[7].hyponyms()[3].name(), children=[]), \n", | |
"# nltk.Tree(vehicle.hyponyms()[7].hyponyms()[4].name(), children=[]), \n", | |
"# nltk.Tree(vehicle.hyponyms()[7].hyponyms()[5].name(), children=[]), \n", | |
"# nltk.Tree(vehicle.hyponyms()[7].hyponyms()[6].name(), children=[]),\n", | |
"# ]), \n", | |
"# ])\n", | |
"# print(t)\n", | |
"\n", | |
"vehicle_tree = treeify('vehicle')\n", | |
"print(vehicle_tree)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 184, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"(car.n.01\n", | |
" (ambulance.n.01 (funny_wagon.n.01 ))\n", | |
" (beach_wagon.n.01 (shooting_brake.n.01 ))\n", | |
" (bus.n.04 )\n", | |
" (cab.n.03 (gypsy_cab.n.01 ) (minicab.n.01 ))\n", | |
" (compact.n.03 )\n", | |
" (convertible.n.01 )\n", | |
" (coupe.n.01 )\n", | |
" (cruiser.n.01 (panda_car.n.01 ))\n", | |
" (electric.n.01 )\n", | |
" (gas_guzzler.n.01 ))\n" | |
] | |
} | |
], | |
"source": [ | |
"car_tree = treeify('car')\n", | |
"print(car_tree)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 185, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"(vomit.n.01 )\n" | |
] | |
} | |
], | |
"source": [ | |
"barf_tree = treeify('barf')\n", | |
"print(barf_tree)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 187, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"friend => friend\n", | |
"friends => friend\n", | |
"friendly => friend\n", | |
"drink => drink\n", | |
"drinks => drink\n", | |
"drunk => drunk\n", | |
"drank => drank\n", | |
"slow => slow\n", | |
"slowing => slow\n", | |
"slowly => slowli\n", | |
"slower => slower\n", | |
"slowest => slowest\n", | |
"quaid => quaid\n", | |
"xyzing => xyze\n", | |
"exerxzyzing => exerxzyz\n" | |
] | |
} | |
], | |
"source": [ | |
"############\n", | |
"# Stemmer #\n", | |
"############\n", | |
"\n", | |
"from functools import reduce\n", | |
"\n", | |
"stemmer = nltk.stem.SnowballStemmer('english')\n", | |
"\n", | |
"words = [\n", | |
" 'friend', \n", | |
" 'friends', \n", | |
" 'friendly',\n", | |
" 'drink',\n", | |
" 'drinks',\n", | |
" 'drunk',\n", | |
" 'drank',\n", | |
" 'slow',\n", | |
" 'slowing',\n", | |
" 'slowly',\n", | |
" 'slower',\n", | |
" 'slowest',\n", | |
" 'quaid',\n", | |
" 'xyzing',\n", | |
" 'exerxzyzing'\n", | |
"]\n", | |
"\n", | |
"def printDict (dict):\n", | |
" def maxlen (a, b):\n", | |
" a_len = len(a)\n", | |
" b_len = len(b)\n", | |
" return a if (a_len > b_len) else b\n", | |
"\n", | |
" max_length = len(reduce(maxlen, dict.keys()))\n", | |
"\n", | |
" for k, v in dict.items():\n", | |
" label = k.ljust(max_length + 1)\n", | |
" print(f'{label} => {v}')\n", | |
" \n", | |
"def mapWords (*, words, mapper):\n", | |
" return dict((word, mapper(word)) for word in words)\n", | |
" \n", | |
"printDict(mapWords(\n", | |
" words = stems,\n", | |
" mapper = lambda word: stemmer.stem(word)\n", | |
"))\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 186, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"lemmatizing as noun\n", | |
"friend => friend\n", | |
"friends => friend\n", | |
"friendly => friendly\n", | |
"drink => drink\n", | |
"drinks => drink\n", | |
"drunk => drunk\n", | |
"drank => drank\n", | |
"slow => slow\n", | |
"slowing => slowing\n", | |
"slowly => slowly\n", | |
"slower => slower\n", | |
"slowest => slowest\n", | |
"quaid => quaid\n", | |
"xyzing => xyzing\n", | |
"exerxzyzing => exerxzyzing\n", | |
"\n", | |
"lemmatizing as verb\n", | |
"friend => friend\n", | |
"friends => friends\n", | |
"friendly => friendly\n", | |
"drink => drink\n", | |
"drinks => drink\n", | |
"drunk => drink\n", | |
"drank => drink\n", | |
"slow => slow\n", | |
"slowing => slow\n", | |
"slowly => slowly\n", | |
"slower => slower\n", | |
"slowest => slowest\n", | |
"quaid => quaid\n", | |
"xyzing => xyzing\n", | |
"exerxzyzing => exerxzyzing\n", | |
"\n", | |
"lemmatizing as adjective\n", | |
"friend => friend\n", | |
"friends => friends\n", | |
"friendly => friendly\n", | |
"drink => drink\n", | |
"drinks => drinks\n", | |
"drunk => drunk\n", | |
"drank => drank\n", | |
"slow => slow\n", | |
"slowing => slowing\n", | |
"slowly => slowly\n", | |
"slower => slow\n", | |
"slowest => slow\n", | |
"quaid => quaid\n", | |
"xyzing => xyzing\n", | |
"exerxzyzing => exerxzyzing\n", | |
"\n" | |
] | |
} | |
], | |
"source": [ | |
"##############\n", | |
"# Lemmatizer #\n", | |
"##############\n", | |
"\n", | |
"lemmatizer = nltk.stem.WordNetLemmatizer()\n", | |
"\n", | |
"for part_of_speech in ['noun', 'verb', 'adjective']:\n", | |
" print(f'lemmatizing as {part_of_speech}')\n", | |
" pos = part_of_speech[0]\n", | |
" printDict(mapWords(\n", | |
" words = stems,\n", | |
" mapper = lambda word: lemmatizer.lemmatize(word, pos)\n", | |
" ))\n", | |
" print()\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.7.4" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment