Skip to content

Instantly share code, notes, and snippets.

@stephenhandley
Created August 17, 2019 01:06
Show Gist options
  • Save stephenhandley/a33b541b10d5b089ba52d0fd6a300b3a to your computer and use it in GitHub Desktop.
Save stephenhandley/a33b541b10d5b089ba52d0fd6a300b3a to your computer and use it in GitHub Desktop.
Natural Language Processing for Hackers (Chapter 1).ipynb
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 82,
"metadata": {},
"outputs": [],
"source": [
"########\n",
"# Deps #\n",
"########\n",
"# pip install nltk numpy\n",
"\n",
"import nltk\n",
"from nltk.corpus import reuters\n",
"from random import randrange\n",
"\n",
"reuters_words = nltk.corpus.reuters.words()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#########\n",
"# Setup #\n",
"#########\n",
"\n",
"nltk.download('all')"
]
},
{
"cell_type": "code",
"execution_count": 173,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Found 9 sentences in this paragraph:\n",
"AMPLE SUPPLIES LIMIT U.S. STRIKE'S OIL PRICE IMPACT\n",
" Ample supplies of OPEC crude weighing on\n",
" world markets helped limit and then reverse oil price gains\n",
" that followed the U.S. Strike on an Iranian oil platform in the\n",
" Gulf earlier on Monday, analysts said.\n",
" December loading rose to 19.65 dlrs, up 45 cents before\n",
" falling to around 19.05/15 later, unchanged from last Friday.\n",
" \"Fundamentals are awful,\" said Philip Lambert, analyst with\n",
" stockbrokers Kleinwort Grieveson, adding that total OPEC\n",
" production in the first week of October could be above 18.5 mln\n",
" bpd, little changed from September levels.\n",
" Peter Nicol, analyst at Chase Manhattan Bank, said OPEC\n",
" production could be about 18.5-19.0 mln in October. Reuter and\n",
" International Energy Agency (IEA) estimates put OPEC September\n",
" production at 18.5 mln bpd.\n",
" The U.S. Attack was in retaliation of last Friday's hit of\n",
" a Kuwaiti oil products tanker flying the U.S. Flag, the Sea\n",
" Isle City. It was struc...\n",
"\n",
"1.\n",
"AMPLE SUPPLIES LIMIT U.S. STRIKE'S OIL PRICE IMPACT\n",
" Ample supplies of OPEC crude weighing on\n",
" world markets helped limit and then reverse oil price gains\n",
" that followed the U.S. Strike on an Iranian oil platform in the\n",
" Gulf earlier on Monday, analysts said.\n",
"2.\n",
"December loading rose to 19.65 dlrs, up 45 cents before\n",
" falling to around 19.05/15 later, unchanged from last Friday.\n",
"3.\n",
"\"Fundamentals are awful,\" said Philip Lambert, analyst with\n",
" stockbrokers Kleinwort Grieveson, adding that total OPEC\n",
" production in the first week of October could be above 18.5 mln\n",
" bpd, little changed from September levels.\n",
"4.\n",
"Peter Nicol, analyst at Chase Manhattan Bank, said OPEC\n",
" production could be about 18.5-19.0 mln in October.\n",
"5.\n",
"Reuter and\n",
" International Energy Agency (IEA) estimates put OPEC September\n",
" production at 18.5 mln bpd.\n",
"6.\n",
"The U.S.\n",
"7.\n",
"Attack was in retaliation of last Friday's hit of\n",
" a Kuwaiti oil products tanker flying the U.S.\n",
"8.\n",
"Flag, the Sea\n",
" Isle City.\n",
"9.\n",
"It was struc\n"
]
}
],
"source": [
"#############\n",
"# Sentences #\n",
"#############\n",
"\n",
"paragraph = reuters.raw('test/21131')[:1000]\n",
"sentences = nltk.sent_tokenize(paragraph) \n",
"num_sentences = len(sentences)\n",
"\n",
"print(f'Found {num_sentences} sentences in this paragraph:\\n{paragraph}...\\n')\n",
"for i in range(num_sentences):\n",
" sentence = sentences[i]\n",
" print(f'{i + 1}.\\n{sentence}')"
]
},
{
"cell_type": "code",
"execution_count": 175,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[\"Don't say that Randy Quaid is a good example.\"]\n",
"['Do', \"n't\", 'say', 'that', 'Randy', 'Quaid', 'is', 'a', 'good', 'example', '.']\n"
]
}
],
"source": [
"#########\n",
"# Words #\n",
"#########\n",
"\n",
"easy_sentence = \"Don't say that Randy Quaid is a good example.\"\n",
"print(nltk.sent_tokenize(easy_sentence))\n",
"easy_words = nltk.word_tokenize(easy_sentence)\n",
"print(easy_words)"
]
},
{
"cell_type": "code",
"execution_count": 176,
"metadata": {
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"most common words are: [('.', 94687), (',', 72360), ('the', 58251), ('of', 35979), ('to', 34035), ('in', 26478), ('said', 25224), ('and', 25043), ('a', 23492), ('mln', 18037)]\n",
"\"stock\" occurs 2346 times\n",
"\"stork\" occurs 0 times\n",
"frequency of the word \"the\" is 0.033849129031826936\n",
"37.37% (15545 / 41600) of the words occur only once. Here is a random example: ERB\n",
"samples: 1720901\n"
]
}
],
"source": [
"############################\n",
"# Occurrences, basic stats #\n",
"############################\n",
"\n",
"def percent (num): \n",
" return '%.2f' % (num * 100)\n",
"\n",
"fdist = nltk.FreqDist(reuters_words)\n",
"most_common = fdist.most_common(n=10)\n",
"\n",
"stock_count = fdist['stock']\n",
"stork_count = fdist['stork']\n",
"\n",
"the_freq = fdist.freq('the')\n",
"\n",
"singles = fdist.hapaxes() # \"hapaxes\" is term for words occurring once\n",
"num_singles = len(singles)\n",
"total = len(fdist.keys())\n",
"percent_single = percent((num_singles / total));\n",
"index = randrange(num_singles)\n",
"example = singles[index]\n",
"\n",
"num_samples = fdist.N()\n",
"\n",
"print('most common words are: ', most_common)\n",
"print(f'\"stock\" occurs {stock_count} times')\n",
"print(f'\"stork\" occurs {stork_count} times')\n",
"print(f'frequency of the word \"the\" is {the_freq}')\n",
"print(f'{percent_single}% ({num_singles} / {total}) of the words occur only once. Here is a random example:', example)\n",
"print('samples:', num_samples)"
]
},
{
"cell_type": "code",
"execution_count": 177,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"bigrams are pairs of words that are adjacent [('Randy', 'works'), ('works', 'at'), ('at', 'Quaid'), ('Quaid', 'Corp'), ('Corp', '.')]\n",
"trigrams are similar but for 3 words [('Randy', 'works', 'at'), ('works', 'at', 'Quaid'), ('at', 'Quaid', 'Corp'), ('Quaid', 'Corp', '.')]\n"
]
}
],
"source": [
"###########\n",
"# n-grams #\n",
"###########\n",
"\n",
"text = \"Randy works at Quaid Corp.\"\n",
"tokens = nltk.word_tokenize(text)\n",
"bigrams = nltk.bigrams(tokens)\n",
"trigrams = nltk.trigrams(tokens)\n",
"\n",
"print('bigrams are pairs of words that are adjacent', list(bigrams))\n",
"print('trigrams are similar but for 3 words', list(trigrams))\n"
]
},
{
"cell_type": "code",
"execution_count": 178,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"50 bigrams with highest pointwise mututal information: [('Het', 'Comite'), ('Lago', 'Agrio'), ('Hoare', 'Govett'), ('Dar', 'es'), ('Ranks', 'Hovis'), ('es', 'Salaam'), ('SRI', 'LANKA'), ('CRAZY', 'EDDIE'), ('LORD', 'ABBETT'), ('Corazon', 'Aquino'), ('MERRILL', 'LYNCH'), ('Ay', 'Expd'), ('Abu', 'Dhabi'), ('Sao', 'Paulo'), ('HONG', 'KONG'), ('THOUS', 'BUSHELS'), ('poison', 'pill'), ('Lear', 'Siegler'), ('Pizza', 'Inn'), ('Kleinwort', 'Benson'), ('Hajime', 'Tamura'), ('Gates', 'Learjet'), ('ORANGE', 'JUICE'), ('Wells', 'Fargo'), ('Real', 'Estate'), ('Margaret', 'Thatcher'), ('SANTA', 'FE'), ('Brace', 'Jovanovich'), ('DIAMOND', 'SHAMROCK'), ('Phelps', 'Dodge'), ('Fort', 'Worth'), ('Puerto', 'Rico'), ('del', 'Este'), ('WALL', 'STREET'), ('REAL', 'ESTATE'), ('Hiram', 'Walker'), ('Du', 'Pont'), ('Partly', 'offsetting'), ('Punta', 'del'), ('Hk', 'Hotels'), ('DAYTON', 'HUDSON'), ('Bra', 'Kanon'), ('HUGHES', 'TOOL'), ('Rank', 'Xerox'), ('Exporting', 'Countries'), ('Marlin', 'Fitzwater'), ('King', 'Fahd'), ('Dean', 'Witter'), ('continental', 'shelf'), ('Costa', 'Rica')]\n",
"50 trigrams with highest pointwise mututal information: [('Dar', 'es', 'Salaam'), ('Punta', 'del', 'Este'), ('Arturo', 'Hernandez', 'Grisanti'), ('Speaker', 'Jim', 'Wright'), ('SANTA', 'FE', 'SOUTHERN'), ('THOUS', 'BUSHELS', 'SOYBEANS'), ('Denis', 'Bra', 'Kanon'), ('Nil', 'Nil', 'Nil'), ('Harcourt', 'Brace', 'Jovanovich'), ('Fernando', 'Santos', 'Alvite'), ('Drexel', 'Burnham', 'Lambert'), ('WALL', 'STREET', 'STOCKS'), ('Liberal', 'Democratic', 'Party'), ('Prev', 'Wk', 'Named'), ('Dean', 'Witter', 'Reynolds'), ('CUSTOMER', 'REPURCHASE', 'AGREEMENTS'), ('Lloyds', 'Shipping', 'Intelligence'), ('Karl', 'Otto', 'Poehl'), ('DISCOUNT', 'BORROWINGS', 'AVERAGE'), ('ADDS', 'RESERVES', 'VIA'), ('VIA', 'CUSTOMER', 'REPURCHASES'), ('Light', 'Louisiana', 'Sweet'), ('President', 'Corazon', 'Aquino'), ('Export', 'Enhancement', 'Program'), ('Exchequer', 'Nigel', 'Lawson'), ('Governor', 'Satoshi', 'Sumita'), ('Rio', 'de', 'Janeiro'), ('governor', 'Satoshi', 'Sumita'), ('Gross', 'Domestic', 'Product'), ('Enhancement', 'Program', 'initiative'), ('ruling', 'Liberal', 'Democratic'), ('OLD', 'RATE', 'MATURITY'), ('Partly', 'offsetting', 'these'), ('Petroleum', 'Exporting', 'Countries'), ('Minister', 'Margaret', 'Thatcher'), ('House', 'Speaker', 'Jim'), ('CURRENT', 'ACCOUNT', 'DEFICIT'), ('Banco', 'do', 'Brasil'), ('DISTILLATE', 'STOCKS', 'OFF'), ('RESERVES', 'VIA', 'CUSTOMER'), ('Representative', 'Clayton', 'Yeutter'), ('ranges', 'broadly', 'consistent'), ('Secretary', 'Caspar', 'Weinberger'), ('offsetting', 'these', 'outflows'), ('MARKET', 'SHORTAGE', 'FORECAST'), ('REUTER', '^', 'M'), ('Marine', 'Midland', 'Banks'), ('DLR', 'CUSTOMER', 'REPURCHASE'), ('Papua', 'New', 'Guinea'), ('President', 'Karl', 'Otto')]\n"
]
}
],
"source": [
"from nltk.collocations import (\n",
" BigramAssocMeasures, \n",
" BigramCollocationFinder,\n",
" TrigramAssocMeasures,\n",
" TrigramCollocationFinder\n",
")\n",
"\n",
"bigram_measures = BigramAssocMeasures()\n",
"trigram_measures = TrigramAssocMeasures()\n",
"\n",
"def gramStats (*, measures, Finder, prefix, min_freq = 10, pmi_count = 50):\n",
" finder = Finder.from_words(reuters_words)\n",
" finder.apply_freq_filter(min_freq)\n",
" highest_pmi = finder.nbest(bigram_measures.pmi, pmi_count)\n",
" label = f'{prefix}grams'\n",
" print(f'{pmi_count} {label} with highest pointwise mututal information:', highest_pmi)\n",
" \n",
"gramStats(\n",
" measures = bigram_measures,\n",
" Finder = BigramCollocationFinder,\n",
" prefix = 'bi'\n",
")\n",
"\n",
"gramStats(\n",
" measures = trigram_measures,\n",
" Finder = TrigramCollocationFinder,\n",
" prefix = 'tri'\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 179,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Tokens: ['Wikipedia.com', 'says', 'that', 'Randy', 'Randall', 'Rudy', 'Quaid', 'is', 'an', 'American', 'film', 'and', 'television', 'actor', 'and', 'Oscar', 'nominee', 'known', 'for', 'his', 'roles', 'in', 'both', 'serious', 'drama', 'and', 'light', 'comedy', '.']\n",
"Tagged Tokens: [('Wikipedia.com', 'NNP'), ('says', 'VBZ'), ('that', 'IN'), ('Randy', 'NNP'), ('Randall', 'NNP'), ('Rudy', 'NNP'), ('Quaid', 'NNP'), ('is', 'VBZ'), ('an', 'DT'), ('American', 'JJ'), ('film', 'NN'), ('and', 'CC'), ('television', 'NN'), ('actor', 'NN'), ('and', 'CC'), ('Oscar', 'NNP'), ('nominee', 'RB'), ('known', 'VBN'), ('for', 'IN'), ('his', 'PRP$'), ('roles', 'NNS'), ('in', 'IN'), ('both', 'DT'), ('serious', 'JJ'), ('drama', 'NN'), ('and', 'CC'), ('light', 'JJ'), ('comedy', 'NN'), ('.', '.')]\n",
"Named entities (S\n",
" Wikipedia.com/NNP\n",
" says/VBZ\n",
" that/IN\n",
" (PERSON Randy/NNP Randall/NNP Rudy/NNP Quaid/NNP)\n",
" is/VBZ\n",
" an/DT\n",
" (GPE American/JJ)\n",
" film/NN\n",
" and/CC\n",
" television/NN\n",
" actor/NN\n",
" and/CC\n",
" (PERSON Oscar/NNP)\n",
" nominee/RB\n",
" known/VBN\n",
" for/IN\n",
" his/PRP$\n",
" roles/NNS\n",
" in/IN\n",
" both/DT\n",
" serious/JJ\n",
" drama/NN\n",
" and/CC\n",
" light/JJ\n",
" comedy/NN\n",
" ./.)\n"
]
}
],
"source": [
"####################################\n",
"# Tokenization and parts of speech #\n",
"####################################\n",
"\n",
"randy_sentence = \"Wikipedia.com says that Randy Randall Rudy Quaid is an American film and television actor and Oscar nominee known for his roles in both serious drama and light comedy.\" \n",
"randy_tokens = nltk.word_tokenize(randy_sentence)\n",
"tagged_tokens = nltk.pos_tag(randy_tokens) \n",
"ner_annotated_tree = nltk.ne_chunk(tagged_tokens)\n",
"\n",
"print(\"Tokens: \", randy_tokens)\n",
"print(\"Tagged Tokens: \", tagged_tokens)\n",
"print(\"Named entities\", ner_annotated_tree)\n"
]
},
{
"cell_type": "code",
"execution_count": 180,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(S\n",
" (PERSON Randy/NNP)\n",
" (PERSON Quaid/NNP)\n",
" does/VBZ\n",
" not/RB\n",
" go/VB\n",
" to/TO\n",
" (ORGANIZATION Harvard/NNP)\n",
" ./.)\n"
]
}
],
"source": [
"sentence = \"\"\"Randy Quaid does not go to Harvard.\"\"\"\n",
"tokens = nltk.word_tokenize(sentence)\n",
"tagged_tokens = nltk.pos_tag(tokens) \n",
"ner_annotated_tree = nltk.ne_chunk(tagged_tokens) \n",
"print(ner_annotated_tree)"
]
},
{
"cell_type": "code",
"execution_count": 181,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Synsets for car (filter = None):\n",
"\n",
"car.n.01:\n",
"['car', 'auto', 'automobile', 'machine', 'motorcar']\n",
"a motor vehicle with four wheels; usually propelled by an internal combustion engine\n",
"\n",
"car.n.02:\n",
"['car', 'railcar', 'railway_car', 'railroad_car']\n",
"a wheeled vehicle adapted to the rails of railroad\n",
"\n",
"car.n.03:\n",
"['car', 'gondola']\n",
"the compartment that is suspended from an airship and that carries personnel and the cargo and the power plant\n",
"\n",
"car.n.04:\n",
"['car', 'elevator_car']\n",
"where passengers ride up and down\n",
"\n",
"cable_car.n.01:\n",
"['cable_car', 'car']\n",
"a conveyance for passengers or freight on a cable railway\n",
"\n",
"------\n",
"\n",
"Synsets for fight (filter = None):\n",
"\n",
"battle.n.01:\n",
"['battle', 'conflict', 'fight', 'engagement']\n",
"a hostile meeting of opposing military forces in the course of a war\n",
"\n",
"fight.n.02:\n",
"['fight', 'fighting', 'combat', 'scrap']\n",
"the act of fighting; any contest or struggle\n",
"\n",
"competitiveness.n.01:\n",
"['competitiveness', 'fight']\n",
"an aggressive willingness to compete\n",
"\n",
"fight.n.04:\n",
"['fight']\n",
"an intense verbal dispute\n",
"\n",
"fight.n.05:\n",
"['fight']\n",
"a boxing or wrestling match\n",
"\n",
"contend.v.06:\n",
"['contend', 'fight', 'struggle']\n",
"be engaged in a fight; carry on a fight\n",
"\n",
"fight.v.02:\n",
"['fight', 'oppose', 'fight_back', 'fight_down', 'defend']\n",
"fight against or resist strongly\n",
"\n",
"fight.v.03:\n",
"['fight', 'struggle']\n",
"make a strenuous or labored effort\n",
"\n",
"crusade.v.01:\n",
"['crusade', 'fight', 'press', 'campaign', 'push', 'agitate']\n",
"exert oneself continuously, vigorously, or obtrusively to gain an end or engage in a crusade for a certain cause or person; be an advocate for\n",
"\n",
"------\n",
"\n",
"Synsets for fight (filter = n):\n",
"\n",
"battle.n.01:\n",
"['battle', 'conflict', 'fight', 'engagement']\n",
"a hostile meeting of opposing military forces in the course of a war\n",
"\n",
"fight.n.02:\n",
"['fight', 'fighting', 'combat', 'scrap']\n",
"the act of fighting; any contest or struggle\n",
"\n",
"competitiveness.n.01:\n",
"['competitiveness', 'fight']\n",
"an aggressive willingness to compete\n",
"\n",
"fight.n.04:\n",
"['fight']\n",
"an intense verbal dispute\n",
"\n",
"fight.n.05:\n",
"['fight']\n",
"a boxing or wrestling match\n",
"\n",
"------\n",
"\n"
]
}
],
"source": [
"###########\n",
"# Wordnet #\n",
"###########\n",
"\n",
"wordnet = nltk.corpus.wordnet\n",
" \n",
"def printSynsets (*, word, filter = None):\n",
" print(f'Synsets for {word} (filter = {filter}):')\n",
" synsets = wordnet.synsets(word, filter)\n",
" for synset in synsets:\n",
" print()\n",
" print(f'{synset.name()}:')\n",
" print([l.name() for l in synset.lemmas()])\n",
" print(synset.definition())\n",
" print('\\n------\\n')\n",
"\n",
"printSynsets(\n",
" word = 'car'\n",
")\n",
" \n",
"printSynsets(\n",
" word = 'fight'\n",
")\n",
" \n",
"# just the nouns\n",
"printSynsets(\n",
" word = 'fight',\n",
" filter = wordnet.NOUN \n",
")\n"
]
},
{
"cell_type": "code",
"execution_count": 182,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Synset('walk.v.01')\n"
]
}
],
"source": [
"# Get a single synset by name \n",
"walk_verb = wordnet.synset('walk.v.01')\n",
"print(walk_verb)"
]
},
{
"cell_type": "code",
"execution_count": 183,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(vehicle.n.01\n",
" (bumper_car.n.01 )\n",
" (craft.n.02\n",
" (aircraft.n.01\n",
" (bogy.n.01 )\n",
" (cruise_missile.n.01 )\n",
" (heavier-than-air_craft.n.01\n",
" (airplane.n.01\n",
" (airliner.n.01\n",
" (airbus.n.01 )\n",
" (narrowbody_aircraft.n.01 )\n",
" (widebody_aircraft.n.01 ))\n",
" (amphibian.n.02 )\n",
" (biplane.n.01 )\n",
" (bomber.n.01 (dive_bomber.n.01 ) (stealth_bomber.n.01 ))\n",
" (delta_wing.n.01 )\n",
" (fighter.n.02\n",
" (interceptor.n.01 )\n",
" (kamikaze.n.01 )\n",
" (stealth_fighter.n.01 ))\n",
" (hangar_queen.n.01 )\n",
" (jet.n.01\n",
" (fanjet.n.01 )\n",
" (jetliner.n.01 )\n",
" (jumbojet.n.01 )\n",
" (twinjet.n.01 ))\n",
" (monoplane.n.01 )\n",
" (multiengine_airplane.n.01 ))\n",
" (autogiro.n.01 )\n",
" (drone.n.04 )\n",
" (glider.n.01 (hang_glider.n.02 ))\n",
" (helicopter.n.01\n",
" (cargo_helicopter.n.01 )\n",
" (shuttle_helicopter.n.01 )\n",
" (single-rotor_helicopter.n.01 )\n",
" (skyhook.n.01 ))\n",
" (orthopter.n.01 )\n",
" (warplane.n.01\n",
" (bomber.n.01 (dive_bomber.n.01 ) (stealth_bomber.n.01 ))\n",
" (fighter.n.02\n",
" (interceptor.n.01 )\n",
" (kamikaze.n.01 )\n",
" (stealth_fighter.n.01 ))\n",
" (reconnaissance_plane.n.01 )))\n",
" (lighter-than-air_craft.n.01\n",
" (airship.n.01\n",
" (barrage_balloon.n.01 (kite_balloon.n.01 ))\n",
" (blimp.n.02 ))\n",
" (balloon.n.01\n",
" (hot-air_balloon.n.01 )\n",
" (meteorological_balloon.n.01 (pilot_balloon.n.01 ))\n",
" (trial_balloon.n.02 )))\n",
" (stealth_aircraft.n.01\n",
" (stealth_bomber.n.01 )\n",
" (stealth_fighter.n.01 )))\n",
" (hovercraft.n.01 )\n",
" (landing_craft.n.01 )\n",
" (spacecraft.n.01\n",
" (lander.n.02 )\n",
" (lunar_excursion_module.n.01 )\n",
" (space_capsule.n.01 )\n",
" (space_shuttle.n.01 )\n",
" (starship.n.01 ))\n",
" (vessel.n.02\n",
" (bareboat.n.01 )\n",
" (boat.n.01\n",
" (ark.n.02 )\n",
" (barge.n.01\n",
" (dredger.n.01 )\n",
" (houseboat.n.01 )\n",
" (pontoon.n.01 )\n",
" (scow.n.02 )\n",
" (wherry.n.01 ))\n",
" (bumboat.n.01 )\n",
" (canal_boat.n.01 )\n",
" (ferry.n.01 (car-ferry.n.01 ))\n",
" (fireboat.n.01 )\n",
" (gondola.n.02 )\n",
" (guard_boat.n.01 )\n",
" (gunboat.n.01 )\n",
" (junk.n.02 ))\n",
" (fishing_boat.n.01 (trawler.n.02 ))\n",
" (galley.n.01 )\n",
" (galley.n.02 (trireme.n.01 ))\n",
" (iceboat.n.02 )\n",
" (patrol_boat.n.01 )\n",
" (sailing_vessel.n.01\n",
" (bark.n.03 )\n",
" (brig.n.01 )\n",
" (brigantine.n.01 )\n",
" (clipper.n.02 )\n",
" (cutter.n.05 )\n",
" (dhow.n.01 )\n",
" (felucca.n.01 )\n",
" (fore-and-after.n.01 )\n",
" (galleon.n.01 (carrack.n.01 ))\n",
" (indiaman.n.01 ))\n",
" (ship.n.01\n",
" (abandoned_ship.n.01 )\n",
" (blockade-runner.n.01 )\n",
" (cargo_ship.n.01\n",
" (banana_boat.n.01 )\n",
" (bottom.n.07 )\n",
" (cattleship.n.01 )\n",
" (container_ship.n.01 )\n",
" (liberty_ship.n.01 )\n",
" (oil_tanker.n.01 (supertanker.n.01 )))\n",
" (flagship.n.02 )\n",
" (gas-turbine_ship.n.01 )\n",
" (hospital_ship.n.01 )\n",
" (hulk.n.02 )\n",
" (icebreaker.n.01 )\n",
" (lightship.n.01 )\n",
" (minelayer.n.01 ))\n",
" (shrimper.n.01 )))\n",
" (military_vehicle.n.01\n",
" (caisson.n.02 )\n",
" (half_track.n.01 )\n",
" (humvee.n.01 )\n",
" (personnel_carrier.n.01 )\n",
" (picket.n.04 (picket_boat.n.01 ) (picket_ship.n.01 ))\n",
" (reconnaissance_vehicle.n.01 )\n",
" (tank.n.01 (panzer.n.01 ))\n",
" (technical.n.01 )\n",
" (troop_carrier.n.01 (troopship.n.01 ))\n",
" (warplane.n.01\n",
" (bomber.n.01 (dive_bomber.n.01 ) (stealth_bomber.n.01 ))\n",
" (fighter.n.02\n",
" (interceptor.n.01 )\n",
" (kamikaze.n.01 )\n",
" (stealth_fighter.n.01 ))\n",
" (reconnaissance_plane.n.01 )))\n",
" (rocket.n.01\n",
" (missile.n.01\n",
" (air-to-air_missile.n.01 )\n",
" (air-to-ground_missile.n.01 )\n",
" (ballistic_missile.n.01\n",
" (intercontinental_ballistic_missile.n.01 (minuteman.n.02 )))\n",
" (guided_missile.n.01\n",
" (antiballistic_missile.n.01 )\n",
" (buzz_bomb.n.01 )\n",
" (exocet.n.01 )\n",
" (space_probe.n.01 )\n",
" (surface-to-air_missile.n.01 (manpad.n.01 ) (stinger.n.03 )))\n",
" (heat-seeking_missile.n.01\n",
" (brilliant_pebble.n.01 )\n",
" (stinger.n.03 ))\n",
" (sidewinder.n.02 ))\n",
" (multistage_rocket.n.01 )\n",
" (test_rocket.n.01 (sounding_rocket.n.01 )))\n",
" (skibob.n.01 )\n",
" (sled.n.01\n",
" (bobsled.n.01 )\n",
" (bobsled.n.02 )\n",
" (dogsled.n.01 )\n",
" (luge.n.01 )\n",
" (pung.n.01 )\n",
" (toboggan.n.01 ))\n",
" (steamroller.n.02 )\n",
" (wheeled_vehicle.n.01\n",
" (baby_buggy.n.01 (bassinet.n.02 ))\n",
" (bicycle.n.01\n",
" (bicycle-built-for-two.n.01 )\n",
" (mountain_bike.n.01 )\n",
" (ordinary.n.04 )\n",
" (push-bike.n.01 )\n",
" (safety_bicycle.n.01 )\n",
" (velocipede.n.01 ))\n",
" (boneshaker.n.01 )\n",
" (car.n.02\n",
" (baggage_car.n.01 )\n",
" (cabin_car.n.01 )\n",
" (club_car.n.01 )\n",
" (freight_car.n.01\n",
" (boxcar.n.01 (stockcar.n.01 ))\n",
" (cattle_car.n.01 )\n",
" (coal_car.n.01 )\n",
" (flatcar.n.01 )\n",
" (gondola_car.n.01 )\n",
" (refrigerator_car.n.01 )\n",
" (tank_car.n.01 ))\n",
" (guard's_van.n.01 )\n",
" (handcar.n.01 )\n",
" (mail_car.n.01 )\n",
" (passenger_car.n.01\n",
" (dining_car.n.01 )\n",
" (nonsmoker.n.02 )\n",
" (parlor_car.n.01 )\n",
" (pullman.n.01 )\n",
" (sleeping_car.n.01 )\n",
" (smoker.n.03 ))\n",
" (slip_coach.n.01 )\n",
" (tender.n.04 ))\n",
" (handcart.n.01\n",
" (applecart.n.02 )\n",
" (barrow.n.03 )\n",
" (hand_truck.n.01 )\n",
" (laundry_cart.n.01 )\n",
" (serving_cart.n.01 (pastry_cart.n.01 ) (tea_cart.n.01 ))\n",
" (shopping_cart.n.01 ))\n",
" (horse-drawn_vehicle.n.01\n",
" (carriage.n.02\n",
" (barouche.n.01 )\n",
" (brougham.n.01 )\n",
" (buckboard.n.01 )\n",
" (buggy.n.01 )\n",
" (cab.n.02 )\n",
" (caroche.n.01 )\n",
" (chaise.n.02 )\n",
" (chariot.n.01 )\n",
" (clarence.n.01 )\n",
" (coach.n.04 (stagecoach.n.01 )))\n",
" (chariot.n.02 )\n",
" (limber.n.01 )\n",
" (sulky.n.01 ))\n",
" (motor_scooter.n.01 )\n",
" (rolling_stock.n.01 )\n",
" (scooter.n.02 )\n",
" (self-propelled_vehicle.n.01\n",
" (armored_vehicle.n.01\n",
" (armored_car.n.01 )\n",
" (armored_car.n.02 )\n",
" (armored_personnel_carrier.n.01 )\n",
" (assault_gun.n.02 )\n",
" (tank.n.01 (panzer.n.01 ))\n",
" (tank_destroyer.n.01 ))\n",
" (carrier.n.02 )\n",
" (forklift.n.01 )\n",
" (locomotive.n.01\n",
" (choo-choo.n.01 )\n",
" (diesel_locomotive.n.01\n",
" (diesel-electric_locomotive.n.01 )\n",
" (diesel-hydraulic_locomotive.n.01 ))\n",
" (dinky.n.01 )\n",
" (electric_locomotive.n.01 )\n",
" (iron_horse.n.01 )\n",
" (pilot_engine.n.01 )\n",
" (shunter.n.01 )\n",
" (steam_locomotive.n.01 )\n",
" (switch_engine.n.01 )\n",
" (tank_engine.n.01 ))\n",
" (motor_vehicle.n.01\n",
" (amphibian.n.01 (swamp_buggy.n.01 ))\n",
" (bloodmobile.n.01 )\n",
" (car.n.01\n",
" (ambulance.n.01 (funny_wagon.n.01 ))\n",
" (beach_wagon.n.01 (shooting_brake.n.01 ))\n",
" (bus.n.04 )\n",
" (cab.n.03 (gypsy_cab.n.01 ) (minicab.n.01 ))\n",
" (compact.n.03 )\n",
" (convertible.n.01 )\n",
" (coupe.n.01 )\n",
" (cruiser.n.01 (panda_car.n.01 ))\n",
" (electric.n.01 )\n",
" (gas_guzzler.n.01 ))\n",
" (doodlebug.n.01 )\n",
" (four-wheel_drive.n.01 )\n",
" (go-kart.n.01 )\n",
" (golfcart.n.01 )\n",
" (hearse.n.01 )\n",
" (motorcycle.n.01\n",
" (minibike.n.01 (moped.n.01 ))\n",
" (trail_bike.n.01 ))\n",
" (snowplow.n.01 ))\n",
" (personnel_carrier.n.01 )\n",
" (reconnaissance_vehicle.n.01 )\n",
" (recreational_vehicle.n.01\n",
" (camper.n.02 (van.n.04 ))\n",
" (dune_buggy.n.01 ))\n",
" (streetcar.n.01 (horsecar.n.01 ))\n",
" (tracked_vehicle.n.01\n",
" (caterpillar.n.02 )\n",
" (half_track.n.01 )\n",
" (snowmobile.n.01 (sno-cat.n.01 ))\n",
" (tank.n.01 (panzer.n.01 ))))))\n"
]
}
],
"source": [
"def treeify (word):\n",
" def treeifyHyponyms (*, hyponym, max_breadth = 10, max_depth = 10, depth = 0):\n",
" def _treeify (hyponym):\n",
" return treeifyHyponyms(\n",
" hyponym = hyponym,\n",
" depth = depth + 1\n",
" )\n",
" \n",
" children = []\n",
" if (depth < max_depth):\n",
" hyponyms = hyponym.hyponyms()[:max_breadth]\n",
" children = [_treeify(h) for h in hyponyms]\n",
" \n",
" name = hyponym.name()\n",
" return nltk.Tree(name, children)\n",
" \n",
" return treeifyHyponyms(hyponym = synsets(word)[0])\n",
"\n",
"# The above is a generalization of this example from the text:\n",
"#\n",
"# vehicle = synsets('car')[0]\\\n",
"# t = nltk.Tree(vehicle.name(), children=[\n",
"# nltk.Tree(vehicle.hyponyms()[3].name(), children=[]), \n",
"# nltk.Tree(vehicle.hyponyms()[4].name(), children=[]), \n",
"# nltk.Tree(vehicle.hyponyms()[5].name(), children=[]), \n",
"# nltk.Tree(vehicle.hyponyms()[7].name(), children=[\n",
"# nltk.Tree(vehicle.hyponyms()[7].hyponyms()[1].name(), children=[]), \n",
"# nltk.Tree(vehicle.hyponyms()[7].hyponyms()[3].name(), children=[]), \n",
"# nltk.Tree(vehicle.hyponyms()[7].hyponyms()[4].name(), children=[]), \n",
"# nltk.Tree(vehicle.hyponyms()[7].hyponyms()[5].name(), children=[]), \n",
"# nltk.Tree(vehicle.hyponyms()[7].hyponyms()[6].name(), children=[]),\n",
"# ]), \n",
"# ])\n",
"# print(t)\n",
"\n",
"vehicle_tree = treeify('vehicle')\n",
"print(vehicle_tree)"
]
},
{
"cell_type": "code",
"execution_count": 184,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(car.n.01\n",
" (ambulance.n.01 (funny_wagon.n.01 ))\n",
" (beach_wagon.n.01 (shooting_brake.n.01 ))\n",
" (bus.n.04 )\n",
" (cab.n.03 (gypsy_cab.n.01 ) (minicab.n.01 ))\n",
" (compact.n.03 )\n",
" (convertible.n.01 )\n",
" (coupe.n.01 )\n",
" (cruiser.n.01 (panda_car.n.01 ))\n",
" (electric.n.01 )\n",
" (gas_guzzler.n.01 ))\n"
]
}
],
"source": [
"car_tree = treeify('car')\n",
"print(car_tree)"
]
},
{
"cell_type": "code",
"execution_count": 185,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(vomit.n.01 )\n"
]
}
],
"source": [
"barf_tree = treeify('barf')\n",
"print(barf_tree)"
]
},
{
"cell_type": "code",
"execution_count": 187,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"friend => friend\n",
"friends => friend\n",
"friendly => friend\n",
"drink => drink\n",
"drinks => drink\n",
"drunk => drunk\n",
"drank => drank\n",
"slow => slow\n",
"slowing => slow\n",
"slowly => slowli\n",
"slower => slower\n",
"slowest => slowest\n",
"quaid => quaid\n",
"xyzing => xyze\n",
"exerxzyzing => exerxzyz\n"
]
}
],
"source": [
"############\n",
"# Stemmer #\n",
"############\n",
"\n",
"from functools import reduce\n",
"\n",
"stemmer = nltk.stem.SnowballStemmer('english')\n",
"\n",
"words = [\n",
" 'friend', \n",
" 'friends', \n",
" 'friendly',\n",
" 'drink',\n",
" 'drinks',\n",
" 'drunk',\n",
" 'drank',\n",
" 'slow',\n",
" 'slowing',\n",
" 'slowly',\n",
" 'slower',\n",
" 'slowest',\n",
" 'quaid',\n",
" 'xyzing',\n",
" 'exerxzyzing'\n",
"]\n",
"\n",
"def printDict (dict):\n",
" def maxlen (a, b):\n",
" a_len = len(a)\n",
" b_len = len(b)\n",
" return a if (a_len > b_len) else b\n",
"\n",
" max_length = len(reduce(maxlen, dict.keys()))\n",
"\n",
" for k, v in dict.items():\n",
" label = k.ljust(max_length + 1)\n",
" print(f'{label} => {v}')\n",
" \n",
"def mapWords (*, words, mapper):\n",
" return dict((word, mapper(word)) for word in words)\n",
" \n",
"printDict(mapWords(\n",
" words = stems,\n",
" mapper = lambda word: stemmer.stem(word)\n",
"))\n"
]
},
{
"cell_type": "code",
"execution_count": 186,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"lemmatizing as noun\n",
"friend => friend\n",
"friends => friend\n",
"friendly => friendly\n",
"drink => drink\n",
"drinks => drink\n",
"drunk => drunk\n",
"drank => drank\n",
"slow => slow\n",
"slowing => slowing\n",
"slowly => slowly\n",
"slower => slower\n",
"slowest => slowest\n",
"quaid => quaid\n",
"xyzing => xyzing\n",
"exerxzyzing => exerxzyzing\n",
"\n",
"lemmatizing as verb\n",
"friend => friend\n",
"friends => friends\n",
"friendly => friendly\n",
"drink => drink\n",
"drinks => drink\n",
"drunk => drink\n",
"drank => drink\n",
"slow => slow\n",
"slowing => slow\n",
"slowly => slowly\n",
"slower => slower\n",
"slowest => slowest\n",
"quaid => quaid\n",
"xyzing => xyzing\n",
"exerxzyzing => exerxzyzing\n",
"\n",
"lemmatizing as adjective\n",
"friend => friend\n",
"friends => friends\n",
"friendly => friendly\n",
"drink => drink\n",
"drinks => drinks\n",
"drunk => drunk\n",
"drank => drank\n",
"slow => slow\n",
"slowing => slowing\n",
"slowly => slowly\n",
"slower => slow\n",
"slowest => slow\n",
"quaid => quaid\n",
"xyzing => xyzing\n",
"exerxzyzing => exerxzyzing\n",
"\n"
]
}
],
"source": [
"##############\n",
"# Lemmatizer #\n",
"##############\n",
"\n",
"lemmatizer = nltk.stem.WordNetLemmatizer()\n",
"\n",
"for part_of_speech in ['noun', 'verb', 'adjective']:\n",
" print(f'lemmatizing as {part_of_speech}')\n",
" pos = part_of_speech[0]\n",
" printDict(mapWords(\n",
" words = stems,\n",
" mapper = lambda word: lemmatizer.lemmatize(word, pos)\n",
" ))\n",
" print()\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.4"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment