stephenhandley · August 17, 2019 01:06
diff --git a/NLP4H_Chapter1.ipynb b/NLP4H_Chapter1.ipynb
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 82,
   "metadata": {},
   "outputs": [],
   "source": [
    "########\n",
    "# Deps #\n",
    "########\n",
    "# pip install nltk numpy\n",
    "\n",
    "import nltk\n",
    "from nltk.corpus import reuters\n",
    "from random import randrange\n",
    "\n",
    "reuters_words = nltk.corpus.reuters.words()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#########\n",
    "# Setup #\n",
    "#########\n",
    "\n",
    "nltk.download('all')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 173,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Found 9 sentences in this paragraph:\n",
      "AMPLE SUPPLIES LIMIT U.S. STRIKE'S OIL PRICE IMPACT\n",
      "  Ample supplies of OPEC crude weighing on\n",
      "  world markets helped limit and then reverse oil price gains\n",
      "  that followed the U.S. Strike on an Iranian oil platform in the\n",
      "  Gulf earlier on Monday, analysts said.\n",
      "      December loading rose to 19.65 dlrs, up 45 cents before\n",
      "  falling to around 19.05/15 later, unchanged from last Friday.\n",
      "      \"Fundamentals are awful,\" said Philip Lambert, analyst with\n",
      "  stockbrokers Kleinwort Grieveson, adding that total OPEC\n",
      "  production in the first week of October could be above 18.5 mln\n",
      "  bpd, little changed from September levels.\n",
      "      Peter Nicol, analyst at Chase Manhattan Bank, said OPEC\n",
      "  production could be about 18.5-19.0 mln in October. Reuter and\n",
      "  International Energy Agency (IEA) estimates put OPEC September\n",
      "  production at 18.5 mln bpd.\n",
      "      The U.S. Attack was in retaliation of last Friday's hit of\n",
      "  a Kuwaiti oil products tanker flying the U.S. Flag, the Sea\n",
      "  Isle City. It was struc...\n",
      "\n",
      "1.\n",
      "AMPLE SUPPLIES LIMIT U.S. STRIKE'S OIL PRICE IMPACT\n",
      "  Ample supplies of OPEC crude weighing on\n",
      "  world markets helped limit and then reverse oil price gains\n",
      "  that followed the U.S. Strike on an Iranian oil platform in the\n",
      "  Gulf earlier on Monday, analysts said.\n",
      "2.\n",
      "December loading rose to 19.65 dlrs, up 45 cents before\n",
      "  falling to around 19.05/15 later, unchanged from last Friday.\n",
      "3.\n",
      "\"Fundamentals are awful,\" said Philip Lambert, analyst with\n",
      "  stockbrokers Kleinwort Grieveson, adding that total OPEC\n",
      "  production in the first week of October could be above 18.5 mln\n",
      "  bpd, little changed from September levels.\n",
      "4.\n",
      "Peter Nicol, analyst at Chase Manhattan Bank, said OPEC\n",
      "  production could be about 18.5-19.0 mln in October.\n",
      "5.\n",
      "Reuter and\n",
      "  International Energy Agency (IEA) estimates put OPEC September\n",
      "  production at 18.5 mln bpd.\n",
      "6.\n",
      "The U.S.\n",
      "7.\n",
      "Attack was in retaliation of last Friday's hit of\n",
      "  a Kuwaiti oil products tanker flying the U.S.\n",
      "8.\n",
      "Flag, the Sea\n",
      "  Isle City.\n",
      "9.\n",
      "It was struc\n"
     ]
    }
   ],
   "source": [
    "#############\n",
    "# Sentences #\n",
    "#############\n",
    "\n",
    "paragraph = reuters.raw('test/21131')[:1000]\n",
    "sentences = nltk.sent_tokenize(paragraph) \n",
    "num_sentences = len(sentences)\n",
    "\n",
    "print(f'Found {num_sentences} sentences in this paragraph:\\n{paragraph}...\\n')\n",
    "for i in range(num_sentences):\n",
    "    sentence = sentences[i]\n",
    "    print(f'{i + 1}.\\n{sentence}')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 175,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[\"Don't say that Randy Quaid is a good example.\"]\n",
      "['Do', \"n't\", 'say', 'that', 'Randy', 'Quaid', 'is', 'a', 'good', 'example', '.']\n"
     ]
    }
   ],
   "source": [
    "#########\n",
    "# Words #\n",
    "#########\n",
    "\n",
    "easy_sentence = \"Don't say that Randy Quaid is a good example.\"\n",
    "print(nltk.sent_tokenize(easy_sentence))\n",
    "easy_words = nltk.word_tokenize(easy_sentence)\n",
    "print(easy_words)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 176,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "most common words are:  [('.', 94687), (',', 72360), ('the', 58251), ('of', 35979), ('to', 34035), ('in', 26478), ('said', 25224), ('and', 25043), ('a', 23492), ('mln', 18037)]\n",
      "\"stock\" occurs 2346 times\n",
      "\"stork\" occurs 0 times\n",
      "frequency of the word \"the\" is 0.033849129031826936\n",
      "37.37% (15545 / 41600) of the words occur only once. Here is a random example: ERB\n",
      "samples: 1720901\n"
     ]
    }
   ],
   "source": [
    "############################\n",
    "# Occurrences, basic stats #\n",
    "############################\n",
    "\n",
    "def percent (num): \n",
    "    return '%.2f' % (num * 100)\n",
    "\n",
    "fdist = nltk.FreqDist(reuters_words)\n",
    "most_common = fdist.most_common(n=10)\n",
    "\n",
    "stock_count = fdist['stock']\n",
    "stork_count = fdist['stork']\n",
    "\n",
    "the_freq = fdist.freq('the')\n",
    "\n",
    "singles = fdist.hapaxes() # \"hapaxes\" is term for words occurring once\n",
    "num_singles = len(singles)\n",
    "total = len(fdist.keys())\n",
    "percent_single = percent((num_singles / total));\n",
    "index = randrange(num_singles)\n",
    "example = singles[index]\n",
    "\n",
    "num_samples = fdist.N()\n",
    "\n",
    "print('most common words are: ', most_common)\n",
    "print(f'\"stock\" occurs {stock_count} times')\n",
    "print(f'\"stork\" occurs {stork_count} times')\n",
    "print(f'frequency of the word \"the\" is {the_freq}')\n",
    "print(f'{percent_single}% ({num_singles} / {total}) of the words occur only once. Here is a random example:', example)\n",
    "print('samples:', num_samples)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 177,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "bigrams are pairs of words that are adjacent [('Randy', 'works'), ('works', 'at'), ('at', 'Quaid'), ('Quaid', 'Corp'), ('Corp', '.')]\n",
      "trigrams are similar but for 3 words [('Randy', 'works', 'at'), ('works', 'at', 'Quaid'), ('at', 'Quaid', 'Corp'), ('Quaid', 'Corp', '.')]\n"
     ]
    }
   ],
   "source": [
    "###########\n",
    "# n-grams #\n",
    "###########\n",
    "\n",
    "text = \"Randy works at Quaid Corp.\"\n",
    "tokens = nltk.word_tokenize(text)\n",
    "bigrams = nltk.bigrams(tokens)\n",
    "trigrams = nltk.trigrams(tokens)\n",
    "\n",
    "print('bigrams are pairs of words that are adjacent', list(bigrams))\n",
    "print('trigrams are similar but for 3 words', list(trigrams))\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 178,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "50 bigrams with highest pointwise mututal information: [('Het', 'Comite'), ('Lago', 'Agrio'), ('Hoare', 'Govett'), ('Dar', 'es'), ('Ranks', 'Hovis'), ('es', 'Salaam'), ('SRI', 'LANKA'), ('CRAZY', 'EDDIE'), ('LORD', 'ABBETT'), ('Corazon', 'Aquino'), ('MERRILL', 'LYNCH'), ('Ay', 'Expd'), ('Abu', 'Dhabi'), ('Sao', 'Paulo'), ('HONG', 'KONG'), ('THOUS', 'BUSHELS'), ('poison', 'pill'), ('Lear', 'Siegler'), ('Pizza', 'Inn'), ('Kleinwort', 'Benson'), ('Hajime', 'Tamura'), ('Gates', 'Learjet'), ('ORANGE', 'JUICE'), ('Wells', 'Fargo'), ('Real', 'Estate'), ('Margaret', 'Thatcher'), ('SANTA', 'FE'), ('Brace', 'Jovanovich'), ('DIAMOND', 'SHAMROCK'), ('Phelps', 'Dodge'), ('Fort', 'Worth'), ('Puerto', 'Rico'), ('del', 'Este'), ('WALL', 'STREET'), ('REAL', 'ESTATE'), ('Hiram', 'Walker'), ('Du', 'Pont'), ('Partly', 'offsetting'), ('Punta', 'del'), ('Hk', 'Hotels'), ('DAYTON', 'HUDSON'), ('Bra', 'Kanon'), ('HUGHES', 'TOOL'), ('Rank', 'Xerox'), ('Exporting', 'Countries'), ('Marlin', 'Fitzwater'), ('King', 'Fahd'), ('Dean', 'Witter'), ('continental', 'shelf'), ('Costa', 'Rica')]\n",
      "50 trigrams with highest pointwise mututal information: [('Dar', 'es', 'Salaam'), ('Punta', 'del', 'Este'), ('Arturo', 'Hernandez', 'Grisanti'), ('Speaker', 'Jim', 'Wright'), ('SANTA', 'FE', 'SOUTHERN'), ('THOUS', 'BUSHELS', 'SOYBEANS'), ('Denis', 'Bra', 'Kanon'), ('Nil', 'Nil', 'Nil'), ('Harcourt', 'Brace', 'Jovanovich'), ('Fernando', 'Santos', 'Alvite'), ('Drexel', 'Burnham', 'Lambert'), ('WALL', 'STREET', 'STOCKS'), ('Liberal', 'Democratic', 'Party'), ('Prev', 'Wk', 'Named'), ('Dean', 'Witter', 'Reynolds'), ('CUSTOMER', 'REPURCHASE', 'AGREEMENTS'), ('Lloyds', 'Shipping', 'Intelligence'), ('Karl', 'Otto', 'Poehl'), ('DISCOUNT', 'BORROWINGS', 'AVERAGE'), ('ADDS', 'RESERVES', 'VIA'), ('VIA', 'CUSTOMER', 'REPURCHASES'), ('Light', 'Louisiana', 'Sweet'), ('President', 'Corazon', 'Aquino'), ('Export', 'Enhancement', 'Program'), ('Exchequer', 'Nigel', 'Lawson'), ('Governor', 'Satoshi', 'Sumita'), ('Rio', 'de', 'Janeiro'), ('governor', 'Satoshi', 'Sumita'), ('Gross', 'Domestic', 'Product'), ('Enhancement', 'Program', 'initiative'), ('ruling', 'Liberal', 'Democratic'), ('OLD', 'RATE', 'MATURITY'), ('Partly', 'offsetting', 'these'), ('Petroleum', 'Exporting', 'Countries'), ('Minister', 'Margaret', 'Thatcher'), ('House', 'Speaker', 'Jim'), ('CURRENT', 'ACCOUNT', 'DEFICIT'), ('Banco', 'do', 'Brasil'), ('DISTILLATE', 'STOCKS', 'OFF'), ('RESERVES', 'VIA', 'CUSTOMER'), ('Representative', 'Clayton', 'Yeutter'), ('ranges', 'broadly', 'consistent'), ('Secretary', 'Caspar', 'Weinberger'), ('offsetting', 'these', 'outflows'), ('MARKET', 'SHORTAGE', 'FORECAST'), ('REUTER', '^', 'M'), ('Marine', 'Midland', 'Banks'), ('DLR', 'CUSTOMER', 'REPURCHASE'), ('Papua', 'New', 'Guinea'), ('President', 'Karl', 'Otto')]\n"
     ]
    }
   ],
   "source": [
    "from nltk.collocations import (\n",
    "    BigramAssocMeasures, \n",
    "    BigramCollocationFinder,\n",
    "    TrigramAssocMeasures,\n",
    "    TrigramCollocationFinder\n",
    ")\n",
    "\n",
    "bigram_measures = BigramAssocMeasures()\n",
    "trigram_measures = TrigramAssocMeasures()\n",
    "\n",
    "def gramStats (*, measures, Finder, prefix, min_freq = 10, pmi_count = 50):\n",
    "    finder = Finder.from_words(reuters_words)\n",
    "    finder.apply_freq_filter(min_freq)\n",
    "    highest_pmi = finder.nbest(bigram_measures.pmi, pmi_count)\n",
    "    label = f'{prefix}grams'\n",
    "    print(f'{pmi_count} {label} with highest pointwise mututal information:', highest_pmi)\n",
    "    \n",
    "gramStats(\n",
    "    measures = bigram_measures,\n",
    "    Finder = BigramCollocationFinder,\n",
    "    prefix = 'bi'\n",
    ")\n",
    "\n",
    "gramStats(\n",
    "    measures = trigram_measures,\n",
    "    Finder = TrigramCollocationFinder,\n",
    "    prefix = 'tri'\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 179,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Tokens:  ['Wikipedia.com', 'says', 'that', 'Randy', 'Randall', 'Rudy', 'Quaid', 'is', 'an', 'American', 'film', 'and', 'television', 'actor', 'and', 'Oscar', 'nominee', 'known', 'for', 'his', 'roles', 'in', 'both', 'serious', 'drama', 'and', 'light', 'comedy', '.']\n",
      "Tagged Tokens:  [('Wikipedia.com', 'NNP'), ('says', 'VBZ'), ('that', 'IN'), ('Randy', 'NNP'), ('Randall', 'NNP'), ('Rudy', 'NNP'), ('Quaid', 'NNP'), ('is', 'VBZ'), ('an', 'DT'), ('American', 'JJ'), ('film', 'NN'), ('and', 'CC'), ('television', 'NN'), ('actor', 'NN'), ('and', 'CC'), ('Oscar', 'NNP'), ('nominee', 'RB'), ('known', 'VBN'), ('for', 'IN'), ('his', 'PRP$'), ('roles', 'NNS'), ('in', 'IN'), ('both', 'DT'), ('serious', 'JJ'), ('drama', 'NN'), ('and', 'CC'), ('light', 'JJ'), ('comedy', 'NN'), ('.', '.')]\n",
      "Named entities (S\n",
      "  Wikipedia.com/NNP\n",
      "  says/VBZ\n",
      "  that/IN\n",
      "  (PERSON Randy/NNP Randall/NNP Rudy/NNP Quaid/NNP)\n",
      "  is/VBZ\n",
      "  an/DT\n",
      "  (GPE American/JJ)\n",
      "  film/NN\n",
      "  and/CC\n",
      "  television/NN\n",
      "  actor/NN\n",
      "  and/CC\n",
      "  (PERSON Oscar/NNP)\n",
      "  nominee/RB\n",
      "  known/VBN\n",
      "  for/IN\n",
      "  his/PRP$\n",
      "  roles/NNS\n",
      "  in/IN\n",
      "  both/DT\n",
      "  serious/JJ\n",
      "  drama/NN\n",
      "  and/CC\n",
      "  light/JJ\n",
      "  comedy/NN\n",
      "  ./.)\n"
     ]
    }
   ],
   "source": [
    "####################################\n",
    "# Tokenization and parts of speech #\n",
    "####################################\n",
    "\n",
    "randy_sentence = \"Wikipedia.com says that Randy Randall Rudy Quaid is an American film and television actor and Oscar nominee known for his roles in both serious drama and light comedy.\" \n",
    "randy_tokens = nltk.word_tokenize(randy_sentence)\n",
    "tagged_tokens = nltk.pos_tag(randy_tokens) \n",
    "ner_annotated_tree = nltk.ne_chunk(tagged_tokens)\n",
    "\n",
    "print(\"Tokens: \", randy_tokens)\n",
    "print(\"Tagged Tokens: \", tagged_tokens)\n",
    "print(\"Named entities\", ner_annotated_tree)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 180,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(S\n",
      "  (PERSON Randy/NNP)\n",
      "  (PERSON Quaid/NNP)\n",
      "  does/VBZ\n",
      "  not/RB\n",
      "  go/VB\n",
      "  to/TO\n",
      "  (ORGANIZATION Harvard/NNP)\n",
      "  ./.)\n"
     ]
    }
   ],
   "source": [
    "sentence = \"\"\"Randy Quaid does not go to Harvard.\"\"\"\n",
    "tokens = nltk.word_tokenize(sentence)\n",
    "tagged_tokens = nltk.pos_tag(tokens) \n",
    "ner_annotated_tree = nltk.ne_chunk(tagged_tokens) \n",
    "print(ner_annotated_tree)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 181,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Synsets for car (filter = None):\n",
      "\n",
      "car.n.01:\n",
      "['car', 'auto', 'automobile', 'machine', 'motorcar']\n",
      "a motor vehicle with four wheels; usually propelled by an internal combustion engine\n",
      "\n",
      "car.n.02:\n",
      "['car', 'railcar', 'railway_car', 'railroad_car']\n",
      "a wheeled vehicle adapted to the rails of railroad\n",
      "\n",
      "car.n.03:\n",
      "['car', 'gondola']\n",
      "the compartment that is suspended from an airship and that carries personnel and the cargo and the power plant\n",
      "\n",
      "car.n.04:\n",
      "['car', 'elevator_car']\n",
      "where passengers ride up and down\n",
      "\n",
      "cable_car.n.01:\n",
      "['cable_car', 'car']\n",
      "a conveyance for passengers or freight on a cable railway\n",
      "\n",
      "------\n",
      "\n",
      "Synsets for fight (filter = None):\n",
      "\n",
      "battle.n.01:\n",
      "['battle', 'conflict', 'fight', 'engagement']\n",
      "a hostile meeting of opposing military forces in the course of a war\n",
      "\n",
      "fight.n.02:\n",
      "['fight', 'fighting', 'combat', 'scrap']\n",
      "the act of fighting; any contest or struggle\n",
      "\n",
      "competitiveness.n.01:\n",
      "['competitiveness', 'fight']\n",
      "an aggressive willingness to compete\n",
      "\n",
      "fight.n.04:\n",
      "['fight']\n",
      "an intense verbal dispute\n",
      "\n",
      "fight.n.05:\n",
      "['fight']\n",
      "a boxing or wrestling match\n",
      "\n",
      "contend.v.06:\n",
      "['contend', 'fight', 'struggle']\n",
      "be engaged in a fight; carry on a fight\n",
      "\n",
      "fight.v.02:\n",
      "['fight', 'oppose', 'fight_back', 'fight_down', 'defend']\n",
      "fight against or resist strongly\n",
      "\n",
      "fight.v.03:\n",
      "['fight', 'struggle']\n",
      "make a strenuous or labored effort\n",
      "\n",
      "crusade.v.01:\n",
      "['crusade', 'fight', 'press', 'campaign', 'push', 'agitate']\n",
      "exert oneself continuously, vigorously, or obtrusively to gain an end or engage in a crusade for a certain cause or person; be an advocate for\n",
      "\n",
      "------\n",
      "\n",
      "Synsets for fight (filter = n):\n",
      "\n",
      "battle.n.01:\n",
      "['battle', 'conflict', 'fight', 'engagement']\n",
      "a hostile meeting of opposing military forces in the course of a war\n",
      "\n",
      "fight.n.02:\n",
      "['fight', 'fighting', 'combat', 'scrap']\n",
      "the act of fighting; any contest or struggle\n",
      "\n",
      "competitiveness.n.01:\n",
      "['competitiveness', 'fight']\n",
      "an aggressive willingness to compete\n",
      "\n",
      "fight.n.04:\n",
      "['fight']\n",
      "an intense verbal dispute\n",
      "\n",
      "fight.n.05:\n",
      "['fight']\n",
      "a boxing or wrestling match\n",
      "\n",
      "------\n",
      "\n"
     ]
    }
   ],
   "source": [
    "###########\n",
    "# Wordnet #\n",
    "###########\n",
    "\n",
    "wordnet = nltk.corpus.wordnet\n",
    "    \n",
    "def printSynsets (*, word, filter = None):\n",
    "    print(f'Synsets for {word} (filter = {filter}):')\n",
    "    synsets = wordnet.synsets(word, filter)\n",
    "    for synset in synsets:\n",
    "        print()\n",
    "        print(f'{synset.name()}:')\n",
    "        print([l.name() for l in synset.lemmas()])\n",
    "        print(synset.definition())\n",
    "    print('\\n------\\n')\n",
    "\n",
    "printSynsets(\n",
    "    word = 'car'\n",
    ")\n",
    "              \n",
    "printSynsets(\n",
    "    word = 'fight'\n",
    ")\n",
    "  \n",
    "# just the nouns\n",
    "printSynsets(\n",
    "    word = 'fight',\n",
    "    filter = wordnet.NOUN \n",
    ")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 182,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Synset('walk.v.01')\n"
     ]
    }
   ],
   "source": [
    "# Get a single synset by name \n",
    "walk_verb = wordnet.synset('walk.v.01')\n",
    "print(walk_verb)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 183,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(vehicle.n.01\n",
      "  (bumper_car.n.01 )\n",
      "  (craft.n.02\n",
      "    (aircraft.n.01\n",
      "      (bogy.n.01 )\n",
      "      (cruise_missile.n.01 )\n",
      "      (heavier-than-air_craft.n.01\n",
      "        (airplane.n.01\n",
      "          (airliner.n.01\n",
      "            (airbus.n.01 )\n",
      "            (narrowbody_aircraft.n.01 )\n",
      "            (widebody_aircraft.n.01 ))\n",
      "          (amphibian.n.02 )\n",
      "          (biplane.n.01 )\n",
      "          (bomber.n.01 (dive_bomber.n.01 ) (stealth_bomber.n.01 ))\n",
      "          (delta_wing.n.01 )\n",
      "          (fighter.n.02\n",
      "            (interceptor.n.01 )\n",
      "            (kamikaze.n.01 )\n",
      "            (stealth_fighter.n.01 ))\n",
      "          (hangar_queen.n.01 )\n",
      "          (jet.n.01\n",
      "            (fanjet.n.01 )\n",
      "            (jetliner.n.01 )\n",
      "            (jumbojet.n.01 )\n",
      "            (twinjet.n.01 ))\n",
      "          (monoplane.n.01 )\n",
      "          (multiengine_airplane.n.01 ))\n",
      "        (autogiro.n.01 )\n",
      "        (drone.n.04 )\n",
      "        (glider.n.01 (hang_glider.n.02 ))\n",
      "        (helicopter.n.01\n",
      "          (cargo_helicopter.n.01 )\n",
      "          (shuttle_helicopter.n.01 )\n",
      "          (single-rotor_helicopter.n.01 )\n",
      "          (skyhook.n.01 ))\n",
      "        (orthopter.n.01 )\n",
      "        (warplane.n.01\n",
      "          (bomber.n.01 (dive_bomber.n.01 ) (stealth_bomber.n.01 ))\n",
      "          (fighter.n.02\n",
      "            (interceptor.n.01 )\n",
      "            (kamikaze.n.01 )\n",
      "            (stealth_fighter.n.01 ))\n",
      "          (reconnaissance_plane.n.01 )))\n",
      "      (lighter-than-air_craft.n.01\n",
      "        (airship.n.01\n",
      "          (barrage_balloon.n.01 (kite_balloon.n.01 ))\n",
      "          (blimp.n.02 ))\n",
      "        (balloon.n.01\n",
      "          (hot-air_balloon.n.01 )\n",
      "          (meteorological_balloon.n.01 (pilot_balloon.n.01 ))\n",
      "          (trial_balloon.n.02 )))\n",
      "      (stealth_aircraft.n.01\n",
      "        (stealth_bomber.n.01 )\n",
      "        (stealth_fighter.n.01 )))\n",
      "    (hovercraft.n.01 )\n",
      "    (landing_craft.n.01 )\n",
      "    (spacecraft.n.01\n",
      "      (lander.n.02 )\n",
      "      (lunar_excursion_module.n.01 )\n",
      "      (space_capsule.n.01 )\n",
      "      (space_shuttle.n.01 )\n",
      "      (starship.n.01 ))\n",
      "    (vessel.n.02\n",
      "      (bareboat.n.01 )\n",
      "      (boat.n.01\n",
      "        (ark.n.02 )\n",
      "        (barge.n.01\n",
      "          (dredger.n.01 )\n",
      "          (houseboat.n.01 )\n",
      "          (pontoon.n.01 )\n",
      "          (scow.n.02 )\n",
      "          (wherry.n.01 ))\n",
      "        (bumboat.n.01 )\n",
      "        (canal_boat.n.01 )\n",
      "        (ferry.n.01 (car-ferry.n.01 ))\n",
      "        (fireboat.n.01 )\n",
      "        (gondola.n.02 )\n",
      "        (guard_boat.n.01 )\n",
      "        (gunboat.n.01 )\n",
      "        (junk.n.02 ))\n",
      "      (fishing_boat.n.01 (trawler.n.02 ))\n",
      "      (galley.n.01 )\n",
      "      (galley.n.02 (trireme.n.01 ))\n",
      "      (iceboat.n.02 )\n",
      "      (patrol_boat.n.01 )\n",
      "      (sailing_vessel.n.01\n",
      "        (bark.n.03 )\n",
      "        (brig.n.01 )\n",
      "        (brigantine.n.01 )\n",
      "        (clipper.n.02 )\n",
      "        (cutter.n.05 )\n",
      "        (dhow.n.01 )\n",
      "        (felucca.n.01 )\n",
      "        (fore-and-after.n.01 )\n",
      "        (galleon.n.01 (carrack.n.01 ))\n",
      "        (indiaman.n.01 ))\n",
      "      (ship.n.01\n",
      "        (abandoned_ship.n.01 )\n",
      "        (blockade-runner.n.01 )\n",
      "        (cargo_ship.n.01\n",
      "          (banana_boat.n.01 )\n",
      "          (bottom.n.07 )\n",
      "          (cattleship.n.01 )\n",
      "          (container_ship.n.01 )\n",
      "          (liberty_ship.n.01 )\n",
      "          (oil_tanker.n.01 (supertanker.n.01 )))\n",
      "        (flagship.n.02 )\n",
      "        (gas-turbine_ship.n.01 )\n",
      "        (hospital_ship.n.01 )\n",
      "        (hulk.n.02 )\n",
      "        (icebreaker.n.01 )\n",
      "        (lightship.n.01 )\n",
      "        (minelayer.n.01 ))\n",
      "      (shrimper.n.01 )))\n",
      "  (military_vehicle.n.01\n",
      "    (caisson.n.02 )\n",
      "    (half_track.n.01 )\n",
      "    (humvee.n.01 )\n",
      "    (personnel_carrier.n.01 )\n",
      "    (picket.n.04 (picket_boat.n.01 ) (picket_ship.n.01 ))\n",
      "    (reconnaissance_vehicle.n.01 )\n",
      "    (tank.n.01 (panzer.n.01 ))\n",
      "    (technical.n.01 )\n",
      "    (troop_carrier.n.01 (troopship.n.01 ))\n",
      "    (warplane.n.01\n",
      "      (bomber.n.01 (dive_bomber.n.01 ) (stealth_bomber.n.01 ))\n",
      "      (fighter.n.02\n",
      "        (interceptor.n.01 )\n",
      "        (kamikaze.n.01 )\n",
      "        (stealth_fighter.n.01 ))\n",
      "      (reconnaissance_plane.n.01 )))\n",
      "  (rocket.n.01\n",
      "    (missile.n.01\n",
      "      (air-to-air_missile.n.01 )\n",
      "      (air-to-ground_missile.n.01 )\n",
      "      (ballistic_missile.n.01\n",
      "        (intercontinental_ballistic_missile.n.01 (minuteman.n.02 )))\n",
      "      (guided_missile.n.01\n",
      "        (antiballistic_missile.n.01 )\n",
      "        (buzz_bomb.n.01 )\n",
      "        (exocet.n.01 )\n",
      "        (space_probe.n.01 )\n",
      "        (surface-to-air_missile.n.01 (manpad.n.01 ) (stinger.n.03 )))\n",
      "      (heat-seeking_missile.n.01\n",
      "        (brilliant_pebble.n.01 )\n",
      "        (stinger.n.03 ))\n",
      "      (sidewinder.n.02 ))\n",
      "    (multistage_rocket.n.01 )\n",
      "    (test_rocket.n.01 (sounding_rocket.n.01 )))\n",
      "  (skibob.n.01 )\n",
      "  (sled.n.01\n",
      "    (bobsled.n.01 )\n",
      "    (bobsled.n.02 )\n",
      "    (dogsled.n.01 )\n",
      "    (luge.n.01 )\n",
      "    (pung.n.01 )\n",
      "    (toboggan.n.01 ))\n",
      "  (steamroller.n.02 )\n",
      "  (wheeled_vehicle.n.01\n",
      "    (baby_buggy.n.01 (bassinet.n.02 ))\n",
      "    (bicycle.n.01\n",
      "      (bicycle-built-for-two.n.01 )\n",
      "      (mountain_bike.n.01 )\n",
      "      (ordinary.n.04 )\n",
      "      (push-bike.n.01 )\n",
      "      (safety_bicycle.n.01 )\n",
      "      (velocipede.n.01 ))\n",
      "    (boneshaker.n.01 )\n",
      "    (car.n.02\n",
      "      (baggage_car.n.01 )\n",
      "      (cabin_car.n.01 )\n",
      "      (club_car.n.01 )\n",
      "      (freight_car.n.01\n",
      "        (boxcar.n.01 (stockcar.n.01 ))\n",
      "        (cattle_car.n.01 )\n",
      "        (coal_car.n.01 )\n",
      "        (flatcar.n.01 )\n",
      "        (gondola_car.n.01 )\n",
      "        (refrigerator_car.n.01 )\n",
      "        (tank_car.n.01 ))\n",
      "      (guard's_van.n.01 )\n",
      "      (handcar.n.01 )\n",
      "      (mail_car.n.01 )\n",
      "      (passenger_car.n.01\n",
      "        (dining_car.n.01 )\n",
      "        (nonsmoker.n.02 )\n",
      "        (parlor_car.n.01 )\n",
      "        (pullman.n.01 )\n",
      "        (sleeping_car.n.01 )\n",
      "        (smoker.n.03 ))\n",
      "      (slip_coach.n.01 )\n",
      "      (tender.n.04 ))\n",
      "    (handcart.n.01\n",
      "      (applecart.n.02 )\n",
      "      (barrow.n.03 )\n",
      "      (hand_truck.n.01 )\n",
      "      (laundry_cart.n.01 )\n",
      "      (serving_cart.n.01 (pastry_cart.n.01 ) (tea_cart.n.01 ))\n",
      "      (shopping_cart.n.01 ))\n",
      "    (horse-drawn_vehicle.n.01\n",
      "      (carriage.n.02\n",
      "        (barouche.n.01 )\n",
      "        (brougham.n.01 )\n",
      "        (buckboard.n.01 )\n",
      "        (buggy.n.01 )\n",
      "        (cab.n.02 )\n",
      "        (caroche.n.01 )\n",
      "        (chaise.n.02 )\n",
      "        (chariot.n.01 )\n",
      "        (clarence.n.01 )\n",
      "        (coach.n.04 (stagecoach.n.01 )))\n",
      "      (chariot.n.02 )\n",
      "      (limber.n.01 )\n",
      "      (sulky.n.01 ))\n",
      "    (motor_scooter.n.01 )\n",
      "    (rolling_stock.n.01 )\n",
      "    (scooter.n.02 )\n",
      "    (self-propelled_vehicle.n.01\n",
      "      (armored_vehicle.n.01\n",
      "        (armored_car.n.01 )\n",
      "        (armored_car.n.02 )\n",
      "        (armored_personnel_carrier.n.01 )\n",
      "        (assault_gun.n.02 )\n",
      "        (tank.n.01 (panzer.n.01 ))\n",
      "        (tank_destroyer.n.01 ))\n",
      "      (carrier.n.02 )\n",
      "      (forklift.n.01 )\n",
      "      (locomotive.n.01\n",
      "        (choo-choo.n.01 )\n",
      "        (diesel_locomotive.n.01\n",
      "          (diesel-electric_locomotive.n.01 )\n",
      "          (diesel-hydraulic_locomotive.n.01 ))\n",
      "        (dinky.n.01 )\n",
      "        (electric_locomotive.n.01 )\n",
      "        (iron_horse.n.01 )\n",
      "        (pilot_engine.n.01 )\n",
      "        (shunter.n.01 )\n",
      "        (steam_locomotive.n.01 )\n",
      "        (switch_engine.n.01 )\n",
      "        (tank_engine.n.01 ))\n",
      "      (motor_vehicle.n.01\n",
      "        (amphibian.n.01 (swamp_buggy.n.01 ))\n",
      "        (bloodmobile.n.01 )\n",
      "        (car.n.01\n",
      "          (ambulance.n.01 (funny_wagon.n.01 ))\n",
      "          (beach_wagon.n.01 (shooting_brake.n.01 ))\n",
      "          (bus.n.04 )\n",
      "          (cab.n.03 (gypsy_cab.n.01 ) (minicab.n.01 ))\n",
      "          (compact.n.03 )\n",
      "          (convertible.n.01 )\n",
      "          (coupe.n.01 )\n",
      "          (cruiser.n.01 (panda_car.n.01 ))\n",
      "          (electric.n.01 )\n",
      "          (gas_guzzler.n.01 ))\n",
      "        (doodlebug.n.01 )\n",
      "        (four-wheel_drive.n.01 )\n",
      "        (go-kart.n.01 )\n",
      "        (golfcart.n.01 )\n",
      "        (hearse.n.01 )\n",
      "        (motorcycle.n.01\n",
      "          (minibike.n.01 (moped.n.01 ))\n",
      "          (trail_bike.n.01 ))\n",
      "        (snowplow.n.01 ))\n",
      "      (personnel_carrier.n.01 )\n",
      "      (reconnaissance_vehicle.n.01 )\n",
      "      (recreational_vehicle.n.01\n",
      "        (camper.n.02 (van.n.04 ))\n",
      "        (dune_buggy.n.01 ))\n",
      "      (streetcar.n.01 (horsecar.n.01 ))\n",
      "      (tracked_vehicle.n.01\n",
      "        (caterpillar.n.02 )\n",
      "        (half_track.n.01 )\n",
      "        (snowmobile.n.01 (sno-cat.n.01 ))\n",
      "        (tank.n.01 (panzer.n.01 ))))))\n"
     ]
    }
   ],
   "source": [
    "def treeify (word):\n",
    "    def treeifyHyponyms (*, hyponym, max_breadth = 10, max_depth = 10, depth = 0):\n",
    "        def _treeify (hyponym):\n",
    "            return treeifyHyponyms(\n",
    "                hyponym = hyponym,\n",
    "                depth = depth + 1\n",
    "            )\n",
    "        \n",
    "        children = []\n",
    "        if (depth < max_depth):\n",
    "            hyponyms = hyponym.hyponyms()[:max_breadth]\n",
    "            children = [_treeify(h) for h in hyponyms]\n",
    "        \n",
    "        name = hyponym.name()\n",
    "        return nltk.Tree(name, children)\n",
    "        \n",
    "    return treeifyHyponyms(hyponym = synsets(word)[0])\n",
    "\n",
    "# The above is a generalization of this example from the text:\n",
    "#\n",
    "# vehicle = synsets('car')[0]\\\n",
    "# t = nltk.Tree(vehicle.name(), children=[\n",
    "#     nltk.Tree(vehicle.hyponyms()[3].name(), children=[]), \n",
    "#     nltk.Tree(vehicle.hyponyms()[4].name(), children=[]), \n",
    "#     nltk.Tree(vehicle.hyponyms()[5].name(), children=[]), \n",
    "#     nltk.Tree(vehicle.hyponyms()[7].name(), children=[\n",
    "#         nltk.Tree(vehicle.hyponyms()[7].hyponyms()[1].name(), children=[]), \n",
    "#         nltk.Tree(vehicle.hyponyms()[7].hyponyms()[3].name(), children=[]), \n",
    "#         nltk.Tree(vehicle.hyponyms()[7].hyponyms()[4].name(), children=[]), \n",
    "#         nltk.Tree(vehicle.hyponyms()[7].hyponyms()[5].name(), children=[]), \n",
    "#         nltk.Tree(vehicle.hyponyms()[7].hyponyms()[6].name(), children=[]),\n",
    "#     ]), \n",
    "# ])\n",
    "# print(t)\n",
    "\n",
    "vehicle_tree = treeify('vehicle')\n",
    "print(vehicle_tree)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 184,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(car.n.01\n",
      "  (ambulance.n.01 (funny_wagon.n.01 ))\n",
      "  (beach_wagon.n.01 (shooting_brake.n.01 ))\n",
      "  (bus.n.04 )\n",
      "  (cab.n.03 (gypsy_cab.n.01 ) (minicab.n.01 ))\n",
      "  (compact.n.03 )\n",
      "  (convertible.n.01 )\n",
      "  (coupe.n.01 )\n",
      "  (cruiser.n.01 (panda_car.n.01 ))\n",
      "  (electric.n.01 )\n",
      "  (gas_guzzler.n.01 ))\n"
     ]
    }
   ],
   "source": [
    "car_tree = treeify('car')\n",
    "print(car_tree)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 185,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(vomit.n.01 )\n"
     ]
    }
   ],
   "source": [
    "barf_tree = treeify('barf')\n",
    "print(barf_tree)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 187,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "friend       => friend\n",
      "friends      => friend\n",
      "friendly     => friend\n",
      "drink        => drink\n",
      "drinks       => drink\n",
      "drunk        => drunk\n",
      "drank        => drank\n",
      "slow         => slow\n",
      "slowing      => slow\n",
      "slowly       => slowli\n",
      "slower       => slower\n",
      "slowest      => slowest\n",
      "quaid        => quaid\n",
      "xyzing       => xyze\n",
      "exerxzyzing  => exerxzyz\n"
     ]
    }
   ],
   "source": [
    "############\n",
    "# Stemmer #\n",
    "############\n",
    "\n",
    "from functools import reduce\n",
    "\n",
    "stemmer = nltk.stem.SnowballStemmer('english')\n",
    "\n",
    "words = [\n",
    "    'friend', \n",
    "    'friends', \n",
    "    'friendly',\n",
    "    'drink',\n",
    "    'drinks',\n",
    "    'drunk',\n",
    "    'drank',\n",
    "    'slow',\n",
    "    'slowing',\n",
    "    'slowly',\n",
    "    'slower',\n",
    "    'slowest',\n",
    "    'quaid',\n",
    "    'xyzing',\n",
    "    'exerxzyzing'\n",
    "]\n",
    "\n",
    "def printDict (dict):\n",
    "    def maxlen (a, b):\n",
    "        a_len = len(a)\n",
    "        b_len = len(b)\n",
    "        return a if (a_len > b_len) else b\n",
    "\n",
    "    max_length = len(reduce(maxlen, dict.keys()))\n",
    "\n",
    "    for k, v in dict.items():\n",
    "        label = k.ljust(max_length + 1)\n",
    "        print(f'{label} => {v}')\n",
    "        \n",
    "def mapWords (*, words, mapper):\n",
    "    return dict((word, mapper(word)) for word in words)\n",
    "        \n",
    "printDict(mapWords(\n",
    "    words = stems,\n",
    "    mapper = lambda word: stemmer.stem(word)\n",
    "))\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 186,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "lemmatizing as noun\n",
      "friend       => friend\n",
      "friends      => friend\n",
      "friendly     => friendly\n",
      "drink        => drink\n",
      "drinks       => drink\n",
      "drunk        => drunk\n",
      "drank        => drank\n",
      "slow         => slow\n",
      "slowing      => slowing\n",
      "slowly       => slowly\n",
      "slower       => slower\n",
      "slowest      => slowest\n",
      "quaid        => quaid\n",
      "xyzing       => xyzing\n",
      "exerxzyzing  => exerxzyzing\n",
      "\n",
      "lemmatizing as verb\n",
      "friend       => friend\n",
      "friends      => friends\n",
      "friendly     => friendly\n",
      "drink        => drink\n",
      "drinks       => drink\n",
      "drunk        => drink\n",
      "drank        => drink\n",
      "slow         => slow\n",
      "slowing      => slow\n",
      "slowly       => slowly\n",
      "slower       => slower\n",
      "slowest      => slowest\n",
      "quaid        => quaid\n",
      "xyzing       => xyzing\n",
      "exerxzyzing  => exerxzyzing\n",
      "\n",
      "lemmatizing as adjective\n",
      "friend       => friend\n",
      "friends      => friends\n",
      "friendly     => friendly\n",
      "drink        => drink\n",
      "drinks       => drinks\n",
      "drunk        => drunk\n",
      "drank        => drank\n",
      "slow         => slow\n",
      "slowing      => slowing\n",
      "slowly       => slowly\n",
      "slower       => slow\n",
      "slowest      => slow\n",
      "quaid        => quaid\n",
      "xyzing       => xyzing\n",
      "exerxzyzing  => exerxzyzing\n",
      "\n"
     ]
    }
   ],
   "source": [
    "##############\n",
    "# Lemmatizer #\n",
    "##############\n",
    "\n",
    "lemmatizer = nltk.stem.WordNetLemmatizer()\n",
    "\n",
    "for part_of_speech in ['noun', 'verb', 'adjective']:\n",
    "    print(f'lemmatizing as {part_of_speech}')\n",
    "    pos = part_of_speech[0]\n",
    "    printDict(mapWords(\n",
    "        words = stems,\n",
    "        mapper = lambda word: lemmatizer.lemmatize(word, pos)\n",
    "    ))\n",
    "    print()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
 }
No results found