{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Web Scraping using XPath and Python\n", "\n", "https://en.wikipedia.org/wiki/XPath\n", "\n", "https://www.w3schools.com/xml/xpath_syntax.asp\n", "\n", "https://codebeautify.org/Xpath-Tester\n", "\n", "> XPath is a query language for selecting nodes from an XML document.\n", "\n", "> The XPath language is based on a tree representation of the XML document, and provides the ability to navigate around the tree, selecting nodes by a variety of criteria\n", "\n", "\n", "\n", "\n", "\n", "https://en.wikipedia.org/wiki/Outline_of_the_Marvel_Cinematic_Universe" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "from lxml import html\n", "import requests" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "url = \"https://en.wikipedia.org/wiki/Outline_of_the_Marvel_Cinematic_Universe\"" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "resp = requests.get(url)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "tree = html.fromstring(resp.content)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "elements = tree.xpath('//*[@id=\"mw-content-text\"]/div/table[2]/tbody/tr[*]/th/i/a')" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "base_url = \"https://en.wikipedia.org\"" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "links = [base_url + element.attrib['href'] for element in elements]" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['https://en.wikipedia.org/wiki/Iron_Man_(2008_film)',\n", " 'https://en.wikipedia.org/wiki/The_Incredible_Hulk_(film)',\n", " 'https://en.wikipedia.org/wiki/Iron_Man_2',\n", " 'https://en.wikipedia.org/wiki/Thor_(film)',\n", " 'https://en.wikipedia.org/wiki/Captain_America:_The_First_Avenger',\n", " 'https://en.wikipedia.org/wiki/The_Avengers_(2012_film)',\n", " 'https://en.wikipedia.org/wiki/Iron_Man_3',\n", " 'https://en.wikipedia.org/wiki/Thor:_The_Dark_World',\n", " 'https://en.wikipedia.org/wiki/Captain_America:_The_Winter_Soldier',\n", " 'https://en.wikipedia.org/wiki/Guardians_of_the_Galaxy_(film)',\n", " 'https://en.wikipedia.org/wiki/Avengers:_Age_of_Ultron',\n", " 'https://en.wikipedia.org/wiki/Ant-Man_(film)',\n", " 'https://en.wikipedia.org/wiki/Captain_America:_Civil_War',\n", " 'https://en.wikipedia.org/wiki/Doctor_Strange_(2016_film)',\n", " 'https://en.wikipedia.org/wiki/Guardians_of_the_Galaxy_Vol._2',\n", " 'https://en.wikipedia.org/wiki/Spider-Man:_Homecoming',\n", " 'https://en.wikipedia.org/wiki/Thor:_Ragnarok',\n", " 'https://en.wikipedia.org/wiki/Black_Panther_(film)',\n", " 'https://en.wikipedia.org/wiki/Avengers:_Infinity_War',\n", " 'https://en.wikipedia.org/wiki/Ant-Man_and_the_Wasp',\n", " 'https://en.wikipedia.org/wiki/Captain_Marvel_(film)',\n", " 'https://en.wikipedia.org/wiki/Avengers:_Endgame',\n", " 'https://en.wikipedia.org/wiki/Spider-Man:_Far_From_Home']" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "links" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.7" } }, "nbformat": 4, "nbformat_minor": 2 }