suranands · October 2, 2016 17:12 · BuvanasriAK · Jun 23, 2020
diff --git a/wabScrape1.py b/wabScrape1.py
 """
 Following Links in Python

 In this assignment you will write a Python program that expands on
 http://www.pythonlearn.com/code/urllinks.py (http://www.pythonlearn.com/code/urllinks.py). The program will
 use urllib to read the HTML from the data files below, extract the href= vaues from the anchor tags, scan for a
 tag that is in a particular position relative to the first name in the list, follow that link and repeat the process a
 number of times and report the last name you find.

 We provide two files for this assignment. One is a sample file where we give you the name for your testing and
 the other is the actual data you need to process for the assignment

 - Sample problem: Start at http://pythondata.drchuck.net/known_by_Fikret.html (http://pythondata.dr
 chuck.net/known_by_Fikret.html) 
 Find the link at position 3 (the first name is 1). Follow that link. Repeat this process 4 times. The answer
 is the last name that you retrieve.
 Sequence of names: Fikret Montgomery Mhairade Butchi Anayah 
 Last name in sequence: Anayah

 - Actual problem: Start at: http://pythondata.drchuck.net/known_by_Inaara.html (http://pythondata.dr
 chuck.net/known_by_Inaara.html) 
 Find the link at position 18 (the first name is 1). Follow that link. Repeat this process 7 times. The
 answer is the last name that you retrieve.
 Hint: The first character of the name of the last page that you will load is: R

 Strategy
 The web pages tweak the height between the links and hide the page after a few seconds to make it difficult for
 you to do the assignment without writing a Python program. But frankly with a little effort and patience you can
 overcome these attempts to make it a little harder to complete the assignment without writing a Python
 program. But that is not the point. The point is to write a clever Python program to solve the program.
 """

 import re, urllib
 from BeautifulSoup import *

 all_links = []
 all_names = []

 url_first_part = 'http://python-data.dr-chuck.net/known_by_'
 url_last_part = '.html'
 first_entry = 'Inaara'

 for i in range(7):
    url = url_first_part + first_entry + url_last_part

    html = urllib.urlopen(url).read()
    soup = BeautifulSoup(html)

    #def get_next_name(url)
    tags = soup('a')
    links = []
    for tag in tags:
        links.append(tag.get('href', None))
    url = links[17]
    print url

    name = url[41:]
    next_entry = name[:-5]
    all_names.append(next_entry)
    first_entry = next_entry
    url = url_first_part + first_entry + url_last_part
    all_links.append(url)
    print all_names[-1]
	"""
	Following Links in Python

	In this assignment you will write a Python program that expands on
	http://www.pythonlearn.com/code/urllinks.py (http://www.pythonlearn.com/code/urllinks.py). The program will
	use urllib to read the HTML from the data files below, extract the href= vaues from the anchor tags, scan for a
	tag that is in a particular position relative to the first name in the list, follow that link and repeat the process a
	number of times and report the last name you find.

	We provide two files for this assignment. One is a sample file where we give you the name for your testing and
	the other is the actual data you need to process for the assignment

	- Sample problem: Start at http://pythondata.drchuck.net/known_by_Fikret.html (http://pythondata.dr
	chuck.net/known_by_Fikret.html)
	Find the link at position 3 (the first name is 1). Follow that link. Repeat this process 4 times. The answer
	is the last name that you retrieve.
	Sequence of names: Fikret Montgomery Mhairade Butchi Anayah
	Last name in sequence: Anayah

	- Actual problem: Start at: http://pythondata.drchuck.net/known_by_Inaara.html (http://pythondata.dr
	chuck.net/known_by_Inaara.html)
	Find the link at position 18 (the first name is 1). Follow that link. Repeat this process 7 times. The
	answer is the last name that you retrieve.
	Hint: The first character of the name of the last page that you will load is: R

	Strategy
	The web pages tweak the height between the links and hide the page after a few seconds to make it difficult for
	you to do the assignment without writing a Python program. But frankly with a little effort and patience you can
	overcome these attempts to make it a little harder to complete the assignment without writing a Python
	program. But that is not the point. The point is to write a clever Python program to solve the program.
	"""

	import re, urllib
	from BeautifulSoup import *

	all_links = []
	all_names = []

	url_first_part = 'http://python-data.dr-chuck.net/known_by_'
	url_last_part = '.html'
	first_entry = 'Inaara'

	for i in range(7):
	url = url_first_part + first_entry + url_last_part

	html = urllib.urlopen(url).read()
	soup = BeautifulSoup(html)

	#def get_next_name(url)
	tags = soup('a')
	links = []
	for tag in tags:
	links.append(tag.get('href', None))
	url = links[17]
	print url

	name = url[41:]
	next_entry = name[:-5]
	all_names.append(next_entry)
	first_entry = next_entry
	url = url_first_part + first_entry + url_last_part
	all_links.append(url)
	print all_names[-1]