lineker · April 15, 2015 01:43
diff --git a/extract_url.py b/extract_url.py
 from lxml import html,etree
 import requests
 import pprint
 #This will create a list of buyers:
 #buyers = tree.xpath('//div[@title="buyer-name"]/text()')
 #This will create a list of prices
 #prices = tree.xpath('//span[@class="item-price"]/text()')

 #print 'Buyers: ', buyers
 #print 'Prices: ', prices


 user_agents = [
    'Mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv:1.8.1.11) Gecko/20071127 Firefox/2.0.0.11',
    'Opera/9.25 (Windows NT 5.1; U; en)',
    'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)',
    'Mozilla/5.0 (compatible; Konqueror/3.5; Linux) KHTML/3.5.5 (like Gecko) (Kubuntu)',
    'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.0.12) Gecko/20070731 Ubuntu/dapper-security Firefox/1.5.0.12',
    'Lynx/2.8.5rel.1 libwww-FM/2.14 SSL-MM/1.4.1 GNUTLS/1.2.9'
 ]

 headers = {
    'User-Agent': user_agents[0]
 }
 baseurl = "http://www.canardscanins.ca"
 #download page
 response = requests.get(baseurl+'/canins/portail.php?action=liste', headers=headers)
 #print response.text

 #convert to lxml etree
 tree = html.fromstring(response.text)
 #find all a tag and extract the href attribute
 links = tree.xpath('//a/@href')
 #find all a tag and extract the text between the open and closing tab
 titles = tree.xpath('//a/text()')
 print titles

 parks = {}

 for (i, item) in enumerate(links):
    parks[baseurl+item] = {"Name":titles[i]}
    #print titles[i] + " - " + baseurl+item
 pprint.pprint(parks)
 #for item in links:
 #    print baseurl+item

 #for each key,value in parks

 key, value = parks.popitem()


 page = requests.get(key, headers=headers)
 ptree = html.fromstring(page.text)
 #imgs = ptree.find(".//img")
 imgs = ptree.xpath('//img')
 #print etree.tostring(tree)
 print imgs[0].getnext().text
 print imgs[0].getnext().tail

 #extract b nodes that contain text "Fondation"
 founded = ptree.xpath('.//b[contains(text(),"Fondation")]')
 print len(founded)
 print etree.tostring(founded[0].getnext())

 #for img in ptree.iterfind('.//img'):
 #    print etree.tostring(img)
	from lxml import html,etree
	import requests
	import pprint
	#This will create a list of buyers:
	#buyers = tree.xpath('//div[@title="buyer-name"]/text()')
	#This will create a list of prices
	#prices = tree.xpath('//span[@class="item-price"]/text()')

	#print 'Buyers: ', buyers
	#print 'Prices: ', prices


	user_agents = [
	'Mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv:1.8.1.11) Gecko/20071127 Firefox/2.0.0.11',
	'Opera/9.25 (Windows NT 5.1; U; en)',
	'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)',
	'Mozilla/5.0 (compatible; Konqueror/3.5; Linux) KHTML/3.5.5 (like Gecko) (Kubuntu)',
	'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.0.12) Gecko/20070731 Ubuntu/dapper-security Firefox/1.5.0.12',
	'Lynx/2.8.5rel.1 libwww-FM/2.14 SSL-MM/1.4.1 GNUTLS/1.2.9'
	]

	headers = {
	'User-Agent': user_agents[0]
	}
	baseurl = "http://www.canardscanins.ca"
	#download page
	response = requests.get(baseurl+'/canins/portail.php?action=liste', headers=headers)
	#print response.text

	#convert to lxml etree
	tree = html.fromstring(response.text)
	#find all a tag and extract the href attribute
	links = tree.xpath('//a/@href')
	#find all a tag and extract the text between the open and closing tab
	titles = tree.xpath('//a/text()')
	print titles

	parks = {}

	for (i, item) in enumerate(links):
	parks[baseurl+item] = {"Name":titles[i]}
	#print titles[i] + " - " + baseurl+item
	pprint.pprint(parks)
	#for item in links:
	# print baseurl+item

	#for each key,value in parks

	key, value = parks.popitem()


	page = requests.get(key, headers=headers)
	ptree = html.fromstring(page.text)
	#imgs = ptree.find(".//img")
	imgs = ptree.xpath('//img')
	#print etree.tostring(tree)
	print imgs[0].getnext().text
	print imgs[0].getnext().tail

	#extract b nodes that contain text "Fondation"
	founded = ptree.xpath('.//b[contains(text(),"Fondation")]')
	print len(founded)
	print etree.tostring(founded[0].getnext())

	#for img in ptree.iterfind('.//img'):
	# print etree.tostring(img)