Created
April 15, 2015 01:43
-
-
Save lineker/2aaf41fc2672f19885a9 to your computer and use it in GitHub Desktop.
Scrapping web with python and lxml
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from lxml import html,etree | |
import requests | |
import pprint | |
#This will create a list of buyers: | |
#buyers = tree.xpath('//div[@title="buyer-name"]/text()') | |
#This will create a list of prices | |
#prices = tree.xpath('//span[@class="item-price"]/text()') | |
#print 'Buyers: ', buyers | |
#print 'Prices: ', prices | |
user_agents = [ | |
'Mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv:1.8.1.11) Gecko/20071127 Firefox/2.0.0.11', | |
'Opera/9.25 (Windows NT 5.1; U; en)', | |
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)', | |
'Mozilla/5.0 (compatible; Konqueror/3.5; Linux) KHTML/3.5.5 (like Gecko) (Kubuntu)', | |
'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.0.12) Gecko/20070731 Ubuntu/dapper-security Firefox/1.5.0.12', | |
'Lynx/2.8.5rel.1 libwww-FM/2.14 SSL-MM/1.4.1 GNUTLS/1.2.9' | |
] | |
headers = { | |
'User-Agent': user_agents[0] | |
} | |
baseurl = "http://www.canardscanins.ca" | |
#download page | |
response = requests.get(baseurl+'/canins/portail.php?action=liste', headers=headers) | |
#print response.text | |
#convert to lxml etree | |
tree = html.fromstring(response.text) | |
#find all a tag and extract the href attribute | |
links = tree.xpath('//a/@href') | |
#find all a tag and extract the text between the open and closing tab | |
titles = tree.xpath('//a/text()') | |
print titles | |
parks = {} | |
for (i, item) in enumerate(links): | |
parks[baseurl+item] = {"Name":titles[i]} | |
#print titles[i] + " - " + baseurl+item | |
pprint.pprint(parks) | |
#for item in links: | |
# print baseurl+item | |
#for each key,value in parks | |
key, value = parks.popitem() | |
page = requests.get(key, headers=headers) | |
ptree = html.fromstring(page.text) | |
#imgs = ptree.find(".//img") | |
imgs = ptree.xpath('//img') | |
#print etree.tostring(tree) | |
print imgs[0].getnext().text | |
print imgs[0].getnext().tail | |
#extract b nodes that contain text "Fondation" | |
founded = ptree.xpath('.//b[contains(text(),"Fondation")]') | |
print len(founded) | |
print etree.tostring(founded[0].getnext()) | |
#for img in ptree.iterfind('.//img'): | |
# print etree.tostring(img) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment