Skip to content

Instantly share code, notes, and snippets.

@mekler
Created November 15, 2012 04:55
Show Gist options
  • Save mekler/4076744 to your computer and use it in GitHub Desktop.
Save mekler/4076744 to your computer and use it in GitHub Desktop.
scraper de diputados en méxico (noviembre-2012) con scrapy
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from diputados.items import DiputadosItem
from time import time
import re
class DiputadosSpider(CrawlSpider):
name = 'diputados'
allowed_domains = ["sitl.diputados.gob.mx"]
start_urls = [
'http://sitl.diputados.gob.mx/LXII_leg/listado_diputados_gpnp.php?tipot=TOTAL'
]
rules = [Rule(SgmlLinkExtractor(allow=[r'LXII_leg/curricula([\w\W])+$']), callback='parse_diputados', follow=True)]
def parse_diputados(self, response):
hxs = HtmlXPathSelector(response)
items = []
item = DiputadosItem()
print response.url
item['tipo_eleccion'] = hxs.select('//html/body/table/tr[3]/td/table[2]/tr[2]/td[2]/text()').extract()[0].strip().encode('utf-8')
item['nombre'] = hxs.select('/html/body/table/tr[3]/td/table[2]/tr/td[3]/span/text()').extract()[0].strip().encode('utf-8')
aux = hxs.select('/html/body/table/tr[3]/td/table[2]/tr/td/img/@src').extract()
item['imagenes'] = aux
item['entidad'] = hxs.select('/html/body/table/tr[3]/td/table[2]/tr[3]/td/text()').extract()[0].strip().encode('utf-8')
item['distrito'] = hxs.select('/html/body/table/tr[3]/td/table[2]/tr[4]/td[2]/text()').extract()[0].strip().encode('utf-8')
item['suplente'] = hxs.select('/html/body/table/tr[3]/td/table[2]/tr[7]/td/span/text()').extract()[0].strip().encode('utf-8')
item['cumpleanios'] = hxs.select('/html/body/table/tr[3]/td/table[2]/tr[8]/td/text()').extract()[0].strip().encode('utf-8')
item['foto'] = 'http://sitl.diputados.gob.mx/LXII_leg/'+aux[0][2:]
item['correo'] = hxs.select('/html/body/table/tr[3]/td/table[2]/tr[9]/td/a/text()').extract()[0].strip()
if aux[1] == 'images/pri01.png':
item['partido'] = 'pri'
elif aux[1] == 'images/pan.png':
item['partido'] = 'pan'
elif aux[1] == 'images/prd01.png':
item['partido'] = 'prd'
elif aux[1] == 'images/logvrd.jpg':
item['partido'] = 'partido verde'
elif aux[1] == 'images/logo_movimiento_ciudadano.png':
item['partido'] = 'movimiento ciudadano'
elif aux[1] == 'images/logpt.jpg':
item['partido'] = 'pt'
elif aux[1] == 'images/panal.gif':
item['partido'] = 'panal'
items.append(item)
return items
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment