Created
March 13, 2013 18:07
-
-
Save mekler/5154638 to your computer and use it in GitHub Desktop.
Scraper hecho en Scrapy para descargar datos del SNIE de la SEP.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Define here the models for your scraped items | |
# | |
# See documentation in: | |
# http://doc.scrapy.org/en/latest/topics/items.html | |
from scrapy.item import Item, Field | |
class SnieSepItem(Item): | |
# define the fields for your item here like: | |
# name = Field() | |
estadistica_basica = Field() | |
indicadores = Field() | |
mi_escuela = Field() | |
instalaciones = Field() | |
tablas_estadistica = Field() | |
cct = Field() | |
estado = Field() | |
completo = Field() | |
pass |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pymongo | |
from scrapy.exceptions import DropItem | |
from scrapy.conf import settings | |
from scrapy import log | |
class MongoDBPipeline(object): | |
def __init__(self): | |
connection = pymongo.Connection(settings['MONGODB_SERVER'], settings['MONGODB_PORT']) | |
db = connection[settings['MONGODB_DB']] | |
self.collection = db[settings['MONGODB_COLLECTION']] | |
def process_item(self, item, spider): | |
valid = True | |
for data in item: | |
# here we only check if the data is not null | |
# but we could do any crazy validation we want | |
if not data: | |
valid = False | |
raise DropItem("Missing %s of blogpost from %s" %(data, item['url'])) | |
if valid: | |
self.collection.insert(dict(item)) | |
log.msg("Item wrote to MongoDB database %s/%s" % | |
(settings['MONGODB_DB'], settings['MONGODB_COLLECTION']), | |
level=log.DEBUG, spider=spider) | |
return item |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Scrapy settings for snie_sep project | |
# | |
# For simplicity, this file contains only the most important settings by | |
# default. All the other settings are documented here: | |
# | |
# http://doc.scrapy.org/en/latest/topics/settings.html | |
# | |
BOT_NAME = 'snie_sep' | |
SPIDER_MODULES = ['snie_sep.spiders'] | |
NEWSPIDER_MODULE = 'snie_sep.spiders' | |
ITEM_PIPELINES = ['snie_sep.pipelines.MongoDBPipeline',] | |
MONGODB_SERVER = "localhost" | |
MONGODB_PORT = 27017 | |
MONGODB_DB = "snie" | |
MONGODB_COLLECTION = "scrapeado" | |
# Crawl responsibly by identifying yourself (and your website) on the user-agent | |
#USER_AGENT = 'snie_sep (+http://www.yourdomain.com)' |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from scrapy.spider import BaseSpider | |
from scrapy.selector import HtmlXPathSelector | |
import lxml.html | |
import pprint | |
from snie_sep.items import SnieSepItem | |
class SnieSepSpider(BaseSpider): | |
name = "snie" | |
allowed_domains = ["www.snie.sep.gob.mx"] | |
download_delay = 2 | |
#url de ejemplo: http://www.snie.sep.gob.mx/SNIESC/detalles.aspx?vcct=12DPR0075S&vsubn=070&vturno=1 | |
f = open("/root/mexprim/archivos/snie_f") | |
start_urls = [url.strip() for url in f.readlines()] | |
f.close() | |
def pass_table_to_list(self, s): | |
html = lxml.html.fromstring(s) | |
tbl = [] | |
rows = html.cssselect("tr") | |
for row in rows: | |
tbl.append(list()) | |
for td in row.cssselect("td"): | |
tbl[-1].append(td.text_content().strip().encode('utf-8')) | |
return tbl | |
def info_basica(self, hxs, cct): | |
tablas = hxs.select('//table[@class="tbl_estadistica"]') | |
item = SnieSepItem() | |
items = [] | |
for tabla in tablas: | |
items.append( self.pass_table_to_list(tabla.extract() ) ) | |
item['tablas_estadistica'] = items | |
item['cct'] = cct | |
item['estado']= '1' | |
item['completo'] = '0' | |
return item | |
def info_completa(self, hxs, cct): | |
item = SnieSepItem() | |
s = hxs.select('//table[@id="FormView3_tblPrincipal3"]/tr/td[1]/table').extract() | |
item['mi_escuela'] = self.pass_table_to_list(s[0]) | |
s = hxs.select('//table[@id="FormView3_tblPrincipal3"]/tr/td[2]/table').extract() | |
item['instalaciones'] = self.pass_table_to_list(s[0]) | |
s = hxs.select('//table[@id="FormView4_tblprincipal4"]/tr/td[1]/table').extract() | |
item['indicadores'] = self.pass_table_to_list(s[0]) | |
s = hxs.select('//table[@id="FormView4_tblprincipal4"]/tr/td[2]/table').extract() | |
item['estadistica_basica'] = self.pass_table_to_list(s[0]) | |
item['cct'] = cct | |
item['estado']= '1' | |
item['completo'] = '1' | |
return item | |
def parse(self, response): | |
hxs = HtmlXPathSelector(response) | |
clave = hxs.select('//span[@id="FormView1_Label8"]/font/text()').extract() | |
if len(clave)>0: | |
item = self.info_completa(hxs,clave) | |
else: | |
clave = hxs.select('//span[@id="FormView1_CCTLabel"]/text()').extract() | |
if len(clave)>0: | |
item = self.info_basica(hxs,clave) | |
else: | |
item = SnieSepItem() | |
item['cct'] = clave | |
item['estado']= '0' | |
return item |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Funciona al 13/Marzo/2013