Skip to content

Instantly share code, notes, and snippets.

@mekler
Created March 13, 2013 18:07
Show Gist options
  • Save mekler/5154638 to your computer and use it in GitHub Desktop.
Save mekler/5154638 to your computer and use it in GitHub Desktop.
Scraper hecho en Scrapy para descargar datos del SNIE de la SEP.
# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html
from scrapy.item import Item, Field
class SnieSepItem(Item):
# define the fields for your item here like:
# name = Field()
estadistica_basica = Field()
indicadores = Field()
mi_escuela = Field()
instalaciones = Field()
tablas_estadistica = Field()
cct = Field()
estado = Field()
completo = Field()
pass
import pymongo
from scrapy.exceptions import DropItem
from scrapy.conf import settings
from scrapy import log
class MongoDBPipeline(object):
def __init__(self):
connection = pymongo.Connection(settings['MONGODB_SERVER'], settings['MONGODB_PORT'])
db = connection[settings['MONGODB_DB']]
self.collection = db[settings['MONGODB_COLLECTION']]
def process_item(self, item, spider):
valid = True
for data in item:
# here we only check if the data is not null
# but we could do any crazy validation we want
if not data:
valid = False
raise DropItem("Missing %s of blogpost from %s" %(data, item['url']))
if valid:
self.collection.insert(dict(item))
log.msg("Item wrote to MongoDB database %s/%s" %
(settings['MONGODB_DB'], settings['MONGODB_COLLECTION']),
level=log.DEBUG, spider=spider)
return item
# Scrapy settings for snie_sep project
#
# For simplicity, this file contains only the most important settings by
# default. All the other settings are documented here:
#
# http://doc.scrapy.org/en/latest/topics/settings.html
#
BOT_NAME = 'snie_sep'
SPIDER_MODULES = ['snie_sep.spiders']
NEWSPIDER_MODULE = 'snie_sep.spiders'
ITEM_PIPELINES = ['snie_sep.pipelines.MongoDBPipeline',]
MONGODB_SERVER = "localhost"
MONGODB_PORT = 27017
MONGODB_DB = "snie"
MONGODB_COLLECTION = "scrapeado"
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'snie_sep (+http://www.yourdomain.com)'
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
import lxml.html
import pprint
from snie_sep.items import SnieSepItem
class SnieSepSpider(BaseSpider):
name = "snie"
allowed_domains = ["www.snie.sep.gob.mx"]
download_delay = 2
#url de ejemplo: http://www.snie.sep.gob.mx/SNIESC/detalles.aspx?vcct=12DPR0075S&vsubn=070&vturno=1
f = open("/root/mexprim/archivos/snie_f")
start_urls = [url.strip() for url in f.readlines()]
f.close()
def pass_table_to_list(self, s):
html = lxml.html.fromstring(s)
tbl = []
rows = html.cssselect("tr")
for row in rows:
tbl.append(list())
for td in row.cssselect("td"):
tbl[-1].append(td.text_content().strip().encode('utf-8'))
return tbl
def info_basica(self, hxs, cct):
tablas = hxs.select('//table[@class="tbl_estadistica"]')
item = SnieSepItem()
items = []
for tabla in tablas:
items.append( self.pass_table_to_list(tabla.extract() ) )
item['tablas_estadistica'] = items
item['cct'] = cct
item['estado']= '1'
item['completo'] = '0'
return item
def info_completa(self, hxs, cct):
item = SnieSepItem()
s = hxs.select('//table[@id="FormView3_tblPrincipal3"]/tr/td[1]/table').extract()
item['mi_escuela'] = self.pass_table_to_list(s[0])
s = hxs.select('//table[@id="FormView3_tblPrincipal3"]/tr/td[2]/table').extract()
item['instalaciones'] = self.pass_table_to_list(s[0])
s = hxs.select('//table[@id="FormView4_tblprincipal4"]/tr/td[1]/table').extract()
item['indicadores'] = self.pass_table_to_list(s[0])
s = hxs.select('//table[@id="FormView4_tblprincipal4"]/tr/td[2]/table').extract()
item['estadistica_basica'] = self.pass_table_to_list(s[0])
item['cct'] = cct
item['estado']= '1'
item['completo'] = '1'
return item
def parse(self, response):
hxs = HtmlXPathSelector(response)
clave = hxs.select('//span[@id="FormView1_Label8"]/font/text()').extract()
if len(clave)>0:
item = self.info_completa(hxs,clave)
else:
clave = hxs.select('//span[@id="FormView1_CCTLabel"]/text()').extract()
if len(clave)>0:
item = self.info_basica(hxs,clave)
else:
item = SnieSepItem()
item['cct'] = clave
item['estado']= '0'
return item
@mekler
Copy link
Author

mekler commented Mar 13, 2013

Funciona al 13/Marzo/2013

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment