Skip to content

Instantly share code, notes, and snippets.

@mekler
Created January 10, 2014 23:46
Show Gist options
  • Save mekler/8364873 to your computer and use it in GitHub Desktop.
Save mekler/8364873 to your computer and use it in GitHub Desktop.
Scrapeo de Snie de la SEP (Segunda parte)
# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html
from scrapy.item import Item, Field
class NewDataDownloaderItem(Item):
# define the fields for your item here like:
# name = Field()
primaria_pub_cual = Field()
primaria_pub_donde = Field()
primaria_pub_docentes = Field()
primaria_pub_infraestructura = Field()
primaria_pub_indicadores = Field()
primaria_pub_estadistica = Field()
secundaria_primaria_publica = Field()
cct = Field()
ruta = Field()
pass
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
#class NewDataDownloaderPipeline(object):
# def process_item(self, item, spider):
# return item
import pymongo
from scrapy.exceptions import DropItem
from scrapy.conf import settings
from scrapy import log
class MongoDBPipeline(object):
def __init__(self):
connection = pymongo.Connection(settings['MONGODB_SERVER'], settings['MONGODB_PORT'])
db = connection[settings['MONGODB_DB']]
self.collection = db[settings['MONGODB_COLLECTION']]
def process_item(self, item, spider):
valid = True
for data in item:
# here we only check if the data is not null
# but we could do any crazy validation we want
if not data:
valid = False
raise DropItem("Missing %s of blogpost from %s" %(data, item['url']))
if valid:
self.collection.insert(dict(item))
log.msg("Item wrote to MongoDB database %s/%s" %
(settings['MONGODB_DB'], settings['MONGODB_COLLECTION']),
level=log.DEBUG, spider=spider)
return item
# Scrapy settings for new_data_downloader project
#
# For simplicity, this file contains only the most important settings by
# default. All the other settings are documented here:
#
# http://doc.scrapy.org/en/latest/topics/settings.html
#
BOT_NAME = 'new_data_downloader'
SPIDER_MODULES = ['new_data_downloader.spiders']
NEWSPIDER_MODULE = 'new_data_downloader.spiders'
ITEM_PIPELINES = ['new_data_downloader.pipelines.MongoDBPipeline',]
MONGODB_SERVER = "localhost"
MONGODB_PORT = 27017
MONGODB_DB = "snie"
MONGODB_COLLECTION = "escuelas"
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'new_data_downloader (+http://www.yourdomain.com)'
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
#from scrapy.selector import HtmlXPathSelector
from scrapy.selector import Selector
from new_data_downloader.items import NewDataDownloaderItem
from time import time
import csv
import re
import string
import sys
def limpia_tablas(rows):
datos = []
for row in rows:
n = len(row)
i=0
flag = 0
aux = []
while i<n:
uniString = unicode(row[i], "UTF-8")
uniString = uniString.replace(u"\u00A0", " ")
row[i] = uniString.encode('utf-8').strip()
if len(row[i]):
aux.append(row[i])
i = i + 1
if len(aux)>0:
datos.append(aux)
return datos
def extract_data_between_html(cadena,salida):
flag = cadena.find('>')
if flag <0:
return salida.strip()
aux = cadena[flag+1:]
return extract_data_between_html(aux[aux.find('<')+1:], salida + " " +aux[:aux.find('<')].strip())
def tabla_cual(tabla):
rows = []
for row in tabla:
#print len( row.xpath('td/span/text()').extract() )
rows.append(row.xpath('td/span/text()').extract() + row.xpath('td/span/font/text()').extract())
#print len(rows)
return rows
def tabla_donde(tabla):
rows = []
for row in tabla:
rows.append(row.xpath('td/span/text()').extract() )
return rows
def tabla_docentes(tabla):
rows = []
for row in tabla:
lista = row.xpath('td').extract()
row_aux = []
for x in lista:
x = x.encode('iso-8859-1').decode('iso-8859-1').encode('utf-8')
row_aux.append( extract_data_between_html(x.translate(string.maketrans("\n\t\r", " ")),'') )
rows.append(row_aux)
return rows
def escuela_primaria_secundaria_publica(sel,response):
item = NewDataDownloaderItem()
tabla_dato_gral = sel.xpath('//table[@id="FormView1_tbl_principal2"]/tr/td[1]/table/tr')
item['primaria_pub_cual'] = tabla_cual(tabla_dato_gral)
tabla_dato_gral = sel.xpath('//table[@id="FormView1_tbl_principal2"]/tr/td[2]/table/tr[2]/td/table/tr')
item['primaria_pub_donde'] = tabla_donde(tabla_dato_gral)
tabla_dato_gral = sel.xpath('//table[@id="FormView3"]/tr/td[1]/table/tr/td[1]/table/tr')
item['primaria_pub_docentes'] = tabla_docentes(tabla_dato_gral)
item['primaria_pub_docentes'] = limpia_tablas(item['primaria_pub_docentes'])
tabla_dato_gral = sel.xpath('//table[@id="FormView3"]/tr/td[1]/table/tr/td[2]/table/tr')
item['primaria_pub_infraestructura'] = tabla_docentes(tabla_dato_gral)
item['primaria_pub_infraestructura'] = limpia_tablas(item['primaria_pub_infraestructura'])
tabla_dato_gral = sel.xpath('//table[@id="FormView4_tblprincipal4"]/tr/td[1]/table/tr')
item['primaria_pub_indicadores'] = tabla_docentes(tabla_dato_gral)
item['primaria_pub_indicadores'] = limpia_tablas(item['primaria_pub_indicadores'])
tabla_dato_gral = sel.xpath('//table[@id="FormView4_tblprincipal4"]/tr/td[2]/table/tr')
item['primaria_pub_estadistica'] = tabla_docentes(tabla_dato_gral)
item['primaria_pub_estadistica'] = limpia_tablas(item['primaria_pub_estadistica'])
item['secundaria_primaria_publica'] = 1
item['cct'] = response.url[response.url.find('vcct=')+5:response.url.find('&')]
item['ruta'] = response.url
return item
def escuela_general(sel,response):
item = NewDataDownloaderItem()
tabla_dato_gral = sel.xpath('//table[@id="FormView1"]/tr/td[1]/table/tr')
item['primaria_pub_cual'] = tabla_docentes(tabla_dato_gral)
tabla_dato_gral = sel.xpath('//table[@id="FormView10"]/tr/td[1]/table/tr')
item['primaria_pub_indicadores'] = tabla_docentes(tabla_dato_gral)
tabla_dato_gral = sel.xpath('//table[@class="tbl_estadistica"]/tr')
item['primaria_pub_estadistica'] = tabla_docentes(tabla_dato_gral)
item['secundaria_primaria_publica'] = 0
item['cct'] = response.url[response.url.find('vcct=')+5:response.url.find('&')]
item['ruta'] = response.url
return item
class NewDataDownloaderSpider(CrawlSpider):
name = 'snie'
allowed_domains = ["snie.sep.gob.mx"]
def __init__(self):
with open('/root/data/enlace-urls.txt', 'rb') as csvfile:
spamreader = csv.reader(csvfile, delimiter=',', quotechar='"')
start_urls = []
i=0
for row in spamreader:
start_urls.append(row[0])
self.start_urls = start_urls
def parse(self, response):
#hxs = HtmlXPathSelector(response)
sel = Selector(response)
items = []
item = NewDataDownloaderItem()
tabla_dato_gral = sel.xpath('//table[@id="FormView1_tbl_principal2"]/tr/td[1]/table/tr')
if len(tabla_dato_gral)>0:
items.append(escuela_primaria_secundaria_publica(sel,response))
else:
tabla_dato_gral = sel.xpath('//table[@id="FormView1"]/tr/td[1]/table/tr')
if len(tabla_dato_gral)>0:
items.append(escuela_general(sel,response))
else:
item['secundaria_primaria_publica'] = -1
item['cct'] = response.url[response.url.find('vcct=')+5:response.url.find('&')]
item['ruta'] = response.url
return items
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment