Created
January 10, 2014 23:46
-
-
Save mekler/8364873 to your computer and use it in GitHub Desktop.
Scrapeo de Snie de la SEP (Segunda parte)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Define here the models for your scraped items | |
# | |
# See documentation in: | |
# http://doc.scrapy.org/en/latest/topics/items.html | |
from scrapy.item import Item, Field | |
class NewDataDownloaderItem(Item): | |
# define the fields for your item here like: | |
# name = Field() | |
primaria_pub_cual = Field() | |
primaria_pub_donde = Field() | |
primaria_pub_docentes = Field() | |
primaria_pub_infraestructura = Field() | |
primaria_pub_indicadores = Field() | |
primaria_pub_estadistica = Field() | |
secundaria_primaria_publica = Field() | |
cct = Field() | |
ruta = Field() | |
pass |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Define your item pipelines here | |
# | |
# Don't forget to add your pipeline to the ITEM_PIPELINES setting | |
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html | |
#class NewDataDownloaderPipeline(object): | |
# def process_item(self, item, spider): | |
# return item | |
import pymongo | |
from scrapy.exceptions import DropItem | |
from scrapy.conf import settings | |
from scrapy import log | |
class MongoDBPipeline(object): | |
def __init__(self): | |
connection = pymongo.Connection(settings['MONGODB_SERVER'], settings['MONGODB_PORT']) | |
db = connection[settings['MONGODB_DB']] | |
self.collection = db[settings['MONGODB_COLLECTION']] | |
def process_item(self, item, spider): | |
valid = True | |
for data in item: | |
# here we only check if the data is not null | |
# but we could do any crazy validation we want | |
if not data: | |
valid = False | |
raise DropItem("Missing %s of blogpost from %s" %(data, item['url'])) | |
if valid: | |
self.collection.insert(dict(item)) | |
log.msg("Item wrote to MongoDB database %s/%s" % | |
(settings['MONGODB_DB'], settings['MONGODB_COLLECTION']), | |
level=log.DEBUG, spider=spider) | |
return item |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Scrapy settings for new_data_downloader project | |
# | |
# For simplicity, this file contains only the most important settings by | |
# default. All the other settings are documented here: | |
# | |
# http://doc.scrapy.org/en/latest/topics/settings.html | |
# | |
BOT_NAME = 'new_data_downloader' | |
SPIDER_MODULES = ['new_data_downloader.spiders'] | |
NEWSPIDER_MODULE = 'new_data_downloader.spiders' | |
ITEM_PIPELINES = ['new_data_downloader.pipelines.MongoDBPipeline',] | |
MONGODB_SERVER = "localhost" | |
MONGODB_PORT = 27017 | |
MONGODB_DB = "snie" | |
MONGODB_COLLECTION = "escuelas" | |
# Crawl responsibly by identifying yourself (and your website) on the user-agent | |
#USER_AGENT = 'new_data_downloader (+http://www.yourdomain.com)' |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from scrapy.contrib.spiders import CrawlSpider, Rule | |
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor | |
#from scrapy.selector import HtmlXPathSelector | |
from scrapy.selector import Selector | |
from new_data_downloader.items import NewDataDownloaderItem | |
from time import time | |
import csv | |
import re | |
import string | |
import sys | |
def limpia_tablas(rows): | |
datos = [] | |
for row in rows: | |
n = len(row) | |
i=0 | |
flag = 0 | |
aux = [] | |
while i<n: | |
uniString = unicode(row[i], "UTF-8") | |
uniString = uniString.replace(u"\u00A0", " ") | |
row[i] = uniString.encode('utf-8').strip() | |
if len(row[i]): | |
aux.append(row[i]) | |
i = i + 1 | |
if len(aux)>0: | |
datos.append(aux) | |
return datos | |
def extract_data_between_html(cadena,salida): | |
flag = cadena.find('>') | |
if flag <0: | |
return salida.strip() | |
aux = cadena[flag+1:] | |
return extract_data_between_html(aux[aux.find('<')+1:], salida + " " +aux[:aux.find('<')].strip()) | |
def tabla_cual(tabla): | |
rows = [] | |
for row in tabla: | |
#print len( row.xpath('td/span/text()').extract() ) | |
rows.append(row.xpath('td/span/text()').extract() + row.xpath('td/span/font/text()').extract()) | |
#print len(rows) | |
return rows | |
def tabla_donde(tabla): | |
rows = [] | |
for row in tabla: | |
rows.append(row.xpath('td/span/text()').extract() ) | |
return rows | |
def tabla_docentes(tabla): | |
rows = [] | |
for row in tabla: | |
lista = row.xpath('td').extract() | |
row_aux = [] | |
for x in lista: | |
x = x.encode('iso-8859-1').decode('iso-8859-1').encode('utf-8') | |
row_aux.append( extract_data_between_html(x.translate(string.maketrans("\n\t\r", " ")),'') ) | |
rows.append(row_aux) | |
return rows | |
def escuela_primaria_secundaria_publica(sel,response): | |
item = NewDataDownloaderItem() | |
tabla_dato_gral = sel.xpath('//table[@id="FormView1_tbl_principal2"]/tr/td[1]/table/tr') | |
item['primaria_pub_cual'] = tabla_cual(tabla_dato_gral) | |
tabla_dato_gral = sel.xpath('//table[@id="FormView1_tbl_principal2"]/tr/td[2]/table/tr[2]/td/table/tr') | |
item['primaria_pub_donde'] = tabla_donde(tabla_dato_gral) | |
tabla_dato_gral = sel.xpath('//table[@id="FormView3"]/tr/td[1]/table/tr/td[1]/table/tr') | |
item['primaria_pub_docentes'] = tabla_docentes(tabla_dato_gral) | |
item['primaria_pub_docentes'] = limpia_tablas(item['primaria_pub_docentes']) | |
tabla_dato_gral = sel.xpath('//table[@id="FormView3"]/tr/td[1]/table/tr/td[2]/table/tr') | |
item['primaria_pub_infraestructura'] = tabla_docentes(tabla_dato_gral) | |
item['primaria_pub_infraestructura'] = limpia_tablas(item['primaria_pub_infraestructura']) | |
tabla_dato_gral = sel.xpath('//table[@id="FormView4_tblprincipal4"]/tr/td[1]/table/tr') | |
item['primaria_pub_indicadores'] = tabla_docentes(tabla_dato_gral) | |
item['primaria_pub_indicadores'] = limpia_tablas(item['primaria_pub_indicadores']) | |
tabla_dato_gral = sel.xpath('//table[@id="FormView4_tblprincipal4"]/tr/td[2]/table/tr') | |
item['primaria_pub_estadistica'] = tabla_docentes(tabla_dato_gral) | |
item['primaria_pub_estadistica'] = limpia_tablas(item['primaria_pub_estadistica']) | |
item['secundaria_primaria_publica'] = 1 | |
item['cct'] = response.url[response.url.find('vcct=')+5:response.url.find('&')] | |
item['ruta'] = response.url | |
return item | |
def escuela_general(sel,response): | |
item = NewDataDownloaderItem() | |
tabla_dato_gral = sel.xpath('//table[@id="FormView1"]/tr/td[1]/table/tr') | |
item['primaria_pub_cual'] = tabla_docentes(tabla_dato_gral) | |
tabla_dato_gral = sel.xpath('//table[@id="FormView10"]/tr/td[1]/table/tr') | |
item['primaria_pub_indicadores'] = tabla_docentes(tabla_dato_gral) | |
tabla_dato_gral = sel.xpath('//table[@class="tbl_estadistica"]/tr') | |
item['primaria_pub_estadistica'] = tabla_docentes(tabla_dato_gral) | |
item['secundaria_primaria_publica'] = 0 | |
item['cct'] = response.url[response.url.find('vcct=')+5:response.url.find('&')] | |
item['ruta'] = response.url | |
return item | |
class NewDataDownloaderSpider(CrawlSpider): | |
name = 'snie' | |
allowed_domains = ["snie.sep.gob.mx"] | |
def __init__(self): | |
with open('/root/data/enlace-urls.txt', 'rb') as csvfile: | |
spamreader = csv.reader(csvfile, delimiter=',', quotechar='"') | |
start_urls = [] | |
i=0 | |
for row in spamreader: | |
start_urls.append(row[0]) | |
self.start_urls = start_urls | |
def parse(self, response): | |
#hxs = HtmlXPathSelector(response) | |
sel = Selector(response) | |
items = [] | |
item = NewDataDownloaderItem() | |
tabla_dato_gral = sel.xpath('//table[@id="FormView1_tbl_principal2"]/tr/td[1]/table/tr') | |
if len(tabla_dato_gral)>0: | |
items.append(escuela_primaria_secundaria_publica(sel,response)) | |
else: | |
tabla_dato_gral = sel.xpath('//table[@id="FormView1"]/tr/td[1]/table/tr') | |
if len(tabla_dato_gral)>0: | |
items.append(escuela_general(sel,response)) | |
else: | |
item['secundaria_primaria_publica'] = -1 | |
item['cct'] = response.url[response.url.find('vcct=')+5:response.url.find('&')] | |
item['ruta'] = response.url | |
return items | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment