Skip to content

Instantly share code, notes, and snippets.

@mekler
Created March 7, 2014 22:42
Show Gist options
  • Save mekler/9421712 to your computer and use it in GitHub Desktop.
Save mekler/9421712 to your computer and use it in GitHub Desktop.
Descarga PDFs de la Sep
# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html
from scrapy.item import Item, Field
class SepPdfItem(Item):
# define the fields for your item here like:
# name = Field()
url = Field()
titulo = Field()
archivo = Field()
pass
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
#class NewDataDownloaderPipeline(object):
# def process_item(self, item, spider):
# return item
import pymongo
from scrapy.exceptions import DropItem
from scrapy.conf import settings
from scrapy import log
class MongoDBPipeline(object):
def __init__(self):
connection = pymongo.Connection(settings['MONGODB_SERVER'], settings['MONGODB_PORT'])
db = connection[settings['MONGODB_DB']]
self.collection = db[settings['MONGODB_COLLECTION']]
def process_item(self, item, spider):
valid = True
for data in item:
# here we only check if the data is not null
# but we could do any crazy validation we want
if not data:
valid = False
raise DropItem("Missing %s of blogpost from %s" %(data, item['url']))
if valid:
self.collection.insert(dict(item))
log.msg("Item wrote to MongoDB database %s/%s" %
(settings['MONGODB_DB'], settings['MONGODB_COLLECTION']),
level=log.DEBUG, spider=spider)
return item
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
#from scrapy.selector import HtmlXPathSelector
from scrapy.selector import Selector
from sep_pdf.items import SepPdfItem
from time import time
import csv
import re
import string
import sys
class SepSpider(CrawlSpider):
name = 'sep_crawler'
allowed_domains = ["sep.gob.mx"]
start_urls =['http://sep.gob.mx/es/sep1/Fondo_de_Aportaciones_para_la_educacion_Basica_y_Normal_FAEB_#.UxkLTNtIqY4']
rules = [Rule(SgmlLinkExtractor(allow=[r'/es/sep1/[a-zA-Z_]+_4']), callback='parse_pdf')]
def parse_pdf(self, response):
#hxs = HtmlXPathSelector(response)
sel = Selector(response)
items = []
item = SepPdfItem()
archivos = []
# sel.xpath('//div[@class="entrada_indiceTematico"]/h3/a/text()').extract()
#sites = sel.xpath('//ul/li')
sites = sel.xpath('//div[@id="content"]/p')
item['url'] = response.url
for site in sites:
title = site.xpath('a/text()').extract()
link = site.xpath('a/@href').extract()
if len(title) >0 and len(link)>0:
archivos.append([title[0],link[0]])
print item
item['archivo'] = archivos
return item
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment