mekler · March 7, 2014 22:42
diff --git a/items.py b/items.py
 # Define here the models for your scraped items
 #
 # See documentation in:
 # http://doc.scrapy.org/en/latest/topics/items.html

 from scrapy.item import Item, Field

 class SepPdfItem(Item):
    # define the fields for your item here like:
    # name = Field()
    url = Field()
    titulo = Field()
    archivo = Field()
    pass
diff --git a/pipelines.py b/pipelines.py
 # Define your item pipelines here
 #
 # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html

 #class NewDataDownloaderPipeline(object):
 #    def process_item(self, item, spider):
 #        return item
 import pymongo

 from scrapy.exceptions import DropItem
 from scrapy.conf import settings
 from scrapy import log

 class MongoDBPipeline(object):
    def __init__(self):
        connection = pymongo.Connection(settings['MONGODB_SERVER'], settings['MONGODB_PORT'])
        db = connection[settings['MONGODB_DB']]
        self.collection = db[settings['MONGODB_COLLECTION']]
        
    def process_item(self, item, spider):
    	valid = True
        for data in item:
          # here we only check if the data is not null
          # but we could do any crazy validation we want
       	  if not data:
            valid = False
            raise DropItem("Missing %s of blogpost from %s" %(data, item['url']))
        if valid:
          self.collection.insert(dict(item))
          log.msg("Item wrote to MongoDB database %s/%s" %
                  (settings['MONGODB_DB'], settings['MONGODB_COLLECTION']),
                  level=log.DEBUG, spider=spider) 
        return item
diff --git a/sep_spider.py b/sep_spider.py
 from scrapy.contrib.spiders import CrawlSpider, Rule
 from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
 #from scrapy.selector import HtmlXPathSelector
 from scrapy.selector import Selector
 from sep_pdf.items import SepPdfItem
 from time import time
 import csv
 import re
 import string
 import sys


 class SepSpider(CrawlSpider):
 	name = 'sep_crawler'
 	allowed_domains = ["sep.gob.mx"]
 	start_urls =['http://sep.gob.mx/es/sep1/Fondo_de_Aportaciones_para_la_educacion_Basica_y_Normal_FAEB_#.UxkLTNtIqY4']
 	rules = [Rule(SgmlLinkExtractor(allow=[r'/es/sep1/[a-zA-Z_]+_4']), callback='parse_pdf')]
 	def parse_pdf(self, response):
 		#hxs = HtmlXPathSelector(response)
 		sel = Selector(response)
 		items = []
 		item = SepPdfItem()
 		archivos = []
 		# sel.xpath('//div[@class="entrada_indiceTematico"]/h3/a/text()').extract()
 		#sites = sel.xpath('//ul/li') 
 		sites = sel.xpath('//div[@id="content"]/p')
 		item['url'] = response.url
 		for site in sites:
 			title = site.xpath('a/text()').extract()
 			link = site.xpath('a/@href').extract()
 			if len(title) >0 and len(link)>0:
 				archivos.append([title[0],link[0]])
 			print item
 		item['archivo'] = archivos
 		return item
	# Define here the models for your scraped items
	#
	# See documentation in:
	# http://doc.scrapy.org/en/latest/topics/items.html

	from scrapy.item import Item, Field

	class SepPdfItem(Item):
	# define the fields for your item here like:
	# name = Field()
	url = Field()
	titulo = Field()
	archivo = Field()
	pass
	# Define your item pipelines here
	#
	# Don't forget to add your pipeline to the ITEM_PIPELINES setting
	# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html

	#class NewDataDownloaderPipeline(object):
	# def process_item(self, item, spider):
	# return item
	import pymongo

	from scrapy.exceptions import DropItem
	from scrapy.conf import settings
	from scrapy import log

	class MongoDBPipeline(object):
	def __init__(self):
	connection = pymongo.Connection(settings['MONGODB_SERVER'], settings['MONGODB_PORT'])
	db = connection[settings['MONGODB_DB']]
	self.collection = db[settings['MONGODB_COLLECTION']]

	def process_item(self, item, spider):
	valid = True
	for data in item:
	# here we only check if the data is not null
	# but we could do any crazy validation we want
	if not data:
	valid = False
	raise DropItem("Missing %s of blogpost from %s" %(data, item['url']))
	if valid:
	self.collection.insert(dict(item))
	log.msg("Item wrote to MongoDB database %s/%s" %
	(settings['MONGODB_DB'], settings['MONGODB_COLLECTION']),
	level=log.DEBUG, spider=spider)
	return item
	from scrapy.contrib.spiders import CrawlSpider, Rule
	from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
	#from scrapy.selector import HtmlXPathSelector
	from scrapy.selector import Selector
	from sep_pdf.items import SepPdfItem
	from time import time
	import csv
	import re
	import string
	import sys


	class SepSpider(CrawlSpider):
	name = 'sep_crawler'
	allowed_domains = ["sep.gob.mx"]
	start_urls =['http://sep.gob.mx/es/sep1/Fondo_de_Aportaciones_para_la_educacion_Basica_y_Normal_FAEB_#.UxkLTNtIqY4']
	rules = [Rule(SgmlLinkExtractor(allow=[r'/es/sep1/[a-zA-Z_]+_4']), callback='parse_pdf')]
	def parse_pdf(self, response):
	#hxs = HtmlXPathSelector(response)
	sel = Selector(response)
	items = []
	item = SepPdfItem()
	archivos = []
	# sel.xpath('//div[@class="entrada_indiceTematico"]/h3/a/text()').extract()
	#sites = sel.xpath('//ul/li')
	sites = sel.xpath('//div[@id="content"]/p')
	item['url'] = response.url
	for site in sites:
	title = site.xpath('a/text()').extract()
	link = site.xpath('a/@href').extract()
	if len(title) >0 and len(link)>0:
	archivos.append([title[0],link[0]])
	print item
	item['archivo'] = archivos
	return item