Skip to content

Instantly share code, notes, and snippets.

custom_settings = {'ROBOTSTXT_OBEY': False, 'LOG_LEVEL': 'INFO',
'CONCURRENT_REQUESTS_PER_DOMAIN': 10,
'RETRY_TIMES': 5}
import scrapy
from urllib.parse import urlencode
from urllib.parse import urlparse
import json
from datetime import datetime
API_KEY = 'YOUR_KEY'
def get_url(url):
payload = {'api_key': API_KEY, 'url': url, 'autoparse': 'true', 'country_code': 'us'}
proxy_url = 'http://api.scraperapi.com/?' + urlencode(payload)
def parse(self, response):
di = json.loads(response.text)
pos = response.meta['pos']
dt = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
for result in di['organic_results']:
title = result['title']
snippet = result['snippet']
link = result['link']
item = {'title': title, 'snippet': snippet, 'link': link, 'position': pos, 'date': dt}
pos += 1
def start_requests(self):
queries = ['scrapy’, ‘beautifulsoup’]
for query in queries:
url = create_google_url(query)
yield scrapy.Request(get_url(url), callback=self.parse, meta={'pos': 0})
def get_url(url):
payload = {'api_key': API_KEY, 'url': url, 'autoparse': 'true', 'country_code': 'us'}
proxy_url = 'http://api.scraperapi.com/?' + urlencode(payload)
return proxy_url
from urllib.parse import urlparse
from urllib.parse import urlencode
def create_google_url(query, site=''):
google_dict = {'q': query, 'num': 100, }
if site:
web = urlparse(site).netloc
google_dict['as_sitesearch'] = web
return 'http://www.google.com/search?' + urlencode(google_dict)
return 'http://www.google.com/search?' + urlencode(google_dict)
import scrapy
class AmazonSpider(scrapy.Spider):
name = 'amazon'
allowed_domains = ['amazon.com']
start_urls = ['http://www.amazon.com/']
def parse(self, response):
pass
class TutorialPipeline:
def process_item(self, item, spider):
for k, v in item.items():
if not v:
item[k] = '' # replace empty list or None with empty string
continue
if k == 'Title':
item[k] = v.strip()
elif k == 'Rating':
def get_url(url):
payload = {'api_key': API, 'url': url, 'country_code': 'us'}
proxy_url = 'http://api.scraperapi.com/?' + urlencode(payload)
return proxy_url
def start_requests(self):
...
yield scrapy.Request(url=get_url(url), callback=self.parse_keyword_response)
def parse_keyword_response(self, response):
...
yield scrapy.Request(url=get_url(product_url), callback=self.parse_product_page, meta={'asin': asin})
...