This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
custom_settings = {'ROBOTSTXT_OBEY': False, 'LOG_LEVEL': 'INFO', | |
'CONCURRENT_REQUESTS_PER_DOMAIN': 10, | |
'RETRY_TIMES': 5} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import scrapy | |
from urllib.parse import urlencode | |
from urllib.parse import urlparse | |
import json | |
from datetime import datetime | |
API_KEY = 'YOUR_KEY' | |
def get_url(url): | |
payload = {'api_key': API_KEY, 'url': url, 'autoparse': 'true', 'country_code': 'us'} | |
proxy_url = 'http://api.scraperapi.com/?' + urlencode(payload) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def parse(self, response): | |
di = json.loads(response.text) | |
pos = response.meta['pos'] | |
dt = datetime.now().strftime('%Y-%m-%d %H:%M:%S') | |
for result in di['organic_results']: | |
title = result['title'] | |
snippet = result['snippet'] | |
link = result['link'] | |
item = {'title': title, 'snippet': snippet, 'link': link, 'position': pos, 'date': dt} | |
pos += 1 |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def start_requests(self): | |
queries = ['scrapy’, ‘beautifulsoup’] | |
for query in queries: | |
url = create_google_url(query) | |
yield scrapy.Request(get_url(url), callback=self.parse, meta={'pos': 0}) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def get_url(url): | |
payload = {'api_key': API_KEY, 'url': url, 'autoparse': 'true', 'country_code': 'us'} | |
proxy_url = 'http://api.scraperapi.com/?' + urlencode(payload) | |
return proxy_url |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from urllib.parse import urlparse | |
from urllib.parse import urlencode | |
def create_google_url(query, site=''): | |
google_dict = {'q': query, 'num': 100, } | |
if site: | |
web = urlparse(site).netloc | |
google_dict['as_sitesearch'] = web | |
return 'http://www.google.com/search?' + urlencode(google_dict) | |
return 'http://www.google.com/search?' + urlencode(google_dict) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import scrapy | |
class AmazonSpider(scrapy.Spider): | |
name = 'amazon' | |
allowed_domains = ['amazon.com'] | |
start_urls = ['http://www.amazon.com/'] | |
def parse(self, response): | |
pass |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
class TutorialPipeline: | |
def process_item(self, item, spider): | |
for k, v in item.items(): | |
if not v: | |
item[k] = '' # replace empty list or None with empty string | |
continue | |
if k == 'Title': | |
item[k] = v.strip() | |
elif k == 'Rating': |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def get_url(url): | |
payload = {'api_key': API, 'url': url, 'country_code': 'us'} | |
proxy_url = 'http://api.scraperapi.com/?' + urlencode(payload) | |
return proxy_url | |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def start_requests(self): | |
... | |
… | |
yield scrapy.Request(url=get_url(url), callback=self.parse_keyword_response) | |
def parse_keyword_response(self, response): | |
... | |
… | |
yield scrapy.Request(url=get_url(product_url), callback=self.parse_product_page, meta={'asin': asin}) | |
... |
NewerOlder