This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| custom_settings = {'ROBOTSTXT_OBEY': False, 'LOG_LEVEL': 'INFO', | |
| 'CONCURRENT_REQUESTS_PER_DOMAIN': 10, | |
| 'RETRY_TIMES': 5} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import scrapy | |
| from urllib.parse import urlencode | |
| from urllib.parse import urlparse | |
| import json | |
| from datetime import datetime | |
| API_KEY = 'YOUR_KEY' | |
| def get_url(url): | |
| payload = {'api_key': API_KEY, 'url': url, 'autoparse': 'true', 'country_code': 'us'} | |
| proxy_url = 'http://api.scraperapi.com/?' + urlencode(payload) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| def parse(self, response): | |
| di = json.loads(response.text) | |
| pos = response.meta['pos'] | |
| dt = datetime.now().strftime('%Y-%m-%d %H:%M:%S') | |
| for result in di['organic_results']: | |
| title = result['title'] | |
| snippet = result['snippet'] | |
| link = result['link'] | |
| item = {'title': title, 'snippet': snippet, 'link': link, 'position': pos, 'date': dt} | |
| pos += 1 |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| def start_requests(self): | |
| queries = ['scrapy’, ‘beautifulsoup’] | |
| for query in queries: | |
| url = create_google_url(query) | |
| yield scrapy.Request(get_url(url), callback=self.parse, meta={'pos': 0}) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| def get_url(url): | |
| payload = {'api_key': API_KEY, 'url': url, 'autoparse': 'true', 'country_code': 'us'} | |
| proxy_url = 'http://api.scraperapi.com/?' + urlencode(payload) | |
| return proxy_url |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from urllib.parse import urlparse | |
| from urllib.parse import urlencode | |
| def create_google_url(query, site=''): | |
| google_dict = {'q': query, 'num': 100, } | |
| if site: | |
| web = urlparse(site).netloc | |
| google_dict['as_sitesearch'] = web | |
| return 'http://www.google.com/search?' + urlencode(google_dict) | |
| return 'http://www.google.com/search?' + urlencode(google_dict) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import scrapy | |
| class AmazonSpider(scrapy.Spider): | |
| name = 'amazon' | |
| allowed_domains = ['amazon.com'] | |
| start_urls = ['http://www.amazon.com/'] | |
| def parse(self, response): | |
| pass |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| class TutorialPipeline: | |
| def process_item(self, item, spider): | |
| for k, v in item.items(): | |
| if not v: | |
| item[k] = '' # replace empty list or None with empty string | |
| continue | |
| if k == 'Title': | |
| item[k] = v.strip() | |
| elif k == 'Rating': |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| def get_url(url): | |
| payload = {'api_key': API, 'url': url, 'country_code': 'us'} | |
| proxy_url = 'http://api.scraperapi.com/?' + urlencode(payload) | |
| return proxy_url | |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| def start_requests(self): | |
| ... | |
| … | |
| yield scrapy.Request(url=get_url(url), callback=self.parse_keyword_response) | |
| def parse_keyword_response(self, response): | |
| ... | |
| … | |
| yield scrapy.Request(url=get_url(product_url), callback=self.parse_product_page, meta={'asin': asin}) | |
| ... |
NewerOlder