ian-kerins

47 followers · 0 following

View GitHub Profile

Recently created

Least recently created

Recently updated

Least recently updated

ian-kerins / google_serp_scrapy_p6.py

Created April 13, 2021 11:23

	custom_settings = {'ROBOTSTXT_OBEY': False, 'LOG_LEVEL': 'INFO',
	'CONCURRENT_REQUESTS_PER_DOMAIN': 10,
	'RETRY_TIMES': 5}

ian-kerins / google_serp_scrapy_p5.py

Created April 12, 2021 14:00

	import scrapy
	from urllib.parse import urlencode
	from urllib.parse import urlparse
	import json
	from datetime import datetime
	API_KEY = 'YOUR_KEY'

	def get_url(url):
	payload = {'api_key': API_KEY, 'url': url, 'autoparse': 'true', 'country_code': 'us'}
	proxy_url = 'http://api.scraperapi.com/?' + urlencode(payload)

ian-kerins / google_serp_scrapy_p4.py

Created April 12, 2021 13:58

	def parse(self, response):
	di = json.loads(response.text)
	pos = response.meta['pos']
	dt = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
	for result in di['organic_results']:
	title = result['title']
	snippet = result['snippet']
	link = result['link']
	item = {'title': title, 'snippet': snippet, 'link': link, 'position': pos, 'date': dt}
	pos += 1

ian-kerins / google_serp_scrapy_p3.py

Created April 12, 2021 13:56

	def start_requests(self):
	queries = ['scrapy’, ‘beautifulsoup’]
	for query in queries:
	url = create_google_url(query)
	yield scrapy.Request(get_url(url), callback=self.parse, meta={'pos': 0})

ian-kerins / google_serp_scrapy_p2.py

Created April 12, 2021 13:55

	def get_url(url):
	payload = {'api_key': API_KEY, 'url': url, 'autoparse': 'true', 'country_code': 'us'}
	proxy_url = 'http://api.scraperapi.com/?' + urlencode(payload)
	return proxy_url

ian-kerins / google_serp_scrapy_p1.py

Created April 12, 2021 13:54

	from urllib.parse import urlparse
	from urllib.parse import urlencode

	def create_google_url(query, site=''):
	google_dict = {'q': query, 'num': 100, }
	if site:
	web = urlparse(site).netloc
	google_dict['as_sitesearch'] = web
	return 'http://www.google.com/search?' + urlencode(google_dict)
	return 'http://www.google.com/search?' + urlencode(google_dict)

ian-kerins / Scrapy_Amazon_P12.py

Created April 8, 2021 17:17

	import scrapy

	class AmazonSpider(scrapy.Spider):
	name = 'amazon'
	allowed_domains = ['amazon.com']
	start_urls = ['http://www.amazon.com/']

	def parse(self, response):
	pass

ian-kerins / Scrapy_Amazon_P11.py

Created April 8, 2021 14:00

	class TutorialPipeline:

	def process_item(self, item, spider):
	for k, v in item.items():
	if not v:
	item[k] = '' # replace empty list or None with empty string
	continue
	if k == 'Title':
	item[k] = v.strip()
	elif k == 'Rating':

ian-kerins / scraperapi_geotargetting.py

Last active April 8, 2021 13:57

	def get_url(url):
	payload = {'api_key': API, 'url': url, 'country_code': 'us'}
	proxy_url = 'http://api.scraperapi.com/?' + urlencode(payload)
	return proxy_url

ian-kerins / Scrapy_Amazon_P9.py

Created April 8, 2021 13:50

	def start_requests(self):
	...
	…
	yield scrapy.Request(url=get_url(url), callback=self.parse_keyword_response)

	def parse_keyword_response(self, response):
	...
	…
	yield scrapy.Request(url=get_url(product_url), callback=self.parse_product_page, meta={'asin': asin})
	...

NewerOlder