Skip to content

Instantly share code, notes, and snippets.

@PKartavkin
Last active January 8, 2021 10:03
Show Gist options
  • Save PKartavkin/75fa29138cb5c74a159f2ccacd73d35d to your computer and use it in GitHub Desktop.
Save PKartavkin/75fa29138cb5c74a159f2ccacd73d35d to your computer and use it in GitHub Desktop.
Proxy scrapper for https://free-proxy-list.net
"""
This script retrieves all proxy addresses from https://free-proxy-list.net (all pages).
Proxy list can be used for ip rotation to get around bot protection.
"""
from selenium import webdriver
from bs4 import BeautifulSoup
class Proxy:
def __init__(self, ip, code, type, https):
self.ip = ip
self.code = code
self.type = type
self.https = https
class ProxyScrapper:
def init(self):
chromeOptions = webdriver.ChromeOptions()
prefs = {'profile.managed_default_content_settings.images': 2, 'disk-cache-size': 4096}
chromeOptions.add_experimental_option('prefs', prefs)
chromeOptions.add_argument("--headless")
self.driver = webdriver.Chrome(executable_path='drivers/chromedriver', options=chromeOptions)
self.driver.implicitly_wait(10)
def tear_down(self):
self.driver.close()
def scrape_proxies(self):
home_url = 'https://free-proxy-list.net'
self.driver.get(home_url)
proxy_list = []
is_next_disabled = 'disabled' in self.driver.find_element_by_css_selector('#proxylisttable_next').get_attribute(
"class")
while not is_next_disabled:
soup = BeautifulSoup(self.driver.page_source, 'html.parser')
table = soup.findAll('tr', {'class': 'odd'}) + soup.findAll('tr', {'class': 'even'})
for row in table:
data = row.findAll('td')
proxy_list.append(Proxy(data[0].string + ':' + data[1].string, data[2], data[4], data[6]))
next_btn = self.driver.find_element_by_css_selector('#proxylisttable_next>a')
next_btn.click()
is_next_disabled = 'disabled' in self.driver.find_element_by_css_selector('#proxylisttable_next').get_attribute(
"class")
return proxy_list
def get_proxy_list(self):
self.init()
result = []
try:
result = self.scrape_proxies()
finally:
self.tear_down()
return result
proxy_scrapper = ProxyScrapper()
for proxy in proxy_scrapper.get_proxy_list():
print(proxy.ip)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment