Last active
January 8, 2021 10:03
-
-
Save PKartavkin/75fa29138cb5c74a159f2ccacd73d35d to your computer and use it in GitHub Desktop.
Proxy scrapper for https://free-proxy-list.net
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
This script retrieves all proxy addresses from https://free-proxy-list.net (all pages). | |
Proxy list can be used for ip rotation to get around bot protection. | |
""" | |
from selenium import webdriver | |
from bs4 import BeautifulSoup | |
class Proxy: | |
def __init__(self, ip, code, type, https): | |
self.ip = ip | |
self.code = code | |
self.type = type | |
self.https = https | |
class ProxyScrapper: | |
def init(self): | |
chromeOptions = webdriver.ChromeOptions() | |
prefs = {'profile.managed_default_content_settings.images': 2, 'disk-cache-size': 4096} | |
chromeOptions.add_experimental_option('prefs', prefs) | |
chromeOptions.add_argument("--headless") | |
self.driver = webdriver.Chrome(executable_path='drivers/chromedriver', options=chromeOptions) | |
self.driver.implicitly_wait(10) | |
def tear_down(self): | |
self.driver.close() | |
def scrape_proxies(self): | |
home_url = 'https://free-proxy-list.net' | |
self.driver.get(home_url) | |
proxy_list = [] | |
is_next_disabled = 'disabled' in self.driver.find_element_by_css_selector('#proxylisttable_next').get_attribute( | |
"class") | |
while not is_next_disabled: | |
soup = BeautifulSoup(self.driver.page_source, 'html.parser') | |
table = soup.findAll('tr', {'class': 'odd'}) + soup.findAll('tr', {'class': 'even'}) | |
for row in table: | |
data = row.findAll('td') | |
proxy_list.append(Proxy(data[0].string + ':' + data[1].string, data[2], data[4], data[6])) | |
next_btn = self.driver.find_element_by_css_selector('#proxylisttable_next>a') | |
next_btn.click() | |
is_next_disabled = 'disabled' in self.driver.find_element_by_css_selector('#proxylisttable_next').get_attribute( | |
"class") | |
return proxy_list | |
def get_proxy_list(self): | |
self.init() | |
result = [] | |
try: | |
result = self.scrape_proxies() | |
finally: | |
self.tear_down() | |
return result | |
proxy_scrapper = ProxyScrapper() | |
for proxy in proxy_scrapper.get_proxy_list(): | |
print(proxy.ip) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment