Last active
May 22, 2022 21:40
-
-
Save squidfunk/bbb5e1195d9773e6a7a7 to your computer and use it in GitHub Desktop.
A web scraper to obtain lists of anonymous proxies for web scraping
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
* Copyright (c) 2016 Martin Donath | |
* | |
* All rights reserved. No part of this computer program(s) may be used, | |
* reproduced, stored in any retrieval system, or transmitted, in any form or | |
* by any means, electronic, mechanical, photocopying, recording, or otherwise | |
* without prior written permission. | |
* | |
* The above copyright notice and this permission notice shall be included in | |
* all copies or substantial portions of the Software. | |
* | |
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
* FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE | |
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING | |
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS | |
* IN THE SOFTWARE. | |
* | |
* ---------------------------------------------------------------------------- | |
* Disclaimer | |
* ---------------------------------------------------------------------------- | |
* | |
* This script is for demonstrational purpose only, not for actual usage. It | |
* demonstrates how easy it is to scrape something with artoo.js, even though | |
* the corresponding HTML might be obfuscated. Web scraping is a grey area. | |
* | |
* ---------------------------------------------------------------------------- | |
* Usage | |
* ---------------------------------------------------------------------------- | |
* | |
* 1. Visit https://medialab.github.io/artoo/ | |
* 2. Install the artoo.js browser booklet following the instructions | |
* 3. Visit http://proxylist.hidemyass.com/ | |
* 4. Click the browser booklet to activate artoo.js (watch console) | |
* 5. Open Chrome Inspector > Sources > Snippets and create a new snippet | |
* 6. Run the snippet and watch the console | |
* | |
* artoo.js also has a server-side scraping companion called sandcrawler: | |
* http://medialab.github.io/sandcrawler/ | |
*/ | |
/* ---------------------------------------------------------------------------- | |
* Scraping | |
* ------------------------------------------------------------------------- */ | |
/* Scrape proxy list */ | |
var raw = artoo.scrape('#listable tbody tr', { | |
/* Unique identifier */ | |
id: 'rel', | |
/* Timestamp of last check */ | |
date: { | |
sel: 'td:nth-child(1)', | |
attr: 'rel' | |
}, | |
/* IP address */ | |
ip: { | |
sel: 'td:nth-child(2)', | |
method: function($) { | |
$(this).find('span :not(style)').each(function(index, part) { | |
if (!part.offsetParent) /* Hack: invisible == null */ | |
part.remove(); | |
}); | |
$(this).find('span style').remove(); | |
return $(this).text(); | |
} | |
}, | |
/* Port number */ | |
port: { | |
sel: 'td:nth-child(3)' | |
}, | |
/* Country of origin */ | |
country: { | |
sel: 'td:nth-child(4)', | |
attr: 'rel' | |
}, | |
/* Relative speed */ | |
speed: { | |
sel: 'td:nth-child(5) .indicator', | |
method: function($) { | |
var matches = $(this).attr('style').match('width\: ?([0-9]+)%'); | |
return matches.length > 1 ? matches[1] : '0'; | |
} | |
}, | |
/* Relative connection time */ | |
connection: { | |
sel: 'td:nth-child(6) .indicator', | |
method: function($) { | |
var matches = $(this).attr('style').match('width\: ?([0-9]+)%'); | |
return matches.length > 1 ? matches[1] : '0'; | |
} | |
}, | |
/* Connection type */ | |
type: { | |
sel: 'td:nth-child(7)' | |
}, | |
/* Anonymity level */ | |
anonymity: { | |
sel: 'td:nth-child(8)' | |
} | |
}); | |
/* ---------------------------------------------------------------------------- | |
* Processing | |
* ------------------------------------------------------------------------- */ | |
/* Map levels of anonymity to value (via index) */ | |
var anonymity = ['None', 'Low', 'Medium', 'High', 'High +KA']; | |
/* Finalize proxy list */ | |
var proxies = raw.reduce(function(proxies, proxy) { | |
proxies[proxy.id] = { | |
server: { | |
ip: proxy.ip, | |
port: parseInt(proxy.port), | |
type: proxy.type | |
}, | |
rating: { | |
speed: parseInt(proxy.speed), | |
connection: parseInt(proxy.connection), | |
anonymity: anonymity.indexOf(proxy.anonymity) * 25 | |
}, | |
country: proxy.country, | |
date: parseInt(proxy.date) | |
} | |
return proxies; | |
}, {}); | |
/* Save prettified JSON or log to console */ | |
// artoo.savePrettyJson(proxies); | |
console.log(proxies); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment