Created
April 18, 2019 22:06
-
-
Save englehardt/53b7b585ef38067540b4a547363b81df to your computer and use it in GitHub Desktop.
Generate a list of safebrowsing hashes from the raw Disconnect list
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import base64 | |
import hashlib | |
import json | |
import re | |
import urllib2 | |
from trackingprotection_tools import DisconnectParser | |
TRACKER_CATEGORIES = [ | |
'Advertising', 'Analytics', 'Social', 'Content', 'Disconnect' | |
] | |
BLOCKLIST = 'https://raw.githubusercontent.com/mozilla-services/shavar-prod-lists/master/disconnect-blacklist.json' # noqa | |
MAPPING = 'https://raw.githubusercontent.com/mozilla-services/shavar-list-creation/master/disconnect_mapping.json' # noqa | |
TEST_SET = { | |
'google.ee': 'BIjZakI3ChrLy6S7OTUampyXO+WUJFp4dYEfvXDQTtc=', | |
'adservice.google.com': 'ORbzeOvsHcz/DEuwyZXenmorqa79AoIdfjEXXKIxKIY=', | |
'doubleclick.net': 'uXNT1PzjAVau8b402OMAIGDejKbiXfQX5iXvPASfO/s=', | |
'youtube.com': 'LvOZqM9U3cK9V1r05/4lr38ecDvgztKSGdyzL4bvE8c=' | |
} | |
# From https://github.com/mozilla-services/shavar-list-creation/blob/93924188fee1c2b708217154524c90ed40d568f3/lists2safebrowsing.py#L154 # noqa | |
def canonicalize(d): | |
if (not d or d == ""): | |
return d | |
# remove tab (0x09), CR (0x0d), LF (0x0a) | |
# TODO?: d, _subs_made = re.subn("\t|\r|\n", "", d) | |
d = re.subn("\t|\r|\n", "", d)[0] | |
# remove any URL fragment | |
fragment_index = d.find("#") | |
if (fragment_index != -1): | |
d = d[0:fragment_index] | |
# repeatedly unescape until no more hex encodings | |
while (1): | |
_d = d | |
d = urllib2.unquote(_d) | |
# if decoding had no effect, stop | |
if (d == _d): | |
break | |
# extract hostname (scheme://)(username(:password)@)hostname(:port)(/...) | |
# extract path | |
# TODO?: use urlparse ? | |
url_components = re.match( | |
re.compile( | |
"^(?:[a-z]+\:\/\/)?(?:[a-z]+(?:\:[a-z0-9]+)?@)?([^\/^\?^\:]+)(?:\:[0-9]+)?(\/(.*)|$)" # noqa | |
), d) | |
host = url_components.group(1) | |
path = url_components.group(2) or "" | |
path = re.subn(r"^(\/)+", "", path)[0] | |
# remove leading and trailing dots | |
# TODO?: host, _subs_made = re.subn("^\.+|\.+$", "", host) | |
host = re.subn(r"^\.+|\.+$", "", host)[0] | |
# replace consequtive dots with a single dot | |
# TODO?: host, _subs_made = re.subn("\.+", ".", host) | |
host = re.subn(r"\.+", ".", host)[0] | |
# lowercase the whole thing | |
host = host.lower() | |
# percent-escape any characters <= ASCII 32, >= 127, or '#' or '%' | |
_path = "" | |
for i in path: | |
if (ord(i) <= 32 or ord(i) >= 127 or i == '#' or i == '%'): | |
_path += urllib2.quote(i) | |
else: | |
_path += i | |
# Note: we do NOT append the scheme | |
# because safebrowsing lookups ignore it | |
return host + "/" + _path | |
def get_safebrowsing_hash(domain): | |
return base64.b64encode(hashlib.sha256(canonicalize(domain)).digest()) | |
def main(): | |
dc = DisconnectParser( | |
blocklist_url=BLOCKLIST, | |
disconnect_mapping_url=MAPPING | |
) | |
output = dict() | |
domains = dc.get_domains_with_category(TRACKER_CATEGORIES) | |
for domain in domains: | |
output[domain] = get_safebrowsing_hash(domain) | |
if domain in TEST_SET: | |
assert(output[domain] == TEST_SET[domain]) | |
with open('output.json', 'w') as f: | |
json.dump(output, f) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment