Created
November 5, 2014 02:25
-
-
Save axiak/8e7a920f53fa45a253a4 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
import re | |
from collections import defaultdict | |
from urllib2 import urlopen | |
def main(): | |
domains = set() | |
#domains |= get_domains_from_url('https://publicsuffix.org/list/effective_tld_names.dat') | |
domains |= get_domains_from_url('http://data.iana.org/TLD/tlds-alpha-by-domain.txt') | |
tree = build_tree(domains) | |
print r'.+\\.' + build_regex(tree) + '$' | |
START = ';' | |
STOP = ':' | |
def build_regex(tree, start=START): | |
items = tree[start] | |
if len(items) == 1: | |
next = iter(items).next() | |
if next == STOP: | |
return '' | |
return escape(next) + build_regex(tree, start + next) | |
else: | |
pattern = [] | |
for item in items: | |
if item == STOP: | |
pattern.append('') | |
else: | |
pattern.append(escape(item) + build_regex(tree, start + item)) | |
return '(?:' + '|'.join(pattern) + ')' | |
def build_tree(domains): | |
tree = defaultdict(set) | |
for domain in domains: | |
for prefix, next in prefixes(START + domain + STOP): | |
tree[prefix].add(next) | |
return tree | |
def is_ascii(s): | |
return all(ord(c) < 128 for c in s) | |
def prefixes(input): | |
for i in range(1, len(input), 1): | |
yield input[:i], input[i] | |
def escape(item): | |
return re.escape(item) | |
def get_domains_from_url(url): | |
f = urlopen(url) | |
domains = f.readlines() | |
f.close() | |
return { | |
domain.strip().lower() | |
.decode('utf8').encode('idna') | |
.lstrip('*.') | |
for domain in domains | |
if not domain.strip().startswith('//') and | |
not domain.strip().startswith('#') and | |
not domain.strip().startswith('!') and | |
domain.strip() | |
} | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment