This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def get_option_dict(url, top_level_url, content_type=None): | |
"""Build an options dict for BlockListParser. | |
These options are checked here: | |
* https://github.com/englehardt/abp-blocklist-parser/blob/40f6bb5b91ea403b7b9852a16d6c57d5ec26cf7f/abp_blocklist_parser/RegexParser.py#L104-L117 | |
* https://github.com/englehardt/abp-blocklist-parser/blob/40f6bb5b91ea403b7b9852a16d6c57d5ec26cf7f/abp_blocklist_parser/RegexParser.py#L240-L248 | |
Parameters | |
---------- | |
url : string |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from urlparse import urlparse | |
from Crypto.Hash import MD2 | |
import pandas as pd | |
import cookies as ck | |
import hackercodecs # noqa | |
import hashlib | |
import pyblake2 | |
import urllib | |
import sha3 | |
import mmh3 |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
103092804.com | |
1rx.io | |
247realmedia.com | |
2leep.com | |
2mdn.net | |
2o7.net | |
33across.com | |
360yield.com | |
365media.com | |
3dstats.com |
We can make this file beautiful and searchable if this error is corrected: Unclosed quoted field in line 2.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
top_level_hostname,url,redirect_chain | |
www.prabhasakshi.com,https://fcmatch.google.com/pixel?google_gm=AMnCDorgMK87r03e115XCkX55u3NGIsHQVGSw3sfqlf2vTyPC1FUd-EWM0O9WxVM7-EvH31H1yx2L5xh-p78KY4cOU_R6Gekf76P6ukigIDMufCzxoqAWbwfYeNFjxxmLcH56fmsIuWl,"[""https://e.dlx.addthis.com/e/a-1189/s-3614?redirect_provider_id=3614&ru=https%3A%2F%2Fcm.g.doubleclick.net%2Fpixel%3Fgoogle_nid%3Ddatalogix_dmp%26google_hm%3D%3CNA_ID%3E%26google_push%3DAHNF13If3D87PP63h-DtKCOgSghwXpmcwg4r08mF1ZsSUQ&google_gid=CAESEAbb_EW8Fb8b1FCVaJP9kFc&google_cver=1"",""https://e.dlx.addthis.com/e/a-1189/s-3614?redirect_provider_id=3614&ru=https%3A%2F%2Fcm.g.doubleclick.net%2Fpixel%3Fgoogle_nid%3Ddatalogix_dmp%26google_hm%3D%3CNA_ID%3E%26google_push%3DAHNF13If3D87PP63h-DtKCOgSghwXpmcwg4r08mF1ZsSUQ&google_gid=CAESEAbb_EW8Fb8b1FCVaJP9kFc&google_cver=1&rd=Y"",""https://cm.g.doubleclick.net/pixel?google_nid=datalogix_dmp&google_hm=MjAxOTA4MjcwNDI1MDk5ODY2ODEwMTQ0Mzk5Ng%3D%3D&google_push=AHNF13If3D87PP63h-DtKCOgSghwXpmcwg4r08mF1ZsSUQ"",""https://fcmatch. |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
5min.com | |
abmr.net | |
aboutecho.com | |
accounts.google.com | |
activengage.com | |
adap.tv | |
adobe.com | |
aim.com | |
akamai.com | |
akqa.com |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" This script reads a sqlite database and writes the content to a parquet | |
database on S3 formatted as OpenWPM would format. It's best to just run this | |
on AWS as it bottlenecks on the S3 upload. This is a lightly modified version | |
of OpenWPM's S3Aggregator class. | |
""" | |
import os | |
import sqlite3 | |
import sys | |
from collections import defaultdict |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from trackingprotection_tools import DisconnectParser | |
BLOCKLIST_URL = 'https://raw.githubusercontent.com/mozilla-services/shavar-prod-lists/master/disconnect-blacklist.json' # noqa | |
REMAPPING_URL = 'https://raw.githubusercontent.com/mozilla-services/shavar-list-creation/master/disconnect_mapping.json' # noqa | |
dc = DisconnectParser( | |
blocklist_url=BLOCKLIST_URL, | |
disconnect_mapping_url=REMAPPING_URL, | |
verbose=True |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import base64 | |
import hashlib | |
import json | |
import re | |
import urllib2 | |
from trackingprotection_tools import DisconnectParser | |
TRACKER_CATEGORIES = [ | |
'Advertising', 'Analytics', 'Social', 'Content', 'Disconnect' |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from StringIO import StringIO | |
import requests | |
import zipfile | |
import random | |
import json | |
import os | |
EC2_LIST = 'http://s3.amazonaws.com/alexa-static/top-1m.csv.zip' | |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from bs4 import BeautifulSoup | |
import multiprocessing | |
import urlparse | |
import requests | |
import json | |
import os | |
# Available: https://github.com/citp/OpenWPM/blob/master/automation/utilities/domain_utils.py # noqa | |
import domain_utils as du | |
# Available: https://gist.github.com/englehardt/802d1872d6bda2084723489a82540cb3 # noqa |
NewerOlder