Created
July 8, 2014 22:48
-
-
Save gwgundersen/e6cd307ba2320c418a3f to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from collections import namedtuple | |
import re | |
import pdb | |
# http://tools.ietf.org/html/rfc3986#section-3.3 | |
""" | |
>> c = request.urlparse("http://gregorygundersen.com") | |
>> c | |
ParseResult(scheme='http', netloc='gregorygundersen.com', path='', params='', query='', fragment='') | |
""" | |
def urlparse(url): | |
pdb.set_trace() | |
url = url.lower() | |
# Check for protocol | |
temp = url.split('://', 1) | |
if len(temp) > 1: | |
scheme = temp[0] | |
remainder = temp[1] | |
else: | |
# If there's no protocol | |
scheme = '' | |
remainder = temp[0] | |
# Check for netloc, i.e. the domain | |
temp = remainder.split('/', 3) | |
if len(temp) > 1: | |
netloc = temp[0] | |
remainder = temp[1] | |
else: | |
netloc = temp[0] | |
remainder = '' | |
# Check for path | |
if len(remainder): | |
temp = remainder.split('?') | |
path = temp[0] | |
if len(temp) > 1: | |
query = temp[1] | |
else: | |
query = '' | |
else: | |
path = '' | |
query = '' | |
#ParseResult = namedtuple('ParseResult', 'scheme netloc path params query fragment') | |
#components = ParseResult(components[0], components[1]), components[2]), components[3]), components[4]), components[5]) | |
#components | |
return (scheme, netloc, path, query) | |
def urlparse2(url): | |
url = url.lower() | |
components = url_parse_strings(url) | |
ParseResult = namedtuple('ParseResult', 'scheme netloc path params query fragment') | |
return ParseResult(components[0], components[1], components[2], components[3], components[4], components[5]) | |
def url_parse_strings(url): | |
return url.replace(':', '|').replace('//', '|').replace('/', '|').replace('www.', '|').replace('.', '|').split('|') | |
def url_parse_re(url): | |
return re.split(':|\.|//|/|\?', url) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment