Created
June 20, 2012 19:39
-
-
Save omeinusch/2961760 to your computer and use it in GitHub Desktop.
Appends all URLs in a given text with numbers surrounded by brackets and returns a tuple with the new text and a dict-list with the urls and their numbers.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def append_urls_to_text_bottom(text): | |
import re | |
url_counter = 0 | |
unique_url_list = dict() | |
appending_list = dict() | |
url_expression = "(?:(?:https?|ftp)://)(?:\\S+(?::\\S*)?@)?(?:(?!10(?:\\.\\d{1,3}){3})(?!127(?:\\.\\d{1,3}){3})(?!169\\.254(?:\\.\\d{1,3}){2})(?!192\\.168(?:\\.\\d{1,3}){2})(?!172\\.(?:1[6-9]|2\\d|3[0-1])(?:\\.\\d{1,3}){2})(?:[1-9]\\d?|1\\d\\d|2[01]\\d|22[0-3])(?:\\.(?:1?\\d{1,2}|2[0-4]\\d|25[0-5])){2}(?:\\.(?:[1-9]\\d?|1\\d\\d|2[0-4]\\d|25[0-4]))|(?:(?:[a-z\\u00a1-\\uffff0-9]+-?)*[a-z\\u00a1-\\uffff0-9]+)(?:\\.(?:[a-z\\u00a1-\\uffff0-9]+-?)*[a-z\\u00a1-\\uffff0-9]+)*(?:\\.(?:[a-z\\u00a1-\\uffff]{2,})))(?::\\d{2,5})?(?:/[^\\s]*)?" | |
list_of_all_urls = re.findall(url_expression, text) | |
for one_url in list_of_all_urls: | |
if one_url not in unique_url_list: | |
url_counter += 1 | |
unique_url_list[one_url] = url_counter | |
for single_url, single_number in unique_url_list.items(): | |
text = re.sub(single_url, "[%d]" % single_number, text) | |
appending_list[single_number] = single_url | |
return (text, appending_list) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment