Created
May 22, 2018 06:43
-
-
Save jeremydouglass/3bbde0e6ce87bd54dd115eda395c2c93 to your computer and use it in GitHub Desktop.
Processing Vanilla forum bookmark page link extractor
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
"""Processing Vanilla forum bookmark page link extractor | |
1. Manually save html pages of bookmarks from forum before decomissioned | |
2. Run this to extract link data | |
""" | |
import fnmatch | |
import os | |
from bs4 import BeautifulSoup | |
def scrape_bookmarks(filename): | |
"""Extract perosnal bookmark data from Processing Vanilla forum pages""" | |
bookmark_list = [] | |
with open(filename, 'r') as inputfile: | |
filestring=inputfile.read() | |
soup = BeautifulSoup(filestring, 'html.parser') | |
passages = soup.select('div.Title') | |
for psg in passages: | |
plink = psg.a['href'] | |
ptitle = psg.get_text() | |
bookmark_list.append((ptitle.strip(), plink)) | |
return bookmark_list | |
def fpath_to_fnamelist(fpath, fnpattern): | |
""" | |
Filepath to filename list: | |
Take a directory and pattern, return a list of file paths. | |
fnpattern filters results use Unix shell-style wildcards: (*, ?, [abc], [!abc]) | |
Uses fnmatch.filter. | |
""" | |
return [os.path.join(dirpath, f) | |
for dirpath, _dirnames, files in os.walk(fpath) | |
for f in fnmatch.filter(files, fnpattern)] | |
def save_bookmarks(list_, filename): | |
"""Save list_ of lines into text file.""" | |
if not list_: raise ValueError('No data to write.') | |
if not filename: raise ValueError('No filename given.') | |
try: | |
with open(filename, 'w') as outputfile: | |
for item in list_: | |
for title, url in item: | |
outputfile.write("{}\t{}\n".format(title.encode('utf-8'), url)) | |
except OSError: | |
print "File not written." | |
if __name__ == '__main__': | |
results = [] | |
fname_list = fpath_to_fnamelist('./', '*.html') | |
for file in fname_list: | |
results.append(scrape_bookmarks(file)) | |
save_bookmarks(results, 'bookmarks.txt') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment