Created
April 4, 2018 18:17
-
-
Save QuantumCalzone/7c728be7cd44e732e7487ffa0ff90a54 to your computer and use it in GitHub Desktop.
Scrapes your saved Facebook links, cleans them up, and export them as a list to a txt file
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from selenium import webdriver | |
from selenium.webdriver.common.keys import Keys | |
from selenium.common.exceptions import TimeoutException | |
from selenium.webdriver.common.by import By | |
import selenium.webdriver.support.ui as ui | |
import selenium.webdriver.support.expected_conditions as EC | |
import os | |
import time | |
import sys | |
import bs4 | |
from urllib.parse import unquote | |
import unittest, time, re | |
from bs4 import BeautifulSoup as soup | |
#create the initial csv file | |
filename = "SavedFacebookLinks.text" | |
#f is the normal convention for a File Writer | w stands for "Write" | |
f = open(filename, "w") | |
print("\n") | |
loops = int(input("Enter loops: ")) | |
print("\n") | |
sleepTime = int(input("Enter sleepTime: ")) | |
debug = "Done!" | |
class Sel(unittest.TestCase): | |
def setUp(self): | |
#set chrome settings | |
options = webdriver.ChromeOptions() | |
options.add_argument("--ignore-certificate-errors") | |
options.add_argument("--ignore-ssl-errors") | |
#load real user data | |
options.add_argument(r"--user-data-dir=C:\Users\georg_000\AppData\Local\Google\Chrome\User Data\Default") | |
#fullscreen | |
#options.add_argument("--start-maximized") | |
#assign driver location | |
chromedriver = "C:\Python27\DownloadedTools\chromedriver_win32\chromedriver.exe" | |
os.environ["webdriver.chrome.driver"] = chromedriver | |
self.driver = webdriver.Chrome(options=options, executable_path=chromedriver) | |
self.verificationErrors = [] | |
self.accept_next_alert = True | |
def test_sel(self): | |
driver = self.driver | |
driver.get("https://www.facebook.com/saved/") | |
#driver.find_element_by_link_text("All").click() | |
print("\n") | |
time.sleep(sleepTime) | |
for i in range(1,loops): | |
print("loop "+str(i)+" / "+str(loops)) | |
time.sleep(sleepTime) | |
self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") | |
html_source = driver.page_source | |
#data = html_source.encode('utf-8') | |
#html parsing | |
page_soup = soup(html_source, "html.parser") | |
savedLinks = page_soup.findAll("div", {"class":"_4bl9 _5yjp"}) | |
print("\n") | |
for savedLink in savedLinks: | |
title = savedLink.a | |
print(title) | |
link = savedLink.a["href"] | |
link = unquote(link) | |
#clean up the links | |
link = link.replace("https://l.facebook.com/l.php?u=", "") | |
link = link.replace("https://facebook.com", "") | |
if ("/videos/" in link and "." not in link): | |
link = "https://facebook.com"+link | |
#clean up the links | |
link = re.sub("&h=?.*", "", link) | |
link = re.sub("\?(.*)", "", link) | |
print(link) | |
print("\n") | |
f.write(link + "\n") | |
print("\n") | |
print("savedLinks: "+str(len(savedLinks))) | |
f.close() | |
print("\n") | |
print(debug) | |
if __name__ == "__main__": | |
unittest.main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment