Last active
May 6, 2023 08:49
-
-
Save Jach/85b8b1af50996366b4059016b02259d2 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
;;;; For years I had a script that used the twitter free API to get my account's (@jachy) likes, and if they were | |
;;;; images or videos, download a local copy of them because I was sick of accounts being suspended and losing some memes/art | |
;;;; forever. | |
;;;; Anyway, because the free tier is no more, here is a very dirty Common Lisp script to replace it. | |
;;;; It requires Selenium (expected to be running before this script, e.g. with `java -jar selenium-server-4.1.2.jar standalone` | |
;;;; and I'm assuming Firefox rather than Chrome is being used) to load up the likes page and continue to scroll down the page | |
;;;; for a fixed duration of time, frequently polling the page source and extracting media URLs from it. It handles pictures, | |
;;;; mp4 videos, and gifs. It uses `yt-dlp` (fork of youtube-dl) to get the mp4 URLs, uses `ls` to check for already-downloaded | |
;;;; existence (needed for lazy glob patterns), and `wget` to do the actual download. | |
;;;; By default it runs for 5 minutes, my cronjob is setup to run the script daily, but the first run I let it go for like | |
;;;; 10 hours to try and get as much as possible. It actually found stuff not returned by the API call... additionally it gets | |
;;;; stuff through quote-tweets as a side effect. | |
(defpackage #:get-likes | |
(:use #:cl)) | |
(in-package :get-likes) | |
(ql:quickload :cl-webdriver-client) | |
(ql:quickload :cl-ppcre) | |
(ql:quickload :com.inuoe.jzon) | |
(use-package :webdriver-client) | |
(defun parse-bool (s) | |
"Turns TRUE or FALSE strings to T or nil" | |
(if (equal s "TRUE") | |
t | |
nil)) | |
(defmacro updatef (thing function) | |
`(setf ,thing (funcall ,function ,thing))) | |
(defun parse-netscape-cookies (path) | |
"Parses a netscape cookies.txt file (http://fileformats.archiveteam.org/wiki/Netscape_cookies.txt) | |
into a list of lists of just each cookie" | |
(let ((contents (uiop:read-file-lines path))) | |
(dotimes (i 4) | |
(pop contents)) | |
(mapcar (lambda (line) | |
(let ((parts (cl-ppcre:split "\\t" line))) | |
(updatef (elt parts 0) (lambda (part) (cl-ppcre:regex-replace ".twitter" part "twitter"))) | |
(updatef (elt parts 0) (lambda (part) (cl-ppcre:regex-replace "#HttpOnly_.twitter" part "twitter"))) | |
(updatef (elt parts 0) (lambda (part) (cl-ppcre:regex-replace "#HttpOnly_twitter" part "twitter"))) | |
(updatef (elt parts 1) #'parse-bool) | |
(updatef (elt parts 3) #'parse-bool) | |
(updatef (elt parts 4) #'parse-integer) | |
parts)) | |
contents))) | |
(defun set-cookies (cookies) | |
(dolist (cookie cookies) | |
(destructuring-bind (host subdomains path secure? expiry name value) cookie | |
(declare (ignore subdomains)) | |
(setf (cookie) (make-cookie name value :path path :domain host :secure secure? :expiry expiry))))) | |
(defparameter *likes-folder* "/path/to/twitter_likes/") | |
(defparameter *img-regex* "src=\"(https://pbs.twimg.com/media/([a-zA-Z0-9_-]+?)\\?format=(jpg|jpeg|png))&name=") | |
(defparameter *vid-regex* "(href=\"(/[a-zA-Z0-9_-]+?/status/[0-9]+?)\" dir=\"ltr\")(?:(?!status).)+?poster=\"(https://pbs.twimg.com/ext_tw_video_thumb/[a-zA-Z0-9_-]+/pu/img/[a-zA-Z0-9_-]+?.(jpg|jpeg|png))\"") | |
(defparameter *gif-regex* "(https://video.twimg.com/tweet_video/([a-zA-Z0-9_-]+?).mp4)") | |
;(start-interactive-session) | |
(start-interactive-session (make-capabilities :always-match | |
'((browser-name . "firefox") | |
("moz:firefoxOptions" . (("args" . #("-headless"))) ) | |
))) | |
(defparameter *likes-url* "https://twitter.com/jachy/likes") | |
(defparameter *resources* (list) | |
"List of image resource names, i.e. 123.jpg, unless it's a full mp4 url") | |
(defparameter *vid-resources* (list) | |
"List of a pairs of (tweet-url . thumbnail-url)") | |
(defun recollect-imgs (src) | |
(setf src (cl-ppcre:regex-replace-all "\\n" src " ")) | |
(cl-ppcre:do-register-groups (full id ext) (*img-regex* src) | |
(declare (ignore full)) | |
(setf *resources* (adjoin (format nil "~a.~a" id ext) | |
*resources* | |
:test #'string-equal))) | |
; also look for 'gifs' | |
(setf *resources* (union *resources* (cl-ppcre:all-matches-as-strings *gif-regex* src) :test #'string-equal))) | |
(defun recollect-vids (src) | |
(setf src (cl-ppcre:regex-replace-all "\\n" src " ")) | |
(cl-ppcre:do-register-groups (_ tweet-id thumb-id) (*vid-regex* src) | |
(declare (ignore _)) | |
(setf *vid-resources* (adjoin (cons (uiop:strcat "https://twitter.com" tweet-id) | |
thumb-id) | |
*vid-resources* | |
:test #'equal)))) | |
(setf (url) *likes-url*) ; need to visit domain at least once before we can set cookies... | |
(set-cookies (parse-netscape-cookies (uiop:strcat *likes-folder* "cookies.txt"))) | |
(setf (url) *likes-url*) | |
(let ((start (get-universal-time)) | |
(progress 0) | |
(last-progress 0) | |
(collection-time (* 60 5)) | |
(recollect-every 1) | |
(last-page-src "")) | |
(loop while (< progress collection-time) | |
do | |
(setf progress (- (get-universal-time) start)) | |
(when (>= (- (get-universal-time) last-progress) recollect-every) | |
(setf last-progress (get-universal-time)) | |
(let ((src (page-source))) | |
(unless (equal src last-page-src) | |
(setf last-page-src src) | |
(recollect-imgs src) | |
(recollect-vids src)))) | |
(webdriver-client-utils:send-key :down-arrow))) | |
(recollect-imgs (page-source)) | |
(recollect-vids (page-source)) | |
(stop-interactive-session) | |
(defun downloaded? (file-id) | |
(let ((dirs (list ""))) ; I had a lot of images so started to organize some into subdirs post-download, add them here if you want to do the same like "memes/" "info/" etc. | |
(dolist (dir dirs) | |
(alexandria:if-let ((file-exists? (not (equal "" (with-output-to-string (s) | |
(handler-case (uiop:run-program (uiop:strcat "ls " *likes-folder* dir "*-" file-id) :output s) | |
(uiop:subprocess-error nil))))))) | |
(return-from downloaded? t)))) | |
nil) | |
(defun last-prefix-n () | |
; note if images are moved to subdirs, don't move the most recent image as that is used for the last-prefix-n... | |
(parse-integer (first | |
(uiop:split-string | |
(car (last (sort | |
(remove-if-not (lambda (n) (cl-ppcre:all-matches "^[0-9]" n)) | |
(mapcar #'pathname-name (uiop:directory-files *likes-folder*))) | |
#'string<))) | |
:separator '(#\-))))) | |
(defun thumb-name (url) | |
(car (last (cl-ppcre:split "/" url)))) | |
(defun get-vid-urls (tweet) | |
(let ((json (com.inuoe.jzon:parse | |
(with-output-to-string (s) | |
(uiop:run-program (list "yt-dlp" "-j" tweet) :output s))))) | |
(let* ((formats (gethash "formats" json))) | |
(map 'list (lambda (f) (gethash "url" f)) | |
(remove-if-not (lambda (f) (equal "https" (gethash "protocol" f))) | |
formats))))) | |
(defparameter *name-format-str* "~a~5,'0d-~a") | |
(defun save-video (vid-urls prefix) | |
(dolist (url vid-urls) | |
(let ((name (car (last (cl-ppcre:split "/" url))))) | |
(format t "Trying vid ~a~%" name) | |
(uiop:run-program (list "wget" | |
"-O" (format nil *name-format-str* *likes-folder* prefix name) | |
url)) | |
(incf prefix))) | |
(length vid-urls)) | |
(defun download-resources () | |
(let ((prefix (1+ (last-prefix-n)))) | |
(dolist (resource *resources*) | |
(if (search ".mp4" resource) | |
(let ((gif-name (car (last (cl-ppcre:split "/" resource))))) | |
(unless (downloaded? gif-name) | |
(format t "Trying ~a~%" gif-name) | |
(uiop:run-program (list "wget" | |
"-O" (format nil *name-format-str* *likes-folder* prefix gif-name) | |
resource)) | |
(incf prefix))) | |
(unless (downloaded? resource) | |
(format t "Trying ~a~%" resource) | |
(uiop:run-program (list "wget" | |
"-O" (format nil *name-format-str* *likes-folder* prefix resource) | |
(format nil "https://pbs.twimg.com/media/~a?name=orig" resource))) | |
(incf prefix)))) | |
(dolist (resource *vid-resources*) | |
(let* ((tweet-url (car resource)) | |
(thumb-url (cdr resource)) | |
(thumb (thumb-name thumb-url))) | |
(unless (downloaded? thumb) | |
(format t "Trying thumb ~a~%" thumb) | |
(uiop:run-program (list "wget" | |
"-O" (format nil *name-format-str* *likes-folder* prefix thumb) | |
(uiop:strcat thumb-url "?name=orig"))) | |
(incf prefix) | |
(incf prefix | |
(save-video (get-vid-urls tweet-url) prefix))))))) | |
(download-resources) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment