-
-
Save koteq/1455886 to your computer and use it in GitHub Desktop.
Fetch google+ album
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
""" | |
Usage: | |
python album_fetcher.py https://plus.google.com/photos/118353143366443526186/albums/5626152497309725217 | |
python album_fetcher.py https://plus.google.com/118353143366443526186 | |
python album_fetcher.py https://plus.google.com/118353143366443526186 [email protected] yourpassword | |
python album_fetcher.py https://plus.google.com/118353143366443526186 [email protected] yourpassword /out_dir/ | |
TODO: use opt parse | |
""" | |
import os | |
import re | |
import sys | |
import urllib | |
import logging | |
import logging.handlers | |
from Queue import Queue | |
from threading import Thread | |
from collections import namedtuple | |
import gdata.photos.service | |
WINDOWS = os.name == 'nt' | |
if WINDOWS: | |
from unidecode import unidecode | |
DownloadTask = namedtuple('DownloadTask', ['url', 'save_path']) | |
ResultParseUrl = namedtuple('ResultParseUrl', ['user_id', 'album_id']) | |
DOWNLOAD_THREADS_COUNT = 30 | |
ALBUM_FETCHER_THREADS_COUNT = 10 | |
DOWNLOAD_FULL_SIZED_IMAGES = True | |
PROJECT_ROOT = os.path.dirname(os.path.abspath(__file__)) | |
ALBUMS_SAVE_DIR = os.path.join(PROJECT_ROOT, 'fetch_albums') | |
log = logging.getLogger('album_fetcher') | |
logging.basicConfig(level=logging.INFO, | |
format='%(asctime)s %(levelname)s %(message)s', | |
datefmt='%H:%M:%S') | |
class Fetcher(object): | |
def __init__(self, user_id, album_id, email=None, password=None, save_dir=None): | |
self.user_id = user_id | |
self.album_id = album_id | |
self.email = email | |
self.password = password | |
self.save_dir = save_dir or ALBUMS_SAVE_DIR | |
self._gd_client = None | |
self._download_queue = None | |
self._album_fetch_queue = None | |
def _strip(self, string): | |
if WINDOWS: | |
# transliterate utf string and strip illegal chars | |
string = unidecode(string.decode('utf8')) | |
string = re.sub(r'[\\/:"*?<>|]+', '', string) | |
string = string.decode('latin-1', errors='ignore').encode('latin-1') | |
return string.strip() | |
def _download_worker(self): | |
while True: | |
task = self._download_queue.get() # wait for item | |
if 1: | |
log.info('GET %s', task.url) | |
try: | |
output_file = open(task.save_path, "wb") | |
input_data = urllib.urlopen(task.url) | |
output_file.write(input_data.read()) | |
output_file.close() | |
input_data.close() | |
except Exception: | |
log.exception() | |
pass | |
self._download_queue.task_done() | |
def _add_download_task(self, url, save_path): | |
if self._download_queue is None: | |
self._download_queue = Queue() | |
for i in range(DOWNLOAD_THREADS_COUNT): | |
t = Thread(target=self._download_worker) | |
t.daemon = True | |
t.start() | |
self._download_queue.put(DownloadTask(url, save_path)) | |
def _album_fetch_worker(self): | |
while True: | |
album_id = self._album_fetch_queue.get() # wait for item | |
log.info("fetching album id: %s", album_id) | |
feed = self._gd_client.GetFeed( | |
'/data/feed/api/user/%s/albumid/%s?kind=photo' % (self.user_id, album_id)) | |
out_dir = self._get_album_out_dir(feed) | |
for filename, src_url in self._get_all_content_links(feed).iteritems(): | |
if WINDOWS: | |
filename, ext = os.path.splitext(filename) | |
filename = filename[:32] + ext # TODO: where is can be same file names | |
filepath = os.path.join(out_dir, filename) | |
if self.file_is_exists(filepath): | |
continue | |
if DOWNLOAD_FULL_SIZED_IMAGES: | |
src_url = re.sub(r'(.*)/', r'\1/s0-d/', src_url) | |
self._add_download_task(src_url, filepath) | |
self._album_fetch_queue.task_done() | |
def _add_album_fetch_task(self, album_id): | |
if self._album_fetch_queue is None: | |
self._album_fetch_queue = Queue() | |
for i in range(ALBUM_FETCHER_THREADS_COUNT): | |
t = Thread(target=self._album_fetch_worker) | |
t.daemon = True | |
t.start() | |
self._album_fetch_queue.put(album_id) | |
def _gd_auth(self): | |
if self._gd_client is not None: | |
return self._gd_client | |
self._gd_client = gdata.photos.service.PhotosService() | |
if self.email and self.password: | |
self._gd_client.email = self.email | |
self._gd_client.password = self.password | |
self._gd_client.ProgrammaticLogin() | |
return self._gd_client | |
def _get_album_out_dir(self, feed): | |
""" | |
Create non exist directories and return album save path | |
""" | |
user_name = self._strip(feed.nickname.text) | |
album_name = None | |
try: | |
# this hack allow us to group many dummy albums from stream into one | |
for extension_element in feed.extension_elements: | |
if extension_element.tag == 'albumType': | |
album_name = extension_element.text | |
except AttributeError: | |
pass | |
if album_name is None: | |
album_name = self._strip(feed.title.text) | |
out_dir = os.path.join(self.save_dir, user_name, album_name) | |
if not os.path.exists(out_dir): | |
os.makedirs(out_dir) | |
# TODO: user can rename himself | |
touch_file = os.path.join(self.save_dir, user_name, self.user_id) | |
try: | |
if not os.path.exists(touch_file): | |
file(touch_file, 'a').close() | |
os.utime(touch_file, None) | |
except IOError: | |
pass | |
return out_dir | |
@staticmethod | |
def parse_album_url(url): | |
""" | |
https://plus.google.com/photos/118353143366443526186/albums/5626152497309725217 | |
https://plus.google.com/114051696952559973034 | |
""" | |
result = re.findall('photos/([\d]+)/albums/([\d]+)$', url) | |
if not result: | |
return ResultParseUrl(re.findall('.*/([\d]+)', url)[0], None) | |
return ResultParseUrl(*result[0]) | |
@staticmethod | |
def file_is_exists(filepath): | |
if os.path.exists(filepath): | |
size = os.stat(filepath).st_size | |
if size: | |
return True | |
return False | |
def fetch(self): | |
"""main class function""" | |
self._gd_client = self._gd_auth() | |
if self.album_id: | |
self._add_album_fetch_task(self.album_id) | |
else: | |
self._fetch_all() | |
log.info("Finish fetching albums") | |
if self._album_fetch_queue is not None: | |
log.info("Waiting for fetch queue") | |
self._album_fetch_queue.join() | |
if self._download_queue is not None: | |
log.info("Waiting for download queue") | |
self._download_queue.join() | |
def _fetch_all(self): | |
albums = self._gd_client.GetUserFeed(user=self.user_id) | |
for album in albums.entry: | |
# TODO: fetch posts album as single album | |
# album.extension_elements[0].text = 'Buzz' | |
album_id = album.gphoto_id.text | |
self._add_album_fetch_task(album_id) | |
def _get_all_content_links(self, feed): | |
"""return: {filename: content_url}""" | |
return dict((self._strip(p.title.text), p.content.src) for p in feed.entry) | |
def main(): | |
args = sys.argv[1:] | |
url = args[0] | |
email = None | |
password = None | |
save_dir = ALBUMS_SAVE_DIR | |
if len(args[1:]) >= 2: | |
email = args[1] | |
password = args[2] | |
if len(args[1:]) == 1: | |
save_dir = args[1] | |
elif len(args[1:]) == 3: | |
save_dir = args[3] | |
parsed_url = Fetcher.parse_album_url(url) | |
f = Fetcher(user_id=parsed_url.user_id, album_id=parsed_url.album_id, | |
email=email, password=password, save_dir=save_dir) | |
f.fetch() | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment