Last active
September 26, 2017 07:22
-
-
Save telenieko/6a98b99e33f62a5fbb9a9c188c4575da to your computer and use it in GitHub Desktop.
Script to merge two similar Google Drive directory structures
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
# pylint: disable=missing-docstring,wrong-import-position,invalid-name,superfluous-parens | |
""" google_drive_fix_migration.py <[email protected]> | |
License: This code is release to the Public Domain under "CC0 1.0 Universal" License | |
Author: Marc Fargas <[email protected]> | |
Background: | |
Our office did a real-life testdrive of Office 365 after 11 years on Google Apps, | |
after 3 months we decided to rollback the affected users which went smoothly | |
except for the OneDrive -> Google Drive migration. | |
We missed that the propietary migration tool used put all OneDrive documents | |
inside a folder called "Documents" on the user's Google Drive thus resulting | |
that a user had: | |
- Documents before going Office365 inside 'root' (/) *and* inside 'Documents' (/Documents) | |
- Documents modified in Office365 inside /Documents | |
- Documents modified in Google (due to being shared with non testdrive users) in / | |
- Documents created in Office365 inside /Documents | |
- Two folder trees / and /Documents "somehow identical". | |
This script is the quickfix we applied. | |
This is not a one size-fits-all thing, it is published to the Public Domain | |
for reference and example for anyone stumbling with something similar in the future. | |
Code was written in September 2017 using Google Drive v3 API. | |
Details: | |
This tool uses a Service Account to impersonate the users on Google G Suite, | |
if you got here you're likely to know what that means or have to means to find | |
out and set it up. If you can't ... well, then maybe you should not attempt to | |
use this code!! | |
The tool will first build an in-memory representation of both folder | |
structures / and /Documents via load_trees() then compare both with | |
compare_trees() and lately apply changes with apply_actions() | |
During comparison the "BREAKUP_DATE" is the Day we moved those users to Office. | |
For convenience, we store the load_trees result in a shelve file and reread | |
it on each call, you can purge this "cache" by deleting the shelve file. | |
Rules: | |
- If a folder is not owned by the user it will not be traversed, it might be moved. | |
- If a file or folder has to be moved a) the former parent is removed, | |
b) the new parent is inserted. | |
That is: if there are more parents those remain unchanged. | |
""" | |
import sys | |
import os | |
import logging | |
import shelve | |
from datetime import date | |
# For convenience I put all dependencies inside a lib/ folder so I do not need a virtualenv | |
# pip install -t lib/ google-api-python-client attrs python-dateutil | |
BASE = os.path.dirname(__file__) | |
sys.path.insert(0, os.path.join(BASE, 'lib')) | |
BREAKUP_DATE = date(2017, 6, 1) | |
from oauth2client.service_account import ServiceAccountCredentials | |
import dateutil.parser | |
import httplib2 | |
import attr | |
import apiclient.discovery | |
import apiclient.http | |
import apiclient.errors | |
KEYFILE = os.path.join(BASE, "google-service-account-key.json") | |
logging.basicConfig(level=logging.ERROR) | |
logger = logging.getLogger('gdrive') # pylint: disable=invalid-name | |
logger.setLevel(logging.INFO) | |
FOLDER_ID_TO_ITEM = {} | |
@attr.s | |
class DriveItem(object): | |
""" Simple representation of a Google Drive item. """ | |
source = attr.ib() # The original item data as received from Google. | |
parent = attr.ib() # The DriveItem instance parent of this. | |
owned_by_me = attr.ib() # Does the current user own this item? | |
action = attr.ib(default='E') # What to do with this item. | |
new_parent = attr.ib(default=None) # If moving item, new parent id. | |
children = attr.ib(default=attr.Factory(list)) # List of children items (aka: child.parent == this) | |
def get_service_creds(email, scope): | |
""" Get Credentials for a given user using the ServiceAccountCredentials. """ | |
main_credentials = ServiceAccountCredentials.from_json_keyfile_name(KEYFILE, scopes=scope) | |
credentials = main_credentials.create_delegated(email) | |
http = credentials.authorize(httplib2.Http()) | |
credentials.refresh(http) | |
return (credentials, http) | |
def get_drive_service(email): | |
""" Get a service instance for Google Drive API v2. """ | |
scope = 'https://www.googleapis.com/auth/drive' | |
creds, http = get_service_creds(email, scope=scope) | |
drive_service = apiclient.discovery.build('drive', 'v3', http=http) | |
return drive_service | |
def am_i_owner(item): | |
""" Is the currently authenticated user the (or an) owner of the item? """ | |
return bool(item['ownedByMe']) | |
def build_tree(service, this, ignore=None, pad=0): | |
""" Build a tree of DriveItem instances, | |
Ignore anyone which title == ignore. | |
pad is a padding for print calls. | |
this is the current DriveItem instance. | |
service is a service from get_drive_service(). | |
""" | |
folder_id = this.source['id'] | |
print('{}Gathering item listings for {}...'.format(' '*pad, folder_id)) | |
q = '\'{}\' in parents'.format(folder_id) | |
page_token = None | |
while True: | |
try: | |
param = {} | |
if page_token: | |
param['pageToken'] = page_token | |
children = service.files().list(q=q, pageToken=page_token, | |
fields='nextPageToken, files(id, name, mimeType, ownedByMe, kind, md5Checksum, modifiedTime)').execute() | |
for item in children.get('files', []): | |
sub = DriveItem(source=item, parent=this, owned_by_me=am_i_owner(item)) | |
if ignore and item['name'] == ignore: | |
print(u'{}Ignore: {}'.format(' '*pad, item['name'])) | |
continue | |
if item['mimeType'] == 'application/vnd.google-apps.folder': | |
print(u'{}Tree: {}'.format(' '*pad, item['name'])) | |
if sub.owned_by_me: | |
build_tree(service, sub, ignore=None, pad=pad+4) | |
else: | |
print(u'{}Skip not owned: {}'.format(' '*pad, item['name'])) | |
continue | |
if item['kind'] == 'drive#file': | |
print(u'{}File: {} ({})'.format(' '*pad, item['name'], item['id'])) | |
this.children.append(sub) | |
page_token = children.get('nextPageToken') | |
if not page_token: | |
break | |
except apiclient.errors.HttpError as e: | |
print('An error occurred: {}'.format(e)) | |
break | |
def load_trees(usuario): | |
""" Read both tree structures for that user from Google. """ | |
service = get_drive_service(usuario) | |
root_item = service.files().get(fileId='root').execute() | |
root = DriveItem(source=root_item, parent=None, owned_by_me=True) | |
build_tree(service, root, ignore='Documents') | |
search = service.files().list(q="name = 'Documents'", | |
fields='nextPageToken, files(id, name, mimeType, ownedByMe, kind, md5Checksum, modifiedTime)').execute() | |
onedrive_trees = [] | |
for onedrive_item in search['files']: | |
if not onedrive_item['ownedByMe']: | |
continue | |
onedrive_root = DriveItem(source=onedrive_item, parent=None, owned_by_me=True) | |
build_tree(service, onedrive_root) | |
onedrive_trees.append(onedrive_root) | |
return root, onedrive_trees | |
def load_trees_from_shelve(dbname): | |
""" Load the trees from our cache. """ | |
db = shelve.open(dbname) | |
res = db['root'], db['onedrive'] | |
db.close() | |
return res | |
def save_trees(dbname, root, onedrive_trees): | |
""" Save the trees to our cache. """ | |
print("Saving tree data on '%s'" % dbname) | |
db = shelve.open(dbname[:-3]) | |
db['root'] = root | |
db['onedrive'] = onedrive_trees | |
db.close() | |
def compare_trees(left, right, path): | |
""" Compare the two trees merging left to right. | |
Actions: | |
>: Reparent the item | |
x: Trash the item | |
-: Do nothing | |
E: Something wrong... the default action. | |
""" | |
lsorted = sorted(left.children, | |
cmp=lambda x, y: cmp(x.source['name'], y.source['name'])) | |
rsorted = sorted(right.children, | |
cmp=lambda x, y: cmp(x.source['name'], y.source['name'])) | |
print("{:<15}----- Comparing '{}' with '{}' -----------".format( | |
path, | |
left.source['name'].encode('utf-8', errors='replace'), | |
right.source['name'].encode('utf-8', errors='replace'))) | |
left_continue = right_continue = True | |
row_format = "{:<15}{:>10} {} {} {:>10}" | |
lval = rval = ltitle = rtitle = None | |
while True: | |
if lval: | |
print(row_format.format(path, ltitle, lval.action, | |
getattr(rval, 'action', None), rtitle)) | |
if left_continue: | |
try: | |
lval = lsorted.pop(0) | |
except IndexError: | |
break | |
lval.action = 'E' | |
ltitle = lval.source['name'].encode('utf-8', errors='replace') | |
left_continue = False | |
if right_continue: | |
try: | |
rval = rsorted.pop(0) | |
except IndexError: | |
# Nothing left on the right to check against, so now all is to be moved over. | |
lval.action = '>' | |
rtitle = 'NONE' | |
left_continue = True | |
right_continue = True | |
continue | |
rval.action = 'E' | |
rtitle = rval.source['name'].encode('utf-8', errors='replace') | |
right_continue = False | |
if lval.source['id'] == rval.source['id']: | |
# EXACTLY the same file. Like when you do ======= in JavaScript. | |
lval.action = '-' | |
rval.action = '-' | |
left_continue = right_continue = True | |
continue | |
if ltitle > rtitle: | |
# No doubt what is in the right is not on the left. | |
# But we dont care. | |
right_continue = True | |
rval.action = '-' | |
continue | |
elif lval.source['mimeType'] == 'application/vnd.google-apps.folder': | |
# Source is a folder.. | |
if rval.source['mimeType'] == 'application/vnd.google-apps.folder' \ | |
and ltitle == rtitle: | |
# Folders of the same name. No action on the folder itself. | |
# check inside. | |
lval.action = '-' | |
compare_trees(lval, rval, path+ltitle+'/') | |
left_continue = right_continue = True | |
continue | |
else: | |
# Left and right are unrelated, so we keep ALL of left. | |
lval.action = '>' | |
lval.new_parent = right.source['id'] | |
left_continue = True | |
continue | |
elif ltitle == rtitle: | |
# Same titles | |
if rval.source['mimeType'] == 'application/vnd.google-apps.folder': | |
# that would be very strange (left not a folder, right a folder, same name...) | |
lval.action = '>' | |
lval.new_parent = right.source['id'] | |
rval.action = '-' | |
left_continue = right_continue = True | |
elif rval.source['mimeType'].find('application/vnd.google-apps.') == -1 \ | |
and lval.source['md5Checksum'] == rval.source['md5Checksum']: | |
# Files are identical!! | |
lval.action = 'x' | |
rval.action = '-' | |
left_continue = right_continue = True | |
else: | |
# Okay.. | |
lmodtime = dateutil.parser.parse(lval.source['modifiedTime']) | |
rmodtime = dateutil.parser.parse(rval.source['modifiedTime']) | |
if lmodtime == rmodtime: | |
# No same md5 but same dates? WTF? keep both... | |
lval.action = '>' | |
rval.action = '-' | |
if rmodtime.date() < BREAKUP_DATE: | |
# Right is older than breakup, so it is safe to keep the Left guy. | |
lval.action = '>' | |
lval.new_parent = right.source['id'] | |
rval.action = 'x' | |
else: | |
# Better be safe... keep both. | |
lval.action = '>' | |
lval.new_parent = right.source['id'] | |
rval.action = '-' | |
left_continue = right_continue = True | |
continue | |
else: | |
lval.action = '>' | |
left_continue = True | |
continue | |
print('{:<15} --------- end compare_trees({}, {})'.format( | |
path, | |
left.source['name'].encode('utf-8', errors='replace'), | |
right.source['name'].encode('utf-8', errors='replace'))) | |
def apply_actions(service, tree, path): | |
""" Apply the actions from the tree. | |
Actions: | |
>: Reparent the item | |
x: Trash the item | |
-: Do nothing | |
E: Something wrong... the default action. | |
""" | |
logger.info("%s ----- Apply '%s' -----------", path, tree.source['name']) | |
deletes = moves = 0 | |
for item in tree.children: | |
logger.info("%s: %s/%s", item.action, path, item.source['name']) | |
itemid = item.source['id'] | |
if len(item.children) > 0 and item.action == '-': | |
# If we are not reparenting item, and it has children, see the children. | |
apply_actions(service, item, path+'/'+item.source['name']) | |
elif item.action == '>': | |
# Change parents, remove tree.source['id'] and add item.new_parent. | |
service.files().update(fileId=itemid, | |
addParents=item.new_parent, | |
removeParents=tree.source['id'], | |
fields='id, parents').execute() | |
moves += 1 | |
elif item.action == 'x': | |
service.files().update(fileId=itemid, | |
body={'trashed': True}).execute() | |
deletes += 1 | |
elif item.action == '-': | |
pass | |
else: | |
pass #print("Unhandled item case on apply_actions.") | |
logger.info("%s ----- END Apply deletes=%d moves=%d '%s' -----------", | |
path, deletes, moves, tree.source['name']) | |
if __name__ == '__main__': | |
USER = sys.argv[1] | |
DBNAME = '%s.db' % USER.split('@')[0] | |
if os.path.exists(DBNAME): | |
print("Loading from Cache '%s'" % DBNAME) | |
root, onedrive_trees = load_trees_from_shelve(DBNAME) | |
else: | |
print("Loading from Google into '%s'" % DBNAME) | |
root, onedrive_trees = load_trees(USER) | |
save_trees(DBNAME, root, onedrive_trees) | |
for tree in onedrive_trees: | |
compare_trees(tree, root, '/') | |
cont = raw_input('Apply? (y/n)') | |
if cont == 'y': | |
svc = get_drive_service(USER) | |
for tree in onedrive_trees: | |
print("#### BEGIN APPLY ACTIONS ONEDRIVE TREE") | |
apply_actions(svc, tree, '/') | |
print("#### BEGIN APPLY ACTIONS GOOGLE DRIVE TREE") | |
apply_actions(svc, root, '/') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment