#!/usr/bin/env python # coding: utf-8 from __future__ import unicode_literals from collections import defaultdict from steve.util import ( #get_from_config, get_project_config, save_json_files, #load_json_files, ) import json import internetarchive def files_by_format(item): d = defaultdict(list) for f in item.iter_files(): d[f.format].append(f) return d def get_format_url(files_lookup, fmt): formats = files_lookup.get(fmt) if formats: return formats[0].url return '' def subject2tags(metadata): subject = metadata.get('subject') if not subject: return [] return [t.strip() for t in subject.split(';')] def creator2speakers(metadata): if 'creator' in metadata: return [metadata['creator']] return [] def language2language(metadata): # need to lookup 3 letter codes return metadata['language'] def item2source_url(item): return '{}//archive.org/details/{}'.format(item.protocol, item.identifier) def item2video(item, category, language): video = {} if not item.exists: return {} md = item.metadata video['category'] = category video['state'] = 2 video['title'] = md['title'] video['description'] = md.get('description', '') video['summary'] = md.get('description', '') video['tags'] = subject2tags(md) video['speakers'] = creator2speakers(md) video['language'] = language video['copyright_text'] = md.get('licenseurl', '') video['recorded'] = md.get('date', '') video['whiteboard'] = 'ia scrape' video['source_url'] = item2source_url(item) file_lookup = files_by_format(item) video['thumbnail_url'] = get_format_url(file_lookup, 'Thumbnail') video['video_ogv_url'] = get_format_url(file_lookup, 'Ogg Video') video['video_ogv_download_only'] = False video['video_mp4_url'] = get_format_url(file_lookup, 'MPEG4') video['video_mp4_download_only'] = False video['video_webm_download_only'] = False video['video_webm_url'] = '' video['video_flv_download_only'] = False video['video_flv_url'] = '' return video if __name__ == "__main__": cfg = get_project_config() videos = [] search = internetarchive.search_items('subject:pyconza2014') identifiers = [result['identifier'] for result in search] for identifier in identifiers: item = internetarchive.Item(identifier) video = item2video(item, 'PyCon ZA 2014', 'English') if item.exists: videos.append(('json/{}.json'.format(identifier), video)) save_json_files(cfg, videos)