mems · August 29, 2015 14:17 · Mar 29, 2015 · Mar 29, 2015 · Mar 29, 2015 · Jun 27, 2014
diff --git a/har2maff.py b/har2maff.py
@@ -197,7 +197,7 @@ def update_entries_to_req_url(to_url):
         elif encoding is None:
             the_bytes = text_content.encode('utf-8')
         else:
-            log.error('Unrecognized response encoding: %s', contents['encoding'])
+            log.error('Unrecognized response encoding: %s', encoding)
             the_bytes = ''
         log.debug('URL:"%s" => "%s"' % (req_url, out_fn))
         zf.writestr(os.path.join(out_dir, out_fn), the_bytes)

diff --git a/har2maff.py b/har2maff.py
@@ -191,12 +191,14 @@ def update_entries_to_req_url(to_url):
                 log.debug('replacing img src %s => %s' % (img_href, img_href2))
                 text_content = make_re(img_href).sub(img_href2, text_content)
 
-        # I'm sure this is documented and I'm sure I didn't look it up
-        compress = contents.get('compression', -1)
-        if 36 == compress or 0 == compress:
+        encoding = contents.get('encoding')
+        if 'base64' == encoding:
             the_bytes = text_content.decode('base64')
-        else:
+        elif encoding is None:
             the_bytes = text_content.encode('utf-8')
+        else:
+            log.error('Unrecognized response encoding: %s', contents['encoding'])
+            the_bytes = ''
         log.debug('URL:"%s" => "%s"' % (req_url, out_fn))
         zf.writestr(os.path.join(out_dir, out_fn), the_bytes)
 

diff --git a/har2maff.py b/har2maff.py
@@ -1,4 +1,4 @@
-#! /usr/bin/env python
+#! /usr/bin/env python2.7
 # -*- coding: utf-8 -*-
 from __future__ import print_function, unicode_literals
 from bs4 import BeautifulSoup

diff --git a/har2maff.py b/har2maff.py
@@ -0,0 +1,213 @@
+#! /usr/bin/env python
+# -*- coding: utf-8 -*-
+from __future__ import print_function, unicode_literals
+from bs4 import BeautifulSoup
+import hashlib
+import logging
+import json
+import os
+import sys
+import re
+import time
+import urlparse
+import zipfile
+
+INDEX_RDF = '''<?xml version="1.0" encoding="UTF-8"?>
+<RDF:RDF xmlns:MAF="http://maf.mozdev.org/metadata/rdf#"
+         xmlns:NC="http://home.netscape.com/NC-rdf#"
+         xmlns:RDF="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
+  <RDF:Description RDF:about="urn:root">
+    <MAF:originalurl RDF:resource="%(url)s"/>
+    <MAF:title RDF:resource="%(title)s"/>
+    <MAF:archivetime RDF:resource="%(time)s"/>
+    <MAF:indexfilename RDF:resource="index.html"/>
+    <MAF:charset RDF:resource="UTF-8"/>
+  </RDF:Description>
+</RDF:RDF>
+'''
+
+
+def main(argv):
+    log = logging.getLogger('har_maff')
+    logging.basicConfig(level=logging.INFO)
+
+    filename = argv[1]
+    maff_fn = os.path.join(os.path.dirname(filename),
+                           re.sub(r'\.har$', '.maff', os.path.basename(filename)))
+    print('Saving to %s' % maff_fn)
+
+    maff_fh = open(maff_fn, 'wb')
+    zf = zipfile.ZipFile(maff_fh, mode='w')
+
+    with open(filename, 'rb') as fh:
+        har_body = fh.read()
+
+    har = json.loads(har_body, encoding='utf-8')
+    har_log = har['log']
+    har_version = har_log['version']
+    if '1.2' != har_version:
+        log.error('I only support version 1.2, not %s' % har_version)
+        return 1
+    har_pages = har_log['pages']
+    if not har_pages:
+        log.error('Har has no "pages", that is fatal')
+        return 1
+    page0 = har_pages[0]
+
+    ## this is in full ISO8601, including millis which
+    ## python 2.7 does not support
+    started_dt = re.sub(r'\.\d+Z$', 'Z', page0['startedDateTime'])
+
+    save_time = time.strptime(started_dt, '%Y-%m-%dT%H:%M:%SZ')
+    rdf_time = time.strftime('%a, %d %b %Y %H:%M:%S -0000', save_time)
+    out_dir = '%s' % int(time.mktime(save_time))
+    start_page = page0['title']  # yup, "title"
+    entries = har_log['entries']
+
+    page_title = None
+    #: :type: dict[unicode, unicode]
+    mime_types = {}
+    for en in entries:
+        req = en['request']
+        req_url = req['url']
+        resp = en['response']
+        contents = resp['content']
+        #: :type: unicode
+        media_type = contents['mimeType']
+        mime_types[req_url] = media_type
+
+    for en in entries:
+        req = en['request']
+        req_method = req['method']
+        req_url = req['url']
+        resp = en['response']
+        contents = resp['content']
+        media_type = mime_types[req_url]
+
+        if 'GET' != req_method:
+            log.warn('Skipping non-GET url: %s \"%s\"' % (req_method, req_url))
+            continue
+        if start_page == req_url:
+            out_fn = 'index.html'
+        else:
+            out_fn = hashlib.md5(req_url.encode('utf-8')).hexdigest()
+            if 'image/gif' in media_type:
+                out_fn = '%s.gif' % out_fn
+            elif 'image/jpeg' in media_type:
+                out_fn = '%s.jpeg' % out_fn
+            elif 'image/png' in media_type:
+                out_fn = '%s.png' % out_fn
+            elif '/javascript' in media_type:
+                out_fn = '%s.js' % out_fn
+            elif 'text/css' in media_type:
+                out_fn = '%s.css' % out_fn
+
+        if 'text' not in contents:
+            continue
+        #: :type: unicode
+        text_content = contents['text']
+
+        if start_page == req_url:
+            soup = BeautifulSoup(text_content)
+            page_title = soup.select('title')[0].text
+
+            def make_re(linky):
+                #: :type: unicode
+                safe_link = re.escape(linky)
+                # BS does not allow us to know if the href contained the "&amp" or not
+                # so here we update the regex to permit either
+                link_re = re.compile(safe_link.replace('\&', '\&(?:amp;)?'))
+                return link_re
+
+            def update_entries_to_req_url(to_url):
+                """
+                Finds the request URL in the HAR, independent of port number,
+                and if they are different from the provided :py:param:`to_url`
+                then I will update the **global** `entries` dict.
+
+                Turns out, HAR does not store the **accurate** URL.
+                For example, ``https://example.com:443/``
+                is stored in the har as ``https://example.com/``
+
+                :param unicode to_url: the URL used in the document
+                :return: the URL as stored in the HAR
+                :rtype: unicode | None
+                """
+                result = None
+                urlp = urlparse.urlparse(to_url)
+                ## this is, after all, the whole problem here
+                # noinspection PyProtectedMember
+                urlp = urlp._replace(netloc=re.sub(r':\d+', '', urlp.netloc))
+                for url2 in mime_types.iterkeys():
+                    url2p = urlparse.urlparse(url2)
+                    # noinspection PyProtectedMember
+                    url2p = url2p._replace(netloc=re.sub(r':\d+', '', url2p.netloc))
+                    if urlp == url2p:
+                        log.debug('matched "%s" and "%s" ...', to_url, url2)
+                        result = url2
+                        if url2 != to_url:
+                            for en2 in entries:
+                                if url2 == en2['request']['url']:
+                                    mime_types[to_url] = mime_types[url2]
+                                    en2['request']['url'] = to_url
+                                    log.warn('Replaced "%s" with "%s" because HAR was wrong', url2, to_url)
+                        break
+                return result
+
+            for css in soup.select('link[rel=stylesheet]'):
+                css_href = css.attrs.get('href')
+                update_entries_to_req_url(css_href)
+                css_href2 = '%s.css' % hashlib.md5(css_href).hexdigest()
+                log.debug('replacing css href %s => %s', css_href, css_href2)
+                text_content = make_re(css_href).sub(css_href2, text_content)
+
+            for js in soup.select('script[src]'):
+                js_href = js.attrs.get('src')
+                update_entries_to_req_url(js_href)
+                js_href2 = '%s.js' % hashlib.md5(js_href).hexdigest()
+                log.debug('replacing js src %s => %s' % (js_href, js_href2))
+                text_content = make_re(js_href).sub(js_href2, text_content)
+
+            for img in soup.select('img[src]'):
+                img_href = img.attrs.get('src')
+                # we need the HAR url in order to look up the URL
+                # in the mime-types dict
+                har_url = update_entries_to_req_url(img_href)
+                ## turns out, the .har doesn't capture *every* <img>
+                if not har_url:
+                    log.debug('Skipping non-HAR img.src "%s"', img_href)
+                    continue
+                img_mt = mime_types.get(har_url)
+                if 'image/png' in img_mt:
+                    img_ext = 'png'
+                elif 'image/jpeg' in img_mt:
+                    img_ext = 'jpeg'
+                elif 'image/gif' in img_mt:
+                    img_ext = 'gif'
+                else:
+                    log.error('Unrecognized img media type: %s for %s', img_mt, img_href)
+                    img_ext = ''
+                img_href2 = '%s.%s' % (hashlib.md5(img_href).hexdigest(), img_ext)
+                log.debug('replacing img src %s => %s' % (img_href, img_href2))
+                text_content = make_re(img_href).sub(img_href2, text_content)
+
+        # I'm sure this is documented and I'm sure I didn't look it up
+        compress = contents.get('compression', -1)
+        if 36 == compress or 0 == compress:
+            the_bytes = text_content.decode('base64')
+        else:
+            the_bytes = text_content.encode('utf-8')
+        log.debug('URL:"%s" => "%s"' % (req_url, out_fn))
+        zf.writestr(os.path.join(out_dir, out_fn), the_bytes)
+
+    rdf = INDEX_RDF % {
+        'url': re.sub(re.escape('&'), '&amp;', start_page),
+        'title': page_title,
+        'time': rdf_time,
+    }
+    zf.writestr(os.path.join(out_dir, 'index.rdf'), rdf.encode('utf-8'))
+    zf.close()
+    return 0
+
+if __name__ == '__main__':
+    sys.exit(main(sys.argv))