Created
May 14, 2015 15:50
-
-
Save joshcartme/5f13fc525189b5d22112 to your computer and use it in GitHub Desktop.
Updated Mezzanine import_wordpress.py to handle pub_date conversion error
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from __future__ import unicode_literals | |
from future.builtins import int | |
from collections import defaultdict | |
from datetime import datetime, timedelta | |
from optparse import make_option | |
import re | |
from time import mktime, timezone | |
from xml.dom.minidom import parse | |
from django.core.management.base import CommandError | |
from django.utils.html import linebreaks | |
from theme.management.base import BaseImporterCommand | |
class Command(BaseImporterCommand): | |
""" | |
Implements a Wordpress importer. Takes a file path or a URL for the | |
Wordpress Extended RSS file. | |
""" | |
option_list = BaseImporterCommand.option_list + ( | |
make_option("-u", "--url", dest="url", help="URL to import file"), | |
) | |
def get_text(self, xml, name, nodetype): | |
""" | |
Gets the element's text value from the XML object provided. | |
""" | |
nodes = xml.getElementsByTagName("wp:comment_" + name)[0].childNodes | |
return "".join([n.data for n in nodes if n.nodeType == nodetype]) | |
def handle_import(self, options): | |
""" | |
Gets the posts from either the provided URL or the path if it | |
is local. | |
""" | |
url = options.get("url") | |
if url is None: | |
raise CommandError("Usage is import_wordpress %s" % self.args) | |
try: | |
import feedparser | |
except ImportError: | |
raise CommandError("Could not import the feedparser library.") | |
feed = feedparser.parse(url) | |
# We use the minidom parser as well because feedparser won't | |
# interpret WXR comments correctly and ends up munging them. | |
# xml.dom.minidom is used simply to pull the comments when we | |
# get to them. | |
xml = parse(url) | |
xmlitems = xml.getElementsByTagName("item") | |
for (i, entry) in enumerate(feed["entries"]): | |
print i | |
# Get a pointer to the right position in the minidom as well. | |
xmlitem = xmlitems[i] | |
content = linebreaks(self.wp_caption(entry.content[0]["value"])) | |
# Get the time struct of the published date if possible and | |
# the updated date if we can't. | |
pub_date = getattr(entry, "published_parsed", entry.updated_parsed) | |
pub_date = datetime.fromtimestamp(mktime(pub_date)) | |
pub_date -= timedelta(seconds=timezone) | |
# Tags and categories are all under "tags" marked with a scheme. | |
terms = defaultdict(set) | |
for item in getattr(entry, "tags", []): | |
terms[item.scheme].add(item.term) | |
if entry['wp_post_type'] == "blog": | |
print 'blog post' | |
post = self.add_post(title=entry.title, content=content, | |
pub_date=pub_date, tags=terms["tag"], | |
categories=terms["category"], | |
old_url=entry.id, author=entry.author) | |
# Get the comments from the xml doc. | |
for c in xmlitem.getElementsByTagName("wp:comment"): | |
name = self.get_text(c, "author", c.CDATA_SECTION_NODE) | |
email = self.get_text(c, "author_email", c.TEXT_NODE) | |
url = self.get_text(c, "author_url", c.TEXT_NODE) | |
body = self.get_text(c, "content", c.CDATA_SECTION_NODE) | |
pub_date = self.get_text(c, "date_gmt", c.TEXT_NODE) | |
fmt = "%Y-%m-%d %H:%M:%S" | |
pub_date = datetime.strptime(pub_date, fmt) | |
pub_date -= timedelta(seconds=timezone) | |
self.add_comment(post=post, name=name, email=email, | |
body=body, website=url, | |
pub_date=pub_date) | |
elif entry.wp_post_type == "page": | |
old_id = getattr(entry, "wp_post_id") | |
parent_id = getattr(entry, "wp_post_parent") | |
self.add_page(title=entry.title, content=content, | |
tags=terms["tag"], old_id=old_id, | |
old_parent_id=parent_id) | |
def wp_caption(self, post): | |
""" | |
Filters a Wordpress Post for Image Captions and renders to | |
match HTML. | |
""" | |
for match in re.finditer(r"\[caption (.*?)\](.*?)\[/caption\]", post): | |
meta = '<div ' | |
caption = '' | |
for imatch in re.finditer(r'(\w+)="(.*?)"', match.group(1)): | |
if imatch.group(1) == 'id': | |
meta += 'id="%s" ' % imatch.group(2) | |
if imatch.group(1) == 'align': | |
meta += 'class="wp-caption %s" ' % imatch.group(2) | |
if imatch.group(1) == 'width': | |
width = int(imatch.group(2)) + 10 | |
meta += 'style="width: %spx;" ' % width | |
if imatch.group(1) == 'caption': | |
caption = imatch.group(2) | |
parts = (match.group(2), caption) | |
meta += '>%s<p class="wp-caption-text">%s</p></div>' % parts | |
post = post.replace(match.group(0), meta) | |
return post |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment