Convert WordPress posts to reStructedText30. Jun '13
WordPress has become pretty bloated so I thought what the hell, I could just serve my blogposts from my own web framework. I had WordPress export my blogposts to JSON and I used following regular expression spaghetti to reformat all the posts to reStructuredText:
import re
import codecs
from cjson import decode
"""
Convert WordPress exported posts JSON to separate reStructuredText files
"""
fh = open("wp_posts.json")
buf = fh.read().decode("utf-8")
posts = decode(buf[98:]) # Skip headers
for post in posts:
if post["post_type"] != "post" or post["post_status"] != "publish":
continue
created = post["post_date_gmt"]
date = created[:10]
title = post["post_title"]
author = post["post_author"]
# Sanitize filename
filename = "%s-%s.rst" % (date, title.replace("/", "_").replace(" ", "_").replace(".", "_").lower())
# Replace DOS newlines
content = post["post_content"].replace("\r\n", "\n").replace("\r", "")
# Replace HTML entities
content = content.replace(" ", " ").replace(">", ">").replace("<", "<").replace("∧", "&")
content = content.replace("\\'", "'")
# Replace bold and italic tags
content = re.sub(r"<strong>(.*?)</strong>", "**\\1**", content)
content = re.sub(r"<em>(.*?)</em>", "*\\1*", content)
content = re.sub(r"<b>(.*?)</b>", "**\\1**", content)
content = re.sub(r"<i>(.*?)</i>", "*\\1*", content)
# Replace links and images
content = re.sub(r"<a href=\"(.+?)\"><img .*?src=\"(.+?)\".*?/></a>", "\n.. image:: \\2\n", content)
content = re.sub(r"<a href=\"(.+?)\">(.+?)</a>", "`\\2 <\\1>`_ ", content)
content = re.sub(r"<img .*?src=\"(.+?)\".*?>", "\n.. image: \\1\n", content)
# Replace headings
def underline(m):
fragment, = m.groups()
return "%s\n%s\n" % (fragment, "-" * len(fragment))
content = re.sub(r"<h[123456]>(.*?)</h[123456]>", underline, content, flags=re.DOTALL)
# Replace <code> tags and indent code
def indent_code_block(m):
code_block, = m.groups()
return ":\n\n.. code:: bash\n\n " + code_block.replace("\n", "\n ").strip() + "\n\n"
content = re.sub(r"</code>\s*<code>", "\n", content)
content = re.sub(r":?\n?\s*<code>(.*?)</code>\s*", indent_code_block, content, flags=re.DOTALL)
content = re.sub(r":?\n?\s*<pre>(.*?)</pre>\s*", indent_code_block, content, flags=re.DOTALL)
# Replace lists
def replace_list_items(m):
chunk, = m.groups()
return "\n%s\n" % re.sub("\s*<li>\s*(.*?)\s*</li>\s*", "* \\1\n", chunk)
content = re.sub("<ul>(.*?)</ul>", replace_list_items, content, flags=re.DOTALL)
fh = codecs.open("%s" % filename, "w", encoding="utf-8")
fh.write(".. title: %s\n.. date: %s\n.. author: %s\n\n" % (title, created, author))
fh.write(content)
fh.write("\n")
fh.close()