I wanted to convert my old blogs on Blogger to markdown files in Github Pages. I went to the Blogger settings and exported my blog. The result was a xml file. The xml file from Google is not easy to parse. As far as I know there is no documentation on the structure of the xml file.
After some trial and error, I have come up with this script that can parse the xml file and emit the contents in a set of markdown files:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
import xml.dom.minidom
from xml.dom import Node
import re
list_text = []
def handle_node ( feeds ):
for f in feeds :
for sf in f . childNodes :
if sf . nodeType == Node . ELEMENT_NODE :
handle_node ( sf . childNodes )
elif sf . nodeType == Node . TEXT_NODE :
list_text . append (( sf . parentNode . nodeName , sf . data ))
handle_node ( sf . childNodes )
else :
handle_node ( sf . childNodes )
TMPL = """---
layout: post
title: %(title)s
date: %(date)s
comments: true
categories:
meta:
---
"""
def gen_post ( list_of_post ):
for post in list_of_post :
post_id = post [ 'id' ]
m = re . match ( ".*tag:blogger.com.*blog-\d*.settings.*" , post_id )
if m :
continue # do not process
m = re . match ( ".*tag:blogger.com.*blog-\d*.layout.*" , post_id )
if m :
continue # do not process
pd = post [ 'published' ]
pdd = pd [ 0 : 10 ]
pdt = pd [ 11 : 16 ]
fn = post [ 'id' ] . split ( "-" )[ - 1 ]
f = open ( pdd + "-" + fn + ".markdown" , "w" )
f . write ( TMPL % { 'title' : post [ 'title' ], 'date' : pdd + " " + pdt , 'id' : post [ 'id' ]})
try :
f . write ( post . get ( 'content' , '' ))
except :
print ( "error with %s , %s " % ( post [ 'title' ], fn ))
f . close ()
def parse_xml ():
ifile = open ( 'blog-06-26-2013.xml' ) # change input file here
dom = xml . dom . minidom . parseString ( ifile . read ())
nodes = dom . childNodes
handle_node ( nodes )
list_post = []
last_id = None
tmp = {}
for k , v in list_text :
# print "[%s] >\t %s" % (k, v)
if k == 'id' :
if last_id and last_id != v :
list_post . append ( tmp )
tmp = {}
last_id = v
tmp [ 'id' ] = v
if k == 'updated' : tmp [ 'updated' ] = v
if k == 'name' : tmp [ 'name' ] = v
if k == 'title' : tmp [ 'title' ] = v
if k == 'email' : tmp [ 'email' ] = v
if k == 'content' : tmp [ 'content' ] = v
if k == 'published' : tmp [ 'published' ] = v
return list_post
def main ():
posts = parse_xml ()
gen_post ( posts )
if __name__ == "__main__" :
main ()
It is not very polished. For example, it will not capture the tags of a blog post. But it is good enough for my purpose.