1 [[!meta title="ikiwiki-wordpress-import"]]
3 I modified the script a bit so categories and tags would actually show up in the output file.
11 Wordpress-to-Ikiwiki import tool
14 Copyright (C) 2007 Chris Lamb <chris@chris-lamb.co.uk>
16 This program is free software: you can redistribute it and/or modify
17 it under the terms of the GNU General Public License as published by
18 the Free Software Foundation, either version 3 of the License, or
19 (at your option) any later version.
21 This program is distributed in the hope that it will be useful,
22 but WITHOUT ANY WARRANTY; without even the implied warranty of
23 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
24 GNU General Public License for more details.
26 You should have received a copy of the GNU General Public License
27 along with this program. If not, see <http://www.gnu.org/licenses/>.
29 Usage: run --help as an argument with this script.
32 I added some extra bits to include the \[[!tag foo]] stuff in the post,
33 as it wasn't before, at all. I'll diff the versions out so you can see
42 from BeautifulSoup import BeautifulSoup
44 import codecs, htmlentitydefs
46 codecs.register_error('html_replace', lambda x: (''.join([u'&%s;' \
47 % htmlentitydefs.codepoint2name[ord(c)] for c in x.object[x.start:x.end]]), x.end))
49 def main(name, email, subdir, branch='master'):
50 soup = BeautifulSoup(sys.stdin.read())
52 # Regular expression to match stub in URL.
53 stub_pattern = re.compile(r'.*\/(.+)\/$')
55 for x in soup.findAll('item'):
57 if x.find('wp:status').string != 'publish': continue
59 match = stub_pattern.match(x.guid.string)
61 stub = match.groups()[0]
63 # Fall back to our own stubs
64 stub = re.sub(r'[^a-zA-Z0-9_]', '-', x.title.string).lower()
66 commit_msg = """Importing WordPress post "%s" [%s]""" % (x.title.string, x.guid.string)
67 timestamp = time.mktime(time.strptime(x.find('wp:post_date_gmt').string, "%Y-%m-%d %H:%M:%S"))
69 content = '\[[!meta title="%s"]]\n\n' % (x.title.string.replace('"', r'\"'))
70 content += x.find('content:encoded').string.replace('\r\n', '\n')
72 # categories = x.findAll('category')
73 # categories = x.findAll({'category':True}, attrs={'domain':re.compile(('category|tag'))})
74 # categories = x.findAll({'category':True}, domain=["category", "tag"])
75 # categories = x.findAll({'category':True}, nicename=True)
77 We do it differently here because we have duplicates otherwise.
79 <category><![CDATA[Health]]></category>
80 <category domain="category" nicename="health"><![CDATA[Health]]></category>
82 If we do the what original did, we end up with all tags and cats doubled.
83 Therefore we only pick out nicename="foo". Our 'True' below is our 'foo'.
84 I'd much rather have the value of 'nicename', and tried, but my
85 python skillz are extremely limited....
87 categories = x.findAll('category', nicename=True)
90 for cat in categories:
91 # remove 'tags/' because we have a 'tagbase' set.
92 # your choice: 'tag', or 'taglink'
93 # content += "\n\[[!tag %s]]" % (cat.string.replace(' ', '-'))
94 content += "\n\[[!taglink %s]]" % (cat.string.replace(' ', '-'))
95 # print >>sys.stderr, cat.string.replace(' ', '-')
97 # moved this thing down
98 data = content.encode('ascii', 'html_replace')
99 print "commit refs/heads/%s" % branch
100 print "committer %s <%s> %d +0000" % (name, email, timestamp)
101 print "data %d" % len(commit_msg)
103 print "M 644 inline %s" % os.path.join(subdir, "%s.mdwn" % stub)
104 print "data %d" % len(data)
107 if __name__ == "__main__":
108 if len(sys.argv) not in (4, 5):
109 print >>sys.stderr, "%s: usage: %s name email subdir [branch] < wordpress-export.xml | git-fast-import " % (sys.argv[0], sys.argv[0])
116 I have another version of the script, which uses the `timestamp` from the script, and inserts that as a \[[!meta date="foodate"]]. I'm posting it here just in case I happen to be doing something to the httpd.
118 (Hopefully I've escaped everything properly; if I missed something, check the source.)
122 #!/usr/bin/env python
126 Wordpress-to-Ikiwiki import tool
129 Copyright (C) 2007 Chris Lamb <chris@chris-lamb.co.uk>
131 This program is free software: you can redistribute it and/or modify
132 it under the terms of the GNU General Public License as published by
133 the Free Software Foundation, either version 3 of the License, or
134 (at your option) any later version.
136 This program is distributed in the hope that it will be useful,
137 but WITHOUT ANY WARRANTY; without even the implied warranty of
138 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
139 GNU General Public License for more details.
141 You should have received a copy of the GNU General Public License
142 along with this program. If not, see <http://www.gnu.org/licenses/>.
144 Usage: run --help as an argument with this script.
147 I added some extra bits to include the \[[!tag foo]] stuff in the post,
148 as it wasn't before, at all. I'll diff the versions out so you can see
157 from datetime import datetime
158 from BeautifulSoup import BeautifulSoup
160 import codecs, htmlentitydefs
162 codecs.register_error('html_replace', lambda x: (''.join([u'&%s;' \
163 % htmlentitydefs.codepoint2name[ord(c)] for c in x.object[x.start:x.end]]), x.end))
165 def main(name, email, subdir, branch='master'):
166 soup = BeautifulSoup(sys.stdin.read())
168 # Regular expression to match stub in URL.
169 stub_pattern = re.compile(r'.*\/(.+)\/$')
171 for x in soup.findAll('item'):
173 if x.find('wp:status').string != 'publish': continue
175 match = stub_pattern.match(x.guid.string)
177 stub = match.groups()[0]
179 # Fall back to our own stubs
180 stub = re.sub(r'[^a-zA-Z0-9_]', '-', x.title.string).lower()
182 commit_msg = """Importing WordPress post "%s" [%s]""" % (x.title.string, x.guid.string)
183 timestamp = time.mktime(time.strptime(x.find('wp:post_date_gmt').string, "%Y-%m-%d %H:%M:%S"))
184 content = '\[[!meta title="%s"]]\n' % (x.title.string.replace('"', r'\"'))
185 content += "\[[!meta date=\"%s\"]]\n" % datetime.fromtimestamp(timestamp)
186 content += x.find('content:encoded').string.replace('\r\n', '\n')
189 We do it differently here because we have duplicates otherwise.
191 <category><![CDATA[Health]]></category>
192 <category domain="category" nicename="health"><![CDATA[Health]]></category>
194 If we do the what original did, we end up with all tags and cats doubled.
195 Therefore we only pick out nicename="foo". Our 'True' below is our 'foo'.
196 I'd much rather have the value of 'nicename', and tried, but my
197 python skillz are extremely limited....
199 categories = x.findAll('category', nicename=True)
202 for cat in categories:
203 # remove 'tags/' because we have a 'tagbase' set.
204 # your choice: 'tag', or 'taglink'
205 # content += "\n\[[!tag %s]]" % (cat.string.replace(' ', '-'))
206 content += "\n\[[!taglink %s]]" % (cat.string.replace(' ', '-'))
207 # this is just debugging, and for fun
208 # print >>sys.stderr, cat.string.replace(' ', '-')
210 # moved this thing down
211 data = content.encode('ascii', 'html_replace')
212 print "commit refs/heads/%s" % branch
213 print "committer %s <%s> %d +0000" % (name, email, timestamp)
214 print "data %d" % len(commit_msg)
216 print "M 644 inline %s" % os.path.join(subdir, "%s.mdwn" % stub)
217 print "data %d" % len(data)
220 if __name__ == "__main__":
221 if len(sys.argv) not in (4, 5):
222 print >>sys.stderr, "%s: usage: %s name email subdir [branch] < wordpress-export.xml | git-fast-import " % (sys.argv[0], sys.argv[0])