trentm · lipnitsk · Dec 21, 2012 · Dec 21, 2012 · Dec 21, 2012 · Dec 21, 2012
diff --git a/wikiconvert.py b/wikiconvert.py
@@ -38,11 +38,8 @@ def convert_file(proj_id, src_path, dst_dir):
     for i, line in enumerate(lines):
         if line.startswith("#"):
             meta_lines.append(line)
-        else:
-            if not line.strip():
-                body_lines = lines[i+1:]
-            else:
-                body_lines = lines[i:]
+	elif line.strip():
+            body_lines = lines[i:]
             break
     meta = {}
     for line in meta_lines:
@@ -58,6 +55,9 @@ def convert_file(proj_id, src_path, dst_dir):
     text = re.compile(r'^{{{+ *\n', re.M).sub(r"```\n", text)
     text = re.compile(r'^}}}+ *(\n|$)', re.M).sub(r"```\n", text)
 
+    # TODO: Add support for `backtick` code quotes
+    text = re.sub(r'{{{(.*?)}}}', r'`\1`', text)
+
     # Headings.
     text = re.compile(r'^===(.*?)===\s*$', re.M).sub(lambda m: "### %s\n"%m.group(1).strip(), text)
     text = re.compile(r'^==(.*?)==\s*$', re.M).sub(lambda m: "## %s\n"%m.group(1).strip(), text)
@@ -130,11 +130,11 @@ def sub_link(m):
 #---- internal support stuff
 
 def _indent(text):
-    return '    ' + '\n    '.join(text.splitlines(False))
+    return '\n    ' + '\n    '.join(text.splitlines(False))
 
 def _gh_page_name_from_gc_page_name(gc):
     """Github (gh) Wiki page name from Google Code (gc) Wiki page name."""
-    gh = re.sub(r'([A-Z][a-z]+)', r'-\1', gc)[1:]
+    gh = re.sub(r'([A-Za-z]+)_?', r'-\1', gc)[1:]
     return gh
 
 

diff --git a/wikiconvert_creole.py b/wikiconvert_creole.py
@@ -0,0 +1,185 @@
+#!/usr/bin/env python
+
+"""
+Usage:
+    python googlecode2github/wikiconfig.py PROJID SRCDIR DSTDIR
+
+where "PROJID" is the github project id, e.g. "trentm/python-markdown2",
+"SRCDIR" is a Google Code project wiki Subversion working copy dir and
+"DSTDIR" is the git clone dir of the git project's wiki.
+"""
+
+__version__ = "1.0.0"
+
+import re
+import sys
+from os.path import *
+from glob import glob
+from pprint import pprint
+import codecs
+from hashlib import md5
+
+
+def log(s):
+    sys.stderr.write(s+"\n")
+
+def convert_dir(proj_id, src_dir, dst_dir):
+    if isfile(src_dir):
+        convert_file(proj_id, src_dir, dst_dir)
+    else:
+        for f in glob(join(src_dir, "*.wiki")):
+            convert_file(proj_id, f, dst_dir)
+
+def convert_file(proj_id, src_path, dst_dir):
+    src = codecs.open(src_path, 'r', 'utf-8').read()
+    meta_lines = []
+    body_lines = []
+    lines = src.splitlines(False)
+    for i, line in enumerate(lines):
+        if line.startswith("#"):
+            meta_lines.append(line)
+        else:
+            assert not line.strip(), "line isn't empty in file %s %r" % (src_path, line)
+            # TODO is it actually mandtory that a blank line separate meta text from body text?
+            body_lines = lines[i+1:]
+            break
+    meta = {}
+    for line in meta_lines:
+        k,v = line[1:].split(None, 1)
+        meta[k] = v
+    text = '\n'.join(body_lines)
+    s_from_hash = {}
+
+    # Pull out pre-blocks so we can restore them unmunged
+    def sub_block(match,indent=True):
+        pre = match.group(1)
+        hash = md5(pre.encode('utf8')).hexdigest()
+        # Creole uses braces, not indentation for code blocks
+        s_from_hash[hash] = "{{{"+pre+"}}}" if indent else pre
+        return hash
+
+    def sub_pre_block(match):
+        return sub_block(match,indent=True)
+
+    text = re.compile(r'^{{{(.*?)}}}', re.M|re.S).sub(sub_pre_block, text)
+
+    #  Pull out `backtick` code quotes 
+    #def sub_code(match)
+     #   return sub_block(match,indent=False)
+    text = re.compile(r'`(.*?)`', re.M|re.S).sub(r'{{{\1}}}', text) # monospace literal for Creole 
+
+    # Headings - No conversion needed for Creole. 
+
+    # Tables
+    def sub_table_creole(m):
+        rows = []
+        for line in m.group(0).splitlines(False):
+            if not line.strip():
+                continue
+            rows.append(list(c.strip() for c in line.split("||")[1:-1]))
+        lines = []
+        # Assume first row is a header (or should we assume the reverse?)
+        if rows:
+            lines.append('|='+'|='.join(rows[0])+'|')
+            for row in rows[1:]:
+                lines.append('|'+'|'.join(row)+'|')
+            return '\n\n' + '\n'.join(lines)
+    text = re.compile(r'\n(\n^\|\|(.*?\|\|)+$)+', re.M).sub(sub_table_creole, text)
+
+    # Lists (doesn't handle nested lists - flattens structure).
+    text = re.compile(r'^[ \t]+\*[ \t]+(.*?)$', re.M).sub(r'{^} \1', text) # temp marker to avoid bold processing
+    text = re.compile(r'^[ \t]+#[ \t]+(.*?)$', re.M).sub(r'# \1', text)
+
+    # Italics, bold. - same for both Markdown & Creole
+    # in*ter*bold: (?<=\w)(\*\w+?\*)(?=\w)
+    text = re.compile(r'(?<![*\w])\*([^*]+?)\*(?![*\w])', re.S).sub(r'**\1**', text)
+    text = re.compile(r'(?<![_\w])_([^_]+?)_(?![_\w])', re.S).sub(r'*\1*', text)
+
+    # Swap our temporary bulllet marker back out
+    text = text.replace('{^}','*')
+
+    # wiki links.
+    def sub_wikilink(m):
+        gh_page_name = _gh_page_name_from_gc_page_name(m.group(1)).replace('-', ' ')
+        if m.group(2):
+            s = "[[%s|%s]]" % ( m.group(2),gh_page_name)
+        else:
+            s = "[[%s]]" % gh_page_name
+        hash = md5(s.encode('utf8')).hexdigest()
+        s_from_hash[hash] = s
+        return hash
+    text = re.compile(r'\[((?:[A-Z][a-z]+)+)(?:\s+(.*?))?\]', re.S).sub(sub_wikilink, text)
+
+    # Links
+    def sub_link(m):
+        # s = "[%s](%s)" % (m.group(2), m.group(1)) # Markdown
+        s = "[[%s|%s]]" % (m.group(2), m.group(1)) # Creole
+        hash = md5(s.encode('utf8')).hexdigest()
+        s_from_hash[hash] = s
+        return hash
+    # NOTE: this only matches http & ftp links currently
+    text = re.compile(r'(?<!\[)\[((?:http|ftp):[^\s]+)\s+(.*?)\](?!\])', re.S).sub(sub_link, text)
+
+    # Auto-linking "issue \d+"
+    # TODO: Construct Google Code -> Github issue lookup map
+    text = re.compile(r'(?<!\[)(issue (\d+))(?!\])').sub(
+        r'[\1](https://github.com/%s/issues#issue/\2)' % proj_id, text)
+
+    # Restore hashed-out blocks.
+    for hash, s in s_from_hash.items():
+        if text == text.replace(hash,s):
+            print 'Failed to replace %s with %s' % (hash,s)
+        text = text.replace(hash, s)
+
+    base = splitext(basename(src_path))[0]
+
+    # Prepend warning block for manual review
+    text = '----\n**NOTE**: This page was automatically converted from the ' \
+           + ('[[old page|http://code.google.com/p/google-refine/wiki/%s]] ' % base) \
+           + ' at Google Code and has not been manually reviewed.  Please compare it to  ' \
+           + ('[[the original|http://code.google.com/p/google-refine/wiki/%s]] ' % base) \
+           + ', correct any errors or omissions, then remove this warning block.\n----\n\n\n' \
+           +text
+
+    #  Prepend summary
+    if "summary" in meta:
+        text = ("//%s//\n\n" % meta["summary"]) + text
+
+    # Project specific replacements for naming, mailing lists, & code
+    text = text.replace('Google Refine','OpenRefine')
+    text = text.replace(
+                    '://groups.google.com/group/google-refine',
+                     '://groups.google.com/group/openrefine')
+    text = text.replace(
+                        'code.google.com/p/google-refine/source/browse/trunk/',
+                        'github.com/OpenRefine/OpenRefine/blob/master/')
+
+    gh_page_name = _gh_page_name_from_gc_page_name(base)
+    dst_path = join(dst_dir, gh_page_name+".creole")
+    if not exists(dst_path) or codecs.open(dst_path, 'r', 'utf-8').read() != text:
+        codecs.open(dst_path, 'w', 'utf-8').write(text)
+        log("wrote '%s'" % dst_path)
+
+
+#---- internal support stuff
+
+def _indent(text):
+    return '    ' + '\n    '.join(text.splitlines(False))
+
+def _gh_page_name_from_gc_page_name(gc):
+    """Github (gh) Wiki page name from Google Code (gc) Wiki page name."""
+    # Note: We can't handle both FetchingURLsFromWebServices & GRELFunctions
+    if re.match(r'[A-Z]+[a-z]{2,}',gc):
+        gh = re.sub(r'([A-Z]+[a-z]+)', r'-\1', gc)[1:]
+    else:
+        gh = gc
+    return gh
+
+#---- mainline
+
+if __name__ == '__main__':
+    convert_dir("OpenRefine/OpenRefine", "c:/users/tfmorris/tmp/grefine-wiki", "c:/users/tfmorris/tmp/orefine-wiki")
+    if len(sys.argv) != 4:
+        print __doc__
+        sys.exit(1)
+    convert_dir(sys.argv[1], sys.argv[2], sys.argv[3])