projectgus · cdruee · Jan 2, 2015 · Jan 2, 2015 · Jan 8, 2015 · Jan 8, 2015
diff --git a/dokuwiki.py b/dokuwiki.py
@@ -33,12 +33,28 @@ def __init__(self, rootpath):
         for subdir in [ self.meta, self.attic, self.pages]:
             ensure_directory_exists(subdir)
 
-    def write_pages(self, pages):
+    def write_pages(self, pages, namespaces):
         """
         Given 'pages' as a list of mediawiki pages with revisions attached, export them to dokuwiki pages
         """
         for page in pages:
-            self._convert_page(page)
+			"""
+			read the (numeric) namespace attribute and search
+			the matching namespace name
+			"""
+			namespace=None
+			ns=page['ns']
+			if ns == 0:
+				namespace=''
+			else:
+				for n in namespaces:
+					if ns == n['id']:
+						namespace=n['*']
+			if namespace==None:
+				raise RuntimeError("Page '%s' has unknown namespace ID: %i" % page,ns)
+
+			self._convert_page(page,namespace)
+
         self._aggregate_changes(self.meta, "_dokuwiki.changes")
 
     def write_images(self, images, file_namespace, http_user=None, http_pass=None):
@@ -74,21 +90,33 @@ def write_images(self, images, file_namespace, http_user=None, http_pass=None):
         # aggregate all the new changes to the media_meta/_media.changes file
         self._aggregate_changes(os.path.join(self.data, "media_meta"), "_media.changes")
 
-    def _convert_page(self, page):
+    def _convert_page(self, page, namespace):
         """ Convert the supplied mediawiki page to a Dokuwiki page """
-        print("Converting %d revisions of page '%s'..." %
-              (len(page["revisions"]), page['title']))
+#        print("Converting %d revisions of page '%s'..." %
+#              (len(page["revisions"]), page['title']))
+        # remove leading namespace specifier, if any
+        page_title=re.sub('^'+namespace,'',page['title'])
+
         # Sanitise the mediawiki pagename to something matching the dokuwiki pagename convention
-        full_title = make_dokuwiki_pagename(page['title'])
+        pagename = re.sub(r'_*[/:]+_*','_',make_dokuwiki_pagename(page_title))
+        if namespace!='':
+            namesparr=(namespace.replace("/",":")).split(':')
+            namesparr=map(make_dokuwiki_pagename(),namesparr)
+            page_ns  = ":".join(namesparr)
+        else:
+            page_ns=''
+        full_title=":".join((page_ns,pagename))
 
         # Mediawiki pagenames can contain namespace :s, convert these to dokuwiki / paths on the filesystem (becoming : namespaces in dokuwiki)
-        subdir, pagename = os.path.split(full_title.replace(':','/'))
+        subdir = page_ns.replace(":","/")
         pagedir = os.path.join(self.pages, subdir)
         metadir = os.path.join(self.meta, subdir)
         atticdir = os.path.join(self.attic, subdir)
         for d in pagedir, metadir, atticdir:
-            ensure_directory_exists(d)
-
+              ensure_directory_exists(d)
+        print("Converting page '%s' to '%s' (%d revisions) ..." %
+              (page['title'],pagename,len(page["revisions"])))
+
         # Walk through the list of revisions
         revisions = list(reversed(page["revisions"])) # order as oldest first
         for revision in revisions:
@@ -108,7 +136,7 @@ def _convert_page(self, page):
             # create gzipped attic revision
             atticname = "%s.%s.txt.gz" % (pagename, timestamp)
             atticpath = os.path.join(atticdir, atticname)
-            with gzip.open(atticpath, "wb") as f:
+            with gzip.open(atticpath.encode("utf-8"), "wb") as f:
                 f.write(content.encode("utf-8"))
             os.utime(atticpath, (timestamp,timestamp))
             # append entry to page's 'changes' metadata index
@@ -192,7 +220,7 @@ def make_dokuwiki_pagename(mediawiki_name):
     Any namespacing that is in the form of a / is replaced with a :
     """
     result = mediawiki_name.replace(" ","_")
-    return names.clean_id(camel_to_underscore(result)).replace("/",":")
+    return names.clean_id(camel_to_underscore(result))
 
 def camel_to_underscore(camelcase):
     """

diff --git a/mediawiki.py b/mediawiki.py
@@ -37,6 +37,7 @@ def get_all_pages(self):
         query = {'list' : 'allpages'}
         print("Getting list of pages...")
         pages = self._query(query, [ 'allpages' ])
+
         print("Query page revisions...")
         for page in pages:
             page["revisions"] = self._get_revisions(page)
@@ -96,6 +97,17 @@ def _query(self, args, path_to_result):
             except KeyError:
                 return result
 
+    def get_all_namespaces(self):
+        """
+        Return a list of dictionaries, each containing the keys 
+        'subpages' (non-empty=may contain subpages),'*' (Name), 
+        'id' (numeric), 'canonical' (canonical name)
+        """
+        query = { 'action' : 'query', 'meta' : 'siteinfo', 'siprop' : 'namespaces|namespacealiases' }
+        result = self.mw.call(query)['query']
+        namespaces = result['namespaces'].values()
+        return namespaces
+
     def get_file_namespaces(self):
         """
         Return a tuple. First entry is the name used by default for the file namespace (which dokuwiki will also use.)

diff --git a/names.py b/names.py
@@ -4,7 +4,7 @@
 Copyright (C) 2014 Angus Gratton
 Licensed under New BSD License as described in the file LICENSE.
 """
-import re, os.path, unicodedata
+import re, os.path, unicodedata, urllib
 
 def clean_id(name):
     """
@@ -13,6 +13,9 @@ def clean_id(name):
     Ignores both slashes and colons as valid namespace choices (to convert slashes to colons,
     call make_dokuwiki_pagename)
     """
+    # decode URL-ecoded characters
+    name=urllib.unquote(name)
+
     main,ext = os.path.splitext(name)
 
     # remove accents
@@ -23,9 +26,12 @@ def clean_id(name):
         no_accent = main # name was plaintext to begin with
 
     # recombine without any other characters
-    result = (re.sub(r'[^\w/:]+', '_', no_accent) + ext).lower()
+#    result = (re.sub(r'[^\w/:]+', '_', no_accent) + ext).lower()
+    result = (re.sub(r'[^\w]+', '_', no_accent) + ext).lower()
     while "__" in result:
         result = result.replace("__", "_") # this is a hack, unsure why regex doesn't catch it
+    # remove heading and trailing underscores
+    result=re.sub('^_|_$', '', result)
     return result
 
 def clean_user(name):

diff --git a/wikicontent.py b/wikicontent.py
@@ -23,10 +23,11 @@ def set_file_namespaces(canonical_alias, aliases):
     canonical_alias is the single namespace that dokuwiki will use (default File:)
     aliases is a list of alternative namespace names that will be converted to the canonical alias
     """
+    print("match localised namespaces for files/images %s <- %s"%(canonical_alias,aliases))
     global mw_file_namespace_aliases
     global dw_file_namespace
     dw_file_namespace = canonical_alias + ":"
-    mw_file_namespace_aliases = re.compile("^(%s):" % "|".join(aliases), re.IGNORECASE)
+    mw_file_namespace_aliases = re.compile("^(%s):" % "|".join([canonical_alias]+aliases), re.IGNORECASE)
 
 def is_file_namespace(target):
     """
@@ -97,7 +98,8 @@ def convert(section, trailing_newline):
     elif section.tagname == "@section":
         level = section.level
         heading = convert(section.children.pop(0), trailing_newline)
-        heading_boundary = "="*(6-level)
+        #highest level dokuwiki is six ='s, ->_7_-1
+        heading_boundary = "="*(7-level)
         result = "\n%s %s %s\n" % (heading_boundary, heading, heading_boundary)
     else:
         print("Unknown tagname %s" % section.tagname)
@@ -131,10 +133,12 @@ def convert(url, trailing_newline):
 
 @visitor.when(URL)
 def convert(url, trailing_newline):
+    print(' ... converting URL %s'%url.caption)
     return url.caption
 
 @visitor.when(ImageLink)
 def convert(link, trailing_newline):
+    print(' ... converting %s'%link.target)
     suffix = ""
     if link.width is not None:
         if link.height is None:
@@ -150,11 +154,12 @@ def convert(link, trailing_newline):
     prealign = " " if link.align in [ "center", "right" ] else ""
     postalign = " " if link.align in [ "center", "left" ] else ""
     target = canonicalise_file_namespace(link.target)
-    target = convert_internal_link(target)
+    target = ":".join(convert_internal_link(tg) for tg in target.split(":"))
     return "{{%s%s%s%s}}" % (prealign, target, suffix, postalign)
 
 @visitor.when(ArticleLink)
 def convert(link, trailing_newline):
+    print(' ... converting %s'%link.target)
     text = convert_children(link).strip(" ")
     pagename = convert_internal_link(link.target)
     if len(text):
@@ -164,21 +169,31 @@ def convert(link, trailing_newline):
 
 @visitor.when(CategoryLink)
 def convert(link, trailing_newline):
+    print(' ... converting %s'%link.target)
     # Category functionality can be implemented with plugin:tag, but not used here
     return ""
 
 @visitor.when(NamespaceLink)
 def convert(link, trailing_newline):
-    if is_file_namespace(link.target): # is a link to a file or image
-        filename = dokuwiki.make_dokuwiki_pagename(canonicalise_file_namespace(link.target))
-        caption = convert_children(link).strip()
+    print(' ... converting %s'%link.target)
+    target = re.sub(r'^:','',link.target)
+    if is_file_namespace(target): # is a link to a file or image
+        target = canonicalise_file_namespace(target)
+        #non-detected file link has a caption: sparate it
+        if re.match(r'\|',target):
+            target,caption=target.split('|')
+        else:
+            caption = convert_children(link).strip()
+        filename = convert_internal_link(re.sub(r'.*[:/]','',target))
+        target = ":".join(convert_internal_link(tg) for tg in target.split(":"))
+        print('     ... is a file link to %s'%filename)
         if len(caption) > 0:
-            return "{{%s%s}}" % (filename, caption)
+            return "{{%s|%s}}" % (target, caption)
         else:
-            return "{{%s}}" % filename
-
-    print("WARNING: Ignoring namespace link to " + link.target)
-    return convert_children(link)
+            return "{{%s}}" % (target)
+    else:
+        print("WARNING: Ignoring namespace link to " + link.target)
+        return convert_children(link)
 
 
 @visitor.when(ItemList)

diff --git a/wikicontent_tests.py b/wikicontent_tests.py
@@ -15,11 +15,15 @@
 
 """
 from __future__ import print_function, unicode_literals, absolute_import, division
-import sys, os, codecs, inspect, traceback
+import sys, os, codecs, inspect, traceback, difflib, unicodedata
 from pprint import pprint
 import wikicontent, yamdwe
 
-DELIMITER="*"*40
+DELIMITER="@"*40
+
+def prep_difflines(content):
+    """ difflib takes input in this "readlines" compatible format """
+    return [ x+"\n" for x in content.split("\n") ]
 
 def run_test(testdir):
     """
@@ -59,12 +63,10 @@ def run_test(testdir):
     print("Input Mediawiki:")
     print(mw)
     print(DELIMITER)
-    print("Expected Output:")
-    print(DELIMITER)
-    print(dw)
-    print(DELIMITER)
-    print("Actual Output:")
-    print(converted)
+
+    diff = difflib.unified_diff(prep_difflines(dw), prep_difflines(converted), fromfile='Expected Dokuwiki', tofile='Actual Dokuwiki', lineterm="\n")
+    sys.stdout.writelines(diff)
+    sys.stdout.write("\n")
     print(DELIMITER)
     return False
 

diff --git a/yamdwe.py b/yamdwe.py
@@ -41,7 +41,9 @@ def main():
     canonical_file, aliases = importer.get_file_namespaces()
     wikicontent.set_file_namespaces(canonical_file, aliases)
 
+
     # Read all pages and page revisions
+    namespaces=importer.get_all_namespaces()
     pages = importer.get_all_pages()
     print("Found %d pages to export..." % len(pages))
 
@@ -58,7 +60,7 @@ def main():
             page["revisions"].insert(0, latest)
 
     # Export pages to Dokuwiki format
-    exporter.write_pages(pages)
+    exporter.write_pages(pages,namespaces)
 
     # Bring over images
     images = importer.get_all_images()