diff --git a/dokuwiki.py b/dokuwiki.py index b253147..80a55b4 100644 --- a/dokuwiki.py +++ b/dokuwiki.py @@ -33,12 +33,28 @@ def __init__(self, rootpath): for subdir in [ self.meta, self.attic, self.pages]: ensure_directory_exists(subdir) - def write_pages(self, pages): + def write_pages(self, pages, namespaces): """ Given 'pages' as a list of mediawiki pages with revisions attached, export them to dokuwiki pages """ for page in pages: - self._convert_page(page) + """ + read the (numeric) namespace attribute and search + the matching namespace name + """ + namespace=None + ns=page['ns'] + if ns == 0: + namespace='' + else: + for n in namespaces: + if ns == n['id']: + namespace=n['*'] + if namespace==None: + raise RuntimeError("Page '%s' has unknown namespace ID: %i" % page,ns) + + self._convert_page(page,namespace) + self._aggregate_changes(self.meta, "_dokuwiki.changes") def write_images(self, images, file_namespace, http_user=None, http_pass=None): @@ -74,21 +90,33 @@ def write_images(self, images, file_namespace, http_user=None, http_pass=None): # aggregate all the new changes to the media_meta/_media.changes file self._aggregate_changes(os.path.join(self.data, "media_meta"), "_media.changes") - def _convert_page(self, page): + def _convert_page(self, page, namespace): """ Convert the supplied mediawiki page to a Dokuwiki page """ - print("Converting %d revisions of page '%s'..." % - (len(page["revisions"]), page['title'])) +# print("Converting %d revisions of page '%s'..." % +# (len(page["revisions"]), page['title'])) + # remove leading namespace specifier, if any + page_title=re.sub('^'+namespace,'',page['title']) + # Sanitise the mediawiki pagename to something matching the dokuwiki pagename convention - full_title = make_dokuwiki_pagename(page['title']) + pagename = re.sub(r'_*[/:]+_*','_',make_dokuwiki_pagename(page_title)) + if namespace!='': + namesparr=(namespace.replace("/",":")).split(':') + namesparr=map(make_dokuwiki_pagename(),namesparr) + page_ns = ":".join(namesparr) + else: + page_ns='' + full_title=":".join((page_ns,pagename)) # Mediawiki pagenames can contain namespace :s, convert these to dokuwiki / paths on the filesystem (becoming : namespaces in dokuwiki) - subdir, pagename = os.path.split(full_title.replace(':','/')) + subdir = page_ns.replace(":","/") pagedir = os.path.join(self.pages, subdir) metadir = os.path.join(self.meta, subdir) atticdir = os.path.join(self.attic, subdir) for d in pagedir, metadir, atticdir: - ensure_directory_exists(d) - + ensure_directory_exists(d) + print("Converting page '%s' to '%s' (%d revisions) ..." % + (page['title'],pagename,len(page["revisions"]))) + # Walk through the list of revisions revisions = list(reversed(page["revisions"])) # order as oldest first for revision in revisions: @@ -108,7 +136,7 @@ def _convert_page(self, page): # create gzipped attic revision atticname = "%s.%s.txt.gz" % (pagename, timestamp) atticpath = os.path.join(atticdir, atticname) - with gzip.open(atticpath, "wb") as f: + with gzip.open(atticpath.encode("utf-8"), "wb") as f: f.write(content.encode("utf-8")) os.utime(atticpath, (timestamp,timestamp)) # append entry to page's 'changes' metadata index @@ -192,7 +220,7 @@ def make_dokuwiki_pagename(mediawiki_name): Any namespacing that is in the form of a / is replaced with a : """ result = mediawiki_name.replace(" ","_") - return names.clean_id(camel_to_underscore(result)).replace("/",":") + return names.clean_id(camel_to_underscore(result)) def camel_to_underscore(camelcase): """ diff --git a/mediawiki.py b/mediawiki.py index 9790a43..eaec076 100644 --- a/mediawiki.py +++ b/mediawiki.py @@ -37,6 +37,7 @@ def get_all_pages(self): query = {'list' : 'allpages'} print("Getting list of pages...") pages = self._query(query, [ 'allpages' ]) + print("Query page revisions...") for page in pages: page["revisions"] = self._get_revisions(page) @@ -96,6 +97,17 @@ def _query(self, args, path_to_result): except KeyError: return result + def get_all_namespaces(self): + """ + Return a list of dictionaries, each containing the keys + 'subpages' (non-empty=may contain subpages),'*' (Name), + 'id' (numeric), 'canonical' (canonical name) + """ + query = { 'action' : 'query', 'meta' : 'siteinfo', 'siprop' : 'namespaces|namespacealiases' } + result = self.mw.call(query)['query'] + namespaces = result['namespaces'].values() + return namespaces + def get_file_namespaces(self): """ Return a tuple. First entry is the name used by default for the file namespace (which dokuwiki will also use.) diff --git a/names.py b/names.py index 038012a..26efee6 100644 --- a/names.py +++ b/names.py @@ -4,7 +4,7 @@ Copyright (C) 2014 Angus Gratton Licensed under New BSD License as described in the file LICENSE. """ -import re, os.path, unicodedata +import re, os.path, unicodedata, urllib def clean_id(name): """ @@ -13,6 +13,9 @@ def clean_id(name): Ignores both slashes and colons as valid namespace choices (to convert slashes to colons, call make_dokuwiki_pagename) """ + # decode URL-ecoded characters + name=urllib.unquote(name) + main,ext = os.path.splitext(name) # remove accents @@ -23,9 +26,12 @@ def clean_id(name): no_accent = main # name was plaintext to begin with # recombine without any other characters - result = (re.sub(r'[^\w/:]+', '_', no_accent) + ext).lower() +# result = (re.sub(r'[^\w/:]+', '_', no_accent) + ext).lower() + result = (re.sub(r'[^\w]+', '_', no_accent) + ext).lower() while "__" in result: result = result.replace("__", "_") # this is a hack, unsure why regex doesn't catch it + # remove heading and trailing underscores + result=re.sub('^_|_$', '', result) return result def clean_user(name): diff --git a/wikicontent.py b/wikicontent.py index 1296824..435d944 100644 --- a/wikicontent.py +++ b/wikicontent.py @@ -23,10 +23,11 @@ def set_file_namespaces(canonical_alias, aliases): canonical_alias is the single namespace that dokuwiki will use (default File:) aliases is a list of alternative namespace names that will be converted to the canonical alias """ + print("match localised namespaces for files/images %s <- %s"%(canonical_alias,aliases)) global mw_file_namespace_aliases global dw_file_namespace dw_file_namespace = canonical_alias + ":" - mw_file_namespace_aliases = re.compile("^(%s):" % "|".join(aliases), re.IGNORECASE) + mw_file_namespace_aliases = re.compile("^(%s):" % "|".join([canonical_alias]+aliases), re.IGNORECASE) def is_file_namespace(target): """ @@ -97,7 +98,8 @@ def convert(section, trailing_newline): elif section.tagname == "@section": level = section.level heading = convert(section.children.pop(0), trailing_newline) - heading_boundary = "="*(6-level) + #highest level dokuwiki is six ='s, ->_7_-1 + heading_boundary = "="*(7-level) result = "\n%s %s %s\n" % (heading_boundary, heading, heading_boundary) else: print("Unknown tagname %s" % section.tagname) @@ -131,10 +133,12 @@ def convert(url, trailing_newline): @visitor.when(URL) def convert(url, trailing_newline): + print(' ... converting URL %s'%url.caption) return url.caption @visitor.when(ImageLink) def convert(link, trailing_newline): + print(' ... converting %s'%link.target) suffix = "" if link.width is not None: if link.height is None: @@ -150,11 +154,12 @@ def convert(link, trailing_newline): prealign = " " if link.align in [ "center", "right" ] else "" postalign = " " if link.align in [ "center", "left" ] else "" target = canonicalise_file_namespace(link.target) - target = convert_internal_link(target) + target = ":".join(convert_internal_link(tg) for tg in target.split(":")) return "{{%s%s%s%s}}" % (prealign, target, suffix, postalign) @visitor.when(ArticleLink) def convert(link, trailing_newline): + print(' ... converting %s'%link.target) text = convert_children(link).strip(" ") pagename = convert_internal_link(link.target) if len(text): @@ -164,21 +169,31 @@ def convert(link, trailing_newline): @visitor.when(CategoryLink) def convert(link, trailing_newline): + print(' ... converting %s'%link.target) # Category functionality can be implemented with plugin:tag, but not used here return "" @visitor.when(NamespaceLink) def convert(link, trailing_newline): - if is_file_namespace(link.target): # is a link to a file or image - filename = dokuwiki.make_dokuwiki_pagename(canonicalise_file_namespace(link.target)) - caption = convert_children(link).strip() + print(' ... converting %s'%link.target) + target = re.sub(r'^:','',link.target) + if is_file_namespace(target): # is a link to a file or image + target = canonicalise_file_namespace(target) + #non-detected file link has a caption: sparate it + if re.match(r'\|',target): + target,caption=target.split('|') + else: + caption = convert_children(link).strip() + filename = convert_internal_link(re.sub(r'.*[:/]','',target)) + target = ":".join(convert_internal_link(tg) for tg in target.split(":")) + print(' ... is a file link to %s'%filename) if len(caption) > 0: - return "{{%s%s}}" % (filename, caption) + return "{{%s|%s}}" % (target, caption) else: - return "{{%s}}" % filename - - print("WARNING: Ignoring namespace link to " + link.target) - return convert_children(link) + return "{{%s}}" % (target) + else: + print("WARNING: Ignoring namespace link to " + link.target) + return convert_children(link) @visitor.when(ItemList) diff --git a/wikicontent_tests.py b/wikicontent_tests.py index c8eb1eb..40b6dcf 100755 --- a/wikicontent_tests.py +++ b/wikicontent_tests.py @@ -15,11 +15,15 @@ """ from __future__ import print_function, unicode_literals, absolute_import, division -import sys, os, codecs, inspect, traceback +import sys, os, codecs, inspect, traceback, difflib, unicodedata from pprint import pprint import wikicontent, yamdwe -DELIMITER="*"*40 +DELIMITER="@"*40 + +def prep_difflines(content): + """ difflib takes input in this "readlines" compatible format """ + return [ x+"\n" for x in content.split("\n") ] def run_test(testdir): """ @@ -59,12 +63,10 @@ def run_test(testdir): print("Input Mediawiki:") print(mw) print(DELIMITER) - print("Expected Output:") - print(DELIMITER) - print(dw) - print(DELIMITER) - print("Actual Output:") - print(converted) + + diff = difflib.unified_diff(prep_difflines(dw), prep_difflines(converted), fromfile='Expected Dokuwiki', tofile='Actual Dokuwiki', lineterm="\n") + sys.stdout.writelines(diff) + sys.stdout.write("\n") print(DELIMITER) return False diff --git a/yamdwe.py b/yamdwe.py index 1f2502f..87c9dc4 100755 --- a/yamdwe.py +++ b/yamdwe.py @@ -41,7 +41,9 @@ def main(): canonical_file, aliases = importer.get_file_namespaces() wikicontent.set_file_namespaces(canonical_file, aliases) + # Read all pages and page revisions + namespaces=importer.get_all_namespaces() pages = importer.get_all_pages() print("Found %d pages to export..." % len(pages)) @@ -58,7 +60,7 @@ def main(): page["revisions"].insert(0, latest) # Export pages to Dokuwiki format - exporter.write_pages(pages) + exporter.write_pages(pages,namespaces) # Bring over images images = importer.get_all_images()