From 2cce3383d50bdc966995b4531f03ada7d5551ab5 Mon Sep 17 00:00:00 2001 From: Sebastian Witowski Date: Mon, 21 Nov 2016 17:46:30 +0100 Subject: [PATCH 1/2] Add files needed for Inspire harvester --- modules/bibconvert/etc/Makefile.am | 3 +- modules/bibconvert/etc/oaiinspire2marcxml.xsl | 134 ++++++++++++++++++ 2 files changed, 136 insertions(+), 1 deletion(-) create mode 100644 modules/bibconvert/etc/oaiinspire2marcxml.xsl diff --git a/modules/bibconvert/etc/Makefile.am b/modules/bibconvert/etc/Makefile.am index d72ec96543..1617525915 100644 --- a/modules/bibconvert/etc/Makefile.am +++ b/modules/bibconvert/etc/Makefile.am @@ -22,7 +22,8 @@ kb_DATA = entdec-to-latin1.kb entdec-to-utf8.kb \ xsldir = $(sysconfdir)/bibconvert/config xsl_DATA = oaidc2marcxml.xsl oaimarc2marcxml.xsl oaiarxiv2marcxml.xsl \ - oaidmf2marcxml.xsl authorlist2marcxml.xsl crossref2marcxml.xsl bibtex2marcxml.cfg + oaidmf2marcxml.xsl oaiinspire2marcxml.xsl authorlist2marcxml.xsl \ + crossref2marcxml.xsl bibtex2marcxml.cfg EXTRA_DIST = $(kb_DATA) $(xsl_DATA) diff --git a/modules/bibconvert/etc/oaiinspire2marcxml.xsl b/modules/bibconvert/etc/oaiinspire2marcxml.xsl new file mode 100644 index 0000000000..56306c673a --- /dev/null +++ b/modules/bibconvert/etc/oaiinspire2marcxml.xsl @@ -0,0 +1,134 @@ + + + + + + + + + + + + + + + + + Inspire + + + + + + + + false + + + + DELETED + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Inspire + + + + + + + + false + + + + + + + + + From 08736812f3f472946ae57f5cfce13ba24f0ec7f1 Mon Sep 17 00:00:00 2001 From: Sebastian Witowski Date: Fri, 2 Dec 2016 14:51:26 +0100 Subject: [PATCH 2/2] oaiharvest: fix for checking the resumption token * When the harvested XML is huge (tested with 50MB) and we search for the resumption token, Python will basically freeze (I stopped waiting after 12 hours). However, since the resumption token should be in the headers, we can only search the first 10 000 characters of the XML. --- modules/oaiharvest/lib/oai_harvest_getter.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/modules/oaiharvest/lib/oai_harvest_getter.py b/modules/oaiharvest/lib/oai_harvest_getter.py index e7d902bb9e..a24dffbefe 100644 --- a/modules/oaiharvest/lib/oai_harvest_getter.py +++ b/modules/oaiharvest/lib/oai_harvest_getter.py @@ -120,8 +120,14 @@ def OAI_Session(server, script, http_param_dict , method="POST", output="", # FIXME We should NOT use regular expressions to parse XML. This works # for the time being to escape namespaces. + # Regexp for big files (for example from Inspire will be extremally + # slow) but we know that the resumption token should be located at the + # beginning of file, so we can search only in the first 10 000 + # characters + # rt_obj = re.search('<.*resumptionToken.*>(.*)', + # harvested_data, re.DOTALL) rt_obj = re.search('<.*resumptionToken.*>(.*)', - harvested_data, re.DOTALL) + harvested_data[0:10000], re.DOTALL) if rt_obj is not None and rt_obj.group(1) != "": http_param_dict = http_param_resume(http_param_dict, rt_obj.group(1)) i = i + 1