From 1ca09974d2c37e617da3d0cdf222b75af86fc888 Mon Sep 17 00:00:00 2001 From: Chris Hobden Date: Tue, 3 Jul 2018 17:51:21 +0100 Subject: [PATCH 1/6] add encoding paramater to parse --- lib/marcjs.js | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/lib/marcjs.js b/lib/marcjs.js index d4352f3..eb1e2bf 100644 --- a/lib/marcjs.js +++ b/lib/marcjs.js @@ -75,10 +75,10 @@ class MARC { * @param {string} type - The type of format to parse: iso2709, marcxml, mij. * @return a MARC record. */ - static parse(raw, type) { + static parse(raw, type, encoding) { let parse = MARC.parser[type.toLowerCase()]; if (parse) { - return parse(raw); + return parse(raw, encoding); } else { throw new Error('Unknown MARC format: ' + type); } @@ -253,7 +253,7 @@ class Transform extends TransformStream { } - + class Marcxml extends Duplex { constructor(stream) { @@ -293,7 +293,7 @@ class Marcxml extends Duplex { var raw = this.buffer.substr(0, pos+9); this.buffer = this.buffer.substr(pos+10); this.count++; - this.push(Marcxml.parse(raw)); + this.push(Marcxml.parse(raw)); } } @@ -397,7 +397,7 @@ class MiJ extends Duplex { let raw = this.buffer.substr(0, pos+4); this.buffer = this.buffer.substr(pos+5); this.count++; - let record = MiJ.parse(raw); + let record = MiJ.parse(raw); this.push(record); } }); @@ -543,17 +543,18 @@ class Iso2709 extends Duplex { } -Iso2709.parse = function(data) { +Iso2709.parse = function(data, encoding) { + if(!encoding)encoding='utf8'; var record = new MARC(); - record.leader = data.toString('utf8', 0, 24); - var directory_len = parseInt(data.toString('utf8', 12, 17), 10) - 25, + record.leader = data.toString(encoding, 0, 24); + var directory_len = parseInt(data.toString(encoding, 12, 17), 10) - 25, number_of_tag = directory_len / 12; record.fields = []; for (var i = 0; i < number_of_tag; i++) { var off = 24 + i * 12; - var tag = data.toString('utf8', off, off+3); - var len = parseInt(data.toString('utf8', off+3, off+7), 0) - 1; - var pos = parseInt(data.toString('utf8', off+7, off+12), 0) + 25 + directory_len; + var tag = data.toString(encoding, off, off+3); + var len = parseInt(data.toString(encoding, off+3, off+7), 0) - 1; + var pos = parseInt(data.toString(encoding, off+7, off+12), 0) + 25 + directory_len; var value = data.toString('utf-8', pos, pos+len); var parts = [ tag ]; if ( parseInt(tag, 10) < 10 ) { From 667204ac4c13a678c6a425b84554ec3bbcebf002 Mon Sep 17 00:00:00 2001 From: Chris Hobden Date: Tue, 3 Jul 2018 18:15:31 +0100 Subject: [PATCH 2/6] ok, really add an encoding option this time --- lib/marcjs.js | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/lib/marcjs.js b/lib/marcjs.js index eb1e2bf..32ae363 100644 --- a/lib/marcjs.js +++ b/lib/marcjs.js @@ -46,9 +46,10 @@ class MARC { * Get a Writable/Readable Stream based on a Node.js stream * @param {Stream} stream - The stream on which read/write * @param {string} type - The type of stream: iso2709, marcxml, text, json, mij + * @param {string} encoding - The text encoding of stream: utf8, latin1 etc. utf8 is the default * @return {Stream} */ - static stream(stream, type) { + static stream(stream, type, encoding) { type = type.toLowerCase(); const recordStream = type === 'iso2709' ? Iso2709 : @@ -57,7 +58,7 @@ class MARC { type === 'mij' ? MiJ : type === 'text' ? Text : null; if (recordStream) { - return new recordStream(stream); + return new recordStream(stream, encoding); } else { throw new Error('Unknown MARC Stream: ' + type); } @@ -73,6 +74,7 @@ class MARC { * Parse and returns a MARC record. * @param {string} raw - The raw MARC record. * @param {string} type - The type of format to parse: iso2709, marcxml, mij. + * @param {string} encoding - The text encoding to parse: utf8, latin1 etc. utf8 is the default * @return a MARC record. */ static parse(raw, type, encoding) { @@ -464,9 +466,9 @@ MiJ.parse = function(data) { class Iso2709 extends Duplex { - constructor(stream) { + constructor(stream, encoding) { super({ objectMode: true }); - + this.encoding = encoding || 'utf8'; this.prevData, // Le buffer précédent du stream en cours de lecture this.prevStart = -1; // La position de ce qu'il reste à lire this.stream = stream; @@ -517,7 +519,7 @@ class Iso2709 extends Duplex { raw = new Buffer(pos-start +1); data.copy(raw, 0, start, pos); } - records.push(Iso2709.parse(raw)); + records.push(Iso2709.parse(raw, this.encoding)); pos++; start = pos; } @@ -544,7 +546,6 @@ class Iso2709 extends Duplex { Iso2709.parse = function(data, encoding) { - if(!encoding)encoding='utf8'; var record = new MARC(); record.leader = data.toString(encoding, 0, 24); var directory_len = parseInt(data.toString(encoding, 12, 17), 10) - 25, From 9b082db00b2f0c8aee3ec8195b7a3032d551de4b Mon Sep 17 00:00:00 2001 From: Chris Hobden Date: Tue, 3 Jul 2018 18:20:14 +0100 Subject: [PATCH 3/6] missed a spot --- lib/marcjs.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/marcjs.js b/lib/marcjs.js index 32ae363..4ae576c 100644 --- a/lib/marcjs.js +++ b/lib/marcjs.js @@ -556,7 +556,7 @@ Iso2709.parse = function(data, encoding) { var tag = data.toString(encoding, off, off+3); var len = parseInt(data.toString(encoding, off+3, off+7), 0) - 1; var pos = parseInt(data.toString(encoding, off+7, off+12), 0) + 25 + directory_len; - var value = data.toString('utf-8', pos, pos+len); + var value = data.toString(encoding, pos, pos+len); var parts = [ tag ]; if ( parseInt(tag, 10) < 10 ) { parts.push(value); From c84ba83d83fffd5c73f30ec39b27a75bc1d6748a Mon Sep 17 00:00:00 2001 From: Chris Hobden Date: Thu, 5 Aug 2021 12:10:40 +0100 Subject: [PATCH 4/6] add support for changing string ecoding format --- lib/Iso2709.js | 20 ++++++++++---------- lib/Marc.js | 10 ++++++---- 2 files changed, 16 insertions(+), 14 deletions(-) diff --git a/lib/Iso2709.js b/lib/Iso2709.js index 567a0dd..6f0f5a1 100644 --- a/lib/Iso2709.js +++ b/lib/Iso2709.js @@ -4,9 +4,9 @@ const Record = require('./Record'); // eslint-disable-next-line no-unused-vars class Iso2709 extends Duplex { - constructor(stream) { + constructor(stream, encoding) { super({ objectMode: true }); - + this.encoding = encoding || 'utf8'; // this.prevData; Le buffer précédent du stream en cours de lecture this.prevStart = -1; // La position de ce qu'il reste à lire this.stream = stream; @@ -54,7 +54,7 @@ class Iso2709 extends Duplex { raw = Buffer.alloc(pos - start + 1); data.copy(raw, 0, start, pos); } - records.push(Iso2709.parse(raw)); + records.push(Iso2709.parse(raw, this.encoding)); pos += 1; start = pos; } @@ -77,18 +77,18 @@ class Iso2709 extends Duplex { callback(null); } - static parse(data) { + static parse(data, encoding) { const record = new Record(); - record.leader = data.toString('utf8', 0, 24); - const directoryLen = parseInt(data.toString('utf8', 12, 17), 10) - 25; + record.leader = data.toString(encoding, 0, 24); + const directoryLen = parseInt(data.toString(encoding, 12, 17), 10) - 25; const numberOfTag = directoryLen / 12; record.fields = []; for (let i = 0; i < numberOfTag; i += 1) { const off = 24 + i * 12; - const tag = data.toString('utf8', off, off + 3); - const len = parseInt(data.toString('utf8', off + 3, off + 7), 10) - 1; - const pos = parseInt(data.toString('utf8', off + 7, off + 12), 10) + 25 + directoryLen; - let value = data.toString('utf-8', pos, pos + len); + const tag = data.toString(encoding, off, off + 3); + const len = parseInt(data.toString(encoding, off + 3, off + 7), 10) - 1; + const pos = parseInt(data.toString(encoding, off + 7, off + 12), 10) + 25 + directoryLen; + let value = data.toString(encoding, pos, pos + len); const parts = [tag]; if (parseInt(tag, 10) < '010') { parts.push(value); diff --git a/lib/Marc.js b/lib/Marc.js index c12c40a..244ad35 100644 --- a/lib/Marc.js +++ b/lib/Marc.js @@ -28,12 +28,13 @@ const Marc = { * Parse and returns a MARC record. * @param {string} raw - The raw MARC record. * @param {string} type - The type of format to parse: iso2709, marcxml, mij. + * @param {string} encoding - The text encoding of stream: utf8, latin1 etc. utf8 is the default * @return a MARC record. */ - parse: (raw, type) => { + parse: (raw, type, encoding) => { const parse = Marc.parser[type.toLowerCase()]; if (parse) { - return parse(raw); + return parse(raw, encoding); } throw new Error(`Unknown MARC format: ${type}`); }, @@ -50,12 +51,13 @@ const Marc = { * Get a Writable/Readable Stream based on a Node.js stream * @param {Stream} stream - The stream on which read/write * @param {string} type - The type of stream: iso2709, marcxml, text, json, mij + * @param {string} encoding - The text encoding of stream: utf8, latin1 etc. utf8 is the default * @return {Stream} */ - stream: (stream, type) => { + stream: (stream, type, encoding) => { switch (type.toLocaleLowerCase()) { case 'iso2709': - return new Iso2709(stream); + return new Iso2709(stream, encoding); case 'marcxml': return new Marcxml(stream); case 'mij': From 3fd3a80d2897018b6ebbc954804acf74bc831d70 Mon Sep 17 00:00:00 2001 From: Joshua Eke Date: Wed, 17 Jul 2024 16:55:15 +0100 Subject: [PATCH 5/6] Update marc parser library to ignore self closing XML tags --- lib/Marcxml.js | 12 ++ package-lock.json | 2 +- package.json | 2 +- test/test-xml.js | 277 ++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 291 insertions(+), 2 deletions(-) create mode 100644 test/test-xml.js diff --git a/lib/Marcxml.js b/lib/Marcxml.js index f178b8a..f62adfd 100644 --- a/lib/Marcxml.js +++ b/lib/Marcxml.js @@ -96,6 +96,8 @@ class Marcxml extends Duplex { let values; let ind1; let ind2; + let tagEnd; + let isSelfClosingTag; while (true) { end += 1; start = xml.indexOf('<', end); @@ -110,6 +112,16 @@ class Marcxml extends Duplex { value = he.decode(value); values = [tag, value]; } else { + if(begin === '', start); + isSelfClosingTag = ("/" === xml[tagEnd -1]); + if(isSelfClosingTag) { + //if self closing tag then skip over it and continue + end = tagEnd; + continue; + } + } end = xml.indexOf(' + + + 00000nam a2200000 a 4500 + 012301230123 + 0123012301230123012.0 + 0123012s0123 0123 0123 0123 1 0123 + + 012301230123 + + + 0123012301230123 (0123.) + + + 012301230123 (0123.) + + + 0123012301230123 (0123.) + + + 0123012301230123 + (01230123) + + + (0123)012301230123 + + + (0123)012301230123 + + + ABC + def + ghi + JKL + MNO + PQR + STU + VWX + YZA + + + AB12.CD45 + Xy 1234 + + + [Sci] + 12 + + + SF SERIES L + + + Smith, John, + author. + + + The Galactic Adventures of Space Cat / + John Smith. + + + Mars : + Space Press, + 2023. + + + 200 p. : + col. ill. ; + 24 cm. + datafield tag="490" ind1="0" ind2=" "> + Random Title; + [1] + + + RANDOM DESCRIPTION TEXT HERE. + + + Ages 8-12. + + + Harry Potter + (Fictitious character) + Juvenile fiction. + + + Wizards + Juvenile fiction. + + + Magic + Juvenile fiction. + + + Friendship + Juvenile fiction. + + + Schools + Juvenile fiction. + + + Fantasy + Juvenile fiction. + + + +`; + +let res = Marcxml.parse(xml_selfclosing); +let data = JSON.parse(JSON.stringify(res)); +console.log(data); + +const xml_notselfclosing = ` + + + + 00001aaa a2200001Ki 4500 + 1000001 + 1231231231230 + 1234567891m223467891nyua j 6 000 1 aaa d + + 1000002 + v.1 : hardcover) + + + 1000003 + (v.1 paperback) + + + 1000004 + v.1 : hardcover) + + + 1000005 + v.2 : hardcover) + + + 1000006 + v.2 : hardcover) + + + 1000007 + v.2 : paperback) + + + 1000008 + v.3 : hardcover) + + + 1000009 + v.3 : hardcover) + + + 1000010 + (v.4 ; hardcover) + + + 1000011 + (v.4 ; hardcover) + + + (OCoLC)1000012 + (OCoLC)1000013 + + + (OCoLC)ocn1000014 + + + AA1 + eng + rda + AA1 + AA2 + AA3 + AA4 + AA5 + AA6 + AA7 + + + 123.4/567 + 12 + + + J GRAPHIC ABC + + + Author, A. A., + author. + + + Random title / + written by A. A. Author ; illustrations by B. B. Illustrator. + + + At head of title: + XYZ PUBLISHERS + + + City : + Publisher Inc., + [1234]- + + + volumes : + color illustrations ; + 22 cm + + + text + txt + rdacontent + + + still image + sti + rdacontent + + + unmediated + n + rdamedia + + + volume + nc + rdacarrier + + + Volume 1. + Title of volume 1 -- + Volume 2. + Title of volume 2 -- + Volume 3. + Title of volume 3 -- + v.4. + Title of volume 4 -- + v.5. + Title of volume 5 -- + + + "Description text here."--Back cover of Volume 1. + + + Topic1 + Comic books, strips, etc. + + + Topic2 + Comic books, strips, etc. + + + Topic3 + Comic books, strips, etc. + + + Genre1. + lcgft + + + Illustrator, B. B., + illustrator. + + + XYZ Publishers, Inc. + + + +`; + +const res2 = Marcxml.parse(xml_notselfclosing); +console.log(res2); \ No newline at end of file From 0c955e3e6e4f9fe341c2b9887ad583ccd6021ee5 Mon Sep 17 00:00:00 2001 From: joshuaeke Date: Thu, 18 Jul 2024 11:25:27 +0100 Subject: [PATCH 6/6] Update test/test-xml.js --- test/test-xml.js | 4 ---- 1 file changed, 4 deletions(-) diff --git a/test/test-xml.js b/test/test-xml.js index 4c90459..3d744b6 100644 --- a/test/test-xml.js +++ b/test/test-xml.js @@ -1,7 +1,3 @@ -/* eslint-disable no-undef */ -// eslint-disable-next-line no-unused-vars -//const should = require('should'); -//const fs = require('fs'); const Marcxml = require('../lib/Marcxml'); const xml_selfclosing = `