diff --git a/lib/browser.js b/lib/browser.js index 652dc0c..cd3403c 100644 --- a/lib/browser.js +++ b/lib/browser.js @@ -6,7 +6,7 @@ var keys = require('./keys'); var hasBinary = require('has-binary'); var sliceBuffer = require('arraybuffer.slice'); var after = require('after'); -var utf8 = require('wtf-8'); +var utf8 = require('./utf8'); var base64encoder; if (global && global.ArrayBuffer) { @@ -117,7 +117,7 @@ exports.encodePacket = function (packet, supportsBinary, utf8encode, callback) { // data fragment is optional if (undefined !== packet.data) { - encoded += utf8encode ? utf8.encode(String(packet.data)) : String(packet.data); + encoded += utf8encode ? utf8.encode(String(packet.data), { strict: false }) : String(packet.data); } return callback('' + encoded); @@ -261,7 +261,7 @@ exports.decodePacket = function (data, binaryType, utf8decode) { function tryDecode(data) { try { - data = utf8.decode(data); + data = utf8.decode(data, { strict: false }); } catch (e) { return false; } diff --git a/lib/index.js b/lib/index.js index 716b1d5..06e20e6 100644 --- a/lib/index.js +++ b/lib/index.js @@ -2,7 +2,7 @@ * Module dependencies. */ -var utf8 = require('wtf-8'); +var utf8 = require('./utf8'); var after = require('after'); var keys = require('./keys'); @@ -72,7 +72,7 @@ exports.encodePacket = function (packet, supportsBinary, utf8encode, callback) { // data fragment is optional if (undefined !== packet.data) { - encoded += utf8encode ? utf8.encode(String(packet.data)) : String(packet.data); + encoded += utf8encode ? utf8.encode(String(packet.data), { strict: false }) : String(packet.data); } return callback('' + encoded); @@ -164,7 +164,7 @@ exports.decodePacket = function (data, binaryType, utf8decode) { function tryDecode(data) { try { - data = utf8.decode(data); + data = utf8.decode(data, { strict: false }); } catch (e) { return false; } diff --git a/lib/utf8.js b/lib/utf8.js new file mode 100644 index 0000000..83e2dd7 --- /dev/null +++ b/lib/utf8.js @@ -0,0 +1,255 @@ +/*! https://mths.be/utf8js v2.1.2 by @mathias */ +;(function(root) { + + // Detect free variables `exports` + var freeExports = typeof exports == 'object' && exports; + + // Detect free variable `module` + var freeModule = typeof module == 'object' && module && + module.exports == freeExports && module; + + // Detect free variable `global`, from Node.js or Browserified code, + // and use it as `root` + var freeGlobal = typeof global == 'object' && global; + if (freeGlobal.global === freeGlobal || freeGlobal.window === freeGlobal) { + root = freeGlobal; + } + + /*--------------------------------------------------------------------------*/ + + var stringFromCharCode = String.fromCharCode; + + // Taken from https://mths.be/punycode + function ucs2decode(string) { + var output = []; + var counter = 0; + var length = string.length; + var value; + var extra; + while (counter < length) { + value = string.charCodeAt(counter++); + if (value >= 0xD800 && value <= 0xDBFF && counter < length) { + // high surrogate, and there is a next character + extra = string.charCodeAt(counter++); + if ((extra & 0xFC00) == 0xDC00) { // low surrogate + output.push(((value & 0x3FF) << 10) + (extra & 0x3FF) + 0x10000); + } else { + // unmatched surrogate; only append this code unit, in case the next + // code unit is the high surrogate of a surrogate pair + output.push(value); + counter--; + } + } else { + output.push(value); + } + } + return output; + } + + // Taken from https://mths.be/punycode + function ucs2encode(array) { + var length = array.length; + var index = -1; + var value; + var output = ''; + while (++index < length) { + value = array[index]; + if (value > 0xFFFF) { + value -= 0x10000; + output += stringFromCharCode(value >>> 10 & 0x3FF | 0xD800); + value = 0xDC00 | value & 0x3FF; + } + output += stringFromCharCode(value); + } + return output; + } + + function checkScalarValue(codePoint, strict) { + if (codePoint >= 0xD800 && codePoint <= 0xDFFF) { + if (strict) { + throw Error( + 'Lone surrogate U+' + codePoint.toString(16).toUpperCase() + + ' is not a scalar value' + ); + } + return false; + } + return true; + } + /*--------------------------------------------------------------------------*/ + + function createByte(codePoint, shift) { + return stringFromCharCode(((codePoint >> shift) & 0x3F) | 0x80); + } + + function encodeCodePoint(codePoint, strict) { + if ((codePoint & 0xFFFFFF80) == 0) { // 1-byte sequence + return stringFromCharCode(codePoint); + } + var symbol = ''; + if ((codePoint & 0xFFFFF800) == 0) { // 2-byte sequence + symbol = stringFromCharCode(((codePoint >> 6) & 0x1F) | 0xC0); + } + else if ((codePoint & 0xFFFF0000) == 0) { // 3-byte sequence + if (!checkScalarValue(codePoint, strict)) { + codePoint = 0xFFFD; + } + symbol = stringFromCharCode(((codePoint >> 12) & 0x0F) | 0xE0); + symbol += createByte(codePoint, 6); + } + else if ((codePoint & 0xFFE00000) == 0) { // 4-byte sequence + symbol = stringFromCharCode(((codePoint >> 18) & 0x07) | 0xF0); + symbol += createByte(codePoint, 12); + symbol += createByte(codePoint, 6); + } + symbol += stringFromCharCode((codePoint & 0x3F) | 0x80); + return symbol; + } + + function utf8encode(string, opts) { + opts = opts || {}; + var strict = false !== opts.strict; + + var codePoints = ucs2decode(string); + var length = codePoints.length; + var index = -1; + var codePoint; + var byteString = ''; + while (++index < length) { + codePoint = codePoints[index]; + byteString += encodeCodePoint(codePoint, strict); + } + return byteString; + } + + /*--------------------------------------------------------------------------*/ + + function readContinuationByte() { + if (byteIndex >= byteCount) { + throw Error('Invalid byte index'); + } + + var continuationByte = byteArray[byteIndex] & 0xFF; + byteIndex++; + + if ((continuationByte & 0xC0) == 0x80) { + return continuationByte & 0x3F; + } + + // If we end up here, it’s not a continuation byte + throw Error('Invalid continuation byte'); + } + + function decodeSymbol(strict) { + var byte1; + var byte2; + var byte3; + var byte4; + var codePoint; + + if (byteIndex > byteCount) { + throw Error('Invalid byte index'); + } + + if (byteIndex == byteCount) { + return false; + } + + // Read first byte + byte1 = byteArray[byteIndex] & 0xFF; + byteIndex++; + + // 1-byte sequence (no continuation bytes) + if ((byte1 & 0x80) == 0) { + return byte1; + } + + // 2-byte sequence + if ((byte1 & 0xE0) == 0xC0) { + byte2 = readContinuationByte(); + codePoint = ((byte1 & 0x1F) << 6) | byte2; + if (codePoint >= 0x80) { + return codePoint; + } else { + throw Error('Invalid continuation byte'); + } + } + + // 3-byte sequence (may include unpaired surrogates) + if ((byte1 & 0xF0) == 0xE0) { + byte2 = readContinuationByte(); + byte3 = readContinuationByte(); + codePoint = ((byte1 & 0x0F) << 12) | (byte2 << 6) | byte3; + if (codePoint >= 0x0800) { + return checkScalarValue(codePoint, strict) ? codePoint : 0xFFFD; + } else { + throw Error('Invalid continuation byte'); + } + } + + // 4-byte sequence + if ((byte1 & 0xF8) == 0xF0) { + byte2 = readContinuationByte(); + byte3 = readContinuationByte(); + byte4 = readContinuationByte(); + codePoint = ((byte1 & 0x07) << 0x12) | (byte2 << 0x0C) | + (byte3 << 0x06) | byte4; + if (codePoint >= 0x010000 && codePoint <= 0x10FFFF) { + return codePoint; + } + } + + throw Error('Invalid UTF-8 detected'); + } + + var byteArray; + var byteCount; + var byteIndex; + function utf8decode(byteString, opts) { + opts = opts || {}; + var strict = false !== opts.strict; + + byteArray = ucs2decode(byteString); + byteCount = byteArray.length; + byteIndex = 0; + var codePoints = []; + var tmp; + while ((tmp = decodeSymbol(strict)) !== false) { + codePoints.push(tmp); + } + return ucs2encode(codePoints); + } + + /*--------------------------------------------------------------------------*/ + + var utf8 = { + 'version': '2.1.2', + 'encode': utf8encode, + 'decode': utf8decode + }; + + // Some AMD build optimizers, like r.js, check for specific condition patterns + // like the following: + if ( + typeof define == 'function' && + typeof define.amd == 'object' && + define.amd + ) { + define(function() { + return utf8; + }); + } else if (freeExports && !freeExports.nodeType) { + if (freeModule) { // in Node.js or RingoJS v0.8.0+ + freeModule.exports = utf8; + } else { // in Narwhal or RingoJS v0.7.0- + var object = {}; + var hasOwnProperty = object.hasOwnProperty; + for (var key in utf8) { + hasOwnProperty.call(utf8, key) && (freeExports[key] = utf8[key]); + } + } + } else { // in Rhino or a web browser + root.utf8 = utf8; + } + +}(this)); diff --git a/package.json b/package.json index 4a50f99..676afd4 100644 --- a/package.json +++ b/package.json @@ -15,8 +15,7 @@ "arraybuffer.slice": "0.0.6", "base64-arraybuffer": "0.1.5", "blob": "0.0.4", - "has-binary": "0.1.7", - "wtf-8": "1.0.0" + "has-binary": "0.1.7" }, "scripts": { "test": "make test" diff --git a/test/parser.js b/test/parser.js index 9a19394..452a242 100644 --- a/test/parser.js +++ b/test/parser.js @@ -124,6 +124,14 @@ module.exports = function(parser) { expect(data).to.match(/^[0-9]$/); }); }); + + it('should encode a string message with lone surrogates replaced by U+FFFD', function(done) { + var data = '\uDC00\uD834\uDF06\uDC00 \uD800\uD835\uDF07\uD800'; + encode({ type: 'message', data: data }, null, true, function(encoded) { + expect(decode(encoded, null, true)).to.eql({ type: 'message', data: '\uFFFD\uD834\uDF06\uFFFD \uFFFD\uD835\uDF07\uFFFD' }); + done(); + }); + }); }); describe('decoding error handing', function () {