diff --git a/TestUniHax/Form1.cs b/TestUniHax/Form1.cs index 289872a..cde99e2 100644 --- a/TestUniHax/Form1.cs +++ b/TestUniHax/Form1.cs @@ -55,7 +55,7 @@ public FormUniMapTest() comboBoxTransformations.DisplayMember = "Transform"; // Populate unichar properties - string[] aProps = { Fuzzer.uBOM, Fuzzer.uMVS, Fuzzer.uReservedCodePoint, Fuzzer.uRLO, Fuzzer.uDEAD, Fuzzer.uDAAD, Fuzzer.uPrivate, Fuzzer.uNotACharacter }; + string[] aProps = { HostileCodePoint.uBOM, HostileCodePoint.uMVS, HostileCodePoint.uReservedCodePoint, HostileCodePoint.uRLO, HostileCodePoint.uDEAD, HostileCodePoint.uDAAD, HostileCodePoint.uPrivate, HostileCodePoint.uNotACharacter }; string sProps = String.Join("\r\n", aProps); textBoxUnicharProps.Text = sProps; } diff --git a/UniHax/Fuzzer.cs b/UniHax/CodePointFuzzer.cs similarity index 71% rename from UniHax/Fuzzer.cs rename to UniHax/CodePointFuzzer.cs index 4679adc..48d8443 100644 --- a/UniHax/Fuzzer.cs +++ b/UniHax/CodePointFuzzer.cs @@ -1,16 +1,17 @@ - + // Copyright (c) 2011 by Christopher Weber - +// Portions Copyright (c) 2017 by Robert Mooney + // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: - + // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. - + // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE @@ -18,29 +19,23 @@ // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. - + // Authors: // Christopher Weber (chris@lookout.net) +// Robert Mooney (rjmooney@gmail.com) using System; -using System.IO; +using System.Collections.Generic; +using System.Text; namespace UniHax { /// - /// The Fuzzer has cases for some of the oddball manifestations of Unicode that can trip up software including: - /// - /// - non-character, reserved, and private use area code points - /// - special meaning characters such as the BOM and RLO - /// - ill-formed byte sequences - /// - a half-surrogate code point - /// - /// + /// An enumeration-style container of hostile code points. /// - public class Fuzzer + public static class HostileCodePoint { - /// /// The Byte Order Mark U+FEFF is a special character defining the byte order and endianess /// of text data. @@ -136,6 +131,52 @@ public class Fuzzer /// public static readonly string u1D160 = char.ConvertFromUtf32(0x1D160); + /// + /// This is a collection of the code points defined above. + /// + /// Remember to update this when adding new code points. + private static readonly string[] _values = new string[] + { + HostileCodePoint.uBOM, + HostileCodePoint.uRLO, + HostileCodePoint.uMVS, + HostileCodePoint.uWordJoiner, + HostileCodePoint.uReservedCodePoint, + HostileCodePoint.uNotACharacter, + HostileCodePoint.uUnassigned, + HostileCodePoint.uDEAD, + HostileCodePoint.uDAAD, + HostileCodePoint.uPrivate, + HostileCodePoint.uFullwidthSolidus, + HostileCodePoint.uBoldEight, + HostileCodePoint.uIdnaSs, + HostileCodePoint.uFDFA, + HostileCodePoint.u0390, + HostileCodePoint.u1F82, + HostileCodePoint.uFB2C, + HostileCodePoint.u1D160 + }; + + /// + /// Retrieves an array of the values of the constants in the HostileCodePoint enumeration. + /// + public static string[] GetValues() + { + return _values; + } + } + + /// + /// The Fuzzer has cases for some of the oddball manifestations of Unicode that can trip up software including: + /// + /// - non-character, reserved, and private use area code points + /// - special meaning characters such as the BOM and RLO + /// - ill-formed byte sequences + /// - a half-surrogate code point + /// + public class CodePointFuzzer + { + #region Public Methods /// /// Gets the requested byte representation of the current Unicode character codepoint /// @@ -159,7 +200,6 @@ public byte[] GetCharacterBytes(string encoding, string character) } return enc.GetBytes(character); - } /// @@ -171,7 +211,7 @@ public byte[] GetCharacterBytes(string encoding, string character) public byte[] GetCharacterBytesMalformed(string encoding, string character) { System.Text.Encoding enc; - + if (encoding == "utf-16le") { enc = new System.Text.UnicodeEncoding(); @@ -209,7 +249,7 @@ public byte[] GetCharacterBytesMalformed(string encoding, string character) public string GetBom() { - return Fuzzer.uBOM; + return HostileCodePoint.uBOM; } /// @@ -220,8 +260,43 @@ public string GetBom() /// A raw byte array because .NET will not allow illegal code points in the System.String class. public byte[] OutOfRangeCodePointAsUtf32BE() { - byte[] bytes = {0x00, 0x1F, 0xFF, 0xFF}; + byte[] bytes = { 0x00, 0x1F, 0xFF, 0xFF }; return bytes; } + #endregion + + #region Static Methods + /// + /// Perform hostile code point substitution on each character in the specified string. + /// + /// The string on which to perform the substitution + /// The next string in the sequence of strings with the next character replaced with a hostile code point + public static IEnumerable Substitute(string source) + { + var lastCodePointLength = 0; + var target = new StringBuilder(source); + for (int n = 0; n < source.Length; ++n) + { + foreach (var codepoint in HostileCodePoint.GetValues()) + { + if (n > 0 && lastCodePointLength > 0) + { + // Remove the last hostile code point replacement and re-insert the original character + target.Remove(n - 1, lastCodePointLength); + target.Insert(n - 1, source[n - 1]); + } + + // Replace the current character of the source with the current code point string + target.Remove(n, 1); + target.Insert(n, codepoint); + + // Store the length of the code point for the next iteration, when it is removed + lastCodePointLength = codepoint.Length; + + yield return target.ToString(); + } + } + } + #endregion } -} +} \ No newline at end of file diff --git a/UniHax/Exceptions.cs b/UniHax/Exceptions.cs index 8bb7e98..9bce962 100644 --- a/UniHax/Exceptions.cs +++ b/UniHax/Exceptions.cs @@ -43,7 +43,6 @@ public override string Message get { return String.Format("Bestfit mapping error:{0}", messageDetails); - return base.Message; } } } diff --git a/UniHax/UniHax.csproj b/UniHax/UniHax.csproj index b3c515e..2dfcf97 100644 --- a/UniHax/UniHax.csproj +++ b/UniHax/UniHax.csproj @@ -40,8 +40,8 @@ + - diff --git a/UniHax/UnicodeChar.cs b/UniHax/UnicodeChar.cs index 6cd6e1c..bd5e096 100644 --- a/UniHax/UnicodeChar.cs +++ b/UniHax/UnicodeChar.cs @@ -76,11 +76,11 @@ public string ConvertCharacterToString(char character) { i = Convert.ToInt32(CodePoint.Trim(), 16); // 0x00 to 0x10ffff } - catch (FormatException e) + catch (FormatException) { i = 0; } - catch(Exception) + catch (Exception) { throw; } @@ -107,11 +107,11 @@ public string ConvertCodePointToString(string codepoint) { i = Convert.ToInt32(codepoint.Trim(), 16); // 0x00 to 0x10ffff } - catch (FormatException e) + catch (FormatException) { i = 0; } - catch(ArgumentOutOfRangeException e) + catch (ArgumentOutOfRangeException) { i = 0; }