From 8c86dff082d792d5234a4cb9fabffc579c894323 Mon Sep 17 00:00:00 2001 From: Nelson Chen Date: Thu, 1 Mar 2018 20:43:56 -0700 Subject: [PATCH] Support CP437 Database is slightly modified from Unicode's file. --- README.md | 1 + src/all.rs | 2 + src/index/gen_index.py | 1 + src/index/singlebyte/cp437.rs | 112 ++++++++++++++++++++ src/index/singlebyte/index-cp437.txt | 148 +++++++++++++++++++++++++++ src/index/singlebyte/lib.rs | 3 + 6 files changed, 267 insertions(+) create mode 100644 src/index/singlebyte/cp437.rs create mode 100644 src/index/singlebyte/index-cp437.txt diff --git a/README.md b/README.md index a058c5be..db30d24d 100644 --- a/README.md +++ b/README.md @@ -218,6 +218,7 @@ Encoding covers all encodings specified by WHATWG Encoding Standard and some mor * Encodings that were originally specified by WHATWG Encoding Standard: * HZ * ISO 8859-1 (distinct from Windows code page 1252) +* Code page 437 (`cp437`) Parenthesized names refer to the encoding's primary name assigned by WHATWG Encoding Standard. diff --git a/src/all.rs b/src/all.rs index dc34bdf6..d856a12f 100644 --- a/src/all.rs +++ b/src/all.rs @@ -39,6 +39,7 @@ macro_rules! singlebyte( unique!(var=ERROR, mod=codec::error, val=ErrorEncoding); unique!(var=ASCII, mod=codec::ascii, val=ASCIIEncoding); singlebyte!(var=ARMSCII_8, mod=index::armscii_8, name="armscii-8"); +singlebyte!(var=CP437, mod=index::cp437, name="cp437"); singlebyte!(var=IBM866, mod=index::ibm866, name|whatwg="ibm866"); singlebyte!(var=ISO_8859_1, mod=codec::singlebyte::iso_8859_1, name="iso-8859-1"); singlebyte!(var=ISO_8859_2, mod=index::iso_8859_2, name|whatwg="iso-8859-2"); @@ -96,6 +97,7 @@ pub fn encodings() -> &'static [EncodingRef] { const ENCODINGS: &'static [EncodingRef] = &[ ERROR, ASCII, + CP437, IBM866, ISO_8859_1, ISO_8859_2, diff --git a/src/index/gen_index.py b/src/index/gen_index.py index 5bd847d9..7e0b2231 100644 --- a/src/index/gen_index.py +++ b/src/index/gen_index.py @@ -907,6 +907,7 @@ def generate_multi_byte_range_lbound_index(opts, crate, name): INDICES = [ ('singlebyte/armscii-8', generate_single_byte_index), + ('singlebyte/cp437', generate_single_byte_index), ('singlebyte/ibm866', generate_single_byte_index), ('singlebyte/iso-8859-2', generate_single_byte_index), diff --git a/src/index/singlebyte/cp437.rs b/src/index/singlebyte/cp437.rs new file mode 100644 index 00000000..08b7c738 --- /dev/null +++ b/src/index/singlebyte/cp437.rs @@ -0,0 +1,112 @@ +// AUTOGENERATED FROM index-cp437.txt, ORIGINAL COMMENT FOLLOWS: +// +// Source URL: ftp://www.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/PC/CP437.TXT +// Keys have been shifted to be more like WHATWG's indexes. +// +// Name: cp437_DOSLatinUS to Unicode table +// Unicode version: 2.0 +// Table version: 2.00 +// Table format: Format A +// Date: 04/24/96 +// Contact: Shawn.Steele@microsoft.com +// +// General notes: none +// +// Format: Three tab-separated columns +// Column #1 is the cp437_DOSLatinUS code (in hex) +// Column #2 is the Unicode (in hex as 0xXXXX) +// Column #3 is the Unicode name (follows a comment sign, '#') +// +// The entries are in cp437_DOSLatinUS order +// + +#[allow(dead_code)] const X: u16 = 0xffff; + +const FORWARD_TABLE: &'static [u16] = &[ + 199, 252, 233, 226, 228, 224, 229, 231, 234, 235, 232, 239, 238, 236, 196, + 197, 201, 230, 198, 244, 246, 242, 251, 249, 255, 214, 220, 162, 163, 165, + 8359, 402, 225, 237, 243, 250, 241, 209, 170, 186, 191, 8976, 172, 189, + 188, 161, 171, 187, 9617, 9618, 9619, 9474, 9508, 9569, 9570, 9558, 9557, + 9571, 9553, 9559, 9565, 9564, 9563, 9488, 9492, 9524, 9516, 9500, 9472, + 9532, 9566, 9567, 9562, 9556, 9577, 9574, 9568, 9552, 9580, 9575, 9576, + 9572, 9573, 9561, 9560, 9554, 9555, 9579, 9578, 9496, 9484, 9608, 9604, + 9612, 9616, 9600, 945, 223, 915, 960, 931, 963, 181, 964, 934, 920, 937, + 948, 8734, 966, 949, 8745, 8801, 177, 8805, 8804, 8992, 8993, 247, 8776, + 176, 8729, 183, 8730, 8319, 178, 9632, 160, +]; // 128 entries + +/// Returns the index code point for pointer `code` in this index. +#[inline] +pub fn forward(code: u8) -> u16 { + FORWARD_TABLE[(code - 0x80) as usize] +} + +#[cfg(not(feature = "no-optimized-legacy-encoding"))] +const BACKWARD_TABLE_LOWER: &'static [u8] = &[ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 196, 0, 179, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 218, 0, 0, 0, 191, 0, 0, 0, 192, 0, 0, 0, 217, 0, 0, 0, 195, 0, 0, 0, + 0, 0, 0, 0, 180, 0, 0, 0, 0, 0, 0, 0, 194, 0, 0, 0, 0, 0, 0, 0, 193, 0, 0, + 0, 0, 0, 0, 0, 197, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 159, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 158, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 169, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 244, 245, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 249, + 251, 0, 0, 0, 236, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 239, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 205, 186, 213, 214, 201, 184, + 183, 187, 212, 211, 200, 190, 189, 188, 198, 199, 204, 181, 182, 185, 209, + 210, 203, 207, 208, 202, 216, 215, 206, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 247, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 240, 0, 0, 243, 242, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 226, 0, 0, 0, 0, 233, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 228, 0, 0, 232, 0, 0, 234, 0, 0, 0, 0, 0, 0, 0, + 224, 0, 0, 235, 238, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 142, 143, 146, 128, 0, + 144, 0, 0, 0, 0, 0, 0, 0, 165, 0, 0, 0, 0, 153, 0, 0, 0, 0, 0, 154, 0, 0, + 225, 133, 160, 131, 0, 132, 134, 145, 135, 138, 130, 136, 137, 141, 161, + 140, 139, 0, 164, 149, 162, 147, 0, 148, 246, 0, 151, 163, 150, 129, 0, 0, + 152, 223, 0, 0, 0, 220, 0, 0, 0, 219, 0, 0, 0, 221, 0, 0, 0, 222, 176, 177, + 178, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 254, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 255, 173, + 155, 156, 0, 157, 0, 0, 0, 0, 166, 174, 170, 0, 0, 0, 248, 241, 253, 0, 0, + 230, 0, 250, 0, 0, 167, 175, 172, 171, 0, 168, 227, 0, 0, 229, 231, 0, 237, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 252, +]; // 678 entries + +#[cfg(not(feature = "no-optimized-legacy-encoding"))] +const BACKWARD_TABLE_UPPER: &'static [u16] = &[ + 0, 0, 543, 446, 0, 0, 125, 0, 0, 0, 0, 0, 0, 0, 386, 607, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 614, 150, 0, 0, 0, 0, 0, 237, 341, 0, 0, 198, 0, 0, 0, + 0, 0, 0, 0, 64, 285, 510, +]; // 151 entries + +/// Returns the index pointer for code point `code` in this index. +#[inline] +#[cfg(not(feature = "no-optimized-legacy-encoding"))] +pub fn backward(code: u32) -> u8 { + let offset = (code >> 6) as usize; + let offset = if offset < 151 {BACKWARD_TABLE_UPPER[offset] as usize} else {0}; + BACKWARD_TABLE_LOWER[offset + ((code & 63) as usize)] +} + +/// Returns the index pointer for code point `code` in this index. +#[cfg(feature = "no-optimized-legacy-encoding")] +pub fn backward(code: u32) -> u8 { + if code > 9632 || ((0x70003u32 >> (code >> 9)) & 1) == 0 { return 0; } + let code = code as u16; + for i in 0..0x80 { + if FORWARD_TABLE[i as usize] == code { return 0x80 + i; } + } + 0 +} + +#[cfg(test)] +single_byte_tests! { +} diff --git a/src/index/singlebyte/index-cp437.txt b/src/index/singlebyte/index-cp437.txt new file mode 100644 index 00000000..33a718bc --- /dev/null +++ b/src/index/singlebyte/index-cp437.txt @@ -0,0 +1,148 @@ +# Source URL: ftp://www.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/PC/CP437.TXT +# Keys have been shifted to be more like WHATWG's indexes. +# +# Name: cp437_DOSLatinUS to Unicode table +# Unicode version: 2.0 +# Table version: 2.00 +# Table format: Format A +# Date: 04/24/96 +# Contact: Shawn.Steele@microsoft.com +# +# General notes: none +# +# Format: Three tab-separated columns +# Column #1 is the cp437_DOSLatinUS code (in hex) +# Column #2 is the Unicode (in hex as 0xXXXX) +# Column #3 is the Unicode name (follows a comment sign, '#') +# +# The entries are in cp437_DOSLatinUS order +# + +0x00 0x00c7 #LATIN CAPITAL LETTER C WITH CEDILLA +0x01 0x00fc #LATIN SMALL LETTER U WITH DIAERESIS +0x02 0x00e9 #LATIN SMALL LETTER E WITH ACUTE +0x03 0x00e2 #LATIN SMALL LETTER A WITH CIRCUMFLEX +0x04 0x00e4 #LATIN SMALL LETTER A WITH DIAERESIS +0x05 0x00e0 #LATIN SMALL LETTER A WITH GRAVE +0x06 0x00e5 #LATIN SMALL LETTER A WITH RING ABOVE +0x07 0x00e7 #LATIN SMALL LETTER C WITH CEDILLA +0x08 0x00ea #LATIN SMALL LETTER E WITH CIRCUMFLEX +0x09 0x00eb #LATIN SMALL LETTER E WITH DIAERESIS +0x0a 0x00e8 #LATIN SMALL LETTER E WITH GRAVE +0x0b 0x00ef #LATIN SMALL LETTER I WITH DIAERESIS +0x0c 0x00ee #LATIN SMALL LETTER I WITH CIRCUMFLEX +0x0d 0x00ec #LATIN SMALL LETTER I WITH GRAVE +0x0e 0x00c4 #LATIN CAPITAL LETTER A WITH DIAERESIS +0x0f 0x00c5 #LATIN CAPITAL LETTER A WITH RING ABOVE +0x10 0x00c9 #LATIN CAPITAL LETTER E WITH ACUTE +0x11 0x00e6 #LATIN SMALL LIGATURE AE +0x12 0x00c6 #LATIN CAPITAL LIGATURE AE +0x13 0x00f4 #LATIN SMALL LETTER O WITH CIRCUMFLEX +0x14 0x00f6 #LATIN SMALL LETTER O WITH DIAERESIS +0x15 0x00f2 #LATIN SMALL LETTER O WITH GRAVE +0x16 0x00fb #LATIN SMALL LETTER U WITH CIRCUMFLEX +0x17 0x00f9 #LATIN SMALL LETTER U WITH GRAVE +0x18 0x00ff #LATIN SMALL LETTER Y WITH DIAERESIS +0x19 0x00d6 #LATIN CAPITAL LETTER O WITH DIAERESIS +0x1a 0x00dc #LATIN CAPITAL LETTER U WITH DIAERESIS +0x1b 0x00a2 #CENT SIGN +0x1c 0x00a3 #POUND SIGN +0x1d 0x00a5 #YEN SIGN +0x1e 0x20a7 #PESETA SIGN +0x1f 0x0192 #LATIN SMALL LETTER F WITH HOOK +0x20 0x00e1 #LATIN SMALL LETTER A WITH ACUTE +0x21 0x00ed #LATIN SMALL LETTER I WITH ACUTE +0x22 0x00f3 #LATIN SMALL LETTER O WITH ACUTE +0x23 0x00fa #LATIN SMALL LETTER U WITH ACUTE +0x24 0x00f1 #LATIN SMALL LETTER N WITH TILDE +0x25 0x00d1 #LATIN CAPITAL LETTER N WITH TILDE +0x26 0x00aa #FEMININE ORDINAL INDICATOR +0x27 0x00ba #MASCULINE ORDINAL INDICATOR +0x28 0x00bf #INVERTED QUESTION MARK +0x29 0x2310 #REVERSED NOT SIGN +0x2a 0x00ac #NOT SIGN +0x2b 0x00bd #VULGAR FRACTION ONE HALF +0x2c 0x00bc #VULGAR FRACTION ONE QUARTER +0x2d 0x00a1 #INVERTED EXCLAMATION MARK +0x2e 0x00ab #LEFT-POINTING DOUBLE ANGLE QUOTATION MARK +0x2f 0x00bb #RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK +0x30 0x2591 #LIGHT SHADE +0x31 0x2592 #MEDIUM SHADE +0x32 0x2593 #DARK SHADE +0x33 0x2502 #BOX DRAWINGS LIGHT VERTICAL +0x34 0x2524 #BOX DRAWINGS LIGHT VERTICAL AND LEFT +0x35 0x2561 #BOX DRAWINGS VERTICAL SINGLE AND LEFT DOUBLE +0x36 0x2562 #BOX DRAWINGS VERTICAL DOUBLE AND LEFT SINGLE +0x37 0x2556 #BOX DRAWINGS DOWN DOUBLE AND LEFT SINGLE +0x38 0x2555 #BOX DRAWINGS DOWN SINGLE AND LEFT DOUBLE +0x39 0x2563 #BOX DRAWINGS DOUBLE VERTICAL AND LEFT +0x3a 0x2551 #BOX DRAWINGS DOUBLE VERTICAL +0x3b 0x2557 #BOX DRAWINGS DOUBLE DOWN AND LEFT +0x3c 0x255d #BOX DRAWINGS DOUBLE UP AND LEFT +0x3d 0x255c #BOX DRAWINGS UP DOUBLE AND LEFT SINGLE +0x3e 0x255b #BOX DRAWINGS UP SINGLE AND LEFT DOUBLE +0x3f 0x2510 #BOX DRAWINGS LIGHT DOWN AND LEFT +0x40 0x2514 #BOX DRAWINGS LIGHT UP AND RIGHT +0x41 0x2534 #BOX DRAWINGS LIGHT UP AND HORIZONTAL +0x42 0x252c #BOX DRAWINGS LIGHT DOWN AND HORIZONTAL +0x43 0x251c #BOX DRAWINGS LIGHT VERTICAL AND RIGHT +0x44 0x2500 #BOX DRAWINGS LIGHT HORIZONTAL +0x45 0x253c #BOX DRAWINGS LIGHT VERTICAL AND HORIZONTAL +0x46 0x255e #BOX DRAWINGS VERTICAL SINGLE AND RIGHT DOUBLE +0x47 0x255f #BOX DRAWINGS VERTICAL DOUBLE AND RIGHT SINGLE +0x48 0x255a #BOX DRAWINGS DOUBLE UP AND RIGHT +0x49 0x2554 #BOX DRAWINGS DOUBLE DOWN AND RIGHT +0x4a 0x2569 #BOX DRAWINGS DOUBLE UP AND HORIZONTAL +0x4b 0x2566 #BOX DRAWINGS DOUBLE DOWN AND HORIZONTAL +0x4c 0x2560 #BOX DRAWINGS DOUBLE VERTICAL AND RIGHT +0x4d 0x2550 #BOX DRAWINGS DOUBLE HORIZONTAL +0x4e 0x256c #BOX DRAWINGS DOUBLE VERTICAL AND HORIZONTAL +0x4f 0x2567 #BOX DRAWINGS UP SINGLE AND HORIZONTAL DOUBLE +0x50 0x2568 #BOX DRAWINGS UP DOUBLE AND HORIZONTAL SINGLE +0x51 0x2564 #BOX DRAWINGS DOWN SINGLE AND HORIZONTAL DOUBLE +0x52 0x2565 #BOX DRAWINGS DOWN DOUBLE AND HORIZONTAL SINGLE +0x53 0x2559 #BOX DRAWINGS UP DOUBLE AND RIGHT SINGLE +0x54 0x2558 #BOX DRAWINGS UP SINGLE AND RIGHT DOUBLE +0x55 0x2552 #BOX DRAWINGS DOWN SINGLE AND RIGHT DOUBLE +0x56 0x2553 #BOX DRAWINGS DOWN DOUBLE AND RIGHT SINGLE +0x57 0x256b #BOX DRAWINGS VERTICAL DOUBLE AND HORIZONTAL SINGLE +0x58 0x256a #BOX DRAWINGS VERTICAL SINGLE AND HORIZONTAL DOUBLE +0x59 0x2518 #BOX DRAWINGS LIGHT UP AND LEFT +0x5a 0x250c #BOX DRAWINGS LIGHT DOWN AND RIGHT +0x5b 0x2588 #FULL BLOCK +0x5c 0x2584 #LOWER HALF BLOCK +0x5d 0x258c #LEFT HALF BLOCK +0x5e 0x2590 #RIGHT HALF BLOCK +0x5f 0x2580 #UPPER HALF BLOCK +0x60 0x03b1 #GREEK SMALL LETTER ALPHA +0x61 0x00df #LATIN SMALL LETTER SHARP S +0x62 0x0393 #GREEK CAPITAL LETTER GAMMA +0x63 0x03c0 #GREEK SMALL LETTER PI +0x64 0x03a3 #GREEK CAPITAL LETTER SIGMA +0x65 0x03c3 #GREEK SMALL LETTER SIGMA +0x66 0x00b5 #MICRO SIGN +0x67 0x03c4 #GREEK SMALL LETTER TAU +0x68 0x03a6 #GREEK CAPITAL LETTER PHI +0x69 0x0398 #GREEK CAPITAL LETTER THETA +0x6a 0x03a9 #GREEK CAPITAL LETTER OMEGA +0x6b 0x03b4 #GREEK SMALL LETTER DELTA +0x6c 0x221e #INFINITY +0x6d 0x03c6 #GREEK SMALL LETTER PHI +0x6e 0x03b5 #GREEK SMALL LETTER EPSILON +0x6f 0x2229 #INTERSECTION +0x70 0x2261 #IDENTICAL TO +0x71 0x00b1 #PLUS-MINUS SIGN +0x72 0x2265 #GREATER-THAN OR EQUAL TO +0x73 0x2264 #LESS-THAN OR EQUAL TO +0x74 0x2320 #TOP HALF INTEGRAL +0x75 0x2321 #BOTTOM HALF INTEGRAL +0x76 0x00f7 #DIVISION SIGN +0x77 0x2248 #ALMOST EQUAL TO +0x78 0x00b0 #DEGREE SIGN +0x79 0x2219 #BULLET OPERATOR +0x7a 0x00b7 #MIDDLE DOT +0x7b 0x221a #SQUARE ROOT +0x7c 0x207f #SUPERSCRIPT LATIN SMALL LETTER N +0x7d 0x00b2 #SUPERSCRIPT TWO +0x7e 0x25a0 #BLACK SQUARE +0x7f 0x00a0 #NO-BREAK SPACE \ No newline at end of file diff --git a/src/index/singlebyte/lib.rs b/src/index/singlebyte/lib.rs index 41fb223c..9332694c 100644 --- a/src/index/singlebyte/lib.rs +++ b/src/index/singlebyte/lib.rs @@ -16,6 +16,9 @@ extern crate encoding_index_tests; /// ARMSCII-8 pub mod armscii_8; +/// CP437 +pub mod cp437; + /// IBM code page 866. pub mod ibm866;