[6a3a178] | 1 | "use strict";
|
---|
| 2 |
|
---|
| 3 | // Description of supported double byte encodings and aliases.
|
---|
| 4 | // Tables are not require()-d until they are needed to speed up library load.
|
---|
| 5 | // require()-s are direct to support Browserify.
|
---|
| 6 |
|
---|
| 7 | module.exports = {
|
---|
| 8 |
|
---|
| 9 | // == Japanese/ShiftJIS ====================================================
|
---|
| 10 | // All japanese encodings are based on JIS X set of standards:
|
---|
| 11 | // JIS X 0201 - Single-byte encoding of ASCII + ¥ + Kana chars at 0xA1-0xDF.
|
---|
| 12 | // JIS X 0208 - Main set of 6879 characters, placed in 94x94 plane, to be encoded by 2 bytes.
|
---|
| 13 | // Has several variations in 1978, 1983, 1990 and 1997.
|
---|
| 14 | // JIS X 0212 - Supplementary plane of 6067 chars in 94x94 plane. 1990. Effectively dead.
|
---|
| 15 | // JIS X 0213 - Extension and modern replacement of 0208 and 0212. Total chars: 11233.
|
---|
| 16 | // 2 planes, first is superset of 0208, second - revised 0212.
|
---|
| 17 | // Introduced in 2000, revised 2004. Some characters are in Unicode Plane 2 (0x2xxxx)
|
---|
| 18 |
|
---|
| 19 | // Byte encodings are:
|
---|
| 20 | // * Shift_JIS: Compatible with 0201, uses not defined chars in top half as lead bytes for double-byte
|
---|
| 21 | // encoding of 0208. Lead byte ranges: 0x81-0x9F, 0xE0-0xEF; Trail byte ranges: 0x40-0x7E, 0x80-0x9E, 0x9F-0xFC.
|
---|
| 22 | // Windows CP932 is a superset of Shift_JIS. Some companies added more chars, notably KDDI.
|
---|
| 23 | // * EUC-JP: Up to 3 bytes per character. Used mostly on *nixes.
|
---|
| 24 | // 0x00-0x7F - lower part of 0201
|
---|
| 25 | // 0x8E, 0xA1-0xDF - upper part of 0201
|
---|
| 26 | // (0xA1-0xFE)x2 - 0208 plane (94x94).
|
---|
| 27 | // 0x8F, (0xA1-0xFE)x2 - 0212 plane (94x94).
|
---|
| 28 | // * JIS X 208: 7-bit, direct encoding of 0208. Byte ranges: 0x21-0x7E (94 values). Uncommon.
|
---|
| 29 | // Used as-is in ISO2022 family.
|
---|
| 30 | // * ISO2022-JP: Stateful encoding, with escape sequences to switch between ASCII,
|
---|
| 31 | // 0201-1976 Roman, 0208-1978, 0208-1983.
|
---|
| 32 | // * ISO2022-JP-1: Adds esc seq for 0212-1990.
|
---|
| 33 | // * ISO2022-JP-2: Adds esc seq for GB2313-1980, KSX1001-1992, ISO8859-1, ISO8859-7.
|
---|
| 34 | // * ISO2022-JP-3: Adds esc seq for 0201-1976 Kana set, 0213-2000 Planes 1, 2.
|
---|
| 35 | // * ISO2022-JP-2004: Adds 0213-2004 Plane 1.
|
---|
| 36 | //
|
---|
| 37 | // After JIS X 0213 appeared, Shift_JIS-2004, EUC-JISX0213 and ISO2022-JP-2004 followed, with just changing the planes.
|
---|
| 38 | //
|
---|
| 39 | // Overall, it seems that it's a mess :( http://www8.plala.or.jp/tkubota1/unicode-symbols-map2.html
|
---|
| 40 |
|
---|
| 41 | 'shiftjis': {
|
---|
| 42 | type: '_dbcs',
|
---|
| 43 | table: function() { return require('./tables/shiftjis.json') },
|
---|
| 44 | encodeAdd: {'\u00a5': 0x5C, '\u203E': 0x7E},
|
---|
| 45 | encodeSkipVals: [{from: 0xED40, to: 0xF940}],
|
---|
| 46 | },
|
---|
| 47 | 'csshiftjis': 'shiftjis',
|
---|
| 48 | 'mskanji': 'shiftjis',
|
---|
| 49 | 'sjis': 'shiftjis',
|
---|
| 50 | 'windows31j': 'shiftjis',
|
---|
| 51 | 'ms31j': 'shiftjis',
|
---|
| 52 | 'xsjis': 'shiftjis',
|
---|
| 53 | 'windows932': 'shiftjis',
|
---|
| 54 | 'ms932': 'shiftjis',
|
---|
| 55 | '932': 'shiftjis',
|
---|
| 56 | 'cp932': 'shiftjis',
|
---|
| 57 |
|
---|
| 58 | 'eucjp': {
|
---|
| 59 | type: '_dbcs',
|
---|
| 60 | table: function() { return require('./tables/eucjp.json') },
|
---|
| 61 | encodeAdd: {'\u00a5': 0x5C, '\u203E': 0x7E},
|
---|
| 62 | },
|
---|
| 63 |
|
---|
| 64 | // TODO: KDDI extension to Shift_JIS
|
---|
| 65 | // TODO: IBM CCSID 942 = CP932, but F0-F9 custom chars and other char changes.
|
---|
| 66 | // TODO: IBM CCSID 943 = Shift_JIS = CP932 with original Shift_JIS lower 128 chars.
|
---|
| 67 |
|
---|
| 68 |
|
---|
| 69 | // == Chinese/GBK ==========================================================
|
---|
| 70 | // http://en.wikipedia.org/wiki/GBK
|
---|
| 71 | // We mostly implement W3C recommendation: https://www.w3.org/TR/encoding/#gbk-encoder
|
---|
| 72 |
|
---|
| 73 | // Oldest GB2312 (1981, ~7600 chars) is a subset of CP936
|
---|
| 74 | 'gb2312': 'cp936',
|
---|
| 75 | 'gb231280': 'cp936',
|
---|
| 76 | 'gb23121980': 'cp936',
|
---|
| 77 | 'csgb2312': 'cp936',
|
---|
| 78 | 'csiso58gb231280': 'cp936',
|
---|
| 79 | 'euccn': 'cp936',
|
---|
| 80 |
|
---|
| 81 | // Microsoft's CP936 is a subset and approximation of GBK.
|
---|
| 82 | 'windows936': 'cp936',
|
---|
| 83 | 'ms936': 'cp936',
|
---|
| 84 | '936': 'cp936',
|
---|
| 85 | 'cp936': {
|
---|
| 86 | type: '_dbcs',
|
---|
| 87 | table: function() { return require('./tables/cp936.json') },
|
---|
| 88 | },
|
---|
| 89 |
|
---|
| 90 | // GBK (~22000 chars) is an extension of CP936 that added user-mapped chars and some other.
|
---|
| 91 | 'gbk': {
|
---|
| 92 | type: '_dbcs',
|
---|
| 93 | table: function() { return require('./tables/cp936.json').concat(require('./tables/gbk-added.json')) },
|
---|
| 94 | },
|
---|
| 95 | 'xgbk': 'gbk',
|
---|
| 96 | 'isoir58': 'gbk',
|
---|
| 97 |
|
---|
| 98 | // GB18030 is an algorithmic extension of GBK.
|
---|
| 99 | // Main source: https://www.w3.org/TR/encoding/#gbk-encoder
|
---|
| 100 | // http://icu-project.org/docs/papers/gb18030.html
|
---|
| 101 | // http://source.icu-project.org/repos/icu/data/trunk/charset/data/xml/gb-18030-2000.xml
|
---|
| 102 | // http://www.khngai.com/chinese/charmap/tblgbk.php?page=0
|
---|
| 103 | 'gb18030': {
|
---|
| 104 | type: '_dbcs',
|
---|
| 105 | table: function() { return require('./tables/cp936.json').concat(require('./tables/gbk-added.json')) },
|
---|
| 106 | gb18030: function() { return require('./tables/gb18030-ranges.json') },
|
---|
| 107 | encodeSkipVals: [0x80],
|
---|
| 108 | encodeAdd: {'€': 0xA2E3},
|
---|
| 109 | },
|
---|
| 110 |
|
---|
| 111 | 'chinese': 'gb18030',
|
---|
| 112 |
|
---|
| 113 |
|
---|
| 114 | // == Korean ===============================================================
|
---|
| 115 | // EUC-KR, KS_C_5601 and KS X 1001 are exactly the same.
|
---|
| 116 | 'windows949': 'cp949',
|
---|
| 117 | 'ms949': 'cp949',
|
---|
| 118 | '949': 'cp949',
|
---|
| 119 | 'cp949': {
|
---|
| 120 | type: '_dbcs',
|
---|
| 121 | table: function() { return require('./tables/cp949.json') },
|
---|
| 122 | },
|
---|
| 123 |
|
---|
| 124 | 'cseuckr': 'cp949',
|
---|
| 125 | 'csksc56011987': 'cp949',
|
---|
| 126 | 'euckr': 'cp949',
|
---|
| 127 | 'isoir149': 'cp949',
|
---|
| 128 | 'korean': 'cp949',
|
---|
| 129 | 'ksc56011987': 'cp949',
|
---|
| 130 | 'ksc56011989': 'cp949',
|
---|
| 131 | 'ksc5601': 'cp949',
|
---|
| 132 |
|
---|
| 133 |
|
---|
| 134 | // == Big5/Taiwan/Hong Kong ================================================
|
---|
| 135 | // There are lots of tables for Big5 and cp950. Please see the following links for history:
|
---|
| 136 | // http://moztw.org/docs/big5/ http://www.haible.de/bruno/charsets/conversion-tables/Big5.html
|
---|
| 137 | // Variations, in roughly number of defined chars:
|
---|
| 138 | // * Windows CP 950: Microsoft variant of Big5. Canonical: http://www.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP950.TXT
|
---|
| 139 | // * Windows CP 951: Microsoft variant of Big5-HKSCS-2001. Seems to be never public. http://me.abelcheung.org/articles/research/what-is-cp951/
|
---|
| 140 | // * Big5-2003 (Taiwan standard) almost superset of cp950.
|
---|
| 141 | // * Unicode-at-on (UAO) / Mozilla 1.8. Falling out of use on the Web. Not supported by other browsers.
|
---|
| 142 | // * Big5-HKSCS (-2001, -2004, -2008). Hong Kong standard.
|
---|
| 143 | // many unicode code points moved from PUA to Supplementary plane (U+2XXXX) over the years.
|
---|
| 144 | // Plus, it has 4 combining sequences.
|
---|
| 145 | // Seems that Mozilla refused to support it for 10 yrs. https://bugzilla.mozilla.org/show_bug.cgi?id=162431 https://bugzilla.mozilla.org/show_bug.cgi?id=310299
|
---|
| 146 | // because big5-hkscs is the only encoding to include astral characters in non-algorithmic way.
|
---|
| 147 | // Implementations are not consistent within browsers; sometimes labeled as just big5.
|
---|
| 148 | // MS Internet Explorer switches from big5 to big5-hkscs when a patch applied.
|
---|
| 149 | // Great discussion & recap of what's going on https://bugzilla.mozilla.org/show_bug.cgi?id=912470#c31
|
---|
| 150 | // In the encoder, it might make sense to support encoding old PUA mappings to Big5 bytes seq-s.
|
---|
| 151 | // Official spec: http://www.ogcio.gov.hk/en/business/tech_promotion/ccli/terms/doc/2003cmp_2008.txt
|
---|
| 152 | // http://www.ogcio.gov.hk/tc/business/tech_promotion/ccli/terms/doc/hkscs-2008-big5-iso.txt
|
---|
| 153 | //
|
---|
| 154 | // Current understanding of how to deal with Big5(-HKSCS) is in the Encoding Standard, http://encoding.spec.whatwg.org/#big5-encoder
|
---|
| 155 | // Unicode mapping (http://www.unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/OTHER/BIG5.TXT) is said to be wrong.
|
---|
| 156 |
|
---|
| 157 | 'windows950': 'cp950',
|
---|
| 158 | 'ms950': 'cp950',
|
---|
| 159 | '950': 'cp950',
|
---|
| 160 | 'cp950': {
|
---|
| 161 | type: '_dbcs',
|
---|
| 162 | table: function() { return require('./tables/cp950.json') },
|
---|
| 163 | },
|
---|
| 164 |
|
---|
| 165 | // Big5 has many variations and is an extension of cp950. We use Encoding Standard's as a consensus.
|
---|
| 166 | 'big5': 'big5hkscs',
|
---|
| 167 | 'big5hkscs': {
|
---|
| 168 | type: '_dbcs',
|
---|
| 169 | table: function() { return require('./tables/cp950.json').concat(require('./tables/big5-added.json')) },
|
---|
| 170 | encodeSkipVals: [0xa2cc],
|
---|
| 171 | },
|
---|
| 172 |
|
---|
| 173 | 'cnbig5': 'big5hkscs',
|
---|
| 174 | 'csbig5': 'big5hkscs',
|
---|
| 175 | 'xxbig5': 'big5hkscs',
|
---|
| 176 | };
|
---|