source: trip-planner-front/node_modules/encoding/node_modules/iconv-lite/encodings/dbcs-data.js@ 6a3a178

Last change on this file since 6a3a178 was 6a3a178, checked in by Ema <ema_spirova@…>, 3 years ago

initial commit

  • Property mode set to 100644
File size: 9.2 KB
Line 
1"use strict";
2
3// Description of supported double byte encodings and aliases.
4// Tables are not require()-d until they are needed to speed up library load.
5// require()-s are direct to support Browserify.
6
7module.exports = {
8
9 // == Japanese/ShiftJIS ====================================================
10 // All japanese encodings are based on JIS X set of standards:
11 // JIS X 0201 - Single-byte encoding of ASCII + ¥ + Kana chars at 0xA1-0xDF.
12 // JIS X 0208 - Main set of 6879 characters, placed in 94x94 plane, to be encoded by 2 bytes.
13 // Has several variations in 1978, 1983, 1990 and 1997.
14 // JIS X 0212 - Supplementary plane of 6067 chars in 94x94 plane. 1990. Effectively dead.
15 // JIS X 0213 - Extension and modern replacement of 0208 and 0212. Total chars: 11233.
16 // 2 planes, first is superset of 0208, second - revised 0212.
17 // Introduced in 2000, revised 2004. Some characters are in Unicode Plane 2 (0x2xxxx)
18
19 // Byte encodings are:
20 // * Shift_JIS: Compatible with 0201, uses not defined chars in top half as lead bytes for double-byte
21 // encoding of 0208. Lead byte ranges: 0x81-0x9F, 0xE0-0xEF; Trail byte ranges: 0x40-0x7E, 0x80-0x9E, 0x9F-0xFC.
22 // Windows CP932 is a superset of Shift_JIS. Some companies added more chars, notably KDDI.
23 // * EUC-JP: Up to 3 bytes per character. Used mostly on *nixes.
24 // 0x00-0x7F - lower part of 0201
25 // 0x8E, 0xA1-0xDF - upper part of 0201
26 // (0xA1-0xFE)x2 - 0208 plane (94x94).
27 // 0x8F, (0xA1-0xFE)x2 - 0212 plane (94x94).
28 // * JIS X 208: 7-bit, direct encoding of 0208. Byte ranges: 0x21-0x7E (94 values). Uncommon.
29 // Used as-is in ISO2022 family.
30 // * ISO2022-JP: Stateful encoding, with escape sequences to switch between ASCII,
31 // 0201-1976 Roman, 0208-1978, 0208-1983.
32 // * ISO2022-JP-1: Adds esc seq for 0212-1990.
33 // * ISO2022-JP-2: Adds esc seq for GB2313-1980, KSX1001-1992, ISO8859-1, ISO8859-7.
34 // * ISO2022-JP-3: Adds esc seq for 0201-1976 Kana set, 0213-2000 Planes 1, 2.
35 // * ISO2022-JP-2004: Adds 0213-2004 Plane 1.
36 //
37 // After JIS X 0213 appeared, Shift_JIS-2004, EUC-JISX0213 and ISO2022-JP-2004 followed, with just changing the planes.
38 //
39 // Overall, it seems that it's a mess :( http://www8.plala.or.jp/tkubota1/unicode-symbols-map2.html
40
41 'shiftjis': {
42 type: '_dbcs',
43 table: function() { return require('./tables/shiftjis.json') },
44 encodeAdd: {'\u00a5': 0x5C, '\u203E': 0x7E},
45 encodeSkipVals: [{from: 0xED40, to: 0xF940}],
46 },
47 'csshiftjis': 'shiftjis',
48 'mskanji': 'shiftjis',
49 'sjis': 'shiftjis',
50 'windows31j': 'shiftjis',
51 'ms31j': 'shiftjis',
52 'xsjis': 'shiftjis',
53 'windows932': 'shiftjis',
54 'ms932': 'shiftjis',
55 '932': 'shiftjis',
56 'cp932': 'shiftjis',
57
58 'eucjp': {
59 type: '_dbcs',
60 table: function() { return require('./tables/eucjp.json') },
61 encodeAdd: {'\u00a5': 0x5C, '\u203E': 0x7E},
62 },
63
64 // TODO: KDDI extension to Shift_JIS
65 // TODO: IBM CCSID 942 = CP932, but F0-F9 custom chars and other char changes.
66 // TODO: IBM CCSID 943 = Shift_JIS = CP932 with original Shift_JIS lower 128 chars.
67
68
69 // == Chinese/GBK ==========================================================
70 // http://en.wikipedia.org/wiki/GBK
71 // We mostly implement W3C recommendation: https://www.w3.org/TR/encoding/#gbk-encoder
72
73 // Oldest GB2312 (1981, ~7600 chars) is a subset of CP936
74 'gb2312': 'cp936',
75 'gb231280': 'cp936',
76 'gb23121980': 'cp936',
77 'csgb2312': 'cp936',
78 'csiso58gb231280': 'cp936',
79 'euccn': 'cp936',
80
81 // Microsoft's CP936 is a subset and approximation of GBK.
82 'windows936': 'cp936',
83 'ms936': 'cp936',
84 '936': 'cp936',
85 'cp936': {
86 type: '_dbcs',
87 table: function() { return require('./tables/cp936.json') },
88 },
89
90 // GBK (~22000 chars) is an extension of CP936 that added user-mapped chars and some other.
91 'gbk': {
92 type: '_dbcs',
93 table: function() { return require('./tables/cp936.json').concat(require('./tables/gbk-added.json')) },
94 },
95 'xgbk': 'gbk',
96 'isoir58': 'gbk',
97
98 // GB18030 is an algorithmic extension of GBK.
99 // Main source: https://www.w3.org/TR/encoding/#gbk-encoder
100 // http://icu-project.org/docs/papers/gb18030.html
101 // http://source.icu-project.org/repos/icu/data/trunk/charset/data/xml/gb-18030-2000.xml
102 // http://www.khngai.com/chinese/charmap/tblgbk.php?page=0
103 'gb18030': {
104 type: '_dbcs',
105 table: function() { return require('./tables/cp936.json').concat(require('./tables/gbk-added.json')) },
106 gb18030: function() { return require('./tables/gb18030-ranges.json') },
107 encodeSkipVals: [0x80],
108 encodeAdd: {'€': 0xA2E3},
109 },
110
111 'chinese': 'gb18030',
112
113
114 // == Korean ===============================================================
115 // EUC-KR, KS_C_5601 and KS X 1001 are exactly the same.
116 'windows949': 'cp949',
117 'ms949': 'cp949',
118 '949': 'cp949',
119 'cp949': {
120 type: '_dbcs',
121 table: function() { return require('./tables/cp949.json') },
122 },
123
124 'cseuckr': 'cp949',
125 'csksc56011987': 'cp949',
126 'euckr': 'cp949',
127 'isoir149': 'cp949',
128 'korean': 'cp949',
129 'ksc56011987': 'cp949',
130 'ksc56011989': 'cp949',
131 'ksc5601': 'cp949',
132
133
134 // == Big5/Taiwan/Hong Kong ================================================
135 // There are lots of tables for Big5 and cp950. Please see the following links for history:
136 // http://moztw.org/docs/big5/ http://www.haible.de/bruno/charsets/conversion-tables/Big5.html
137 // Variations, in roughly number of defined chars:
138 // * Windows CP 950: Microsoft variant of Big5. Canonical: http://www.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP950.TXT
139 // * Windows CP 951: Microsoft variant of Big5-HKSCS-2001. Seems to be never public. http://me.abelcheung.org/articles/research/what-is-cp951/
140 // * Big5-2003 (Taiwan standard) almost superset of cp950.
141 // * Unicode-at-on (UAO) / Mozilla 1.8. Falling out of use on the Web. Not supported by other browsers.
142 // * Big5-HKSCS (-2001, -2004, -2008). Hong Kong standard.
143 // many unicode code points moved from PUA to Supplementary plane (U+2XXXX) over the years.
144 // Plus, it has 4 combining sequences.
145 // Seems that Mozilla refused to support it for 10 yrs. https://bugzilla.mozilla.org/show_bug.cgi?id=162431 https://bugzilla.mozilla.org/show_bug.cgi?id=310299
146 // because big5-hkscs is the only encoding to include astral characters in non-algorithmic way.
147 // Implementations are not consistent within browsers; sometimes labeled as just big5.
148 // MS Internet Explorer switches from big5 to big5-hkscs when a patch applied.
149 // Great discussion & recap of what's going on https://bugzilla.mozilla.org/show_bug.cgi?id=912470#c31
150 // In the encoder, it might make sense to support encoding old PUA mappings to Big5 bytes seq-s.
151 // Official spec: http://www.ogcio.gov.hk/en/business/tech_promotion/ccli/terms/doc/2003cmp_2008.txt
152 // http://www.ogcio.gov.hk/tc/business/tech_promotion/ccli/terms/doc/hkscs-2008-big5-iso.txt
153 //
154 // Current understanding of how to deal with Big5(-HKSCS) is in the Encoding Standard, http://encoding.spec.whatwg.org/#big5-encoder
155 // Unicode mapping (http://www.unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/OTHER/BIG5.TXT) is said to be wrong.
156
157 'windows950': 'cp950',
158 'ms950': 'cp950',
159 '950': 'cp950',
160 'cp950': {
161 type: '_dbcs',
162 table: function() { return require('./tables/cp950.json') },
163 },
164
165 // Big5 has many variations and is an extension of cp950. We use Encoding Standard's as a consensus.
166 'big5': 'big5hkscs',
167 'big5hkscs': {
168 type: '_dbcs',
169 table: function() { return require('./tables/cp950.json').concat(require('./tables/big5-added.json')) },
170 encodeSkipVals: [
171 // Although Encoding Standard says we should avoid encoding to HKSCS area (See Step 1 of
172 // https://encoding.spec.whatwg.org/#index-big5-pointer), we still do it to increase compatibility with ICU.
173 // But if a single unicode point can be encoded both as HKSCS and regular Big5, we prefer the latter.
174 0x8e69, 0x8e6f, 0x8e7e, 0x8eab, 0x8eb4, 0x8ecd, 0x8ed0, 0x8f57, 0x8f69, 0x8f6e, 0x8fcb, 0x8ffe,
175 0x906d, 0x907a, 0x90c4, 0x90dc, 0x90f1, 0x91bf, 0x92af, 0x92b0, 0x92b1, 0x92b2, 0x92d1, 0x9447, 0x94ca,
176 0x95d9, 0x96fc, 0x9975, 0x9b76, 0x9b78, 0x9b7b, 0x9bc6, 0x9bde, 0x9bec, 0x9bf6, 0x9c42, 0x9c53, 0x9c62,
177 0x9c68, 0x9c6b, 0x9c77, 0x9cbc, 0x9cbd, 0x9cd0, 0x9d57, 0x9d5a, 0x9dc4, 0x9def, 0x9dfb, 0x9ea9, 0x9eef,
178 0x9efd, 0x9f60, 0x9fcb, 0xa077, 0xa0dc, 0xa0df, 0x8fcc, 0x92c8, 0x9644, 0x96ed,
179
180 // Step 2 of https://encoding.spec.whatwg.org/#index-big5-pointer: Use last pointer for U+2550, U+255E, U+2561, U+256A, U+5341, or U+5345
181 0xa2a4, 0xa2a5, 0xa2a7, 0xa2a6, 0xa2cc, 0xa2ce,
182 ],
183 },
184
185 'cnbig5': 'big5hkscs',
186 'csbig5': 'big5hkscs',
187 'xxbig5': 'big5hkscs',
188};
Note: See TracBrowser for help on using the repository browser.