source: trip-planner-front/node_modules/chardet/encoding/mbcs.js@ 1ad8e64

Last change on this file since 1ad8e64 was 6a3a178, checked in by Ema <ema_spirova@…>, 3 years ago

initial commit

  • Property mode set to 100644
File size: 16.6 KB
Line 
1var util = require('util'),
2 Match = require ('../match');
3
4/**
5 * Binary search implementation (recursive)
6 */
7function binarySearch(arr, searchValue) {
8 function find(arr, searchValue, left, right) {
9 if (right < left)
10 return -1;
11
12 /*
13 int mid = mid = (left + right) / 2;
14 There is a bug in the above line;
15 Joshua Bloch suggests the following replacement:
16 */
17 var mid = Math.floor((left + right) >>> 1);
18 if (searchValue > arr[mid])
19 return find(arr, searchValue, mid + 1, right);
20
21 if (searchValue < arr[mid])
22 return find(arr, searchValue, left, mid - 1);
23
24 return mid;
25 };
26
27 return find(arr, searchValue, 0, arr.length - 1);
28};
29
30// 'Character' iterated character class.
31// Recognizers for specific mbcs encodings make their 'characters' available
32// by providing a nextChar() function that fills in an instance of iteratedChar
33// with the next char from the input.
34// The returned characters are not converted to Unicode, but remain as the raw
35// bytes (concatenated into an int) from the codepage data.
36//
37// For Asian charsets, use the raw input rather than the input that has been
38// stripped of markup. Detection only considers multi-byte chars, effectively
39// stripping markup anyway, and double byte chars do occur in markup too.
40//
41function IteratedChar() {
42
43 this.charValue = 0; // 1-4 bytes from the raw input data
44 this.index = 0;
45 this.nextIndex = 0;
46 this.error = false;
47 this.done = false;
48
49 this.reset = function() {
50 this.charValue = 0;
51 this.index = -1;
52 this.nextIndex = 0;
53 this.error = false;
54 this.done = false;
55 };
56
57 this.nextByte = function(det) {
58 if (this.nextIndex >= det.fRawLength) {
59 this.done = true;
60 return -1;
61 }
62 var byteValue = det.fRawInput[this.nextIndex++] & 0x00ff;
63 return byteValue;
64 };
65};
66
67
68
69/**
70 * Asian double or multi-byte - charsets.
71 * Match is determined mostly by the input data adhering to the
72 * encoding scheme for the charset, and, optionally,
73 * frequency-of-occurence of characters.
74 */
75
76function mbcs() {};
77
78/**
79 * Test the match of this charset with the input text data
80 * which is obtained via the CharsetDetector object.
81 *
82 * @param det The CharsetDetector, which contains the input text
83 * to be checked for being in this charset.
84 * @return Two values packed into one int (Damn java, anyhow)
85 * bits 0-7: the match confidence, ranging from 0-100
86 * bits 8-15: The match reason, an enum-like value.
87 */
88mbcs.prototype.match = function(det) {
89
90 var singleByteCharCount = 0, //TODO Do we really need this?
91 doubleByteCharCount = 0,
92 commonCharCount = 0,
93 badCharCount = 0,
94 totalCharCount = 0,
95 confidence = 0;
96
97 var iter = new IteratedChar();
98
99 detectBlock: {
100 for (iter.reset(); this.nextChar(iter, det);) {
101 totalCharCount++;
102 if (iter.error) {
103 badCharCount++;
104 } else {
105 var cv = iter.charValue & 0xFFFFFFFF;
106
107 if (cv <= 0xff) {
108 singleByteCharCount++;
109 } else {
110 doubleByteCharCount++;
111 if (this.commonChars != null) {
112 // NOTE: This assumes that there are no 4-byte common chars.
113 if (binarySearch(this.commonChars, cv) >= 0) {
114 commonCharCount++;
115 }
116 }
117 }
118 }
119 if (badCharCount >= 2 && badCharCount * 5 >= doubleByteCharCount) {
120 // console.log('its here!')
121 // Bail out early if the byte data is not matching the encoding scheme.
122 break detectBlock;
123 }
124 }
125
126 if (doubleByteCharCount <= 10 && badCharCount== 0) {
127 // Not many multi-byte chars.
128 if (doubleByteCharCount == 0 && totalCharCount < 10) {
129 // There weren't any multibyte sequences, and there was a low density of non-ASCII single bytes.
130 // We don't have enough data to have any confidence.
131 // Statistical analysis of single byte non-ASCII charcters would probably help here.
132 confidence = 0;
133 }
134 else {
135 // ASCII or ISO file? It's probably not our encoding,
136 // but is not incompatible with our encoding, so don't give it a zero.
137 confidence = 10;
138 }
139 break detectBlock;
140 }
141
142 //
143 // No match if there are too many characters that don't fit the encoding scheme.
144 // (should we have zero tolerance for these?)
145 //
146 if (doubleByteCharCount < 20 * badCharCount) {
147 confidence = 0;
148 break detectBlock;
149 }
150
151 if (this.commonChars == null) {
152 // We have no statistics on frequently occuring characters.
153 // Assess confidence purely on having a reasonable number of
154 // multi-byte characters (the more the better
155 confidence = 30 + doubleByteCharCount - 20 * badCharCount;
156 if (confidence > 100) {
157 confidence = 100;
158 }
159 } else {
160 //
161 // Frequency of occurence statistics exist.
162 //
163 var maxVal = Math.log(parseFloat(doubleByteCharCount) / 4);
164 var scaleFactor = 90.0 / maxVal;
165 confidence = Math.floor(Math.log(commonCharCount + 1) * scaleFactor + 10);
166 confidence = Math.min(confidence, 100);
167 }
168 } // end of detectBlock:
169
170 return confidence == 0 ? null : new Match(det, this, confidence);
171};
172
173/**
174 * Get the next character (however many bytes it is) from the input data
175 * Subclasses for specific charset encodings must implement this function
176 * to get characters according to the rules of their encoding scheme.
177 *
178 * This function is not a method of class iteratedChar only because
179 * that would require a lot of extra derived classes, which is awkward.
180 * @param it The iteratedChar 'struct' into which the returned char is placed.
181 * @param det The charset detector, which is needed to get at the input byte data
182 * being iterated over.
183 * @return True if a character was returned, false at end of input.
184 */
185
186mbcs.prototype.nextChar = function(iter, det) {};
187
188
189
190/**
191 * Shift-JIS charset recognizer.
192 */
193module.exports.sjis = function() {
194 this.name = function() {
195 return 'Shift-JIS';
196 };
197 this.language = function() {
198 return 'ja';
199 };
200
201 // TODO: This set of data comes from the character frequency-
202 // of-occurence analysis tool. The data needs to be moved
203 // into a resource and loaded from there.
204 this.commonChars = [
205 0x8140, 0x8141, 0x8142, 0x8145, 0x815b, 0x8169, 0x816a, 0x8175, 0x8176, 0x82a0,
206 0x82a2, 0x82a4, 0x82a9, 0x82aa, 0x82ab, 0x82ad, 0x82af, 0x82b1, 0x82b3, 0x82b5,
207 0x82b7, 0x82bd, 0x82be, 0x82c1, 0x82c4, 0x82c5, 0x82c6, 0x82c8, 0x82c9, 0x82cc,
208 0x82cd, 0x82dc, 0x82e0, 0x82e7, 0x82e8, 0x82e9, 0x82ea, 0x82f0, 0x82f1, 0x8341,
209 0x8343, 0x834e, 0x834f, 0x8358, 0x835e, 0x8362, 0x8367, 0x8375, 0x8376, 0x8389,
210 0x838a, 0x838b, 0x838d, 0x8393, 0x8e96, 0x93fa, 0x95aa
211 ];
212
213 this.nextChar = function(iter, det) {
214 iter.index = iter.nextIndex;
215 iter.error = false;
216
217 var firstByte;
218 firstByte = iter.charValue = iter.nextByte(det);
219 if (firstByte < 0)
220 return false;
221
222 if (firstByte <= 0x7f || (firstByte > 0xa0 && firstByte <= 0xdf))
223 return true;
224
225 var secondByte = iter.nextByte(det);
226 if (secondByte < 0)
227 return false;
228
229 iter.charValue = (firstByte << 8) | secondByte;
230 if (! ((secondByte >= 0x40 && secondByte <= 0x7f) || (secondByte >= 0x80 && secondByte <= 0xff))) {
231 // Illegal second byte value.
232 iter.error = true;
233 }
234 return true;
235 };
236};
237util.inherits(module.exports.sjis, mbcs);
238
239
240
241/**
242 * Big5 charset recognizer.
243 */
244module.exports.big5 = function() {
245 this.name = function() {
246 return 'Big5';
247 };
248 this.language = function() {
249 return 'zh';
250 };
251 // TODO: This set of data comes from the character frequency-
252 // of-occurence analysis tool. The data needs to be moved
253 // into a resource and loaded from there.
254 this.commonChars = [
255 0xa140, 0xa141, 0xa142, 0xa143, 0xa147, 0xa149, 0xa175, 0xa176, 0xa440, 0xa446,
256 0xa447, 0xa448, 0xa451, 0xa454, 0xa457, 0xa464, 0xa46a, 0xa46c, 0xa477, 0xa4a3,
257 0xa4a4, 0xa4a7, 0xa4c1, 0xa4ce, 0xa4d1, 0xa4df, 0xa4e8, 0xa4fd, 0xa540, 0xa548,
258 0xa558, 0xa569, 0xa5cd, 0xa5e7, 0xa657, 0xa661, 0xa662, 0xa668, 0xa670, 0xa6a8,
259 0xa6b3, 0xa6b9, 0xa6d3, 0xa6db, 0xa6e6, 0xa6f2, 0xa740, 0xa751, 0xa759, 0xa7da,
260 0xa8a3, 0xa8a5, 0xa8ad, 0xa8d1, 0xa8d3, 0xa8e4, 0xa8fc, 0xa9c0, 0xa9d2, 0xa9f3,
261 0xaa6b, 0xaaba, 0xaabe, 0xaacc, 0xaafc, 0xac47, 0xac4f, 0xacb0, 0xacd2, 0xad59,
262 0xaec9, 0xafe0, 0xb0ea, 0xb16f, 0xb2b3, 0xb2c4, 0xb36f, 0xb44c, 0xb44e, 0xb54c,
263 0xb5a5, 0xb5bd, 0xb5d0, 0xb5d8, 0xb671, 0xb7ed, 0xb867, 0xb944, 0xbad8, 0xbb44,
264 0xbba1, 0xbdd1, 0xc2c4, 0xc3b9, 0xc440, 0xc45f
265 ];
266 this.nextChar = function(iter, det) {
267 iter.index = iter.nextIndex;
268 iter.error = false;
269
270 var firstByte = iter.charValue = iter.nextByte(det);
271
272 if (firstByte < 0)
273 return false;
274
275 // single byte character.
276 if (firstByte <= 0x7f || firstByte == 0xff)
277 return true;
278
279 var secondByte = iter.nextByte(det);
280
281 if (secondByte < 0)
282 return false;
283
284 iter.charValue = (iter.charValue << 8) | secondByte;
285
286 if (secondByte < 0x40 || secondByte == 0x7f || secondByte == 0xff)
287 iter.error = true;
288
289 return true;
290 };
291};
292util.inherits(module.exports.big5, mbcs);
293
294
295
296/**
297 * EUC charset recognizers. One abstract class that provides the common function
298 * for getting the next character according to the EUC encoding scheme,
299 * and nested derived classes for EUC_KR, EUC_JP, EUC_CN.
300 *
301 * Get the next character value for EUC based encodings.
302 * Character 'value' is simply the raw bytes that make up the character
303 * packed into an int.
304 */
305function eucNextChar(iter, det) {
306 iter.index = iter.nextIndex;
307 iter.error = false;
308 var firstByte = 0;
309 var secondByte = 0;
310 var thirdByte = 0;
311 //int fourthByte = 0;
312 buildChar: {
313 firstByte = iter.charValue = iter.nextByte(det);
314 if (firstByte < 0) {
315 // Ran off the end of the input data
316 iter.done = true;
317 break buildChar;
318 }
319 if (firstByte <= 0x8d) {
320 // single byte char
321 break buildChar;
322 }
323 secondByte = iter.nextByte(det);
324 iter.charValue = (iter.charValue << 8) | secondByte;
325 if (firstByte >= 0xA1 && firstByte <= 0xfe) {
326 // Two byte Char
327 if (secondByte < 0xa1) {
328 iter.error = true;
329 }
330 break buildChar;
331 }
332 if (firstByte == 0x8e) {
333 // Code Set 2.
334 // In EUC-JP, total char size is 2 bytes, only one byte of actual char value.
335 // In EUC-TW, total char size is 4 bytes, three bytes contribute to char value.
336 // We don't know which we've got.
337 // Treat it like EUC-JP. If the data really was EUC-TW, the following two
338 // bytes will look like a well formed 2 byte char.
339 if (secondByte < 0xa1) {
340 iter.error = true;
341 }
342 break buildChar;
343 }
344 if (firstByte == 0x8f) {
345 // Code set 3.
346 // Three byte total char size, two bytes of actual char value.
347 thirdByte = iter.nextByte(det);
348 iter.charValue = (iter.charValue << 8) | thirdByte;
349 if (thirdByte < 0xa1) {
350 iter.error = true;
351 }
352 }
353 }
354 return iter.done == false;
355};
356
357
358
359/**
360 * The charset recognize for EUC-JP. A singleton instance of this class
361 * is created and kept by the public CharsetDetector class
362 */
363module.exports.euc_jp = function() {
364 this.name = function() {
365 return 'EUC-JP';
366 };
367 this.language = function() {
368 return 'ja';
369 };
370
371 // TODO: This set of data comes from the character frequency-
372 // of-occurence analysis tool. The data needs to be moved
373 // into a resource and loaded from there.
374 this.commonChars = [
375 0xa1a1, 0xa1a2, 0xa1a3, 0xa1a6, 0xa1bc, 0xa1ca, 0xa1cb, 0xa1d6, 0xa1d7, 0xa4a2,
376 0xa4a4, 0xa4a6, 0xa4a8, 0xa4aa, 0xa4ab, 0xa4ac, 0xa4ad, 0xa4af, 0xa4b1, 0xa4b3,
377 0xa4b5, 0xa4b7, 0xa4b9, 0xa4bb, 0xa4bd, 0xa4bf, 0xa4c0, 0xa4c1, 0xa4c3, 0xa4c4,
378 0xa4c6, 0xa4c7, 0xa4c8, 0xa4c9, 0xa4ca, 0xa4cb, 0xa4ce, 0xa4cf, 0xa4d0, 0xa4de,
379 0xa4df, 0xa4e1, 0xa4e2, 0xa4e4, 0xa4e8, 0xa4e9, 0xa4ea, 0xa4eb, 0xa4ec, 0xa4ef,
380 0xa4f2, 0xa4f3, 0xa5a2, 0xa5a3, 0xa5a4, 0xa5a6, 0xa5a7, 0xa5aa, 0xa5ad, 0xa5af,
381 0xa5b0, 0xa5b3, 0xa5b5, 0xa5b7, 0xa5b8, 0xa5b9, 0xa5bf, 0xa5c3, 0xa5c6, 0xa5c7,
382 0xa5c8, 0xa5c9, 0xa5cb, 0xa5d0, 0xa5d5, 0xa5d6, 0xa5d7, 0xa5de, 0xa5e0, 0xa5e1,
383 0xa5e5, 0xa5e9, 0xa5ea, 0xa5eb, 0xa5ec, 0xa5ed, 0xa5f3, 0xb8a9, 0xb9d4, 0xbaee,
384 0xbbc8, 0xbef0, 0xbfb7, 0xc4ea, 0xc6fc, 0xc7bd, 0xcab8, 0xcaf3, 0xcbdc, 0xcdd1
385 ];
386
387 this.nextChar = eucNextChar;
388};
389util.inherits(module.exports.euc_jp, mbcs);
390
391
392
393/**
394 * The charset recognize for EUC-KR. A singleton instance of this class
395 * is created and kept by the public CharsetDetector class
396 */
397module.exports.euc_kr = function() {
398 this.name = function() {
399 return 'EUC-KR';
400 };
401 this.language = function() {
402 return 'ko';
403 };
404
405 // TODO: This set of data comes from the character frequency-
406 // of-occurence analysis tool. The data needs to be moved
407 // into a resource and loaded from there.
408 this.commonChars = [
409 0xb0a1, 0xb0b3, 0xb0c5, 0xb0cd, 0xb0d4, 0xb0e6, 0xb0ed, 0xb0f8, 0xb0fa, 0xb0fc,
410 0xb1b8, 0xb1b9, 0xb1c7, 0xb1d7, 0xb1e2, 0xb3aa, 0xb3bb, 0xb4c2, 0xb4cf, 0xb4d9,
411 0xb4eb, 0xb5a5, 0xb5b5, 0xb5bf, 0xb5c7, 0xb5e9, 0xb6f3, 0xb7af, 0xb7c2, 0xb7ce,
412 0xb8a6, 0xb8ae, 0xb8b6, 0xb8b8, 0xb8bb, 0xb8e9, 0xb9ab, 0xb9ae, 0xb9cc, 0xb9ce,
413 0xb9fd, 0xbab8, 0xbace, 0xbad0, 0xbaf1, 0xbbe7, 0xbbf3, 0xbbfd, 0xbcad, 0xbcba,
414 0xbcd2, 0xbcf6, 0xbdba, 0xbdc0, 0xbdc3, 0xbdc5, 0xbec6, 0xbec8, 0xbedf, 0xbeee,
415 0xbef8, 0xbefa, 0xbfa1, 0xbfa9, 0xbfc0, 0xbfe4, 0xbfeb, 0xbfec, 0xbff8, 0xc0a7,
416 0xc0af, 0xc0b8, 0xc0ba, 0xc0bb, 0xc0bd, 0xc0c7, 0xc0cc, 0xc0ce, 0xc0cf, 0xc0d6,
417 0xc0da, 0xc0e5, 0xc0fb, 0xc0fc, 0xc1a4, 0xc1a6, 0xc1b6, 0xc1d6, 0xc1df, 0xc1f6,
418 0xc1f8, 0xc4a1, 0xc5cd, 0xc6ae, 0xc7cf, 0xc7d1, 0xc7d2, 0xc7d8, 0xc7e5, 0xc8ad
419 ];
420
421 this.nextChar = eucNextChar;
422};
423util.inherits(module.exports.euc_kr, mbcs);
424
425
426
427/**
428 * GB-18030 recognizer. Uses simplified Chinese statistics.
429 */
430module.exports.gb_18030 = function() {
431 this.name = function() {
432 return 'GB18030';
433 };
434 this.language = function() {
435 return 'zh';
436 };
437
438 /*
439 * Get the next character value for EUC based encodings.
440 * Character 'value' is simply the raw bytes that make up the character
441 * packed into an int.
442 */
443 this.nextChar = function(iter, det) {
444 iter.index = iter.nextIndex;
445 iter.error = false;
446 var firstByte = 0;
447 var secondByte = 0;
448 var thirdByte = 0;
449 var fourthByte = 0;
450 buildChar: {
451 firstByte = iter.charValue = iter.nextByte(det);
452 if (firstByte < 0) {
453 // Ran off the end of the input data
454 iter.done = true;
455 break buildChar;
456 }
457 if (firstByte <= 0x80) {
458 // single byte char
459 break buildChar;
460 }
461 secondByte = iter.nextByte(det);
462 iter.charValue = (iter.charValue << 8) | secondByte;
463 if (firstByte >= 0x81 && firstByte <= 0xFE) {
464 // Two byte Char
465 if ((secondByte >= 0x40 && secondByte <= 0x7E) || (secondByte >=80 && secondByte <= 0xFE)) {
466 break buildChar;
467 }
468 // Four byte char
469 if (secondByte >= 0x30 && secondByte <= 0x39) {
470 thirdByte = iter.nextByte(det);
471 if (thirdByte >= 0x81 && thirdByte <= 0xFE) {
472 fourthByte = iter.nextByte(det);
473 if (fourthByte >= 0x30 && fourthByte <= 0x39) {
474 iter.charValue = (iter.charValue << 16) | (thirdByte << 8) | fourthByte;
475 break buildChar;
476 }
477 }
478 }
479 iter.error = true;
480 break buildChar;
481 }
482 }
483 return iter.done == false;
484 };
485
486 // TODO: This set of data comes from the character frequency-
487 // of-occurence analysis tool. The data needs to be moved
488 // into a resource and loaded from there.
489 this.commonChars = [
490 0xa1a1, 0xa1a2, 0xa1a3, 0xa1a4, 0xa1b0, 0xa1b1, 0xa1f1, 0xa1f3, 0xa3a1, 0xa3ac,
491 0xa3ba, 0xb1a8, 0xb1b8, 0xb1be, 0xb2bb, 0xb3c9, 0xb3f6, 0xb4f3, 0xb5bd, 0xb5c4,
492 0xb5e3, 0xb6af, 0xb6d4, 0xb6e0, 0xb7a2, 0xb7a8, 0xb7bd, 0xb7d6, 0xb7dd, 0xb8b4,
493 0xb8df, 0xb8f6, 0xb9ab, 0xb9c9, 0xb9d8, 0xb9fa, 0xb9fd, 0xbacd, 0xbba7, 0xbbd6,
494 0xbbe1, 0xbbfa, 0xbcbc, 0xbcdb, 0xbcfe, 0xbdcc, 0xbecd, 0xbedd, 0xbfb4, 0xbfc6,
495 0xbfc9, 0xc0b4, 0xc0ed, 0xc1cb, 0xc2db, 0xc3c7, 0xc4dc, 0xc4ea, 0xc5cc, 0xc6f7,
496 0xc7f8, 0xc8ab, 0xc8cb, 0xc8d5, 0xc8e7, 0xc9cf, 0xc9fa, 0xcab1, 0xcab5, 0xcac7,
497 0xcad0, 0xcad6, 0xcaf5, 0xcafd, 0xccec, 0xcdf8, 0xceaa, 0xcec4, 0xced2, 0xcee5,
498 0xcfb5, 0xcfc2, 0xcfd6, 0xd0c2, 0xd0c5, 0xd0d0, 0xd0d4, 0xd1a7, 0xd2aa, 0xd2b2,
499 0xd2b5, 0xd2bb, 0xd2d4, 0xd3c3, 0xd3d0, 0xd3fd, 0xd4c2, 0xd4da, 0xd5e2, 0xd6d0
500 ];
501};
502util.inherits(module.exports.gb_18030, mbcs);
Note: See TracBrowser for help on using the repository browser.