Context Navigation

← Previous Revision
Next Revision →
Blame
Revision Log

source: trip-planner-front/node_modules/chardet/encoding/mbcs.js

Last change on this file was 6a3a178, checked in by Ema <ema_spirova@…>, 3 years ago
initial commit
Property mode set to `100644`
File size: 16.6 KB

Line
1	var util = require('util'),
2	Match = require ('../match');
3
4	/**
5	* Binary search implementation (recursive)
6	*/
7	function binarySearch(arr, searchValue) {
8	function find(arr, searchValue, left, right) {
9	if (right < left)
10	return -1;
11
12	/*
13	int mid = mid = (left + right) / 2;
14	There is a bug in the above line;
15	Joshua Bloch suggests the following replacement:
16	*/
17	var mid = Math.floor((left + right) >>> 1);
18	if (searchValue > arr[mid])
19	return find(arr, searchValue, mid + 1, right);
20
21	if (searchValue < arr[mid])
22	return find(arr, searchValue, left, mid - 1);
23
24	return mid;
25	};
26
27	return find(arr, searchValue, 0, arr.length - 1);
28	};
29
30	// 'Character' iterated character class.
31	// Recognizers for specific mbcs encodings make their 'characters' available
32	// by providing a nextChar() function that fills in an instance of iteratedChar
33	// with the next char from the input.
34	// The returned characters are not converted to Unicode, but remain as the raw
35	// bytes (concatenated into an int) from the codepage data.
36	//
37	// For Asian charsets, use the raw input rather than the input that has been
38	// stripped of markup. Detection only considers multi-byte chars, effectively
39	// stripping markup anyway, and double byte chars do occur in markup too.
40	//
41	function IteratedChar() {
42
43	this.charValue = 0; // 1-4 bytes from the raw input data
44	this.index = 0;
45	this.nextIndex = 0;
46	this.error = false;
47	this.done = false;
48
49	this.reset = function() {
50	this.charValue = 0;
51	this.index = -1;
52	this.nextIndex = 0;
53	this.error = false;
54	this.done = false;
55	};
56
57	this.nextByte = function(det) {
58	if (this.nextIndex >= det.fRawLength) {
59	this.done = true;
60	return -1;
61	}
62	var byteValue = det.fRawInput[this.nextIndex++] & 0x00ff;
63	return byteValue;
64	};
65	};
66
67
68
69	/**
70	* Asian double or multi-byte - charsets.
71	* Match is determined mostly by the input data adhering to the
72	* encoding scheme for the charset, and, optionally,
73	* frequency-of-occurence of characters.
74	*/
75
76	function mbcs() {};
77
78	/**
79	* Test the match of this charset with the input text data
80	* which is obtained via the CharsetDetector object.
81	*
82	* @param det The CharsetDetector, which contains the input text
83	* to be checked for being in this charset.
84	* @return Two values packed into one int (Damn java, anyhow)
85	* bits 0-7: the match confidence, ranging from 0-100
86	* bits 8-15: The match reason, an enum-like value.
87	*/
88	mbcs.prototype.match = function(det) {
89
90	var singleByteCharCount = 0, //TODO Do we really need this?
91	doubleByteCharCount = 0,
92	commonCharCount = 0,
93	badCharCount = 0,
94	totalCharCount = 0,
95	confidence = 0;
96
97	var iter = new IteratedChar();
98
99	detectBlock: {
100	for (iter.reset(); this.nextChar(iter, det);) {
101	totalCharCount++;
102	if (iter.error) {
103	badCharCount++;
104	} else {
105	var cv = iter.charValue & 0xFFFFFFFF;
106
107	if (cv <= 0xff) {
108	singleByteCharCount++;
109	} else {
110	doubleByteCharCount++;
111	if (this.commonChars != null) {
112	// NOTE: This assumes that there are no 4-byte common chars.
113	if (binarySearch(this.commonChars, cv) >= 0) {
114	commonCharCount++;
115	}
116	}
117	}
118	}
119	if (badCharCount >= 2 && badCharCount * 5 >= doubleByteCharCount) {
120	// console.log('its here!')
121	// Bail out early if the byte data is not matching the encoding scheme.
122	break detectBlock;
123	}
124	}
125
126	if (doubleByteCharCount <= 10 && badCharCount== 0) {
127	// Not many multi-byte chars.
128	if (doubleByteCharCount == 0 && totalCharCount < 10) {
129	// There weren't any multibyte sequences, and there was a low density of non-ASCII single bytes.
130	// We don't have enough data to have any confidence.
131	// Statistical analysis of single byte non-ASCII charcters would probably help here.
132	confidence = 0;
133	}
134	else {
135	// ASCII or ISO file? It's probably not our encoding,
136	// but is not incompatible with our encoding, so don't give it a zero.
137	confidence = 10;
138	}
139	break detectBlock;
140	}
141
142	//
143	// No match if there are too many characters that don't fit the encoding scheme.
144	// (should we have zero tolerance for these?)
145	//
146	if (doubleByteCharCount < 20 * badCharCount) {
147	confidence = 0;
148	break detectBlock;
149	}
150
151	if (this.commonChars == null) {
152	// We have no statistics on frequently occuring characters.
153	// Assess confidence purely on having a reasonable number of
154	// multi-byte characters (the more the better
155	confidence = 30 + doubleByteCharCount - 20 * badCharCount;
156	if (confidence > 100) {
157	confidence = 100;
158	}
159	} else {
160	//
161	// Frequency of occurence statistics exist.
162	//
163	var maxVal = Math.log(parseFloat(doubleByteCharCount) / 4);
164	var scaleFactor = 90.0 / maxVal;
165	confidence = Math.floor(Math.log(commonCharCount + 1) * scaleFactor + 10);
166	confidence = Math.min(confidence, 100);
167	}
168	} // end of detectBlock:
169
170	return confidence == 0 ? null : new Match(det, this, confidence);
171	};
172
173	/**
174	* Get the next character (however many bytes it is) from the input data
175	* Subclasses for specific charset encodings must implement this function
176	* to get characters according to the rules of their encoding scheme.
177	*
178	* This function is not a method of class iteratedChar only because
179	* that would require a lot of extra derived classes, which is awkward.
180	* @param it The iteratedChar 'struct' into which the returned char is placed.
181	* @param det The charset detector, which is needed to get at the input byte data
182	* being iterated over.
183	* @return True if a character was returned, false at end of input.
184	*/
185
186	mbcs.prototype.nextChar = function(iter, det) {};
187
188
189
190	/**
191	* Shift-JIS charset recognizer.
192	*/
193	module.exports.sjis = function() {
194	this.name = function() {
195	return 'Shift-JIS';
196	};
197	this.language = function() {
198	return 'ja';
199	};
200
201	// TODO: This set of data comes from the character frequency-
202	// of-occurence analysis tool. The data needs to be moved
203	// into a resource and loaded from there.
204	this.commonChars = [
205	0x8140, 0x8141, 0x8142, 0x8145, 0x815b, 0x8169, 0x816a, 0x8175, 0x8176, 0x82a0,
206	0x82a2, 0x82a4, 0x82a9, 0x82aa, 0x82ab, 0x82ad, 0x82af, 0x82b1, 0x82b3, 0x82b5,
207	0x82b7, 0x82bd, 0x82be, 0x82c1, 0x82c4, 0x82c5, 0x82c6, 0x82c8, 0x82c9, 0x82cc,
208	0x82cd, 0x82dc, 0x82e0, 0x82e7, 0x82e8, 0x82e9, 0x82ea, 0x82f0, 0x82f1, 0x8341,
209	0x8343, 0x834e, 0x834f, 0x8358, 0x835e, 0x8362, 0x8367, 0x8375, 0x8376, 0x8389,
210	0x838a, 0x838b, 0x838d, 0x8393, 0x8e96, 0x93fa, 0x95aa
211	];
212
213	this.nextChar = function(iter, det) {
214	iter.index = iter.nextIndex;
215	iter.error = false;
216
217	var firstByte;
218	firstByte = iter.charValue = iter.nextByte(det);
219	if (firstByte < 0)
220	return false;
221
222	if (firstByte <= 0x7f \|\| (firstByte > 0xa0 && firstByte <= 0xdf))
223	return true;
224
225	var secondByte = iter.nextByte(det);
226	if (secondByte < 0)
227	return false;
228
229	iter.charValue = (firstByte << 8) \| secondByte;
230	if (! ((secondByte >= 0x40 && secondByte <= 0x7f) \|\| (secondByte >= 0x80 && secondByte <= 0xff))) {
231	// Illegal second byte value.
232	iter.error = true;
233	}
234	return true;
235	};
236	};
237	util.inherits(module.exports.sjis, mbcs);
238
239
240
241	/**
242	* Big5 charset recognizer.
243	*/
244	module.exports.big5 = function() {
245	this.name = function() {
246	return 'Big5';
247	};
248	this.language = function() {
249	return 'zh';
250	};
251	// TODO: This set of data comes from the character frequency-
252	// of-occurence analysis tool. The data needs to be moved
253	// into a resource and loaded from there.
254	this.commonChars = [
255	0xa140, 0xa141, 0xa142, 0xa143, 0xa147, 0xa149, 0xa175, 0xa176, 0xa440, 0xa446,
256	0xa447, 0xa448, 0xa451, 0xa454, 0xa457, 0xa464, 0xa46a, 0xa46c, 0xa477, 0xa4a3,
257	0xa4a4, 0xa4a7, 0xa4c1, 0xa4ce, 0xa4d1, 0xa4df, 0xa4e8, 0xa4fd, 0xa540, 0xa548,
258	0xa558, 0xa569, 0xa5cd, 0xa5e7, 0xa657, 0xa661, 0xa662, 0xa668, 0xa670, 0xa6a8,
259	0xa6b3, 0xa6b9, 0xa6d3, 0xa6db, 0xa6e6, 0xa6f2, 0xa740, 0xa751, 0xa759, 0xa7da,
260	0xa8a3, 0xa8a5, 0xa8ad, 0xa8d1, 0xa8d3, 0xa8e4, 0xa8fc, 0xa9c0, 0xa9d2, 0xa9f3,
261	0xaa6b, 0xaaba, 0xaabe, 0xaacc, 0xaafc, 0xac47, 0xac4f, 0xacb0, 0xacd2, 0xad59,
262	0xaec9, 0xafe0, 0xb0ea, 0xb16f, 0xb2b3, 0xb2c4, 0xb36f, 0xb44c, 0xb44e, 0xb54c,
263	0xb5a5, 0xb5bd, 0xb5d0, 0xb5d8, 0xb671, 0xb7ed, 0xb867, 0xb944, 0xbad8, 0xbb44,
264	0xbba1, 0xbdd1, 0xc2c4, 0xc3b9, 0xc440, 0xc45f
265	];
266	this.nextChar = function(iter, det) {
267	iter.index = iter.nextIndex;
268	iter.error = false;
269
270	var firstByte = iter.charValue = iter.nextByte(det);
271
272	if (firstByte < 0)
273	return false;
274
275	// single byte character.
276	if (firstByte <= 0x7f \|\| firstByte == 0xff)
277	return true;
278
279	var secondByte = iter.nextByte(det);
280
281	if (secondByte < 0)
282	return false;
283
284	iter.charValue = (iter.charValue << 8) \| secondByte;
285
286	if (secondByte < 0x40 \|\| secondByte == 0x7f \|\| secondByte == 0xff)
287	iter.error = true;
288
289	return true;
290	};
291	};
292	util.inherits(module.exports.big5, mbcs);
293
294
295
296	/**
297	* EUC charset recognizers. One abstract class that provides the common function
298	* for getting the next character according to the EUC encoding scheme,
299	* and nested derived classes for EUC_KR, EUC_JP, EUC_CN.
300	*
301	* Get the next character value for EUC based encodings.
302	* Character 'value' is simply the raw bytes that make up the character
303	* packed into an int.
304	*/
305	function eucNextChar(iter, det) {
306	iter.index = iter.nextIndex;
307	iter.error = false;
308	var firstByte = 0;
309	var secondByte = 0;
310	var thirdByte = 0;
311	//int fourthByte = 0;
312	buildChar: {
313	firstByte = iter.charValue = iter.nextByte(det);
314	if (firstByte < 0) {
315	// Ran off the end of the input data
316	iter.done = true;
317	break buildChar;
318	}
319	if (firstByte <= 0x8d) {
320	// single byte char
321	break buildChar;
322	}
323	secondByte = iter.nextByte(det);
324	iter.charValue = (iter.charValue << 8) \| secondByte;
325	if (firstByte >= 0xA1 && firstByte <= 0xfe) {
326	// Two byte Char
327	if (secondByte < 0xa1) {
328	iter.error = true;
329	}
330	break buildChar;
331	}
332	if (firstByte == 0x8e) {
333	// Code Set 2.
334	// In EUC-JP, total char size is 2 bytes, only one byte of actual char value.
335	// In EUC-TW, total char size is 4 bytes, three bytes contribute to char value.
336	// We don't know which we've got.
337	// Treat it like EUC-JP. If the data really was EUC-TW, the following two
338	// bytes will look like a well formed 2 byte char.
339	if (secondByte < 0xa1) {
340	iter.error = true;
341	}
342	break buildChar;
343	}
344	if (firstByte == 0x8f) {
345	// Code set 3.
346	// Three byte total char size, two bytes of actual char value.
347	thirdByte = iter.nextByte(det);
348	iter.charValue = (iter.charValue << 8) \| thirdByte;
349	if (thirdByte < 0xa1) {
350	iter.error = true;
351	}
352	}
353	}
354	return iter.done == false;
355	};
356
357
358
359	/**
360	* The charset recognize for EUC-JP. A singleton instance of this class
361	* is created and kept by the public CharsetDetector class
362	*/
363	module.exports.euc_jp = function() {
364	this.name = function() {
365	return 'EUC-JP';
366	};
367	this.language = function() {
368	return 'ja';
369	};
370
371	// TODO: This set of data comes from the character frequency-
372	// of-occurence analysis tool. The data needs to be moved
373	// into a resource and loaded from there.
374	this.commonChars = [
375	0xa1a1, 0xa1a2, 0xa1a3, 0xa1a6, 0xa1bc, 0xa1ca, 0xa1cb, 0xa1d6, 0xa1d7, 0xa4a2,
376	0xa4a4, 0xa4a6, 0xa4a8, 0xa4aa, 0xa4ab, 0xa4ac, 0xa4ad, 0xa4af, 0xa4b1, 0xa4b3,
377	0xa4b5, 0xa4b7, 0xa4b9, 0xa4bb, 0xa4bd, 0xa4bf, 0xa4c0, 0xa4c1, 0xa4c3, 0xa4c4,
378	0xa4c6, 0xa4c7, 0xa4c8, 0xa4c9, 0xa4ca, 0xa4cb, 0xa4ce, 0xa4cf, 0xa4d0, 0xa4de,
379	0xa4df, 0xa4e1, 0xa4e2, 0xa4e4, 0xa4e8, 0xa4e9, 0xa4ea, 0xa4eb, 0xa4ec, 0xa4ef,
380	0xa4f2, 0xa4f3, 0xa5a2, 0xa5a3, 0xa5a4, 0xa5a6, 0xa5a7, 0xa5aa, 0xa5ad, 0xa5af,
381	0xa5b0, 0xa5b3, 0xa5b5, 0xa5b7, 0xa5b8, 0xa5b9, 0xa5bf, 0xa5c3, 0xa5c6, 0xa5c7,
382	0xa5c8, 0xa5c9, 0xa5cb, 0xa5d0, 0xa5d5, 0xa5d6, 0xa5d7, 0xa5de, 0xa5e0, 0xa5e1,
383	0xa5e5, 0xa5e9, 0xa5ea, 0xa5eb, 0xa5ec, 0xa5ed, 0xa5f3, 0xb8a9, 0xb9d4, 0xbaee,
384	0xbbc8, 0xbef0, 0xbfb7, 0xc4ea, 0xc6fc, 0xc7bd, 0xcab8, 0xcaf3, 0xcbdc, 0xcdd1
385	];
386
387	this.nextChar = eucNextChar;
388	};
389	util.inherits(module.exports.euc_jp, mbcs);
390
391
392
393	/**
394	* The charset recognize for EUC-KR. A singleton instance of this class
395	* is created and kept by the public CharsetDetector class
396	*/
397	module.exports.euc_kr = function() {
398	this.name = function() {
399	return 'EUC-KR';
400	};
401	this.language = function() {
402	return 'ko';
403	};
404
405	// TODO: This set of data comes from the character frequency-
406	// of-occurence analysis tool. The data needs to be moved
407	// into a resource and loaded from there.
408	this.commonChars = [
409	0xb0a1, 0xb0b3, 0xb0c5, 0xb0cd, 0xb0d4, 0xb0e6, 0xb0ed, 0xb0f8, 0xb0fa, 0xb0fc,
410	0xb1b8, 0xb1b9, 0xb1c7, 0xb1d7, 0xb1e2, 0xb3aa, 0xb3bb, 0xb4c2, 0xb4cf, 0xb4d9,
411	0xb4eb, 0xb5a5, 0xb5b5, 0xb5bf, 0xb5c7, 0xb5e9, 0xb6f3, 0xb7af, 0xb7c2, 0xb7ce,
412	0xb8a6, 0xb8ae, 0xb8b6, 0xb8b8, 0xb8bb, 0xb8e9, 0xb9ab, 0xb9ae, 0xb9cc, 0xb9ce,
413	0xb9fd, 0xbab8, 0xbace, 0xbad0, 0xbaf1, 0xbbe7, 0xbbf3, 0xbbfd, 0xbcad, 0xbcba,
414	0xbcd2, 0xbcf6, 0xbdba, 0xbdc0, 0xbdc3, 0xbdc5, 0xbec6, 0xbec8, 0xbedf, 0xbeee,
415	0xbef8, 0xbefa, 0xbfa1, 0xbfa9, 0xbfc0, 0xbfe4, 0xbfeb, 0xbfec, 0xbff8, 0xc0a7,
416	0xc0af, 0xc0b8, 0xc0ba, 0xc0bb, 0xc0bd, 0xc0c7, 0xc0cc, 0xc0ce, 0xc0cf, 0xc0d6,
417	0xc0da, 0xc0e5, 0xc0fb, 0xc0fc, 0xc1a4, 0xc1a6, 0xc1b6, 0xc1d6, 0xc1df, 0xc1f6,
418	0xc1f8, 0xc4a1, 0xc5cd, 0xc6ae, 0xc7cf, 0xc7d1, 0xc7d2, 0xc7d8, 0xc7e5, 0xc8ad
419	];
420
421	this.nextChar = eucNextChar;
422	};
423	util.inherits(module.exports.euc_kr, mbcs);
424
425
426
427	/**
428	* GB-18030 recognizer. Uses simplified Chinese statistics.
429	*/
430	module.exports.gb_18030 = function() {
431	this.name = function() {
432	return 'GB18030';
433	};
434	this.language = function() {
435	return 'zh';
436	};
437
438	/*
439	* Get the next character value for EUC based encodings.
440	* Character 'value' is simply the raw bytes that make up the character
441	* packed into an int.
442	*/
443	this.nextChar = function(iter, det) {
444	iter.index = iter.nextIndex;
445	iter.error = false;
446	var firstByte = 0;
447	var secondByte = 0;
448	var thirdByte = 0;
449	var fourthByte = 0;
450	buildChar: {
451	firstByte = iter.charValue = iter.nextByte(det);
452	if (firstByte < 0) {
453	// Ran off the end of the input data
454	iter.done = true;
455	break buildChar;
456	}
457	if (firstByte <= 0x80) {
458	// single byte char
459	break buildChar;
460	}
461	secondByte = iter.nextByte(det);
462	iter.charValue = (iter.charValue << 8) \| secondByte;
463	if (firstByte >= 0x81 && firstByte <= 0xFE) {
464	// Two byte Char
465	if ((secondByte >= 0x40 && secondByte <= 0x7E) \|\| (secondByte >=80 && secondByte <= 0xFE)) {
466	break buildChar;
467	}
468	// Four byte char
469	if (secondByte >= 0x30 && secondByte <= 0x39) {
470	thirdByte = iter.nextByte(det);
471	if (thirdByte >= 0x81 && thirdByte <= 0xFE) {
472	fourthByte = iter.nextByte(det);
473	if (fourthByte >= 0x30 && fourthByte <= 0x39) {
474	iter.charValue = (iter.charValue << 16) \| (thirdByte << 8) \| fourthByte;
475	break buildChar;
476	}
477	}
478	}
479	iter.error = true;
480	break buildChar;
481	}
482	}
483	return iter.done == false;
484	};
485
486	// TODO: This set of data comes from the character frequency-
487	// of-occurence analysis tool. The data needs to be moved
488	// into a resource and loaded from there.
489	this.commonChars = [
490	0xa1a1, 0xa1a2, 0xa1a3, 0xa1a4, 0xa1b0, 0xa1b1, 0xa1f1, 0xa1f3, 0xa3a1, 0xa3ac,
491	0xa3ba, 0xb1a8, 0xb1b8, 0xb1be, 0xb2bb, 0xb3c9, 0xb3f6, 0xb4f3, 0xb5bd, 0xb5c4,
492	0xb5e3, 0xb6af, 0xb6d4, 0xb6e0, 0xb7a2, 0xb7a8, 0xb7bd, 0xb7d6, 0xb7dd, 0xb8b4,
493	0xb8df, 0xb8f6, 0xb9ab, 0xb9c9, 0xb9d8, 0xb9fa, 0xb9fd, 0xbacd, 0xbba7, 0xbbd6,
494	0xbbe1, 0xbbfa, 0xbcbc, 0xbcdb, 0xbcfe, 0xbdcc, 0xbecd, 0xbedd, 0xbfb4, 0xbfc6,
495	0xbfc9, 0xc0b4, 0xc0ed, 0xc1cb, 0xc2db, 0xc3c7, 0xc4dc, 0xc4ea, 0xc5cc, 0xc6f7,
496	0xc7f8, 0xc8ab, 0xc8cb, 0xc8d5, 0xc8e7, 0xc9cf, 0xc9fa, 0xcab1, 0xcab5, 0xcac7,
497	0xcad0, 0xcad6, 0xcaf5, 0xcafd, 0xccec, 0xcdf8, 0xceaa, 0xcec4, 0xced2, 0xcee5,
498	0xcfb5, 0xcfc2, 0xcfd6, 0xd0c2, 0xd0c5, 0xd0d0, 0xd0d4, 0xd1a7, 0xd2aa, 0xd2b2,
499	0xd2b5, 0xd2bb, 0xd2d4, 0xd3c3, 0xd3d0, 0xd3fd, 0xd4c2, 0xd4da, 0xd5e2, 0xd6d0
500	];
501	};
502	util.inherits(module.exports.gb_18030, mbcs);

Note: See TracBrowser for help on using the repository browser.

Download in other formats: