Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Normal
Revision Log

source: trip-planner-front/node_modules/chardet/encoding/mbcs.js@ e29cc2e

Last change on this file since e29cc2e was 6a3a178, checked in by Ema <ema_spirova@…>, 3 years ago
initial commit
Property mode set to `100644`
File size: 16.6 KB

Rev	Line
[6a3a178]	1	var util = require('util'),
	2	Match = require ('../match');
	3
	4	/**
	5	* Binary search implementation (recursive)
	6	*/
	7	function binarySearch(arr, searchValue) {
	8	function find(arr, searchValue, left, right) {
	9	if (right < left)
	10	return -1;
	11
	12	/*
	13	int mid = mid = (left + right) / 2;
	14	There is a bug in the above line;
	15	Joshua Bloch suggests the following replacement:
	16	*/
	17	var mid = Math.floor((left + right) >>> 1);
	18	if (searchValue > arr[mid])
	19	return find(arr, searchValue, mid + 1, right);
	20
	21	if (searchValue < arr[mid])
	22	return find(arr, searchValue, left, mid - 1);
	23
	24	return mid;
	25	};
	26
	27	return find(arr, searchValue, 0, arr.length - 1);
	28	};
	29
	30	// 'Character' iterated character class.
	31	// Recognizers for specific mbcs encodings make their 'characters' available
	32	// by providing a nextChar() function that fills in an instance of iteratedChar
	33	// with the next char from the input.
	34	// The returned characters are not converted to Unicode, but remain as the raw
	35	// bytes (concatenated into an int) from the codepage data.
	36	//
	37	// For Asian charsets, use the raw input rather than the input that has been
	38	// stripped of markup. Detection only considers multi-byte chars, effectively
	39	// stripping markup anyway, and double byte chars do occur in markup too.
	40	//
	41	function IteratedChar() {
	42
	43	this.charValue = 0; // 1-4 bytes from the raw input data
	44	this.index = 0;
	45	this.nextIndex = 0;
	46	this.error = false;
	47	this.done = false;
	48
	49	this.reset = function() {
	50	this.charValue = 0;
	51	this.index = -1;
	52	this.nextIndex = 0;
	53	this.error = false;
	54	this.done = false;
	55	};
	56
	57	this.nextByte = function(det) {
	58	if (this.nextIndex >= det.fRawLength) {
	59	this.done = true;
	60	return -1;
	61	}
	62	var byteValue = det.fRawInput[this.nextIndex++] & 0x00ff;
	63	return byteValue;
	64	};
	65	};
	66
	67
	68
	69	/**
	70	* Asian double or multi-byte - charsets.
	71	* Match is determined mostly by the input data adhering to the
	72	* encoding scheme for the charset, and, optionally,
	73	* frequency-of-occurence of characters.
	74	*/
	75
	76	function mbcs() {};
	77
	78	/**
	79	* Test the match of this charset with the input text data
	80	* which is obtained via the CharsetDetector object.
	81	*
	82	* @param det The CharsetDetector, which contains the input text
	83	* to be checked for being in this charset.
	84	* @return Two values packed into one int (Damn java, anyhow)
	85	* bits 0-7: the match confidence, ranging from 0-100
	86	* bits 8-15: The match reason, an enum-like value.
	87	*/
	88	mbcs.prototype.match = function(det) {
	89
	90	var singleByteCharCount = 0, //TODO Do we really need this?
	91	doubleByteCharCount = 0,
	92	commonCharCount = 0,
	93	badCharCount = 0,
	94	totalCharCount = 0,
	95	confidence = 0;
	96
	97	var iter = new IteratedChar();
	98
	99	detectBlock: {
	100	for (iter.reset(); this.nextChar(iter, det);) {
	101	totalCharCount++;
	102	if (iter.error) {
	103	badCharCount++;
	104	} else {
	105	var cv = iter.charValue & 0xFFFFFFFF;
	106
	107	if (cv <= 0xff) {
	108	singleByteCharCount++;
	109	} else {
	110	doubleByteCharCount++;
	111	if (this.commonChars != null) {
	112	// NOTE: This assumes that there are no 4-byte common chars.
	113	if (binarySearch(this.commonChars, cv) >= 0) {
	114	commonCharCount++;
	115	}
	116	}
	117	}
	118	}
	119	if (badCharCount >= 2 && badCharCount * 5 >= doubleByteCharCount) {
	120	// console.log('its here!')
	121	// Bail out early if the byte data is not matching the encoding scheme.
	122	break detectBlock;
	123	}
	124	}
	125
	126	if (doubleByteCharCount <= 10 && badCharCount== 0) {
	127	// Not many multi-byte chars.
	128	if (doubleByteCharCount == 0 && totalCharCount < 10) {
	129	// There weren't any multibyte sequences, and there was a low density of non-ASCII single bytes.
	130	// We don't have enough data to have any confidence.
	131	// Statistical analysis of single byte non-ASCII charcters would probably help here.
	132	confidence = 0;
	133	}
	134	else {
	135	// ASCII or ISO file? It's probably not our encoding,
	136	// but is not incompatible with our encoding, so don't give it a zero.
	137	confidence = 10;
	138	}
	139	break detectBlock;
	140	}
	141
	142	//
	143	// No match if there are too many characters that don't fit the encoding scheme.
	144	// (should we have zero tolerance for these?)
	145	//
	146	if (doubleByteCharCount < 20 * badCharCount) {
	147	confidence = 0;
	148	break detectBlock;
	149	}
	150
	151	if (this.commonChars == null) {
	152	// We have no statistics on frequently occuring characters.
	153	// Assess confidence purely on having a reasonable number of
	154	// multi-byte characters (the more the better
	155	confidence = 30 + doubleByteCharCount - 20 * badCharCount;
	156	if (confidence > 100) {
	157	confidence = 100;
	158	}
	159	} else {
	160	//
	161	// Frequency of occurence statistics exist.
	162	//
	163	var maxVal = Math.log(parseFloat(doubleByteCharCount) / 4);
	164	var scaleFactor = 90.0 / maxVal;
	165	confidence = Math.floor(Math.log(commonCharCount + 1) * scaleFactor + 10);
	166	confidence = Math.min(confidence, 100);
	167	}
	168	} // end of detectBlock:
	169
	170	return confidence == 0 ? null : new Match(det, this, confidence);
	171	};
	172
	173	/**
	174	* Get the next character (however many bytes it is) from the input data
	175	* Subclasses for specific charset encodings must implement this function
	176	* to get characters according to the rules of their encoding scheme.
	177	*
	178	* This function is not a method of class iteratedChar only because
	179	* that would require a lot of extra derived classes, which is awkward.
	180	* @param it The iteratedChar 'struct' into which the returned char is placed.
	181	* @param det The charset detector, which is needed to get at the input byte data
	182	* being iterated over.
	183	* @return True if a character was returned, false at end of input.
	184	*/
	185
	186	mbcs.prototype.nextChar = function(iter, det) {};
	187
	188
	189
	190	/**
	191	* Shift-JIS charset recognizer.
	192	*/
	193	module.exports.sjis = function() {
	194	this.name = function() {
	195	return 'Shift-JIS';
	196	};
	197	this.language = function() {
	198	return 'ja';
	199	};
	200
	201	// TODO: This set of data comes from the character frequency-
	202	// of-occurence analysis tool. The data needs to be moved
	203	// into a resource and loaded from there.
	204	this.commonChars = [
	205	0x8140, 0x8141, 0x8142, 0x8145, 0x815b, 0x8169, 0x816a, 0x8175, 0x8176, 0x82a0,
	206	0x82a2, 0x82a4, 0x82a9, 0x82aa, 0x82ab, 0x82ad, 0x82af, 0x82b1, 0x82b3, 0x82b5,
	207	0x82b7, 0x82bd, 0x82be, 0x82c1, 0x82c4, 0x82c5, 0x82c6, 0x82c8, 0x82c9, 0x82cc,
	208	0x82cd, 0x82dc, 0x82e0, 0x82e7, 0x82e8, 0x82e9, 0x82ea, 0x82f0, 0x82f1, 0x8341,
	209	0x8343, 0x834e, 0x834f, 0x8358, 0x835e, 0x8362, 0x8367, 0x8375, 0x8376, 0x8389,
	210	0x838a, 0x838b, 0x838d, 0x8393, 0x8e96, 0x93fa, 0x95aa
	211	];
	212
	213	this.nextChar = function(iter, det) {
	214	iter.index = iter.nextIndex;
	215	iter.error = false;
	216
	217	var firstByte;
	218	firstByte = iter.charValue = iter.nextByte(det);
	219	if (firstByte < 0)
	220	return false;
	221
	222	if (firstByte <= 0x7f \|\| (firstByte > 0xa0 && firstByte <= 0xdf))
	223	return true;
	224
	225	var secondByte = iter.nextByte(det);
	226	if (secondByte < 0)
	227	return false;
	228
	229	iter.charValue = (firstByte << 8) \| secondByte;
	230	if (! ((secondByte >= 0x40 && secondByte <= 0x7f) \|\| (secondByte >= 0x80 && secondByte <= 0xff))) {
	231	// Illegal second byte value.
	232	iter.error = true;
	233	}
	234	return true;
	235	};
	236	};
	237	util.inherits(module.exports.sjis, mbcs);
	238
	239
	240
	241	/**
	242	* Big5 charset recognizer.
	243	*/
	244	module.exports.big5 = function() {
	245	this.name = function() {
	246	return 'Big5';
	247	};
	248	this.language = function() {
	249	return 'zh';
	250	};
	251	// TODO: This set of data comes from the character frequency-
	252	// of-occurence analysis tool. The data needs to be moved
	253	// into a resource and loaded from there.
	254	this.commonChars = [
	255	0xa140, 0xa141, 0xa142, 0xa143, 0xa147, 0xa149, 0xa175, 0xa176, 0xa440, 0xa446,
	256	0xa447, 0xa448, 0xa451, 0xa454, 0xa457, 0xa464, 0xa46a, 0xa46c, 0xa477, 0xa4a3,
	257	0xa4a4, 0xa4a7, 0xa4c1, 0xa4ce, 0xa4d1, 0xa4df, 0xa4e8, 0xa4fd, 0xa540, 0xa548,
	258	0xa558, 0xa569, 0xa5cd, 0xa5e7, 0xa657, 0xa661, 0xa662, 0xa668, 0xa670, 0xa6a8,
	259	0xa6b3, 0xa6b9, 0xa6d3, 0xa6db, 0xa6e6, 0xa6f2, 0xa740, 0xa751, 0xa759, 0xa7da,
	260	0xa8a3, 0xa8a5, 0xa8ad, 0xa8d1, 0xa8d3, 0xa8e4, 0xa8fc, 0xa9c0, 0xa9d2, 0xa9f3,
	261	0xaa6b, 0xaaba, 0xaabe, 0xaacc, 0xaafc, 0xac47, 0xac4f, 0xacb0, 0xacd2, 0xad59,
	262	0xaec9, 0xafe0, 0xb0ea, 0xb16f, 0xb2b3, 0xb2c4, 0xb36f, 0xb44c, 0xb44e, 0xb54c,
	263	0xb5a5, 0xb5bd, 0xb5d0, 0xb5d8, 0xb671, 0xb7ed, 0xb867, 0xb944, 0xbad8, 0xbb44,
	264	0xbba1, 0xbdd1, 0xc2c4, 0xc3b9, 0xc440, 0xc45f
	265	];
	266	this.nextChar = function(iter, det) {
	267	iter.index = iter.nextIndex;
	268	iter.error = false;
	269
	270	var firstByte = iter.charValue = iter.nextByte(det);
	271
	272	if (firstByte < 0)
	273	return false;
	274
	275	// single byte character.
	276	if (firstByte <= 0x7f \|\| firstByte == 0xff)
	277	return true;
	278
	279	var secondByte = iter.nextByte(det);
	280
	281	if (secondByte < 0)
	282	return false;
	283
	284	iter.charValue = (iter.charValue << 8) \| secondByte;
	285
	286	if (secondByte < 0x40 \|\| secondByte == 0x7f \|\| secondByte == 0xff)
	287	iter.error = true;
	288
	289	return true;
	290	};
	291	};
	292	util.inherits(module.exports.big5, mbcs);
	293
	294
	295
	296	/**
	297	* EUC charset recognizers. One abstract class that provides the common function
	298	* for getting the next character according to the EUC encoding scheme,
	299	* and nested derived classes for EUC_KR, EUC_JP, EUC_CN.
	300	*
	301	* Get the next character value for EUC based encodings.
	302	* Character 'value' is simply the raw bytes that make up the character
	303	* packed into an int.
	304	*/
	305	function eucNextChar(iter, det) {
	306	iter.index = iter.nextIndex;
	307	iter.error = false;
	308	var firstByte = 0;
	309	var secondByte = 0;
	310	var thirdByte = 0;
	311	//int fourthByte = 0;
	312	buildChar: {
	313	firstByte = iter.charValue = iter.nextByte(det);
	314	if (firstByte < 0) {
	315	// Ran off the end of the input data
	316	iter.done = true;
	317	break buildChar;
	318	}
	319	if (firstByte <= 0x8d) {
	320	// single byte char
	321	break buildChar;
	322	}
	323	secondByte = iter.nextByte(det);
	324	iter.charValue = (iter.charValue << 8) \| secondByte;
	325	if (firstByte >= 0xA1 && firstByte <= 0xfe) {
	326	// Two byte Char
	327	if (secondByte < 0xa1) {
	328	iter.error = true;
	329	}
	330	break buildChar;
	331	}
	332	if (firstByte == 0x8e) {
	333	// Code Set 2.
	334	// In EUC-JP, total char size is 2 bytes, only one byte of actual char value.
	335	// In EUC-TW, total char size is 4 bytes, three bytes contribute to char value.
	336	// We don't know which we've got.
	337	// Treat it like EUC-JP. If the data really was EUC-TW, the following two
	338	// bytes will look like a well formed 2 byte char.
	339	if (secondByte < 0xa1) {
	340	iter.error = true;
	341	}
	342	break buildChar;
	343	}
	344	if (firstByte == 0x8f) {
	345	// Code set 3.
	346	// Three byte total char size, two bytes of actual char value.
	347	thirdByte = iter.nextByte(det);
	348	iter.charValue = (iter.charValue << 8) \| thirdByte;
	349	if (thirdByte < 0xa1) {
	350	iter.error = true;
	351	}
	352	}
	353	}
	354	return iter.done == false;
	355	};
	356
	357
	358
	359	/**
	360	* The charset recognize for EUC-JP. A singleton instance of this class
	361	* is created and kept by the public CharsetDetector class
	362	*/
	363	module.exports.euc_jp = function() {
	364	this.name = function() {
	365	return 'EUC-JP';
	366	};
	367	this.language = function() {
	368	return 'ja';
	369	};
	370
	371	// TODO: This set of data comes from the character frequency-
	372	// of-occurence analysis tool. The data needs to be moved
	373	// into a resource and loaded from there.
	374	this.commonChars = [
	375	0xa1a1, 0xa1a2, 0xa1a3, 0xa1a6, 0xa1bc, 0xa1ca, 0xa1cb, 0xa1d6, 0xa1d7, 0xa4a2,
	376	0xa4a4, 0xa4a6, 0xa4a8, 0xa4aa, 0xa4ab, 0xa4ac, 0xa4ad, 0xa4af, 0xa4b1, 0xa4b3,
	377	0xa4b5, 0xa4b7, 0xa4b9, 0xa4bb, 0xa4bd, 0xa4bf, 0xa4c0, 0xa4c1, 0xa4c3, 0xa4c4,
	378	0xa4c6, 0xa4c7, 0xa4c8, 0xa4c9, 0xa4ca, 0xa4cb, 0xa4ce, 0xa4cf, 0xa4d0, 0xa4de,
	379	0xa4df, 0xa4e1, 0xa4e2, 0xa4e4, 0xa4e8, 0xa4e9, 0xa4ea, 0xa4eb, 0xa4ec, 0xa4ef,
	380	0xa4f2, 0xa4f3, 0xa5a2, 0xa5a3, 0xa5a4, 0xa5a6, 0xa5a7, 0xa5aa, 0xa5ad, 0xa5af,
	381	0xa5b0, 0xa5b3, 0xa5b5, 0xa5b7, 0xa5b8, 0xa5b9, 0xa5bf, 0xa5c3, 0xa5c6, 0xa5c7,
	382	0xa5c8, 0xa5c9, 0xa5cb, 0xa5d0, 0xa5d5, 0xa5d6, 0xa5d7, 0xa5de, 0xa5e0, 0xa5e1,
	383	0xa5e5, 0xa5e9, 0xa5ea, 0xa5eb, 0xa5ec, 0xa5ed, 0xa5f3, 0xb8a9, 0xb9d4, 0xbaee,
	384	0xbbc8, 0xbef0, 0xbfb7, 0xc4ea, 0xc6fc, 0xc7bd, 0xcab8, 0xcaf3, 0xcbdc, 0xcdd1
	385	];
	386
	387	this.nextChar = eucNextChar;
	388	};
	389	util.inherits(module.exports.euc_jp, mbcs);
	390
	391
	392
	393	/**
	394	* The charset recognize for EUC-KR. A singleton instance of this class
	395	* is created and kept by the public CharsetDetector class
	396	*/
	397	module.exports.euc_kr = function() {
	398	this.name = function() {
	399	return 'EUC-KR';
	400	};
	401	this.language = function() {
	402	return 'ko';
	403	};
	404
	405	// TODO: This set of data comes from the character frequency-
	406	// of-occurence analysis tool. The data needs to be moved
	407	// into a resource and loaded from there.
	408	this.commonChars = [
	409	0xb0a1, 0xb0b3, 0xb0c5, 0xb0cd, 0xb0d4, 0xb0e6, 0xb0ed, 0xb0f8, 0xb0fa, 0xb0fc,
	410	0xb1b8, 0xb1b9, 0xb1c7, 0xb1d7, 0xb1e2, 0xb3aa, 0xb3bb, 0xb4c2, 0xb4cf, 0xb4d9,
	411	0xb4eb, 0xb5a5, 0xb5b5, 0xb5bf, 0xb5c7, 0xb5e9, 0xb6f3, 0xb7af, 0xb7c2, 0xb7ce,
	412	0xb8a6, 0xb8ae, 0xb8b6, 0xb8b8, 0xb8bb, 0xb8e9, 0xb9ab, 0xb9ae, 0xb9cc, 0xb9ce,
	413	0xb9fd, 0xbab8, 0xbace, 0xbad0, 0xbaf1, 0xbbe7, 0xbbf3, 0xbbfd, 0xbcad, 0xbcba,
	414	0xbcd2, 0xbcf6, 0xbdba, 0xbdc0, 0xbdc3, 0xbdc5, 0xbec6, 0xbec8, 0xbedf, 0xbeee,
	415	0xbef8, 0xbefa, 0xbfa1, 0xbfa9, 0xbfc0, 0xbfe4, 0xbfeb, 0xbfec, 0xbff8, 0xc0a7,
	416	0xc0af, 0xc0b8, 0xc0ba, 0xc0bb, 0xc0bd, 0xc0c7, 0xc0cc, 0xc0ce, 0xc0cf, 0xc0d6,
	417	0xc0da, 0xc0e5, 0xc0fb, 0xc0fc, 0xc1a4, 0xc1a6, 0xc1b6, 0xc1d6, 0xc1df, 0xc1f6,
	418	0xc1f8, 0xc4a1, 0xc5cd, 0xc6ae, 0xc7cf, 0xc7d1, 0xc7d2, 0xc7d8, 0xc7e5, 0xc8ad
	419	];
	420
	421	this.nextChar = eucNextChar;
	422	};
	423	util.inherits(module.exports.euc_kr, mbcs);
	424
	425
	426
	427	/**
	428	* GB-18030 recognizer. Uses simplified Chinese statistics.
	429	*/
	430	module.exports.gb_18030 = function() {
	431	this.name = function() {
	432	return 'GB18030';
	433	};
	434	this.language = function() {
	435	return 'zh';
	436	};
	437
	438	/*
	439	* Get the next character value for EUC based encodings.
	440	* Character 'value' is simply the raw bytes that make up the character
	441	* packed into an int.
	442	*/
	443	this.nextChar = function(iter, det) {
	444	iter.index = iter.nextIndex;
	445	iter.error = false;
	446	var firstByte = 0;
	447	var secondByte = 0;
	448	var thirdByte = 0;
	449	var fourthByte = 0;
	450	buildChar: {
	451	firstByte = iter.charValue = iter.nextByte(det);
	452	if (firstByte < 0) {
	453	// Ran off the end of the input data
	454	iter.done = true;
	455	break buildChar;
	456	}
	457	if (firstByte <= 0x80) {
	458	// single byte char
	459	break buildChar;
	460	}
	461	secondByte = iter.nextByte(det);
	462	iter.charValue = (iter.charValue << 8) \| secondByte;
	463	if (firstByte >= 0x81 && firstByte <= 0xFE) {
	464	// Two byte Char
	465	if ((secondByte >= 0x40 && secondByte <= 0x7E) \|\| (secondByte >=80 && secondByte <= 0xFE)) {
	466	break buildChar;
	467	}
	468	// Four byte char
	469	if (secondByte >= 0x30 && secondByte <= 0x39) {
	470	thirdByte = iter.nextByte(det);
	471	if (thirdByte >= 0x81 && thirdByte <= 0xFE) {
	472	fourthByte = iter.nextByte(det);
	473	if (fourthByte >= 0x30 && fourthByte <= 0x39) {
	474	iter.charValue = (iter.charValue << 16) \| (thirdByte << 8) \| fourthByte;
	475	break buildChar;
	476	}
	477	}
	478	}
	479	iter.error = true;
	480	break buildChar;
	481	}
	482	}
	483	return iter.done == false;
	484	};
	485
	486	// TODO: This set of data comes from the character frequency-
	487	// of-occurence analysis tool. The data needs to be moved
	488	// into a resource and loaded from there.
	489	this.commonChars = [
	490	0xa1a1, 0xa1a2, 0xa1a3, 0xa1a4, 0xa1b0, 0xa1b1, 0xa1f1, 0xa1f3, 0xa3a1, 0xa3ac,
	491	0xa3ba, 0xb1a8, 0xb1b8, 0xb1be, 0xb2bb, 0xb3c9, 0xb3f6, 0xb4f3, 0xb5bd, 0xb5c4,
	492	0xb5e3, 0xb6af, 0xb6d4, 0xb6e0, 0xb7a2, 0xb7a8, 0xb7bd, 0xb7d6, 0xb7dd, 0xb8b4,
	493	0xb8df, 0xb8f6, 0xb9ab, 0xb9c9, 0xb9d8, 0xb9fa, 0xb9fd, 0xbacd, 0xbba7, 0xbbd6,
	494	0xbbe1, 0xbbfa, 0xbcbc, 0xbcdb, 0xbcfe, 0xbdcc, 0xbecd, 0xbedd, 0xbfb4, 0xbfc6,
	495	0xbfc9, 0xc0b4, 0xc0ed, 0xc1cb, 0xc2db, 0xc3c7, 0xc4dc, 0xc4ea, 0xc5cc, 0xc6f7,
	496	0xc7f8, 0xc8ab, 0xc8cb, 0xc8d5, 0xc8e7, 0xc9cf, 0xc9fa, 0xcab1, 0xcab5, 0xcac7,
	497	0xcad0, 0xcad6, 0xcaf5, 0xcafd, 0xccec, 0xcdf8, 0xceaa, 0xcec4, 0xced2, 0xcee5,
	498	0xcfb5, 0xcfc2, 0xcfd6, 0xd0c2, 0xd0c5, 0xd0d0, 0xd0d4, 0xd1a7, 0xd2aa, 0xd2b2,
	499	0xd2b5, 0xd2bb, 0xd2d4, 0xd3c3, 0xd3d0, 0xd3fd, 0xd4c2, 0xd4da, 0xd5e2, 0xd6d0
	500	];
	501	};
	502	util.inherits(module.exports.gb_18030, mbcs);

Note: See TracBrowser for help on using the repository browser.

Download in other formats: