1 | var util = require('util'),
|
---|
2 | Match = require ('../match');
|
---|
3 |
|
---|
4 |
|
---|
5 | /**
|
---|
6 | * This is a superclass for the individual detectors for
|
---|
7 | * each of the detectable members of the ISO 2022 family
|
---|
8 | * of encodings.
|
---|
9 | */
|
---|
10 |
|
---|
11 | function ISO_2022() {}
|
---|
12 |
|
---|
13 | ISO_2022.prototype.match = function(det) {
|
---|
14 |
|
---|
15 | /**
|
---|
16 | * Matching function shared among the 2022 detectors JP, CN and KR
|
---|
17 | * Counts up the number of legal an unrecognized escape sequences in
|
---|
18 | * the sample of text, and computes a score based on the total number &
|
---|
19 | * the proportion that fit the encoding.
|
---|
20 | *
|
---|
21 | *
|
---|
22 | * @param text the byte buffer containing text to analyse
|
---|
23 | * @param textLen the size of the text in the byte.
|
---|
24 | * @param escapeSequences the byte escape sequences to test for.
|
---|
25 | * @return match quality, in the range of 0-100.
|
---|
26 | */
|
---|
27 |
|
---|
28 | var i, j;
|
---|
29 | var escN;
|
---|
30 | var hits = 0;
|
---|
31 | var misses = 0;
|
---|
32 | var shifts = 0;
|
---|
33 | var quality;
|
---|
34 |
|
---|
35 | // TODO: refactor me
|
---|
36 | var text = det.fInputBytes;
|
---|
37 | var textLen = det.fInputLen;
|
---|
38 |
|
---|
39 | scanInput:
|
---|
40 | for (i = 0; i < textLen; i++) {
|
---|
41 | if (text[i] == 0x1b) {
|
---|
42 | checkEscapes:
|
---|
43 | for (escN = 0; escN < this.escapeSequences.length; escN++) {
|
---|
44 | var seq = this.escapeSequences[escN];
|
---|
45 |
|
---|
46 | if ((textLen - i) < seq.length)
|
---|
47 | continue checkEscapes;
|
---|
48 |
|
---|
49 | for (j = 1; j < seq.length; j++)
|
---|
50 | if (seq[j] != text[i + j])
|
---|
51 | continue checkEscapes;
|
---|
52 |
|
---|
53 |
|
---|
54 | hits++;
|
---|
55 | i += seq.length - 1;
|
---|
56 | continue scanInput;
|
---|
57 | }
|
---|
58 |
|
---|
59 | misses++;
|
---|
60 | }
|
---|
61 |
|
---|
62 | // Shift in/out
|
---|
63 | if (text[i] == 0x0e || text[i] == 0x0f)
|
---|
64 | shifts++;
|
---|
65 |
|
---|
66 | }
|
---|
67 |
|
---|
68 | if (hits == 0)
|
---|
69 | return null;
|
---|
70 |
|
---|
71 | //
|
---|
72 | // Initial quality is based on relative proportion of recongized vs.
|
---|
73 | // unrecognized escape sequences.
|
---|
74 | // All good: quality = 100;
|
---|
75 | // half or less good: quality = 0;
|
---|
76 | // linear inbetween.
|
---|
77 | quality = (100 * hits - 100 * misses) / (hits + misses);
|
---|
78 |
|
---|
79 | // Back off quality if there were too few escape sequences seen.
|
---|
80 | // Include shifts in this computation, so that KR does not get penalized
|
---|
81 | // for having only a single Escape sequence, but many shifts.
|
---|
82 | if (hits + shifts < 5)
|
---|
83 | quality -= (5 - (hits + shifts)) * 10;
|
---|
84 |
|
---|
85 | return quality <= 0 ? null : new Match(det, this, quality);
|
---|
86 | };
|
---|
87 |
|
---|
88 | module.exports.ISO_2022_JP = function() {
|
---|
89 | this.name = function() {
|
---|
90 | return 'ISO-2022-JP';
|
---|
91 | };
|
---|
92 | this.escapeSequences = [
|
---|
93 | [ 0x1b, 0x24, 0x28, 0x43 ], // KS X 1001:1992
|
---|
94 | [ 0x1b, 0x24, 0x28, 0x44 ], // JIS X 212-1990
|
---|
95 | [ 0x1b, 0x24, 0x40 ], // JIS C 6226-1978
|
---|
96 | [ 0x1b, 0x24, 0x41 ], // GB 2312-80
|
---|
97 | [ 0x1b, 0x24, 0x42 ], // JIS X 208-1983
|
---|
98 | [ 0x1b, 0x26, 0x40 ], // JIS X 208 1990, 1997
|
---|
99 | [ 0x1b, 0x28, 0x42 ], // ASCII
|
---|
100 | [ 0x1b, 0x28, 0x48 ], // JIS-Roman
|
---|
101 | [ 0x1b, 0x28, 0x49 ], // Half-width katakana
|
---|
102 | [ 0x1b, 0x28, 0x4a ], // JIS-Roman
|
---|
103 | [ 0x1b, 0x2e, 0x41 ], // ISO 8859-1
|
---|
104 | [ 0x1b, 0x2e, 0x46 ] // ISO 8859-7
|
---|
105 | ];
|
---|
106 | };
|
---|
107 | util.inherits(module.exports.ISO_2022_JP, ISO_2022);
|
---|
108 |
|
---|
109 |
|
---|
110 |
|
---|
111 | module.exports.ISO_2022_KR = function() {
|
---|
112 | this.name = function() {
|
---|
113 | return 'ISO-2022-KR';
|
---|
114 | };
|
---|
115 | this.escapeSequences = [
|
---|
116 | [ 0x1b, 0x24, 0x29, 0x43 ]
|
---|
117 | ];
|
---|
118 | };
|
---|
119 | util.inherits(module.exports.ISO_2022_KR, ISO_2022);
|
---|
120 |
|
---|
121 |
|
---|
122 |
|
---|
123 | module.exports.ISO_2022_CN = function() {
|
---|
124 | this.name = function() {
|
---|
125 | return 'ISO-2022-CN';
|
---|
126 | };
|
---|
127 | this.escapeSequences = [
|
---|
128 | [ 0x1b, 0x24, 0x29, 0x41 ], // GB 2312-80
|
---|
129 | [ 0x1b, 0x24, 0x29, 0x47 ], // CNS 11643-1992 Plane 1
|
---|
130 | [ 0x1b, 0x24, 0x2A, 0x48 ], // CNS 11643-1992 Plane 2
|
---|
131 | [ 0x1b, 0x24, 0x29, 0x45 ], // ISO-IR-165
|
---|
132 | [ 0x1b, 0x24, 0x2B, 0x49 ], // CNS 11643-1992 Plane 3
|
---|
133 | [ 0x1b, 0x24, 0x2B, 0x4A ], // CNS 11643-1992 Plane 4
|
---|
134 | [ 0x1b, 0x24, 0x2B, 0x4B ], // CNS 11643-1992 Plane 5
|
---|
135 | [ 0x1b, 0x24, 0x2B, 0x4C ], // CNS 11643-1992 Plane 6
|
---|
136 | [ 0x1b, 0x24, 0x2B, 0x4D ], // CNS 11643-1992 Plane 7
|
---|
137 | [ 0x1b, 0x4e ], // SS2
|
---|
138 | [ 0x1b, 0x4f ] // SS3
|
---|
139 | ];
|
---|
140 | };
|
---|
141 | util.inherits(module.exports.ISO_2022_CN, ISO_2022);
|
---|