[6a3a178] | 1 | var util = require('util'),
|
---|
| 2 | Match = require ('../match');
|
---|
| 3 |
|
---|
| 4 |
|
---|
| 5 | /**
|
---|
| 6 | * This is a superclass for the individual detectors for
|
---|
| 7 | * each of the detectable members of the ISO 2022 family
|
---|
| 8 | * of encodings.
|
---|
| 9 | */
|
---|
| 10 |
|
---|
| 11 | function ISO_2022() {}
|
---|
| 12 |
|
---|
| 13 | ISO_2022.prototype.match = function(det) {
|
---|
| 14 |
|
---|
| 15 | /**
|
---|
| 16 | * Matching function shared among the 2022 detectors JP, CN and KR
|
---|
| 17 | * Counts up the number of legal an unrecognized escape sequences in
|
---|
| 18 | * the sample of text, and computes a score based on the total number &
|
---|
| 19 | * the proportion that fit the encoding.
|
---|
| 20 | *
|
---|
| 21 | *
|
---|
| 22 | * @param text the byte buffer containing text to analyse
|
---|
| 23 | * @param textLen the size of the text in the byte.
|
---|
| 24 | * @param escapeSequences the byte escape sequences to test for.
|
---|
| 25 | * @return match quality, in the range of 0-100.
|
---|
| 26 | */
|
---|
| 27 |
|
---|
| 28 | var i, j;
|
---|
| 29 | var escN;
|
---|
| 30 | var hits = 0;
|
---|
| 31 | var misses = 0;
|
---|
| 32 | var shifts = 0;
|
---|
| 33 | var quality;
|
---|
| 34 |
|
---|
| 35 | // TODO: refactor me
|
---|
| 36 | var text = det.fInputBytes;
|
---|
| 37 | var textLen = det.fInputLen;
|
---|
| 38 |
|
---|
| 39 | scanInput:
|
---|
| 40 | for (i = 0; i < textLen; i++) {
|
---|
| 41 | if (text[i] == 0x1b) {
|
---|
| 42 | checkEscapes:
|
---|
| 43 | for (escN = 0; escN < this.escapeSequences.length; escN++) {
|
---|
| 44 | var seq = this.escapeSequences[escN];
|
---|
| 45 |
|
---|
| 46 | if ((textLen - i) < seq.length)
|
---|
| 47 | continue checkEscapes;
|
---|
| 48 |
|
---|
| 49 | for (j = 1; j < seq.length; j++)
|
---|
| 50 | if (seq[j] != text[i + j])
|
---|
| 51 | continue checkEscapes;
|
---|
| 52 |
|
---|
| 53 |
|
---|
| 54 | hits++;
|
---|
| 55 | i += seq.length - 1;
|
---|
| 56 | continue scanInput;
|
---|
| 57 | }
|
---|
| 58 |
|
---|
| 59 | misses++;
|
---|
| 60 | }
|
---|
| 61 |
|
---|
| 62 | // Shift in/out
|
---|
| 63 | if (text[i] == 0x0e || text[i] == 0x0f)
|
---|
| 64 | shifts++;
|
---|
| 65 |
|
---|
| 66 | }
|
---|
| 67 |
|
---|
| 68 | if (hits == 0)
|
---|
| 69 | return null;
|
---|
| 70 |
|
---|
| 71 | //
|
---|
| 72 | // Initial quality is based on relative proportion of recongized vs.
|
---|
| 73 | // unrecognized escape sequences.
|
---|
| 74 | // All good: quality = 100;
|
---|
| 75 | // half or less good: quality = 0;
|
---|
| 76 | // linear inbetween.
|
---|
| 77 | quality = (100 * hits - 100 * misses) / (hits + misses);
|
---|
| 78 |
|
---|
| 79 | // Back off quality if there were too few escape sequences seen.
|
---|
| 80 | // Include shifts in this computation, so that KR does not get penalized
|
---|
| 81 | // for having only a single Escape sequence, but many shifts.
|
---|
| 82 | if (hits + shifts < 5)
|
---|
| 83 | quality -= (5 - (hits + shifts)) * 10;
|
---|
| 84 |
|
---|
| 85 | return quality <= 0 ? null : new Match(det, this, quality);
|
---|
| 86 | };
|
---|
| 87 |
|
---|
| 88 | module.exports.ISO_2022_JP = function() {
|
---|
| 89 | this.name = function() {
|
---|
| 90 | return 'ISO-2022-JP';
|
---|
| 91 | };
|
---|
| 92 | this.escapeSequences = [
|
---|
| 93 | [ 0x1b, 0x24, 0x28, 0x43 ], // KS X 1001:1992
|
---|
| 94 | [ 0x1b, 0x24, 0x28, 0x44 ], // JIS X 212-1990
|
---|
| 95 | [ 0x1b, 0x24, 0x40 ], // JIS C 6226-1978
|
---|
| 96 | [ 0x1b, 0x24, 0x41 ], // GB 2312-80
|
---|
| 97 | [ 0x1b, 0x24, 0x42 ], // JIS X 208-1983
|
---|
| 98 | [ 0x1b, 0x26, 0x40 ], // JIS X 208 1990, 1997
|
---|
| 99 | [ 0x1b, 0x28, 0x42 ], // ASCII
|
---|
| 100 | [ 0x1b, 0x28, 0x48 ], // JIS-Roman
|
---|
| 101 | [ 0x1b, 0x28, 0x49 ], // Half-width katakana
|
---|
| 102 | [ 0x1b, 0x28, 0x4a ], // JIS-Roman
|
---|
| 103 | [ 0x1b, 0x2e, 0x41 ], // ISO 8859-1
|
---|
| 104 | [ 0x1b, 0x2e, 0x46 ] // ISO 8859-7
|
---|
| 105 | ];
|
---|
| 106 | };
|
---|
| 107 | util.inherits(module.exports.ISO_2022_JP, ISO_2022);
|
---|
| 108 |
|
---|
| 109 |
|
---|
| 110 |
|
---|
| 111 | module.exports.ISO_2022_KR = function() {
|
---|
| 112 | this.name = function() {
|
---|
| 113 | return 'ISO-2022-KR';
|
---|
| 114 | };
|
---|
| 115 | this.escapeSequences = [
|
---|
| 116 | [ 0x1b, 0x24, 0x29, 0x43 ]
|
---|
| 117 | ];
|
---|
| 118 | };
|
---|
| 119 | util.inherits(module.exports.ISO_2022_KR, ISO_2022);
|
---|
| 120 |
|
---|
| 121 |
|
---|
| 122 |
|
---|
| 123 | module.exports.ISO_2022_CN = function() {
|
---|
| 124 | this.name = function() {
|
---|
| 125 | return 'ISO-2022-CN';
|
---|
| 126 | };
|
---|
| 127 | this.escapeSequences = [
|
---|
| 128 | [ 0x1b, 0x24, 0x29, 0x41 ], // GB 2312-80
|
---|
| 129 | [ 0x1b, 0x24, 0x29, 0x47 ], // CNS 11643-1992 Plane 1
|
---|
| 130 | [ 0x1b, 0x24, 0x2A, 0x48 ], // CNS 11643-1992 Plane 2
|
---|
| 131 | [ 0x1b, 0x24, 0x29, 0x45 ], // ISO-IR-165
|
---|
| 132 | [ 0x1b, 0x24, 0x2B, 0x49 ], // CNS 11643-1992 Plane 3
|
---|
| 133 | [ 0x1b, 0x24, 0x2B, 0x4A ], // CNS 11643-1992 Plane 4
|
---|
| 134 | [ 0x1b, 0x24, 0x2B, 0x4B ], // CNS 11643-1992 Plane 5
|
---|
| 135 | [ 0x1b, 0x24, 0x2B, 0x4C ], // CNS 11643-1992 Plane 6
|
---|
| 136 | [ 0x1b, 0x24, 0x2B, 0x4D ], // CNS 11643-1992 Plane 7
|
---|
| 137 | [ 0x1b, 0x4e ], // SS2
|
---|
| 138 | [ 0x1b, 0x4f ] // SS3
|
---|
| 139 | ];
|
---|
| 140 | };
|
---|
| 141 | util.inherits(module.exports.ISO_2022_CN, ISO_2022);
|
---|