[d24f17c] | 1 | import { __extends } from "tslib";
|
---|
| 2 | import { Matcher } from './matcher';
|
---|
| 3 | import { alphaNumericCharsStr, alphaNumericAndMarksCharsStr, getDomainNameStr } from '../regex-lib';
|
---|
| 4 | import { tldRegex } from './tld-regex';
|
---|
| 5 | import { UrlMatch } from '../match/url-match';
|
---|
| 6 | import { UrlMatchValidator } from './url-match-validator';
|
---|
| 7 | // RegExp objects which are shared by all instances of UrlMatcher. These are
|
---|
| 8 | // here to avoid re-instantiating the RegExp objects if `Autolinker.link()` is
|
---|
| 9 | // called multiple times, thus instantiating UrlMatcher and its RegExp
|
---|
| 10 | // objects each time (which is very expensive - see https://github.com/gregjacobs/Autolinker.js/issues/314).
|
---|
| 11 | // See descriptions of the properties where they are used for details about them
|
---|
| 12 | // prettier-ignore
|
---|
| 13 | var matcherRegex = (function () {
|
---|
| 14 | var schemeRegex = /(?:[A-Za-z][-.+A-Za-z0-9]{0,63}:(?![A-Za-z][-.+A-Za-z0-9]{0,63}:\/\/)(?!\d+\/?)(?:\/\/)?)/, // match protocol, allow in format "http://" or "mailto:". However, do not match the first part of something like 'link:http://www.google.com' (i.e. don't match "link:"). Also, make sure we don't interpret 'google.com:8000' as if 'google.com' was a protocol here (i.e. ignore a trailing port number in this regex)
|
---|
| 15 | wwwRegex = /(?:www\.)/, // starting with 'www.'
|
---|
| 16 | // Allow optional path, query string, and hash anchor, not ending in the following characters: "?!:,.;"
|
---|
| 17 | // http://blog.codinghorror.com/the-problem-with-urls/
|
---|
| 18 | urlSuffixRegex = new RegExp('[/?#](?:[' + alphaNumericAndMarksCharsStr + '\\-+&@#/%=~_()|\'$*\\[\\]{}?!:,.;^\u2713]*[' + alphaNumericAndMarksCharsStr + '\\-+&@#/%=~_()|\'$*\\[\\]{}\u2713])?');
|
---|
| 19 | return new RegExp([
|
---|
| 20 | '(?:',
|
---|
| 21 | '(',
|
---|
| 22 | schemeRegex.source,
|
---|
| 23 | getDomainNameStr(2),
|
---|
| 24 | ')',
|
---|
| 25 | '|',
|
---|
| 26 | '(',
|
---|
| 27 | '(//)?',
|
---|
| 28 | wwwRegex.source,
|
---|
| 29 | getDomainNameStr(6),
|
---|
| 30 | ')',
|
---|
| 31 | '|',
|
---|
| 32 | '(',
|
---|
| 33 | '(//)?',
|
---|
| 34 | getDomainNameStr(10) + '\\.',
|
---|
| 35 | tldRegex.source,
|
---|
| 36 | '(?![-' + alphaNumericCharsStr + '])',
|
---|
| 37 | ')',
|
---|
| 38 | ')',
|
---|
| 39 | '(?::[0-9]+)?',
|
---|
| 40 | '(?:' + urlSuffixRegex.source + ')?' // match for path, query string, and/or hash anchor - optional
|
---|
| 41 | ].join(""), 'gi');
|
---|
| 42 | })();
|
---|
| 43 | var wordCharRegExp = new RegExp('[' + alphaNumericAndMarksCharsStr + ']');
|
---|
| 44 | /**
|
---|
| 45 | * @class Autolinker.matcher.Url
|
---|
| 46 | * @extends Autolinker.matcher.Matcher
|
---|
| 47 | *
|
---|
| 48 | * Matcher to find URL matches in an input string.
|
---|
| 49 | *
|
---|
| 50 | * See this class's superclass ({@link Autolinker.matcher.Matcher}) for more details.
|
---|
| 51 | */
|
---|
| 52 | var UrlMatcher = /** @class */ (function (_super) {
|
---|
| 53 | __extends(UrlMatcher, _super);
|
---|
| 54 | /**
|
---|
| 55 | * @method constructor
|
---|
| 56 | * @param {Object} cfg The configuration properties for the Match instance,
|
---|
| 57 | * specified in an Object (map).
|
---|
| 58 | */
|
---|
| 59 | function UrlMatcher(cfg) {
|
---|
| 60 | var _this = _super.call(this, cfg) || this;
|
---|
| 61 | /**
|
---|
| 62 | * @cfg {Object} stripPrefix (required)
|
---|
| 63 | *
|
---|
| 64 | * The Object form of {@link Autolinker#cfg-stripPrefix}.
|
---|
| 65 | */
|
---|
| 66 | _this.stripPrefix = {
|
---|
| 67 | scheme: true,
|
---|
| 68 | www: true,
|
---|
| 69 | }; // default value just to get the above doc comment in the ES5 output and documentation generator
|
---|
| 70 | /**
|
---|
| 71 | * @cfg {Boolean} stripTrailingSlash (required)
|
---|
| 72 | * @inheritdoc Autolinker#stripTrailingSlash
|
---|
| 73 | */
|
---|
| 74 | _this.stripTrailingSlash = true; // default value just to get the above doc comment in the ES5 output and documentation generator
|
---|
| 75 | /**
|
---|
| 76 | * @cfg {Boolean} decodePercentEncoding (required)
|
---|
| 77 | * @inheritdoc Autolinker#decodePercentEncoding
|
---|
| 78 | */
|
---|
| 79 | _this.decodePercentEncoding = true; // default value just to get the above doc comment in the ES5 output and documentation generator
|
---|
| 80 | /**
|
---|
| 81 | * @protected
|
---|
| 82 | * @property {RegExp} matcherRegex
|
---|
| 83 | *
|
---|
| 84 | * The regular expression to match URLs with an optional scheme, port
|
---|
| 85 | * number, path, query string, and hash anchor.
|
---|
| 86 | *
|
---|
| 87 | * Example matches:
|
---|
| 88 | *
|
---|
| 89 | * http://google.com
|
---|
| 90 | * www.google.com
|
---|
| 91 | * google.com/path/to/file?q1=1&q2=2#myAnchor
|
---|
| 92 | *
|
---|
| 93 | *
|
---|
| 94 | * This regular expression will have the following capturing groups:
|
---|
| 95 | *
|
---|
| 96 | * 1. Group that matches a scheme-prefixed URL (i.e. 'http://google.com').
|
---|
| 97 | * This is used to match scheme URLs with just a single word, such as
|
---|
| 98 | * 'http://localhost', where we won't double check that the domain name
|
---|
| 99 | * has at least one dot ('.') in it.
|
---|
| 100 | * 2. Group that matches a 'www.' prefixed URL. This is only matched if the
|
---|
| 101 | * 'www.' text was not prefixed by a scheme (i.e.: not prefixed by
|
---|
| 102 | * 'http://', 'ftp:', etc.)
|
---|
| 103 | * 3. A protocol-relative ('//') match for the case of a 'www.' prefixed
|
---|
| 104 | * URL. Will be an empty string if it is not a protocol-relative match.
|
---|
| 105 | * We need to know the character before the '//' in order to determine
|
---|
| 106 | * if it is a valid match or the // was in a string we don't want to
|
---|
| 107 | * auto-link.
|
---|
| 108 | * 4. Group that matches a known TLD (top level domain), when a scheme
|
---|
| 109 | * or 'www.'-prefixed domain is not matched.
|
---|
| 110 | * 5. A protocol-relative ('//') match for the case of a known TLD prefixed
|
---|
| 111 | * URL. Will be an empty string if it is not a protocol-relative match.
|
---|
| 112 | * See #3 for more info.
|
---|
| 113 | */
|
---|
| 114 | _this.matcherRegex = matcherRegex;
|
---|
| 115 | /**
|
---|
| 116 | * A regular expression to use to check the character before a protocol-relative
|
---|
| 117 | * URL match. We don't want to match a protocol-relative URL if it is part
|
---|
| 118 | * of another word.
|
---|
| 119 | *
|
---|
| 120 | * For example, we want to match something like "Go to: //google.com",
|
---|
| 121 | * but we don't want to match something like "abc//google.com"
|
---|
| 122 | *
|
---|
| 123 | * This regular expression is used to test the character before the '//'.
|
---|
| 124 | *
|
---|
| 125 | * @protected
|
---|
| 126 | * @type {RegExp} wordCharRegExp
|
---|
| 127 | */
|
---|
| 128 | _this.wordCharRegExp = wordCharRegExp;
|
---|
| 129 | _this.stripPrefix = cfg.stripPrefix;
|
---|
| 130 | _this.stripTrailingSlash = cfg.stripTrailingSlash;
|
---|
| 131 | _this.decodePercentEncoding = cfg.decodePercentEncoding;
|
---|
| 132 | return _this;
|
---|
| 133 | }
|
---|
| 134 | /**
|
---|
| 135 | * @inheritdoc
|
---|
| 136 | */
|
---|
| 137 | UrlMatcher.prototype.parseMatches = function (text) {
|
---|
| 138 | var matcherRegex = this.matcherRegex, stripPrefix = this.stripPrefix, stripTrailingSlash = this.stripTrailingSlash, decodePercentEncoding = this.decodePercentEncoding, tagBuilder = this.tagBuilder, matches = [], match;
|
---|
| 139 | var _loop_1 = function () {
|
---|
| 140 | var matchStr = match[0], schemeUrlMatch = match[1], wwwUrlMatch = match[4], wwwProtocolRelativeMatch = match[5],
|
---|
| 141 | //tldUrlMatch = match[ 8 ], -- not needed at the moment
|
---|
| 142 | tldProtocolRelativeMatch = match[9], offset = match.index, protocolRelativeMatch = wwwProtocolRelativeMatch || tldProtocolRelativeMatch, prevChar = text.charAt(offset - 1);
|
---|
| 143 | if (!UrlMatchValidator.isValid(matchStr, schemeUrlMatch)) {
|
---|
| 144 | return "continue";
|
---|
| 145 | }
|
---|
| 146 | // If the match is preceded by an '@' character, then it is either
|
---|
| 147 | // an email address or a username. Skip these types of matches.
|
---|
| 148 | if (offset > 0 && prevChar === '@') {
|
---|
| 149 | return "continue";
|
---|
| 150 | }
|
---|
| 151 | // If it's a protocol-relative '//' match, but the character before the '//'
|
---|
| 152 | // was a word character (i.e. a letter/number), then we found the '//' in the
|
---|
| 153 | // middle of another word (such as "asdf//asdf.com"). In this case, skip the
|
---|
| 154 | // match.
|
---|
| 155 | if (offset > 0 && protocolRelativeMatch && this_1.wordCharRegExp.test(prevChar)) {
|
---|
| 156 | return "continue";
|
---|
| 157 | }
|
---|
| 158 | // If the URL ends with a question mark, don't include the question
|
---|
| 159 | // mark as part of the URL. We'll assume the question mark was the
|
---|
| 160 | // end of a sentence, such as: "Going to google.com?"
|
---|
| 161 | if (/\?$/.test(matchStr)) {
|
---|
| 162 | matchStr = matchStr.substr(0, matchStr.length - 1);
|
---|
| 163 | }
|
---|
| 164 | // Handle a closing parenthesis or square bracket at the end of the
|
---|
| 165 | // match, and exclude it if there is not a matching open parenthesis
|
---|
| 166 | // or square bracket in the match itself.
|
---|
| 167 | if (this_1.matchHasUnbalancedClosingParen(matchStr)) {
|
---|
| 168 | matchStr = matchStr.substr(0, matchStr.length - 1); // remove the trailing ")"
|
---|
| 169 | }
|
---|
| 170 | else {
|
---|
| 171 | // Handle an invalid character after the TLD
|
---|
| 172 | var pos = this_1.matchHasInvalidCharAfterTld(matchStr, schemeUrlMatch);
|
---|
| 173 | if (pos > -1) {
|
---|
| 174 | matchStr = matchStr.substr(0, pos); // remove the trailing invalid chars
|
---|
| 175 | }
|
---|
| 176 | }
|
---|
| 177 | // The autolinker accepts many characters in a url's scheme (like `fake://test.com`).
|
---|
| 178 | // However, in cases where a URL is missing whitespace before an obvious link,
|
---|
| 179 | // (for example: `nowhitespacehttp://www.test.com`), we only want the match to start
|
---|
| 180 | // at the http:// part. We will check if the match contains a common scheme and then
|
---|
| 181 | // shift the match to start from there.
|
---|
| 182 | var foundCommonScheme = ['http://', 'https://'].find(function (commonScheme) { return !!schemeUrlMatch && schemeUrlMatch.indexOf(commonScheme) !== -1; });
|
---|
| 183 | if (foundCommonScheme) {
|
---|
| 184 | // If we found an overmatched URL, we want to find the index
|
---|
| 185 | // of where the match should start and shift the match to
|
---|
| 186 | // start from the beginning of the common scheme
|
---|
| 187 | var indexOfSchemeStart = matchStr.indexOf(foundCommonScheme);
|
---|
| 188 | matchStr = matchStr.substr(indexOfSchemeStart);
|
---|
| 189 | schemeUrlMatch = schemeUrlMatch.substr(indexOfSchemeStart);
|
---|
| 190 | offset = offset + indexOfSchemeStart;
|
---|
| 191 | }
|
---|
| 192 | var urlMatchType = schemeUrlMatch
|
---|
| 193 | ? 'scheme'
|
---|
| 194 | : wwwUrlMatch
|
---|
| 195 | ? 'www'
|
---|
| 196 | : 'tld', protocolUrlMatch = !!schemeUrlMatch;
|
---|
| 197 | matches.push(new UrlMatch({
|
---|
| 198 | tagBuilder: tagBuilder,
|
---|
| 199 | matchedText: matchStr,
|
---|
| 200 | offset: offset,
|
---|
| 201 | urlMatchType: urlMatchType,
|
---|
| 202 | url: matchStr,
|
---|
| 203 | protocolUrlMatch: protocolUrlMatch,
|
---|
| 204 | protocolRelativeMatch: !!protocolRelativeMatch,
|
---|
| 205 | stripPrefix: stripPrefix,
|
---|
| 206 | stripTrailingSlash: stripTrailingSlash,
|
---|
| 207 | decodePercentEncoding: decodePercentEncoding,
|
---|
| 208 | }));
|
---|
| 209 | };
|
---|
| 210 | var this_1 = this;
|
---|
| 211 | while ((match = matcherRegex.exec(text)) !== null) {
|
---|
| 212 | _loop_1();
|
---|
| 213 | }
|
---|
| 214 | return matches;
|
---|
| 215 | };
|
---|
| 216 | /**
|
---|
| 217 | * Determines if a match found has an unmatched closing parenthesis,
|
---|
| 218 | * square bracket or curly bracket. If so, the symbol will be removed
|
---|
| 219 | * from the match itself, and appended after the generated anchor tag.
|
---|
| 220 | *
|
---|
| 221 | * A match may have an extra closing parenthesis at the end of the match
|
---|
| 222 | * because the regular expression must include parenthesis for URLs such as
|
---|
| 223 | * "wikipedia.com/something_(disambiguation)", which should be auto-linked.
|
---|
| 224 | *
|
---|
| 225 | * However, an extra parenthesis *will* be included when the URL itself is
|
---|
| 226 | * wrapped in parenthesis, such as in the case of:
|
---|
| 227 | * "(wikipedia.com/something_(disambiguation))"
|
---|
| 228 | * In this case, the last closing parenthesis should *not* be part of the
|
---|
| 229 | * URL itself, and this method will return `true`.
|
---|
| 230 | *
|
---|
| 231 | * For square brackets in URLs such as in PHP arrays, the same behavior as
|
---|
| 232 | * parenthesis discussed above should happen:
|
---|
| 233 | * "[http://www.example.com/foo.php?bar[]=1&bar[]=2&bar[]=3]"
|
---|
| 234 | * The closing square bracket should not be part of the URL itself, and this
|
---|
| 235 | * method will return `true`.
|
---|
| 236 | *
|
---|
| 237 | * @protected
|
---|
| 238 | * @param {String} matchStr The full match string from the {@link #matcherRegex}.
|
---|
| 239 | * @return {Boolean} `true` if there is an unbalanced closing parenthesis or
|
---|
| 240 | * square bracket at the end of the `matchStr`, `false` otherwise.
|
---|
| 241 | */
|
---|
| 242 | UrlMatcher.prototype.matchHasUnbalancedClosingParen = function (matchStr) {
|
---|
| 243 | var endChar = matchStr.charAt(matchStr.length - 1);
|
---|
| 244 | var startChar;
|
---|
| 245 | if (endChar === ')') {
|
---|
| 246 | startChar = '(';
|
---|
| 247 | }
|
---|
| 248 | else if (endChar === ']') {
|
---|
| 249 | startChar = '[';
|
---|
| 250 | }
|
---|
| 251 | else if (endChar === '}') {
|
---|
| 252 | startChar = '{';
|
---|
| 253 | }
|
---|
| 254 | else {
|
---|
| 255 | return false; // not a close parenthesis or square bracket
|
---|
| 256 | }
|
---|
| 257 | // Find if there are the same number of open braces as close braces in
|
---|
| 258 | // the URL string, minus the last character (which we have already
|
---|
| 259 | // determined to be either ')', ']' or '}'
|
---|
| 260 | var numOpenBraces = 0;
|
---|
| 261 | for (var i = 0, len = matchStr.length - 1; i < len; i++) {
|
---|
| 262 | var char = matchStr.charAt(i);
|
---|
| 263 | if (char === startChar) {
|
---|
| 264 | numOpenBraces++;
|
---|
| 265 | }
|
---|
| 266 | else if (char === endChar) {
|
---|
| 267 | numOpenBraces = Math.max(numOpenBraces - 1, 0);
|
---|
| 268 | }
|
---|
| 269 | }
|
---|
| 270 | // If the number of open braces matches the number of close braces in
|
---|
| 271 | // the URL minus the last character, then the match has *unbalanced*
|
---|
| 272 | // braces because of the last character. Example of unbalanced braces
|
---|
| 273 | // from the regex match:
|
---|
| 274 | // "http://example.com?a[]=1]"
|
---|
| 275 | if (numOpenBraces === 0) {
|
---|
| 276 | return true;
|
---|
| 277 | }
|
---|
| 278 | return false;
|
---|
| 279 | };
|
---|
| 280 | /**
|
---|
| 281 | * Determine if there's an invalid character after the TLD in a URL. Valid
|
---|
| 282 | * characters after TLD are ':/?#'. Exclude scheme matched URLs from this
|
---|
| 283 | * check.
|
---|
| 284 | *
|
---|
| 285 | * @protected
|
---|
| 286 | * @param {String} urlMatch The matched URL, if there was one. Will be an
|
---|
| 287 | * empty string if the match is not a URL match.
|
---|
| 288 | * @param {String} schemeUrlMatch The match URL string for a scheme
|
---|
| 289 | * match. Ex: 'http://yahoo.com'. This is used to match something like
|
---|
| 290 | * 'http://localhost', where we won't double check that the domain name
|
---|
| 291 | * has at least one '.' in it.
|
---|
| 292 | * @return {Number} the position where the invalid character was found. If
|
---|
| 293 | * no such character was found, returns -1
|
---|
| 294 | */
|
---|
| 295 | UrlMatcher.prototype.matchHasInvalidCharAfterTld = function (urlMatch, schemeUrlMatch) {
|
---|
| 296 | if (!urlMatch) {
|
---|
| 297 | return -1;
|
---|
| 298 | }
|
---|
| 299 | var offset = 0;
|
---|
| 300 | if (schemeUrlMatch) {
|
---|
| 301 | offset = urlMatch.indexOf(':');
|
---|
| 302 | urlMatch = urlMatch.slice(offset);
|
---|
| 303 | }
|
---|
| 304 | // prettier-ignore
|
---|
| 305 | var re = new RegExp("^((.?\/\/)?[-." + alphaNumericAndMarksCharsStr + "]*[-" + alphaNumericAndMarksCharsStr + "]\\.[-" + alphaNumericAndMarksCharsStr + "]+)");
|
---|
| 306 | var res = re.exec(urlMatch);
|
---|
| 307 | if (res === null) {
|
---|
| 308 | return -1;
|
---|
| 309 | }
|
---|
| 310 | offset += res[1].length;
|
---|
| 311 | urlMatch = urlMatch.slice(res[1].length);
|
---|
| 312 | if (/^[^-.A-Za-z0-9:\/?#]/.test(urlMatch)) {
|
---|
| 313 | return offset;
|
---|
| 314 | }
|
---|
| 315 | return -1;
|
---|
| 316 | };
|
---|
| 317 | return UrlMatcher;
|
---|
| 318 | }(Matcher));
|
---|
| 319 | export { UrlMatcher };
|
---|
| 320 | //# sourceMappingURL=url-matcher.js.map |
---|