source: node_modules/autolinker/dist/commonjs/matcher/url-matcher.js

main
Last change on this file was d24f17c, checked in by Aleksandar Panovski <apano77@…>, 15 months ago

Initial commit

  • Property mode set to 100644
File size: 15.3 KB
Line 
1"use strict";
2Object.defineProperty(exports, "__esModule", { value: true });
3exports.UrlMatcher = void 0;
4var tslib_1 = require("tslib");
5var matcher_1 = require("./matcher");
6var regex_lib_1 = require("../regex-lib");
7var tld_regex_1 = require("./tld-regex");
8var url_match_1 = require("../match/url-match");
9var url_match_validator_1 = require("./url-match-validator");
10// RegExp objects which are shared by all instances of UrlMatcher. These are
11// here to avoid re-instantiating the RegExp objects if `Autolinker.link()` is
12// called multiple times, thus instantiating UrlMatcher and its RegExp
13// objects each time (which is very expensive - see https://github.com/gregjacobs/Autolinker.js/issues/314).
14// See descriptions of the properties where they are used for details about them
15// prettier-ignore
16var matcherRegex = (function () {
17 var schemeRegex = /(?:[A-Za-z][-.+A-Za-z0-9]{0,63}:(?![A-Za-z][-.+A-Za-z0-9]{0,63}:\/\/)(?!\d+\/?)(?:\/\/)?)/, // match protocol, allow in format "http://" or "mailto:". However, do not match the first part of something like 'link:http://www.google.com' (i.e. don't match "link:"). Also, make sure we don't interpret 'google.com:8000' as if 'google.com' was a protocol here (i.e. ignore a trailing port number in this regex)
18 wwwRegex = /(?:www\.)/, // starting with 'www.'
19 // Allow optional path, query string, and hash anchor, not ending in the following characters: "?!:,.;"
20 // http://blog.codinghorror.com/the-problem-with-urls/
21 urlSuffixRegex = new RegExp('[/?#](?:[' + regex_lib_1.alphaNumericAndMarksCharsStr + '\\-+&@#/%=~_()|\'$*\\[\\]{}?!:,.;^\u2713]*[' + regex_lib_1.alphaNumericAndMarksCharsStr + '\\-+&@#/%=~_()|\'$*\\[\\]{}\u2713])?');
22 return new RegExp([
23 '(?:',
24 '(',
25 schemeRegex.source,
26 (0, regex_lib_1.getDomainNameStr)(2),
27 ')',
28 '|',
29 '(',
30 '(//)?',
31 wwwRegex.source,
32 (0, regex_lib_1.getDomainNameStr)(6),
33 ')',
34 '|',
35 '(',
36 '(//)?',
37 (0, regex_lib_1.getDomainNameStr)(10) + '\\.',
38 tld_regex_1.tldRegex.source,
39 '(?![-' + regex_lib_1.alphaNumericCharsStr + '])',
40 ')',
41 ')',
42 '(?::[0-9]+)?',
43 '(?:' + urlSuffixRegex.source + ')?' // match for path, query string, and/or hash anchor - optional
44 ].join(""), 'gi');
45})();
46var wordCharRegExp = new RegExp('[' + regex_lib_1.alphaNumericAndMarksCharsStr + ']');
47/**
48 * @class Autolinker.matcher.Url
49 * @extends Autolinker.matcher.Matcher
50 *
51 * Matcher to find URL matches in an input string.
52 *
53 * See this class's superclass ({@link Autolinker.matcher.Matcher}) for more details.
54 */
55var UrlMatcher = /** @class */ (function (_super) {
56 (0, tslib_1.__extends)(UrlMatcher, _super);
57 /**
58 * @method constructor
59 * @param {Object} cfg The configuration properties for the Match instance,
60 * specified in an Object (map).
61 */
62 function UrlMatcher(cfg) {
63 var _this = _super.call(this, cfg) || this;
64 /**
65 * @cfg {Object} stripPrefix (required)
66 *
67 * The Object form of {@link Autolinker#cfg-stripPrefix}.
68 */
69 _this.stripPrefix = {
70 scheme: true,
71 www: true,
72 }; // default value just to get the above doc comment in the ES5 output and documentation generator
73 /**
74 * @cfg {Boolean} stripTrailingSlash (required)
75 * @inheritdoc Autolinker#stripTrailingSlash
76 */
77 _this.stripTrailingSlash = true; // default value just to get the above doc comment in the ES5 output and documentation generator
78 /**
79 * @cfg {Boolean} decodePercentEncoding (required)
80 * @inheritdoc Autolinker#decodePercentEncoding
81 */
82 _this.decodePercentEncoding = true; // default value just to get the above doc comment in the ES5 output and documentation generator
83 /**
84 * @protected
85 * @property {RegExp} matcherRegex
86 *
87 * The regular expression to match URLs with an optional scheme, port
88 * number, path, query string, and hash anchor.
89 *
90 * Example matches:
91 *
92 * http://google.com
93 * www.google.com
94 * google.com/path/to/file?q1=1&q2=2#myAnchor
95 *
96 *
97 * This regular expression will have the following capturing groups:
98 *
99 * 1. Group that matches a scheme-prefixed URL (i.e. 'http://google.com').
100 * This is used to match scheme URLs with just a single word, such as
101 * 'http://localhost', where we won't double check that the domain name
102 * has at least one dot ('.') in it.
103 * 2. Group that matches a 'www.' prefixed URL. This is only matched if the
104 * 'www.' text was not prefixed by a scheme (i.e.: not prefixed by
105 * 'http://', 'ftp:', etc.)
106 * 3. A protocol-relative ('//') match for the case of a 'www.' prefixed
107 * URL. Will be an empty string if it is not a protocol-relative match.
108 * We need to know the character before the '//' in order to determine
109 * if it is a valid match or the // was in a string we don't want to
110 * auto-link.
111 * 4. Group that matches a known TLD (top level domain), when a scheme
112 * or 'www.'-prefixed domain is not matched.
113 * 5. A protocol-relative ('//') match for the case of a known TLD prefixed
114 * URL. Will be an empty string if it is not a protocol-relative match.
115 * See #3 for more info.
116 */
117 _this.matcherRegex = matcherRegex;
118 /**
119 * A regular expression to use to check the character before a protocol-relative
120 * URL match. We don't want to match a protocol-relative URL if it is part
121 * of another word.
122 *
123 * For example, we want to match something like "Go to: //google.com",
124 * but we don't want to match something like "abc//google.com"
125 *
126 * This regular expression is used to test the character before the '//'.
127 *
128 * @protected
129 * @type {RegExp} wordCharRegExp
130 */
131 _this.wordCharRegExp = wordCharRegExp;
132 _this.stripPrefix = cfg.stripPrefix;
133 _this.stripTrailingSlash = cfg.stripTrailingSlash;
134 _this.decodePercentEncoding = cfg.decodePercentEncoding;
135 return _this;
136 }
137 /**
138 * @inheritdoc
139 */
140 UrlMatcher.prototype.parseMatches = function (text) {
141 var matcherRegex = this.matcherRegex, stripPrefix = this.stripPrefix, stripTrailingSlash = this.stripTrailingSlash, decodePercentEncoding = this.decodePercentEncoding, tagBuilder = this.tagBuilder, matches = [], match;
142 var _loop_1 = function () {
143 var matchStr = match[0], schemeUrlMatch = match[1], wwwUrlMatch = match[4], wwwProtocolRelativeMatch = match[5],
144 //tldUrlMatch = match[ 8 ], -- not needed at the moment
145 tldProtocolRelativeMatch = match[9], offset = match.index, protocolRelativeMatch = wwwProtocolRelativeMatch || tldProtocolRelativeMatch, prevChar = text.charAt(offset - 1);
146 if (!url_match_validator_1.UrlMatchValidator.isValid(matchStr, schemeUrlMatch)) {
147 return "continue";
148 }
149 // If the match is preceded by an '@' character, then it is either
150 // an email address or a username. Skip these types of matches.
151 if (offset > 0 && prevChar === '@') {
152 return "continue";
153 }
154 // If it's a protocol-relative '//' match, but the character before the '//'
155 // was a word character (i.e. a letter/number), then we found the '//' in the
156 // middle of another word (such as "asdf//asdf.com"). In this case, skip the
157 // match.
158 if (offset > 0 && protocolRelativeMatch && this_1.wordCharRegExp.test(prevChar)) {
159 return "continue";
160 }
161 // If the URL ends with a question mark, don't include the question
162 // mark as part of the URL. We'll assume the question mark was the
163 // end of a sentence, such as: "Going to google.com?"
164 if (/\?$/.test(matchStr)) {
165 matchStr = matchStr.substr(0, matchStr.length - 1);
166 }
167 // Handle a closing parenthesis or square bracket at the end of the
168 // match, and exclude it if there is not a matching open parenthesis
169 // or square bracket in the match itself.
170 if (this_1.matchHasUnbalancedClosingParen(matchStr)) {
171 matchStr = matchStr.substr(0, matchStr.length - 1); // remove the trailing ")"
172 }
173 else {
174 // Handle an invalid character after the TLD
175 var pos = this_1.matchHasInvalidCharAfterTld(matchStr, schemeUrlMatch);
176 if (pos > -1) {
177 matchStr = matchStr.substr(0, pos); // remove the trailing invalid chars
178 }
179 }
180 // The autolinker accepts many characters in a url's scheme (like `fake://test.com`).
181 // However, in cases where a URL is missing whitespace before an obvious link,
182 // (for example: `nowhitespacehttp://www.test.com`), we only want the match to start
183 // at the http:// part. We will check if the match contains a common scheme and then
184 // shift the match to start from there.
185 var foundCommonScheme = ['http://', 'https://'].find(function (commonScheme) { return !!schemeUrlMatch && schemeUrlMatch.indexOf(commonScheme) !== -1; });
186 if (foundCommonScheme) {
187 // If we found an overmatched URL, we want to find the index
188 // of where the match should start and shift the match to
189 // start from the beginning of the common scheme
190 var indexOfSchemeStart = matchStr.indexOf(foundCommonScheme);
191 matchStr = matchStr.substr(indexOfSchemeStart);
192 schemeUrlMatch = schemeUrlMatch.substr(indexOfSchemeStart);
193 offset = offset + indexOfSchemeStart;
194 }
195 var urlMatchType = schemeUrlMatch
196 ? 'scheme'
197 : wwwUrlMatch
198 ? 'www'
199 : 'tld', protocolUrlMatch = !!schemeUrlMatch;
200 matches.push(new url_match_1.UrlMatch({
201 tagBuilder: tagBuilder,
202 matchedText: matchStr,
203 offset: offset,
204 urlMatchType: urlMatchType,
205 url: matchStr,
206 protocolUrlMatch: protocolUrlMatch,
207 protocolRelativeMatch: !!protocolRelativeMatch,
208 stripPrefix: stripPrefix,
209 stripTrailingSlash: stripTrailingSlash,
210 decodePercentEncoding: decodePercentEncoding,
211 }));
212 };
213 var this_1 = this;
214 while ((match = matcherRegex.exec(text)) !== null) {
215 _loop_1();
216 }
217 return matches;
218 };
219 /**
220 * Determines if a match found has an unmatched closing parenthesis,
221 * square bracket or curly bracket. If so, the symbol will be removed
222 * from the match itself, and appended after the generated anchor tag.
223 *
224 * A match may have an extra closing parenthesis at the end of the match
225 * because the regular expression must include parenthesis for URLs such as
226 * "wikipedia.com/something_(disambiguation)", which should be auto-linked.
227 *
228 * However, an extra parenthesis *will* be included when the URL itself is
229 * wrapped in parenthesis, such as in the case of:
230 * "(wikipedia.com/something_(disambiguation))"
231 * In this case, the last closing parenthesis should *not* be part of the
232 * URL itself, and this method will return `true`.
233 *
234 * For square brackets in URLs such as in PHP arrays, the same behavior as
235 * parenthesis discussed above should happen:
236 * "[http://www.example.com/foo.php?bar[]=1&bar[]=2&bar[]=3]"
237 * The closing square bracket should not be part of the URL itself, and this
238 * method will return `true`.
239 *
240 * @protected
241 * @param {String} matchStr The full match string from the {@link #matcherRegex}.
242 * @return {Boolean} `true` if there is an unbalanced closing parenthesis or
243 * square bracket at the end of the `matchStr`, `false` otherwise.
244 */
245 UrlMatcher.prototype.matchHasUnbalancedClosingParen = function (matchStr) {
246 var endChar = matchStr.charAt(matchStr.length - 1);
247 var startChar;
248 if (endChar === ')') {
249 startChar = '(';
250 }
251 else if (endChar === ']') {
252 startChar = '[';
253 }
254 else if (endChar === '}') {
255 startChar = '{';
256 }
257 else {
258 return false; // not a close parenthesis or square bracket
259 }
260 // Find if there are the same number of open braces as close braces in
261 // the URL string, minus the last character (which we have already
262 // determined to be either ')', ']' or '}'
263 var numOpenBraces = 0;
264 for (var i = 0, len = matchStr.length - 1; i < len; i++) {
265 var char = matchStr.charAt(i);
266 if (char === startChar) {
267 numOpenBraces++;
268 }
269 else if (char === endChar) {
270 numOpenBraces = Math.max(numOpenBraces - 1, 0);
271 }
272 }
273 // If the number of open braces matches the number of close braces in
274 // the URL minus the last character, then the match has *unbalanced*
275 // braces because of the last character. Example of unbalanced braces
276 // from the regex match:
277 // "http://example.com?a[]=1]"
278 if (numOpenBraces === 0) {
279 return true;
280 }
281 return false;
282 };
283 /**
284 * Determine if there's an invalid character after the TLD in a URL. Valid
285 * characters after TLD are ':/?#'. Exclude scheme matched URLs from this
286 * check.
287 *
288 * @protected
289 * @param {String} urlMatch The matched URL, if there was one. Will be an
290 * empty string if the match is not a URL match.
291 * @param {String} schemeUrlMatch The match URL string for a scheme
292 * match. Ex: 'http://yahoo.com'. This is used to match something like
293 * 'http://localhost', where we won't double check that the domain name
294 * has at least one '.' in it.
295 * @return {Number} the position where the invalid character was found. If
296 * no such character was found, returns -1
297 */
298 UrlMatcher.prototype.matchHasInvalidCharAfterTld = function (urlMatch, schemeUrlMatch) {
299 if (!urlMatch) {
300 return -1;
301 }
302 var offset = 0;
303 if (schemeUrlMatch) {
304 offset = urlMatch.indexOf(':');
305 urlMatch = urlMatch.slice(offset);
306 }
307 // prettier-ignore
308 var re = new RegExp("^((.?\/\/)?[-." + regex_lib_1.alphaNumericAndMarksCharsStr + "]*[-" + regex_lib_1.alphaNumericAndMarksCharsStr + "]\\.[-" + regex_lib_1.alphaNumericAndMarksCharsStr + "]+)");
309 var res = re.exec(urlMatch);
310 if (res === null) {
311 return -1;
312 }
313 offset += res[1].length;
314 urlMatch = urlMatch.slice(res[1].length);
315 if (/^[^-.A-Za-z0-9:\/?#]/.test(urlMatch)) {
316 return offset;
317 }
318 return -1;
319 };
320 return UrlMatcher;
321}(matcher_1.Matcher));
322exports.UrlMatcher = UrlMatcher;
323//# sourceMappingURL=url-matcher.js.map
Note: See TracBrowser for help on using the repository browser.