source: node_modules/autolinker/dist/es2015/matcher/url-matcher.js@ d24f17c

main
Last change on this file since d24f17c was d24f17c, checked in by Aleksandar Panovski <apano77@…>, 15 months ago

Initial commit

  • Property mode set to 100644
File size: 15.0 KB
Line 
1import { __extends } from "tslib";
2import { Matcher } from './matcher';
3import { alphaNumericCharsStr, alphaNumericAndMarksCharsStr, getDomainNameStr } from '../regex-lib';
4import { tldRegex } from './tld-regex';
5import { UrlMatch } from '../match/url-match';
6import { UrlMatchValidator } from './url-match-validator';
7// RegExp objects which are shared by all instances of UrlMatcher. These are
8// here to avoid re-instantiating the RegExp objects if `Autolinker.link()` is
9// called multiple times, thus instantiating UrlMatcher and its RegExp
10// objects each time (which is very expensive - see https://github.com/gregjacobs/Autolinker.js/issues/314).
11// See descriptions of the properties where they are used for details about them
12// prettier-ignore
13var matcherRegex = (function () {
14 var schemeRegex = /(?:[A-Za-z][-.+A-Za-z0-9]{0,63}:(?![A-Za-z][-.+A-Za-z0-9]{0,63}:\/\/)(?!\d+\/?)(?:\/\/)?)/, // match protocol, allow in format "http://" or "mailto:". However, do not match the first part of something like 'link:http://www.google.com' (i.e. don't match "link:"). Also, make sure we don't interpret 'google.com:8000' as if 'google.com' was a protocol here (i.e. ignore a trailing port number in this regex)
15 wwwRegex = /(?:www\.)/, // starting with 'www.'
16 // Allow optional path, query string, and hash anchor, not ending in the following characters: "?!:,.;"
17 // http://blog.codinghorror.com/the-problem-with-urls/
18 urlSuffixRegex = new RegExp('[/?#](?:[' + alphaNumericAndMarksCharsStr + '\\-+&@#/%=~_()|\'$*\\[\\]{}?!:,.;^\u2713]*[' + alphaNumericAndMarksCharsStr + '\\-+&@#/%=~_()|\'$*\\[\\]{}\u2713])?');
19 return new RegExp([
20 '(?:',
21 '(',
22 schemeRegex.source,
23 getDomainNameStr(2),
24 ')',
25 '|',
26 '(',
27 '(//)?',
28 wwwRegex.source,
29 getDomainNameStr(6),
30 ')',
31 '|',
32 '(',
33 '(//)?',
34 getDomainNameStr(10) + '\\.',
35 tldRegex.source,
36 '(?![-' + alphaNumericCharsStr + '])',
37 ')',
38 ')',
39 '(?::[0-9]+)?',
40 '(?:' + urlSuffixRegex.source + ')?' // match for path, query string, and/or hash anchor - optional
41 ].join(""), 'gi');
42})();
43var wordCharRegExp = new RegExp('[' + alphaNumericAndMarksCharsStr + ']');
44/**
45 * @class Autolinker.matcher.Url
46 * @extends Autolinker.matcher.Matcher
47 *
48 * Matcher to find URL matches in an input string.
49 *
50 * See this class's superclass ({@link Autolinker.matcher.Matcher}) for more details.
51 */
52var UrlMatcher = /** @class */ (function (_super) {
53 __extends(UrlMatcher, _super);
54 /**
55 * @method constructor
56 * @param {Object} cfg The configuration properties for the Match instance,
57 * specified in an Object (map).
58 */
59 function UrlMatcher(cfg) {
60 var _this = _super.call(this, cfg) || this;
61 /**
62 * @cfg {Object} stripPrefix (required)
63 *
64 * The Object form of {@link Autolinker#cfg-stripPrefix}.
65 */
66 _this.stripPrefix = {
67 scheme: true,
68 www: true,
69 }; // default value just to get the above doc comment in the ES5 output and documentation generator
70 /**
71 * @cfg {Boolean} stripTrailingSlash (required)
72 * @inheritdoc Autolinker#stripTrailingSlash
73 */
74 _this.stripTrailingSlash = true; // default value just to get the above doc comment in the ES5 output and documentation generator
75 /**
76 * @cfg {Boolean} decodePercentEncoding (required)
77 * @inheritdoc Autolinker#decodePercentEncoding
78 */
79 _this.decodePercentEncoding = true; // default value just to get the above doc comment in the ES5 output and documentation generator
80 /**
81 * @protected
82 * @property {RegExp} matcherRegex
83 *
84 * The regular expression to match URLs with an optional scheme, port
85 * number, path, query string, and hash anchor.
86 *
87 * Example matches:
88 *
89 * http://google.com
90 * www.google.com
91 * google.com/path/to/file?q1=1&q2=2#myAnchor
92 *
93 *
94 * This regular expression will have the following capturing groups:
95 *
96 * 1. Group that matches a scheme-prefixed URL (i.e. 'http://google.com').
97 * This is used to match scheme URLs with just a single word, such as
98 * 'http://localhost', where we won't double check that the domain name
99 * has at least one dot ('.') in it.
100 * 2. Group that matches a 'www.' prefixed URL. This is only matched if the
101 * 'www.' text was not prefixed by a scheme (i.e.: not prefixed by
102 * 'http://', 'ftp:', etc.)
103 * 3. A protocol-relative ('//') match for the case of a 'www.' prefixed
104 * URL. Will be an empty string if it is not a protocol-relative match.
105 * We need to know the character before the '//' in order to determine
106 * if it is a valid match or the // was in a string we don't want to
107 * auto-link.
108 * 4. Group that matches a known TLD (top level domain), when a scheme
109 * or 'www.'-prefixed domain is not matched.
110 * 5. A protocol-relative ('//') match for the case of a known TLD prefixed
111 * URL. Will be an empty string if it is not a protocol-relative match.
112 * See #3 for more info.
113 */
114 _this.matcherRegex = matcherRegex;
115 /**
116 * A regular expression to use to check the character before a protocol-relative
117 * URL match. We don't want to match a protocol-relative URL if it is part
118 * of another word.
119 *
120 * For example, we want to match something like "Go to: //google.com",
121 * but we don't want to match something like "abc//google.com"
122 *
123 * This regular expression is used to test the character before the '//'.
124 *
125 * @protected
126 * @type {RegExp} wordCharRegExp
127 */
128 _this.wordCharRegExp = wordCharRegExp;
129 _this.stripPrefix = cfg.stripPrefix;
130 _this.stripTrailingSlash = cfg.stripTrailingSlash;
131 _this.decodePercentEncoding = cfg.decodePercentEncoding;
132 return _this;
133 }
134 /**
135 * @inheritdoc
136 */
137 UrlMatcher.prototype.parseMatches = function (text) {
138 var matcherRegex = this.matcherRegex, stripPrefix = this.stripPrefix, stripTrailingSlash = this.stripTrailingSlash, decodePercentEncoding = this.decodePercentEncoding, tagBuilder = this.tagBuilder, matches = [], match;
139 var _loop_1 = function () {
140 var matchStr = match[0], schemeUrlMatch = match[1], wwwUrlMatch = match[4], wwwProtocolRelativeMatch = match[5],
141 //tldUrlMatch = match[ 8 ], -- not needed at the moment
142 tldProtocolRelativeMatch = match[9], offset = match.index, protocolRelativeMatch = wwwProtocolRelativeMatch || tldProtocolRelativeMatch, prevChar = text.charAt(offset - 1);
143 if (!UrlMatchValidator.isValid(matchStr, schemeUrlMatch)) {
144 return "continue";
145 }
146 // If the match is preceded by an '@' character, then it is either
147 // an email address or a username. Skip these types of matches.
148 if (offset > 0 && prevChar === '@') {
149 return "continue";
150 }
151 // If it's a protocol-relative '//' match, but the character before the '//'
152 // was a word character (i.e. a letter/number), then we found the '//' in the
153 // middle of another word (such as "asdf//asdf.com"). In this case, skip the
154 // match.
155 if (offset > 0 && protocolRelativeMatch && this_1.wordCharRegExp.test(prevChar)) {
156 return "continue";
157 }
158 // If the URL ends with a question mark, don't include the question
159 // mark as part of the URL. We'll assume the question mark was the
160 // end of a sentence, such as: "Going to google.com?"
161 if (/\?$/.test(matchStr)) {
162 matchStr = matchStr.substr(0, matchStr.length - 1);
163 }
164 // Handle a closing parenthesis or square bracket at the end of the
165 // match, and exclude it if there is not a matching open parenthesis
166 // or square bracket in the match itself.
167 if (this_1.matchHasUnbalancedClosingParen(matchStr)) {
168 matchStr = matchStr.substr(0, matchStr.length - 1); // remove the trailing ")"
169 }
170 else {
171 // Handle an invalid character after the TLD
172 var pos = this_1.matchHasInvalidCharAfterTld(matchStr, schemeUrlMatch);
173 if (pos > -1) {
174 matchStr = matchStr.substr(0, pos); // remove the trailing invalid chars
175 }
176 }
177 // The autolinker accepts many characters in a url's scheme (like `fake://test.com`).
178 // However, in cases where a URL is missing whitespace before an obvious link,
179 // (for example: `nowhitespacehttp://www.test.com`), we only want the match to start
180 // at the http:// part. We will check if the match contains a common scheme and then
181 // shift the match to start from there.
182 var foundCommonScheme = ['http://', 'https://'].find(function (commonScheme) { return !!schemeUrlMatch && schemeUrlMatch.indexOf(commonScheme) !== -1; });
183 if (foundCommonScheme) {
184 // If we found an overmatched URL, we want to find the index
185 // of where the match should start and shift the match to
186 // start from the beginning of the common scheme
187 var indexOfSchemeStart = matchStr.indexOf(foundCommonScheme);
188 matchStr = matchStr.substr(indexOfSchemeStart);
189 schemeUrlMatch = schemeUrlMatch.substr(indexOfSchemeStart);
190 offset = offset + indexOfSchemeStart;
191 }
192 var urlMatchType = schemeUrlMatch
193 ? 'scheme'
194 : wwwUrlMatch
195 ? 'www'
196 : 'tld', protocolUrlMatch = !!schemeUrlMatch;
197 matches.push(new UrlMatch({
198 tagBuilder: tagBuilder,
199 matchedText: matchStr,
200 offset: offset,
201 urlMatchType: urlMatchType,
202 url: matchStr,
203 protocolUrlMatch: protocolUrlMatch,
204 protocolRelativeMatch: !!protocolRelativeMatch,
205 stripPrefix: stripPrefix,
206 stripTrailingSlash: stripTrailingSlash,
207 decodePercentEncoding: decodePercentEncoding,
208 }));
209 };
210 var this_1 = this;
211 while ((match = matcherRegex.exec(text)) !== null) {
212 _loop_1();
213 }
214 return matches;
215 };
216 /**
217 * Determines if a match found has an unmatched closing parenthesis,
218 * square bracket or curly bracket. If so, the symbol will be removed
219 * from the match itself, and appended after the generated anchor tag.
220 *
221 * A match may have an extra closing parenthesis at the end of the match
222 * because the regular expression must include parenthesis for URLs such as
223 * "wikipedia.com/something_(disambiguation)", which should be auto-linked.
224 *
225 * However, an extra parenthesis *will* be included when the URL itself is
226 * wrapped in parenthesis, such as in the case of:
227 * "(wikipedia.com/something_(disambiguation))"
228 * In this case, the last closing parenthesis should *not* be part of the
229 * URL itself, and this method will return `true`.
230 *
231 * For square brackets in URLs such as in PHP arrays, the same behavior as
232 * parenthesis discussed above should happen:
233 * "[http://www.example.com/foo.php?bar[]=1&bar[]=2&bar[]=3]"
234 * The closing square bracket should not be part of the URL itself, and this
235 * method will return `true`.
236 *
237 * @protected
238 * @param {String} matchStr The full match string from the {@link #matcherRegex}.
239 * @return {Boolean} `true` if there is an unbalanced closing parenthesis or
240 * square bracket at the end of the `matchStr`, `false` otherwise.
241 */
242 UrlMatcher.prototype.matchHasUnbalancedClosingParen = function (matchStr) {
243 var endChar = matchStr.charAt(matchStr.length - 1);
244 var startChar;
245 if (endChar === ')') {
246 startChar = '(';
247 }
248 else if (endChar === ']') {
249 startChar = '[';
250 }
251 else if (endChar === '}') {
252 startChar = '{';
253 }
254 else {
255 return false; // not a close parenthesis or square bracket
256 }
257 // Find if there are the same number of open braces as close braces in
258 // the URL string, minus the last character (which we have already
259 // determined to be either ')', ']' or '}'
260 var numOpenBraces = 0;
261 for (var i = 0, len = matchStr.length - 1; i < len; i++) {
262 var char = matchStr.charAt(i);
263 if (char === startChar) {
264 numOpenBraces++;
265 }
266 else if (char === endChar) {
267 numOpenBraces = Math.max(numOpenBraces - 1, 0);
268 }
269 }
270 // If the number of open braces matches the number of close braces in
271 // the URL minus the last character, then the match has *unbalanced*
272 // braces because of the last character. Example of unbalanced braces
273 // from the regex match:
274 // "http://example.com?a[]=1]"
275 if (numOpenBraces === 0) {
276 return true;
277 }
278 return false;
279 };
280 /**
281 * Determine if there's an invalid character after the TLD in a URL. Valid
282 * characters after TLD are ':/?#'. Exclude scheme matched URLs from this
283 * check.
284 *
285 * @protected
286 * @param {String} urlMatch The matched URL, if there was one. Will be an
287 * empty string if the match is not a URL match.
288 * @param {String} schemeUrlMatch The match URL string for a scheme
289 * match. Ex: 'http://yahoo.com'. This is used to match something like
290 * 'http://localhost', where we won't double check that the domain name
291 * has at least one '.' in it.
292 * @return {Number} the position where the invalid character was found. If
293 * no such character was found, returns -1
294 */
295 UrlMatcher.prototype.matchHasInvalidCharAfterTld = function (urlMatch, schemeUrlMatch) {
296 if (!urlMatch) {
297 return -1;
298 }
299 var offset = 0;
300 if (schemeUrlMatch) {
301 offset = urlMatch.indexOf(':');
302 urlMatch = urlMatch.slice(offset);
303 }
304 // prettier-ignore
305 var re = new RegExp("^((.?\/\/)?[-." + alphaNumericAndMarksCharsStr + "]*[-" + alphaNumericAndMarksCharsStr + "]\\.[-" + alphaNumericAndMarksCharsStr + "]+)");
306 var res = re.exec(urlMatch);
307 if (res === null) {
308 return -1;
309 }
310 offset += res[1].length;
311 urlMatch = urlMatch.slice(res[1].length);
312 if (/^[^-.A-Za-z0-9:\/?#]/.test(urlMatch)) {
313 return offset;
314 }
315 return -1;
316 };
317 return UrlMatcher;
318}(Matcher));
319export { UrlMatcher };
320//# sourceMappingURL=url-matcher.js.map
Note: See TracBrowser for help on using the repository browser.