Context Navigation

← Previous Revision
Next Revision →
Blame
Revision Log

source: node_modules/autolinker/dist/commonjs/matcher/url-matcher.js

main

Last change on this file was d24f17c, checked in by Aleksandar Panovski <apano77@…>, 16 months ago
Initial commit
Property mode set to `100644`
File size: 15.3 KB

Line
1	"use strict";
2	Object.defineProperty(exports, "__esModule", { value: true });
3	exports.UrlMatcher = void 0;
4	var tslib_1 = require("tslib");
5	var matcher_1 = require("./matcher");
6	var regex_lib_1 = require("../regex-lib");
7	var tld_regex_1 = require("./tld-regex");
8	var url_match_1 = require("../match/url-match");
9	var url_match_validator_1 = require("./url-match-validator");
10	// RegExp objects which are shared by all instances of UrlMatcher. These are
11	// here to avoid re-instantiating the RegExp objects if `Autolinker.link()` is
12	// called multiple times, thus instantiating UrlMatcher and its RegExp
13	// objects each time (which is very expensive - see https://github.com/gregjacobs/Autolinker.js/issues/314).
14	// See descriptions of the properties where they are used for details about them
15	// prettier-ignore
16	var matcherRegex = (function () {
17	var schemeRegex = /(?:[A-Za-z][-.+A-Za-z0-9]{0,63}:(?![A-Za-z][-.+A-Za-z0-9]{0,63}:\/\/)(?!\d+\/?)(?:\/\/)?)/, // match protocol, allow in format "http://" or "mailto:". However, do not match the first part of something like 'link:http://www.google.com' (i.e. don't match "link:"). Also, make sure we don't interpret 'google.com:8000' as if 'google.com' was a protocol here (i.e. ignore a trailing port number in this regex)
18	wwwRegex = /(?:www\.)/, // starting with 'www.'
19	// Allow optional path, query string, and hash anchor, not ending in the following characters: "?!:,.;"
20	// http://blog.codinghorror.com/the-problem-with-urls/
21	urlSuffixRegex = new RegExp('[/?#](?:[' + regex_lib_1.alphaNumericAndMarksCharsStr + '\\-+&@#/%=~_()\|\'$\\[\\]{}?!:,.;^\u2713][' + regex_lib_1.alphaNumericAndMarksCharsStr + '\\-+&@#/%=~_()\|\'$*\\[\\]{}\u2713])?');
22	return new RegExp([
23	'(?:',
24	'(',
25	schemeRegex.source,
26	(0, regex_lib_1.getDomainNameStr)(2),
27	')',
28	'\|',
29	'(',
30	'(//)?',
31	wwwRegex.source,
32	(0, regex_lib_1.getDomainNameStr)(6),
33	')',
34	'\|',
35	'(',
36	'(//)?',
37	(0, regex_lib_1.getDomainNameStr)(10) + '\\.',
38	tld_regex_1.tldRegex.source,
39	'(?![-' + regex_lib_1.alphaNumericCharsStr + '])',
40	')',
41	')',
42	'(?::[0-9]+)?',
43	'(?:' + urlSuffixRegex.source + ')?' // match for path, query string, and/or hash anchor - optional
44	].join(""), 'gi');
45	})();
46	var wordCharRegExp = new RegExp('[' + regex_lib_1.alphaNumericAndMarksCharsStr + ']');
47	/**
48	* @class Autolinker.matcher.Url
49	* @extends Autolinker.matcher.Matcher
50	*
51	* Matcher to find URL matches in an input string.
52	*
53	* See this class's superclass ({@link Autolinker.matcher.Matcher}) for more details.
54	*/
55	var UrlMatcher = /** @class */ (function (_super) {
56	(0, tslib_1.__extends)(UrlMatcher, _super);
57	/**
58	* @method constructor
59	* @param {Object} cfg The configuration properties for the Match instance,
60	* specified in an Object (map).
61	*/
62	function UrlMatcher(cfg) {
63	var _this = _super.call(this, cfg) \|\| this;
64	/**
65	* @cfg {Object} stripPrefix (required)
66	*
67	* The Object form of {@link Autolinker#cfg-stripPrefix}.
68	*/
69	_this.stripPrefix = {
70	scheme: true,
71	www: true,
72	}; // default value just to get the above doc comment in the ES5 output and documentation generator
73	/**
74	* @cfg {Boolean} stripTrailingSlash (required)
75	* @inheritdoc Autolinker#stripTrailingSlash
76	*/
77	_this.stripTrailingSlash = true; // default value just to get the above doc comment in the ES5 output and documentation generator
78	/**
79	* @cfg {Boolean} decodePercentEncoding (required)
80	* @inheritdoc Autolinker#decodePercentEncoding
81	*/
82	_this.decodePercentEncoding = true; // default value just to get the above doc comment in the ES5 output and documentation generator
83	/**
84	* @protected
85	* @property {RegExp} matcherRegex
86	*
87	* The regular expression to match URLs with an optional scheme, port
88	* number, path, query string, and hash anchor.
89	*
90	* Example matches:
91	*
92	* http://google.com
93	* www.google.com
94	* google.com/path/to/file?q1=1&q2=2#myAnchor
95	*
96	*
97	* This regular expression will have the following capturing groups:
98	*
99	* 1. Group that matches a scheme-prefixed URL (i.e. 'http://google.com').
100	* This is used to match scheme URLs with just a single word, such as
101	* 'http://localhost', where we won't double check that the domain name
102	* has at least one dot ('.') in it.
103	* 2. Group that matches a 'www.' prefixed URL. This is only matched if the
104	* 'www.' text was not prefixed by a scheme (i.e.: not prefixed by
105	* 'http://', 'ftp:', etc.)
106	* 3. A protocol-relative ('//') match for the case of a 'www.' prefixed
107	* URL. Will be an empty string if it is not a protocol-relative match.
108	* We need to know the character before the '//' in order to determine
109	* if it is a valid match or the // was in a string we don't want to
110	* auto-link.
111	* 4. Group that matches a known TLD (top level domain), when a scheme
112	* or 'www.'-prefixed domain is not matched.
113	* 5. A protocol-relative ('//') match for the case of a known TLD prefixed
114	* URL. Will be an empty string if it is not a protocol-relative match.
115	* See #3 for more info.
116	*/
117	_this.matcherRegex = matcherRegex;
118	/**
119	* A regular expression to use to check the character before a protocol-relative
120	* URL match. We don't want to match a protocol-relative URL if it is part
121	* of another word.
122	*
123	* For example, we want to match something like "Go to: //google.com",
124	* but we don't want to match something like "abc//google.com"
125	*
126	* This regular expression is used to test the character before the '//'.
127	*
128	* @protected
129	* @type {RegExp} wordCharRegExp
130	*/
131	_this.wordCharRegExp = wordCharRegExp;
132	_this.stripPrefix = cfg.stripPrefix;
133	_this.stripTrailingSlash = cfg.stripTrailingSlash;
134	_this.decodePercentEncoding = cfg.decodePercentEncoding;
135	return _this;
136	}
137	/**
138	* @inheritdoc
139	*/
140	UrlMatcher.prototype.parseMatches = function (text) {
141	var matcherRegex = this.matcherRegex, stripPrefix = this.stripPrefix, stripTrailingSlash = this.stripTrailingSlash, decodePercentEncoding = this.decodePercentEncoding, tagBuilder = this.tagBuilder, matches = [], match;
142	var _loop_1 = function () {
143	var matchStr = match[0], schemeUrlMatch = match[1], wwwUrlMatch = match[4], wwwProtocolRelativeMatch = match[5],
144	//tldUrlMatch = match[ 8 ], -- not needed at the moment
145	tldProtocolRelativeMatch = match[9], offset = match.index, protocolRelativeMatch = wwwProtocolRelativeMatch \|\| tldProtocolRelativeMatch, prevChar = text.charAt(offset - 1);
146	if (!url_match_validator_1.UrlMatchValidator.isValid(matchStr, schemeUrlMatch)) {
147	return "continue";
148	}
149	// If the match is preceded by an '@' character, then it is either
150	// an email address or a username. Skip these types of matches.
151	if (offset > 0 && prevChar === '@') {
152	return "continue";
153	}
154	// If it's a protocol-relative '//' match, but the character before the '//'
155	// was a word character (i.e. a letter/number), then we found the '//' in the
156	// middle of another word (such as "asdf//asdf.com"). In this case, skip the
157	// match.
158	if (offset > 0 && protocolRelativeMatch && this_1.wordCharRegExp.test(prevChar)) {
159	return "continue";
160	}
161	// If the URL ends with a question mark, don't include the question
162	// mark as part of the URL. We'll assume the question mark was the
163	// end of a sentence, such as: "Going to google.com?"
164	if (/\?$/.test(matchStr)) {
165	matchStr = matchStr.substr(0, matchStr.length - 1);
166	}
167	// Handle a closing parenthesis or square bracket at the end of the
168	// match, and exclude it if there is not a matching open parenthesis
169	// or square bracket in the match itself.
170	if (this_1.matchHasUnbalancedClosingParen(matchStr)) {
171	matchStr = matchStr.substr(0, matchStr.length - 1); // remove the trailing ")"
172	}
173	else {
174	// Handle an invalid character after the TLD
175	var pos = this_1.matchHasInvalidCharAfterTld(matchStr, schemeUrlMatch);
176	if (pos > -1) {
177	matchStr = matchStr.substr(0, pos); // remove the trailing invalid chars
178	}
179	}
180	// The autolinker accepts many characters in a url's scheme (like `fake://test.com`).
181	// However, in cases where a URL is missing whitespace before an obvious link,
182	// (for example: `nowhitespacehttp://www.test.com`), we only want the match to start
183	// at the http:// part. We will check if the match contains a common scheme and then
184	// shift the match to start from there.
185	var foundCommonScheme = ['http://', 'https://'].find(function (commonScheme) { return !!schemeUrlMatch && schemeUrlMatch.indexOf(commonScheme) !== -1; });
186	if (foundCommonScheme) {
187	// If we found an overmatched URL, we want to find the index
188	// of where the match should start and shift the match to
189	// start from the beginning of the common scheme
190	var indexOfSchemeStart = matchStr.indexOf(foundCommonScheme);
191	matchStr = matchStr.substr(indexOfSchemeStart);
192	schemeUrlMatch = schemeUrlMatch.substr(indexOfSchemeStart);
193	offset = offset + indexOfSchemeStart;
194	}
195	var urlMatchType = schemeUrlMatch
196	? 'scheme'
197	: wwwUrlMatch
198	? 'www'
199	: 'tld', protocolUrlMatch = !!schemeUrlMatch;
200	matches.push(new url_match_1.UrlMatch({
201	tagBuilder: tagBuilder,
202	matchedText: matchStr,
203	offset: offset,
204	urlMatchType: urlMatchType,
205	url: matchStr,
206	protocolUrlMatch: protocolUrlMatch,
207	protocolRelativeMatch: !!protocolRelativeMatch,
208	stripPrefix: stripPrefix,
209	stripTrailingSlash: stripTrailingSlash,
210	decodePercentEncoding: decodePercentEncoding,
211	}));
212	};
213	var this_1 = this;
214	while ((match = matcherRegex.exec(text)) !== null) {
215	_loop_1();
216	}
217	return matches;
218	};
219	/**
220	* Determines if a match found has an unmatched closing parenthesis,
221	* square bracket or curly bracket. If so, the symbol will be removed
222	* from the match itself, and appended after the generated anchor tag.
223	*
224	* A match may have an extra closing parenthesis at the end of the match
225	* because the regular expression must include parenthesis for URLs such as
226	* "wikipedia.com/something_(disambiguation)", which should be auto-linked.
227	*
228	* However, an extra parenthesis will be included when the URL itself is
229	* wrapped in parenthesis, such as in the case of:
230	* "(wikipedia.com/something_(disambiguation))"
231	* In this case, the last closing parenthesis should not be part of the
232	* URL itself, and this method will return `true`.
233	*
234	* For square brackets in URLs such as in PHP arrays, the same behavior as
235	* parenthesis discussed above should happen:
236	* "[http://www.example.com/foo.php?bar[]=1&bar[]=2&bar[]=3]"
237	* The closing square bracket should not be part of the URL itself, and this
238	* method will return `true`.
239	*
240	* @protected
241	* @param {String} matchStr The full match string from the {@link #matcherRegex}.
242	* @return {Boolean} `true` if there is an unbalanced closing parenthesis or
243	* square bracket at the end of the `matchStr`, `false` otherwise.
244	*/
245	UrlMatcher.prototype.matchHasUnbalancedClosingParen = function (matchStr) {
246	var endChar = matchStr.charAt(matchStr.length - 1);
247	var startChar;
248	if (endChar === ')') {
249	startChar = '(';
250	}
251	else if (endChar === ']') {
252	startChar = '[';
253	}
254	else if (endChar === '}') {
255	startChar = '{';
256	}
257	else {
258	return false; // not a close parenthesis or square bracket
259	}
260	// Find if there are the same number of open braces as close braces in
261	// the URL string, minus the last character (which we have already
262	// determined to be either ')', ']' or '}'
263	var numOpenBraces = 0;
264	for (var i = 0, len = matchStr.length - 1; i < len; i++) {
265	var char = matchStr.charAt(i);
266	if (char === startChar) {
267	numOpenBraces++;
268	}
269	else if (char === endChar) {
270	numOpenBraces = Math.max(numOpenBraces - 1, 0);
271	}
272	}
273	// If the number of open braces matches the number of close braces in
274	// the URL minus the last character, then the match has unbalanced
275	// braces because of the last character. Example of unbalanced braces
276	// from the regex match:
277	// "http://example.com?a[]=1]"
278	if (numOpenBraces === 0) {
279	return true;
280	}
281	return false;
282	};
283	/**
284	* Determine if there's an invalid character after the TLD in a URL. Valid
285	* characters after TLD are ':/?#'. Exclude scheme matched URLs from this
286	* check.
287	*
288	* @protected
289	* @param {String} urlMatch The matched URL, if there was one. Will be an
290	* empty string if the match is not a URL match.
291	* @param {String} schemeUrlMatch The match URL string for a scheme
292	* match. Ex: 'http://yahoo.com'. This is used to match something like
293	* 'http://localhost', where we won't double check that the domain name
294	* has at least one '.' in it.
295	* @return {Number} the position where the invalid character was found. If
296	* no such character was found, returns -1
297	*/
298	UrlMatcher.prototype.matchHasInvalidCharAfterTld = function (urlMatch, schemeUrlMatch) {
299	if (!urlMatch) {
300	return -1;
301	}
302	var offset = 0;
303	if (schemeUrlMatch) {
304	offset = urlMatch.indexOf(':');
305	urlMatch = urlMatch.slice(offset);
306	}
307	// prettier-ignore
308	var re = new RegExp("^((.?\/\/)?[-." + regex_lib_1.alphaNumericAndMarksCharsStr + "]*[-" + regex_lib_1.alphaNumericAndMarksCharsStr + "]\\.[-" + regex_lib_1.alphaNumericAndMarksCharsStr + "]+)");
309	var res = re.exec(urlMatch);
310	if (res === null) {
311	return -1;
312	}
313	offset += res[1].length;
314	urlMatch = urlMatch.slice(res[1].length);
315	if (/^[^-.A-Za-z0-9:\/?#]/.test(urlMatch)) {
316	return offset;
317	}
318	return -1;
319	};
320	return UrlMatcher;
321	}(matcher_1.Matcher));
322	exports.UrlMatcher = UrlMatcher;
323	//# sourceMappingURL=url-matcher.js.map

Note: See TracBrowser for help on using the repository browser.

Download in other formats: