Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Normal
Revision Log

source: node_modules/autolinker/dist/es2015/matcher/url-matcher.js@ d24f17c

main

Last change on this file since d24f17c was d24f17c, checked in by Aleksandar Panovski <apano77@…>, 17 months ago
Initial commit
Property mode set to `100644`
File size: 15.0 KB

Rev	Line
[d24f17c]	1	import { __extends } from "tslib";
	2	import { Matcher } from './matcher';
	3	import { alphaNumericCharsStr, alphaNumericAndMarksCharsStr, getDomainNameStr } from '../regex-lib';
	4	import { tldRegex } from './tld-regex';
	5	import { UrlMatch } from '../match/url-match';
	6	import { UrlMatchValidator } from './url-match-validator';
	7	// RegExp objects which are shared by all instances of UrlMatcher. These are
	8	// here to avoid re-instantiating the RegExp objects if `Autolinker.link()` is
	9	// called multiple times, thus instantiating UrlMatcher and its RegExp
	10	// objects each time (which is very expensive - see https://github.com/gregjacobs/Autolinker.js/issues/314).
	11	// See descriptions of the properties where they are used for details about them
	12	// prettier-ignore
	13	var matcherRegex = (function () {
	14	var schemeRegex = /(?:[A-Za-z][-.+A-Za-z0-9]{0,63}:(?![A-Za-z][-.+A-Za-z0-9]{0,63}:\/\/)(?!\d+\/?)(?:\/\/)?)/, // match protocol, allow in format "http://" or "mailto:". However, do not match the first part of something like 'link:http://www.google.com' (i.e. don't match "link:"). Also, make sure we don't interpret 'google.com:8000' as if 'google.com' was a protocol here (i.e. ignore a trailing port number in this regex)
	15	wwwRegex = /(?:www\.)/, // starting with 'www.'
	16	// Allow optional path, query string, and hash anchor, not ending in the following characters: "?!:,.;"
	17	// http://blog.codinghorror.com/the-problem-with-urls/
	18	urlSuffixRegex = new RegExp('[/?#](?:[' + alphaNumericAndMarksCharsStr + '\\-+&@#/%=~_()\|\'$\\[\\]{}?!:,.;^\u2713][' + alphaNumericAndMarksCharsStr + '\\-+&@#/%=~_()\|\'$*\\[\\]{}\u2713])?');
	19	return new RegExp([
	20	'(?:',
	21	'(',
	22	schemeRegex.source,
	23	getDomainNameStr(2),
	24	')',
	25	'\|',
	26	'(',
	27	'(//)?',
	28	wwwRegex.source,
	29	getDomainNameStr(6),
	30	')',
	31	'\|',
	32	'(',
	33	'(//)?',
	34	getDomainNameStr(10) + '\\.',
	35	tldRegex.source,
	36	'(?![-' + alphaNumericCharsStr + '])',
	37	')',
	38	')',
	39	'(?::[0-9]+)?',
	40	'(?:' + urlSuffixRegex.source + ')?' // match for path, query string, and/or hash anchor - optional
	41	].join(""), 'gi');
	42	})();
	43	var wordCharRegExp = new RegExp('[' + alphaNumericAndMarksCharsStr + ']');
	44	/**
	45	* @class Autolinker.matcher.Url
	46	* @extends Autolinker.matcher.Matcher
	47	*
	48	* Matcher to find URL matches in an input string.
	49	*
	50	* See this class's superclass ({@link Autolinker.matcher.Matcher}) for more details.
	51	*/
	52	var UrlMatcher = /** @class */ (function (_super) {
	53	__extends(UrlMatcher, _super);
	54	/**
	55	* @method constructor
	56	* @param {Object} cfg The configuration properties for the Match instance,
	57	* specified in an Object (map).
	58	*/
	59	function UrlMatcher(cfg) {
	60	var _this = _super.call(this, cfg) \|\| this;
	61	/**
	62	* @cfg {Object} stripPrefix (required)
	63	*
	64	* The Object form of {@link Autolinker#cfg-stripPrefix}.
	65	*/
	66	_this.stripPrefix = {
	67	scheme: true,
	68	www: true,
	69	}; // default value just to get the above doc comment in the ES5 output and documentation generator
	70	/**
	71	* @cfg {Boolean} stripTrailingSlash (required)
	72	* @inheritdoc Autolinker#stripTrailingSlash
	73	*/
	74	_this.stripTrailingSlash = true; // default value just to get the above doc comment in the ES5 output and documentation generator
	75	/**
	76	* @cfg {Boolean} decodePercentEncoding (required)
	77	* @inheritdoc Autolinker#decodePercentEncoding
	78	*/
	79	_this.decodePercentEncoding = true; // default value just to get the above doc comment in the ES5 output and documentation generator
	80	/**
	81	* @protected
	82	* @property {RegExp} matcherRegex
	83	*
	84	* The regular expression to match URLs with an optional scheme, port
	85	* number, path, query string, and hash anchor.
	86	*
	87	* Example matches:
	88	*
	89	* http://google.com
	90	* www.google.com
	91	* google.com/path/to/file?q1=1&q2=2#myAnchor
	92	*
	93	*
	94	* This regular expression will have the following capturing groups:
	95	*
	96	* 1. Group that matches a scheme-prefixed URL (i.e. 'http://google.com').
	97	* This is used to match scheme URLs with just a single word, such as
	98	* 'http://localhost', where we won't double check that the domain name
	99	* has at least one dot ('.') in it.
	100	* 2. Group that matches a 'www.' prefixed URL. This is only matched if the
	101	* 'www.' text was not prefixed by a scheme (i.e.: not prefixed by
	102	* 'http://', 'ftp:', etc.)
	103	* 3. A protocol-relative ('//') match for the case of a 'www.' prefixed
	104	* URL. Will be an empty string if it is not a protocol-relative match.
	105	* We need to know the character before the '//' in order to determine
	106	* if it is a valid match or the // was in a string we don't want to
	107	* auto-link.
	108	* 4. Group that matches a known TLD (top level domain), when a scheme
	109	* or 'www.'-prefixed domain is not matched.
	110	* 5. A protocol-relative ('//') match for the case of a known TLD prefixed
	111	* URL. Will be an empty string if it is not a protocol-relative match.
	112	* See #3 for more info.
	113	*/
	114	_this.matcherRegex = matcherRegex;
	115	/**
	116	* A regular expression to use to check the character before a protocol-relative
	117	* URL match. We don't want to match a protocol-relative URL if it is part
	118	* of another word.
	119	*
	120	* For example, we want to match something like "Go to: //google.com",
	121	* but we don't want to match something like "abc//google.com"
	122	*
	123	* This regular expression is used to test the character before the '//'.
	124	*
	125	* @protected
	126	* @type {RegExp} wordCharRegExp
	127	*/
	128	_this.wordCharRegExp = wordCharRegExp;
	129	_this.stripPrefix = cfg.stripPrefix;
	130	_this.stripTrailingSlash = cfg.stripTrailingSlash;
	131	_this.decodePercentEncoding = cfg.decodePercentEncoding;
	132	return _this;
	133	}
	134	/**
	135	* @inheritdoc
	136	*/
	137	UrlMatcher.prototype.parseMatches = function (text) {
	138	var matcherRegex = this.matcherRegex, stripPrefix = this.stripPrefix, stripTrailingSlash = this.stripTrailingSlash, decodePercentEncoding = this.decodePercentEncoding, tagBuilder = this.tagBuilder, matches = [], match;
	139	var _loop_1 = function () {
	140	var matchStr = match[0], schemeUrlMatch = match[1], wwwUrlMatch = match[4], wwwProtocolRelativeMatch = match[5],
	141	//tldUrlMatch = match[ 8 ], -- not needed at the moment
	142	tldProtocolRelativeMatch = match[9], offset = match.index, protocolRelativeMatch = wwwProtocolRelativeMatch \|\| tldProtocolRelativeMatch, prevChar = text.charAt(offset - 1);
	143	if (!UrlMatchValidator.isValid(matchStr, schemeUrlMatch)) {
	144	return "continue";
	145	}
	146	// If the match is preceded by an '@' character, then it is either
	147	// an email address or a username. Skip these types of matches.
	148	if (offset > 0 && prevChar === '@') {
	149	return "continue";
	150	}
	151	// If it's a protocol-relative '//' match, but the character before the '//'
	152	// was a word character (i.e. a letter/number), then we found the '//' in the
	153	// middle of another word (such as "asdf//asdf.com"). In this case, skip the
	154	// match.
	155	if (offset > 0 && protocolRelativeMatch && this_1.wordCharRegExp.test(prevChar)) {
	156	return "continue";
	157	}
	158	// If the URL ends with a question mark, don't include the question
	159	// mark as part of the URL. We'll assume the question mark was the
	160	// end of a sentence, such as: "Going to google.com?"
	161	if (/\?$/.test(matchStr)) {
	162	matchStr = matchStr.substr(0, matchStr.length - 1);
	163	}
	164	// Handle a closing parenthesis or square bracket at the end of the
	165	// match, and exclude it if there is not a matching open parenthesis
	166	// or square bracket in the match itself.
	167	if (this_1.matchHasUnbalancedClosingParen(matchStr)) {
	168	matchStr = matchStr.substr(0, matchStr.length - 1); // remove the trailing ")"
	169	}
	170	else {
	171	// Handle an invalid character after the TLD
	172	var pos = this_1.matchHasInvalidCharAfterTld(matchStr, schemeUrlMatch);
	173	if (pos > -1) {
	174	matchStr = matchStr.substr(0, pos); // remove the trailing invalid chars
	175	}
	176	}
	177	// The autolinker accepts many characters in a url's scheme (like `fake://test.com`).
	178	// However, in cases where a URL is missing whitespace before an obvious link,
	179	// (for example: `nowhitespacehttp://www.test.com`), we only want the match to start
	180	// at the http:// part. We will check if the match contains a common scheme and then
	181	// shift the match to start from there.
	182	var foundCommonScheme = ['http://', 'https://'].find(function (commonScheme) { return !!schemeUrlMatch && schemeUrlMatch.indexOf(commonScheme) !== -1; });
	183	if (foundCommonScheme) {
	184	// If we found an overmatched URL, we want to find the index
	185	// of where the match should start and shift the match to
	186	// start from the beginning of the common scheme
	187	var indexOfSchemeStart = matchStr.indexOf(foundCommonScheme);
	188	matchStr = matchStr.substr(indexOfSchemeStart);
	189	schemeUrlMatch = schemeUrlMatch.substr(indexOfSchemeStart);
	190	offset = offset + indexOfSchemeStart;
	191	}
	192	var urlMatchType = schemeUrlMatch
	193	? 'scheme'
	194	: wwwUrlMatch
	195	? 'www'
	196	: 'tld', protocolUrlMatch = !!schemeUrlMatch;
	197	matches.push(new UrlMatch({
	198	tagBuilder: tagBuilder,
	199	matchedText: matchStr,
	200	offset: offset,
	201	urlMatchType: urlMatchType,
	202	url: matchStr,
	203	protocolUrlMatch: protocolUrlMatch,
	204	protocolRelativeMatch: !!protocolRelativeMatch,
	205	stripPrefix: stripPrefix,
	206	stripTrailingSlash: stripTrailingSlash,
	207	decodePercentEncoding: decodePercentEncoding,
	208	}));
	209	};
	210	var this_1 = this;
	211	while ((match = matcherRegex.exec(text)) !== null) {
	212	_loop_1();
	213	}
	214	return matches;
	215	};
	216	/**
	217	* Determines if a match found has an unmatched closing parenthesis,
	218	* square bracket or curly bracket. If so, the symbol will be removed
	219	* from the match itself, and appended after the generated anchor tag.
	220	*
	221	* A match may have an extra closing parenthesis at the end of the match
	222	* because the regular expression must include parenthesis for URLs such as
	223	* "wikipedia.com/something_(disambiguation)", which should be auto-linked.
	224	*
	225	* However, an extra parenthesis will be included when the URL itself is
	226	* wrapped in parenthesis, such as in the case of:
	227	* "(wikipedia.com/something_(disambiguation))"
	228	* In this case, the last closing parenthesis should not be part of the
	229	* URL itself, and this method will return `true`.
	230	*
	231	* For square brackets in URLs such as in PHP arrays, the same behavior as
	232	* parenthesis discussed above should happen:
	233	* "[http://www.example.com/foo.php?bar[]=1&bar[]=2&bar[]=3]"
	234	* The closing square bracket should not be part of the URL itself, and this
	235	* method will return `true`.
	236	*
	237	* @protected
	238	* @param {String} matchStr The full match string from the {@link #matcherRegex}.
	239	* @return {Boolean} `true` if there is an unbalanced closing parenthesis or
	240	* square bracket at the end of the `matchStr`, `false` otherwise.
	241	*/
	242	UrlMatcher.prototype.matchHasUnbalancedClosingParen = function (matchStr) {
	243	var endChar = matchStr.charAt(matchStr.length - 1);
	244	var startChar;
	245	if (endChar === ')') {
	246	startChar = '(';
	247	}
	248	else if (endChar === ']') {
	249	startChar = '[';
	250	}
	251	else if (endChar === '}') {
	252	startChar = '{';
	253	}
	254	else {
	255	return false; // not a close parenthesis or square bracket
	256	}
	257	// Find if there are the same number of open braces as close braces in
	258	// the URL string, minus the last character (which we have already
	259	// determined to be either ')', ']' or '}'
	260	var numOpenBraces = 0;
	261	for (var i = 0, len = matchStr.length - 1; i < len; i++) {
	262	var char = matchStr.charAt(i);
	263	if (char === startChar) {
	264	numOpenBraces++;
	265	}
	266	else if (char === endChar) {
	267	numOpenBraces = Math.max(numOpenBraces - 1, 0);
	268	}
	269	}
	270	// If the number of open braces matches the number of close braces in
	271	// the URL minus the last character, then the match has unbalanced
	272	// braces because of the last character. Example of unbalanced braces
	273	// from the regex match:
	274	// "http://example.com?a[]=1]"
	275	if (numOpenBraces === 0) {
	276	return true;
	277	}
	278	return false;
	279	};
	280	/**
	281	* Determine if there's an invalid character after the TLD in a URL. Valid
	282	* characters after TLD are ':/?#'. Exclude scheme matched URLs from this
	283	* check.
	284	*
	285	* @protected
	286	* @param {String} urlMatch The matched URL, if there was one. Will be an
	287	* empty string if the match is not a URL match.
	288	* @param {String} schemeUrlMatch The match URL string for a scheme
	289	* match. Ex: 'http://yahoo.com'. This is used to match something like
	290	* 'http://localhost', where we won't double check that the domain name
	291	* has at least one '.' in it.
	292	* @return {Number} the position where the invalid character was found. If
	293	* no such character was found, returns -1
	294	*/
	295	UrlMatcher.prototype.matchHasInvalidCharAfterTld = function (urlMatch, schemeUrlMatch) {
	296	if (!urlMatch) {
	297	return -1;
	298	}
	299	var offset = 0;
	300	if (schemeUrlMatch) {
	301	offset = urlMatch.indexOf(':');
	302	urlMatch = urlMatch.slice(offset);
	303	}
	304	// prettier-ignore
	305	var re = new RegExp("^((.?\/\/)?[-." + alphaNumericAndMarksCharsStr + "]*[-" + alphaNumericAndMarksCharsStr + "]\\.[-" + alphaNumericAndMarksCharsStr + "]+)");
	306	var res = re.exec(urlMatch);
	307	if (res === null) {
	308	return -1;
	309	}
	310	offset += res[1].length;
	311	urlMatch = urlMatch.slice(res[1].length);
	312	if (/^[^-.A-Za-z0-9:\/?#]/.test(urlMatch)) {
	313	return offset;
	314	}
	315	return -1;
	316	};
	317	return UrlMatcher;
	318	}(Matcher));
	319	export { UrlMatcher };
	320	//# sourceMappingURL=url-matcher.js.map

Note: See TracBrowser for help on using the repository browser.

Download in other formats: