1 | /** Used to compose unicode character classes. */
|
---|
2 | var rsAstralRange = '\\ud800-\\udfff',
|
---|
3 | rsComboMarksRange = '\\u0300-\\u036f',
|
---|
4 | reComboHalfMarksRange = '\\ufe20-\\ufe2f',
|
---|
5 | rsComboSymbolsRange = '\\u20d0-\\u20ff',
|
---|
6 | rsComboRange = rsComboMarksRange + reComboHalfMarksRange + rsComboSymbolsRange,
|
---|
7 | rsDingbatRange = '\\u2700-\\u27bf',
|
---|
8 | rsLowerRange = 'a-z\\xdf-\\xf6\\xf8-\\xff',
|
---|
9 | rsMathOpRange = '\\xac\\xb1\\xd7\\xf7',
|
---|
10 | rsNonCharRange = '\\x00-\\x2f\\x3a-\\x40\\x5b-\\x60\\x7b-\\xbf',
|
---|
11 | rsPunctuationRange = '\\u2000-\\u206f',
|
---|
12 | rsSpaceRange = ' \\t\\x0b\\f\\xa0\\ufeff\\n\\r\\u2028\\u2029\\u1680\\u180e\\u2000\\u2001\\u2002\\u2003\\u2004\\u2005\\u2006\\u2007\\u2008\\u2009\\u200a\\u202f\\u205f\\u3000',
|
---|
13 | rsUpperRange = 'A-Z\\xc0-\\xd6\\xd8-\\xde',
|
---|
14 | rsVarRange = '\\ufe0e\\ufe0f',
|
---|
15 | rsBreakRange = rsMathOpRange + rsNonCharRange + rsPunctuationRange + rsSpaceRange;
|
---|
16 |
|
---|
17 | /** Used to compose unicode capture groups. */
|
---|
18 | var rsApos = "['\u2019]",
|
---|
19 | rsBreak = '[' + rsBreakRange + ']',
|
---|
20 | rsCombo = '[' + rsComboRange + ']',
|
---|
21 | rsDigits = '\\d+',
|
---|
22 | rsDingbat = '[' + rsDingbatRange + ']',
|
---|
23 | rsLower = '[' + rsLowerRange + ']',
|
---|
24 | rsMisc = '[^' + rsAstralRange + rsBreakRange + rsDigits + rsDingbatRange + rsLowerRange + rsUpperRange + ']',
|
---|
25 | rsFitz = '\\ud83c[\\udffb-\\udfff]',
|
---|
26 | rsModifier = '(?:' + rsCombo + '|' + rsFitz + ')',
|
---|
27 | rsNonAstral = '[^' + rsAstralRange + ']',
|
---|
28 | rsRegional = '(?:\\ud83c[\\udde6-\\uddff]){2}',
|
---|
29 | rsSurrPair = '[\\ud800-\\udbff][\\udc00-\\udfff]',
|
---|
30 | rsUpper = '[' + rsUpperRange + ']',
|
---|
31 | rsZWJ = '\\u200d';
|
---|
32 |
|
---|
33 | /** Used to compose unicode regexes. */
|
---|
34 | var rsMiscLower = '(?:' + rsLower + '|' + rsMisc + ')',
|
---|
35 | rsMiscUpper = '(?:' + rsUpper + '|' + rsMisc + ')',
|
---|
36 | rsOptContrLower = '(?:' + rsApos + '(?:d|ll|m|re|s|t|ve))?',
|
---|
37 | rsOptContrUpper = '(?:' + rsApos + '(?:D|LL|M|RE|S|T|VE))?',
|
---|
38 | reOptMod = rsModifier + '?',
|
---|
39 | rsOptVar = '[' + rsVarRange + ']?',
|
---|
40 | rsOptJoin = '(?:' + rsZWJ + '(?:' + [rsNonAstral, rsRegional, rsSurrPair].join('|') + ')' + rsOptVar + reOptMod + ')*',
|
---|
41 | rsOrdLower = '\\d*(?:1st|2nd|3rd|(?![123])\\dth)(?=\\b|[A-Z_])',
|
---|
42 | rsOrdUpper = '\\d*(?:1ST|2ND|3RD|(?![123])\\dTH)(?=\\b|[a-z_])',
|
---|
43 | rsSeq = rsOptVar + reOptMod + rsOptJoin,
|
---|
44 | rsEmoji = '(?:' + [rsDingbat, rsRegional, rsSurrPair].join('|') + ')' + rsSeq;
|
---|
45 |
|
---|
46 | /** Used to match complex or compound words. */
|
---|
47 | var reUnicodeWord = RegExp([
|
---|
48 | rsUpper + '?' + rsLower + '+' + rsOptContrLower + '(?=' + [rsBreak, rsUpper, '$'].join('|') + ')',
|
---|
49 | rsMiscUpper + '+' + rsOptContrUpper + '(?=' + [rsBreak, rsUpper + rsMiscLower, '$'].join('|') + ')',
|
---|
50 | rsUpper + '?' + rsMiscLower + '+' + rsOptContrLower,
|
---|
51 | rsUpper + '+' + rsOptContrUpper,
|
---|
52 | rsOrdUpper,
|
---|
53 | rsOrdLower,
|
---|
54 | rsDigits,
|
---|
55 | rsEmoji
|
---|
56 | ].join('|'), 'g');
|
---|
57 |
|
---|
58 | /**
|
---|
59 | * Splits a Unicode `string` into an array of its words.
|
---|
60 | *
|
---|
61 | * @private
|
---|
62 | * @param {string} The string to inspect.
|
---|
63 | * @returns {Array} Returns the words of `string`.
|
---|
64 | */
|
---|
65 | function unicodeWords(string) {
|
---|
66 | return string.match(reUnicodeWord) || [];
|
---|
67 | }
|
---|
68 |
|
---|
69 | module.exports = unicodeWords;
|
---|