char-class-classranges-merge-transform.js 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341
  1. /**
  2. * The MIT License (MIT)
  3. * Copyright (c) 2017-present Dmitry Soshnikov <dmitry.soshnikov@gmail.com>
  4. */
  5. 'use strict';
  6. /**
  7. * A regexp-tree plugin to merge class ranges.
  8. *
  9. * [a-ec] -> [a-e]
  10. * [a-ec-e] -> [a-e]
  11. * [\w\da-f] -> [\w]
  12. * [abcdef] -> [a-f]
  13. */
  14. module.exports = {
  15. _hasIUFlags: false,
  16. init: function init(ast) {
  17. this._hasIUFlags = ast.flags.includes('i') && ast.flags.includes('u');
  18. },
  19. CharacterClass: function CharacterClass(path) {
  20. var node = path.node;
  21. var expressions = node.expressions;
  22. var metas = [];
  23. // Extract metas
  24. expressions.forEach(function (expression) {
  25. if (isMeta(expression)) {
  26. metas.push(expression.value);
  27. }
  28. });
  29. expressions.sort(sortCharClass);
  30. for (var i = 0; i < expressions.length; i++) {
  31. var expression = expressions[i];
  32. if (fitsInMetas(expression, metas, this._hasIUFlags) || combinesWithPrecedingClassRange(expression, expressions[i - 1]) || combinesWithFollowingClassRange(expression, expressions[i + 1])) {
  33. expressions.splice(i, 1);
  34. i--;
  35. } else {
  36. var nbMergedChars = charCombinesWithPrecedingChars(expression, i, expressions);
  37. expressions.splice(i - nbMergedChars + 1, nbMergedChars);
  38. i -= nbMergedChars;
  39. }
  40. }
  41. }
  42. };
  43. /**
  44. * Sorts expressions in char class in the following order:
  45. * - meta chars, ordered alphabetically by value
  46. * - chars (except `control` kind) and class ranges, ordered alphabetically (`from` char is used for class ranges)
  47. * - if ambiguous, class range comes before char
  48. * - if ambiguous between two class ranges, orders alphabetically by `to` char
  49. * - control chars, ordered alphabetically by value
  50. * @param {Object} a - Left Char or ClassRange node
  51. * @param {Object} b - Right Char or ClassRange node
  52. * @returns {number}
  53. */
  54. function sortCharClass(a, b) {
  55. var aValue = getSortValue(a);
  56. var bValue = getSortValue(b);
  57. if (aValue === bValue) {
  58. // We want ClassRange before Char
  59. // [bb-d] -> [b-db]
  60. if (a.type === 'ClassRange' && b.type !== 'ClassRange') {
  61. return -1;
  62. }
  63. if (b.type === 'ClassRange' && a.type !== 'ClassRange') {
  64. return 1;
  65. }
  66. if (a.type === 'ClassRange' && b.type === 'ClassRange') {
  67. return getSortValue(a.to) - getSortValue(b.to);
  68. }
  69. if (isMeta(a) && isMeta(b) || isControl(a) && isControl(b)) {
  70. return a.value < b.value ? -1 : 1;
  71. }
  72. }
  73. return aValue - bValue;
  74. }
  75. /**
  76. * @param {Object} expression - Char or ClassRange node
  77. * @returns {number}
  78. */
  79. function getSortValue(expression) {
  80. if (expression.type === 'Char') {
  81. if (expression.value === '-') {
  82. return Infinity;
  83. }
  84. if (expression.kind === 'control') {
  85. return Infinity;
  86. }
  87. if (expression.kind === 'meta' && isNaN(expression.codePoint)) {
  88. return -1;
  89. }
  90. return expression.codePoint;
  91. }
  92. // ClassRange
  93. return expression.from.codePoint;
  94. }
  95. /**
  96. * Checks if a node is a meta char from the set \d\w\s\D\W\S
  97. * @param {Object} expression - Char or ClassRange node
  98. * @param {?string} value
  99. * @returns {boolean}
  100. */
  101. function isMeta(expression) {
  102. var value = arguments.length > 1 && arguments[1] !== undefined ? arguments[1] : null;
  103. return expression.type === 'Char' && expression.kind === 'meta' && (value ? expression.value === value : /^\\[dws]$/i.test(expression.value));
  104. }
  105. /**
  106. * @param {Object} expression - Char or ClassRange node
  107. * @returns {boolean}
  108. */
  109. function isControl(expression) {
  110. return expression.type === 'Char' && expression.kind === 'control';
  111. }
  112. /**
  113. * @param {Object} expression - Char or ClassRange node
  114. * @param {string[]} metas - Array of meta chars, e.g. ["\\w", "\\s"]
  115. * @param {boolean} hasIUFlags
  116. * @returns {boolean}
  117. */
  118. function fitsInMetas(expression, metas, hasIUFlags) {
  119. for (var i = 0; i < metas.length; i++) {
  120. if (fitsInMeta(expression, metas[i], hasIUFlags)) {
  121. return true;
  122. }
  123. }
  124. return false;
  125. }
  126. /**
  127. * @param {Object} expression - Char or ClassRange node
  128. * @param {string} meta - e.g. "\\w"
  129. * @param {boolean} hasIUFlags
  130. * @returns {boolean}
  131. */
  132. function fitsInMeta(expression, meta, hasIUFlags) {
  133. if (expression.type === 'ClassRange') {
  134. return fitsInMeta(expression.from, meta, hasIUFlags) && fitsInMeta(expression.to, meta, hasIUFlags);
  135. }
  136. // Special cases:
  137. // \S contains \w and \d
  138. if (meta === '\\S' && (isMeta(expression, '\\w') || isMeta(expression, '\\d'))) {
  139. return true;
  140. }
  141. // \D contains \W and \s
  142. if (meta === '\\D' && (isMeta(expression, '\\W') || isMeta(expression, '\\s'))) {
  143. return true;
  144. }
  145. // \w contains \d
  146. if (meta === '\\w' && isMeta(expression, '\\d')) {
  147. return true;
  148. }
  149. // \W contains \s
  150. if (meta === '\\W' && isMeta(expression, '\\s')) {
  151. return true;
  152. }
  153. if (expression.type !== 'Char' || isNaN(expression.codePoint)) {
  154. return false;
  155. }
  156. if (meta === '\\s') {
  157. return fitsInMetaS(expression);
  158. }
  159. if (meta === '\\S') {
  160. return !fitsInMetaS(expression);
  161. }
  162. if (meta === '\\d') {
  163. return fitsInMetaD(expression);
  164. }
  165. if (meta === '\\D') {
  166. return !fitsInMetaD(expression);
  167. }
  168. if (meta === '\\w') {
  169. return fitsInMetaW(expression, hasIUFlags);
  170. }
  171. if (meta === '\\W') {
  172. return !fitsInMetaW(expression, hasIUFlags);
  173. }
  174. return false;
  175. }
  176. /**
  177. * @param {Object} expression - Char node with codePoint
  178. * @returns {boolean}
  179. */
  180. function fitsInMetaS(expression) {
  181. return expression.codePoint === 0x0009 || // \t
  182. expression.codePoint === 0x000a || // \n
  183. expression.codePoint === 0x000b || // \v
  184. expression.codePoint === 0x000c || // \f
  185. expression.codePoint === 0x000d || // \r
  186. expression.codePoint === 0x0020 || // space
  187. expression.codePoint === 0x00a0 || // nbsp
  188. expression.codePoint === 0x1680 || // part of Zs
  189. expression.codePoint >= 0x2000 && expression.codePoint <= 0x200a || // part of Zs
  190. expression.codePoint === 0x2028 || // line separator
  191. expression.codePoint === 0x2029 || // paragraph separator
  192. expression.codePoint === 0x202f || // part of Zs
  193. expression.codePoint === 0x205f || // part of Zs
  194. expression.codePoint === 0x3000 || // part of Zs
  195. expression.codePoint === 0xfeff; // zwnbsp
  196. }
  197. /**
  198. * @param {Object} expression - Char node with codePoint
  199. * @returns {boolean}
  200. */
  201. function fitsInMetaD(expression) {
  202. return expression.codePoint >= 0x30 && expression.codePoint <= 0x39; // 0-9
  203. }
  204. /**
  205. * @param {Object} expression - Char node with codePoint
  206. * @param {boolean} hasIUFlags
  207. * @returns {boolean}
  208. */
  209. function fitsInMetaW(expression, hasIUFlags) {
  210. return fitsInMetaD(expression) || expression.codePoint >= 0x41 && expression.codePoint <= 0x5a || // A-Z
  211. expression.codePoint >= 0x61 && expression.codePoint <= 0x7a || // a-z
  212. expression.value === '_' || hasIUFlags && (expression.codePoint === 0x017f || expression.codePoint === 0x212a);
  213. }
  214. /**
  215. * @param {Object} expression - Char or ClassRange node
  216. * @param {Object} classRange - Char or ClassRange node
  217. * @returns {boolean}
  218. */
  219. function combinesWithPrecedingClassRange(expression, classRange) {
  220. if (classRange && classRange.type === 'ClassRange') {
  221. if (fitsInClassRange(expression, classRange)) {
  222. // [a-gc] -> [a-g]
  223. // [a-gc-e] -> [a-g]
  224. return true;
  225. } else if (
  226. // We only want \w chars or char codes to keep readability
  227. isMetaWCharOrCode(expression) && classRange.to.codePoint === expression.codePoint - 1) {
  228. // [a-de] -> [a-e]
  229. classRange.to = expression;
  230. return true;
  231. } else if (expression.type === 'ClassRange' && expression.from.codePoint <= classRange.to.codePoint + 1 && expression.to.codePoint >= classRange.from.codePoint - 1) {
  232. // [a-db-f] -> [a-f]
  233. // [b-fa-d] -> [a-f]
  234. // [a-cd-f] -> [a-f]
  235. if (expression.from.codePoint < classRange.from.codePoint) {
  236. classRange.from = expression.from;
  237. }
  238. if (expression.to.codePoint > classRange.to.codePoint) {
  239. classRange.to = expression.to;
  240. }
  241. return true;
  242. }
  243. }
  244. return false;
  245. }
  246. /**
  247. * @param {Object} expression - Char or ClassRange node
  248. * @param {Object} classRange - Char or ClassRange node
  249. * @returns {boolean}
  250. */
  251. function combinesWithFollowingClassRange(expression, classRange) {
  252. if (classRange && classRange.type === 'ClassRange') {
  253. // Considering the elements were ordered alphabetically,
  254. // there is only one case to handle
  255. // [ab-e] -> [a-e]
  256. if (
  257. // We only want \w chars or char codes to keep readability
  258. isMetaWCharOrCode(expression) && classRange.from.codePoint === expression.codePoint + 1) {
  259. classRange.from = expression;
  260. return true;
  261. }
  262. }
  263. return false;
  264. }
  265. /**
  266. * @param {Object} expression - Char or ClassRange node
  267. * @param {Object} classRange - ClassRange node
  268. * @returns {boolean}
  269. */
  270. function fitsInClassRange(expression, classRange) {
  271. if (expression.type === 'Char' && isNaN(expression.codePoint)) {
  272. return false;
  273. }
  274. if (expression.type === 'ClassRange') {
  275. return fitsInClassRange(expression.from, classRange) && fitsInClassRange(expression.to, classRange);
  276. }
  277. return expression.codePoint >= classRange.from.codePoint && expression.codePoint <= classRange.to.codePoint;
  278. }
  279. /**
  280. * @param {Object} expression - Char or ClassRange node
  281. * @param {Number} index
  282. * @param {Object[]} expressions - expressions in CharClass
  283. * @returns {number} - Number of characters combined with expression
  284. */
  285. function charCombinesWithPrecedingChars(expression, index, expressions) {
  286. // We only want \w chars or char codes to keep readability
  287. if (!isMetaWCharOrCode(expression)) {
  288. return 0;
  289. }
  290. var nbMergedChars = 0;
  291. while (index > 0) {
  292. var currentExpression = expressions[index];
  293. var precedingExpresion = expressions[index - 1];
  294. if (isMetaWCharOrCode(precedingExpresion) && precedingExpresion.codePoint === currentExpression.codePoint - 1) {
  295. nbMergedChars++;
  296. index--;
  297. } else {
  298. break;
  299. }
  300. }
  301. if (nbMergedChars > 1) {
  302. expressions[index] = {
  303. type: 'ClassRange',
  304. from: expressions[index],
  305. to: expression
  306. };
  307. return nbMergedChars;
  308. }
  309. return 0;
  310. }
  311. function isMetaWCharOrCode(expression) {
  312. return expression && expression.type === 'Char' && !isNaN(expression.codePoint) && (fitsInMetaW(expression, false) || expression.kind === 'unicode' || expression.kind === 'hex' || expression.kind === 'oct' || expression.kind === 'decimal');
  313. }