deterministicGrouping.js 7.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274
  1. "use strict";
  2. // Simulations show these probabilities for a single change
  3. // 93.1% that one group is invalidated
  4. // 4.8% that two groups are invalidated
  5. // 1.1% that 3 groups are invalidated
  6. // 0.1% that 4 or more groups are invalidated
  7. //
  8. // And these for removing/adding 10 lexically adjacent files
  9. // 64.5% that one group is invalidated
  10. // 24.8% that two groups are invalidated
  11. // 7.8% that 3 groups are invalidated
  12. // 2.7% that 4 or more groups are invalidated
  13. //
  14. // And these for removing/adding 3 random files
  15. // 0% that one group is invalidated
  16. // 3.7% that two groups are invalidated
  17. // 80.8% that 3 groups are invalidated
  18. // 12.3% that 4 groups are invalidated
  19. // 3.2% that 5 or more groups are invalidated
  20. /**
  21. *
  22. * @param {string} a key
  23. * @param {string} b key
  24. * @returns {number} the similarity as number
  25. */
  26. const similarity = (a, b) => {
  27. const l = Math.min(a.length, b.length);
  28. let dist = 0;
  29. for (let i = 0; i < l; i++) {
  30. const ca = a.charCodeAt(i);
  31. const cb = b.charCodeAt(i);
  32. dist += Math.max(0, 10 - Math.abs(ca - cb));
  33. }
  34. return dist;
  35. };
  36. /**
  37. * @param {string} a key
  38. * @param {string} b key
  39. * @returns {string} the common part and a single char for the difference
  40. */
  41. const getName = (a, b) => {
  42. const l = Math.min(a.length, b.length);
  43. let r = "";
  44. for (let i = 0; i < l; i++) {
  45. const ca = a.charAt(i);
  46. const cb = b.charAt(i);
  47. r += ca;
  48. if (ca === cb) {
  49. continue;
  50. }
  51. return r;
  52. }
  53. return a;
  54. };
  55. /**
  56. * @template T
  57. */
  58. class Node {
  59. /**
  60. * @param {T} item item
  61. * @param {string} key key
  62. * @param {number} size size
  63. */
  64. constructor(item, key, size) {
  65. this.item = item;
  66. this.key = key;
  67. this.size = size;
  68. }
  69. }
  70. /**
  71. * @template T
  72. */
  73. class Group {
  74. /**
  75. * @param {Node<T>[]} nodes nodes
  76. * @param {number[]} similarities similarities between the nodes (length = nodes.length - 1)
  77. */
  78. constructor(nodes, similarities) {
  79. this.nodes = nodes;
  80. this.similarities = similarities;
  81. this.size = nodes.reduce((size, node) => size + node.size, 0);
  82. /** @type {string} */
  83. this.key = undefined;
  84. }
  85. }
  86. /**
  87. * @template T
  88. * @typedef {Object} GroupedItems<T>
  89. * @property {string} key
  90. * @property {T[]} items
  91. * @property {number} size
  92. */
  93. /**
  94. * @template T
  95. * @typedef {Object} Options
  96. * @property {number} maxSize maximum size of a group
  97. * @property {number} minSize minimum size of a group (preferred over maximum size)
  98. * @property {Iterable<T>} items a list of items
  99. * @property {function(T): number} getSize function to get size of an item
  100. * @property {function(T): string} getKey function to get the key of an item
  101. */
  102. /**
  103. * @template T
  104. * @param {Options<T>} options options object
  105. * @returns {GroupedItems<T>[]} grouped items
  106. */
  107. module.exports = ({ maxSize, minSize, items, getSize, getKey }) => {
  108. /** @type {Group<T>[]} */
  109. const result = [];
  110. const nodes = Array.from(
  111. items,
  112. item => new Node(item, getKey(item), getSize(item))
  113. );
  114. /** @type {Node<T>[]} */
  115. const initialNodes = [];
  116. // lexically ordering of keys
  117. nodes.sort((a, b) => {
  118. if (a.key < b.key) return -1;
  119. if (a.key > b.key) return 1;
  120. return 0;
  121. });
  122. // return nodes bigger than maxSize directly as group
  123. for (const node of nodes) {
  124. if (node.size >= maxSize) {
  125. result.push(new Group([node], []));
  126. } else {
  127. initialNodes.push(node);
  128. }
  129. }
  130. if (initialNodes.length > 0) {
  131. // calculate similarities between lexically adjacent nodes
  132. /** @type {number[]} */
  133. const similarities = [];
  134. for (let i = 1; i < initialNodes.length; i++) {
  135. const a = initialNodes[i - 1];
  136. const b = initialNodes[i];
  137. similarities.push(similarity(a.key, b.key));
  138. }
  139. const initialGroup = new Group(initialNodes, similarities);
  140. if (initialGroup.size < minSize) {
  141. // We hit an edgecase where the working set is already smaller than minSize
  142. // We merge it with the smallest result node to keep minSize intact
  143. if (result.length > 0) {
  144. const smallestGroup = result.reduce((min, group) =>
  145. min.size > group.size ? group : min
  146. );
  147. for (const node of initialGroup.nodes) smallestGroup.nodes.push(node);
  148. smallestGroup.nodes.sort((a, b) => {
  149. if (a.key < b.key) return -1;
  150. if (a.key > b.key) return 1;
  151. return 0;
  152. });
  153. } else {
  154. // There are no other nodes
  155. // We use all nodes and have to accept that it's smaller than minSize
  156. result.push(initialGroup);
  157. }
  158. } else {
  159. const queue = [initialGroup];
  160. while (queue.length) {
  161. const group = queue.pop();
  162. // only groups bigger than maxSize need to be splitted
  163. if (group.size < maxSize) {
  164. result.push(group);
  165. continue;
  166. }
  167. // find unsplittable area from left and right
  168. // going minSize from left and right
  169. // at least one node need to be included otherwise we get stuck
  170. let left = 0;
  171. let leftSize = 0;
  172. while (leftSize <= minSize) {
  173. leftSize += group.nodes[left].size;
  174. left++;
  175. }
  176. let right = group.nodes.length - 1;
  177. let rightSize = 0;
  178. while (rightSize <= minSize) {
  179. rightSize += group.nodes[right].size;
  180. right--;
  181. }
  182. if (left - 1 > right) {
  183. // can't split group while holding minSize
  184. // because minSize is preferred of maxSize we return
  185. // the group here even while it's too big
  186. // To avoid this make sure maxSize > minSize * 3
  187. result.push(group);
  188. continue;
  189. }
  190. if (left <= right) {
  191. // when there is a area between left and right
  192. // we look for best split point
  193. // we split at the minimum similarity
  194. // here key space is separated the most
  195. let best = left - 1;
  196. let bestSimilarity = group.similarities[best];
  197. for (let i = left; i <= right; i++) {
  198. const similarity = group.similarities[i];
  199. if (similarity < bestSimilarity) {
  200. best = i;
  201. bestSimilarity = similarity;
  202. }
  203. }
  204. left = best + 1;
  205. right = best;
  206. }
  207. // create two new groups for left and right area
  208. // and queue them up
  209. const rightNodes = [group.nodes[right + 1]];
  210. /** @type {number[]} */
  211. const rightSimilaries = [];
  212. for (let i = right + 2; i < group.nodes.length; i++) {
  213. rightSimilaries.push(group.similarities[i - 1]);
  214. rightNodes.push(group.nodes[i]);
  215. }
  216. queue.push(new Group(rightNodes, rightSimilaries));
  217. const leftNodes = [group.nodes[0]];
  218. /** @type {number[]} */
  219. const leftSimilaries = [];
  220. for (let i = 1; i < left; i++) {
  221. leftSimilaries.push(group.similarities[i - 1]);
  222. leftNodes.push(group.nodes[i]);
  223. }
  224. queue.push(new Group(leftNodes, leftSimilaries));
  225. }
  226. }
  227. }
  228. // lexically ordering
  229. result.sort((a, b) => {
  230. if (a.nodes[0].key < b.nodes[0].key) return -1;
  231. if (a.nodes[0].key > b.nodes[0].key) return 1;
  232. return 0;
  233. });
  234. // give every group a name
  235. for (let i = 0; i < result.length; i++) {
  236. const group = result[i];
  237. const first = group.nodes[0];
  238. const last = group.nodes[group.nodes.length - 1];
  239. let name = getName(first.key, last.key);
  240. group.key = name;
  241. }
  242. // return the results
  243. return result.map(group => {
  244. /** @type {GroupedItems} */
  245. return {
  246. key: group.key,
  247. items: group.nodes.map(node => node.item),
  248. size: group.size
  249. };
  250. });
  251. };