html.js 35 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092
  1. import he from 'he';
  2. import { selectAll, selectOne } from 'css-select';
  3. import Node from './node';
  4. import NodeType from './type';
  5. import TextNode from './text';
  6. import Matcher from '../matcher';
  7. import arr_back from '../back';
  8. import CommentNode from './comment';
  9. // const { decode } = he;
  10. function decode(val) {
  11. // clone string
  12. return JSON.parse(JSON.stringify(he.decode(val)));
  13. }
  14. // https://developer.mozilla.org/en-US/docs/Web/HTML/Block-level_elements
  15. const kBlockElements = new Set();
  16. kBlockElements.add('address');
  17. kBlockElements.add('ADDRESS');
  18. kBlockElements.add('article');
  19. kBlockElements.add('ARTICLE');
  20. kBlockElements.add('aside');
  21. kBlockElements.add('ASIDE');
  22. kBlockElements.add('blockquote');
  23. kBlockElements.add('BLOCKQUOTE');
  24. kBlockElements.add('br');
  25. kBlockElements.add('BR');
  26. kBlockElements.add('details');
  27. kBlockElements.add('DETAILS');
  28. kBlockElements.add('dialog');
  29. kBlockElements.add('DIALOG');
  30. kBlockElements.add('dd');
  31. kBlockElements.add('DD');
  32. kBlockElements.add('div');
  33. kBlockElements.add('DIV');
  34. kBlockElements.add('dl');
  35. kBlockElements.add('DL');
  36. kBlockElements.add('dt');
  37. kBlockElements.add('DT');
  38. kBlockElements.add('fieldset');
  39. kBlockElements.add('FIELDSET');
  40. kBlockElements.add('figcaption');
  41. kBlockElements.add('FIGCAPTION');
  42. kBlockElements.add('figure');
  43. kBlockElements.add('FIGURE');
  44. kBlockElements.add('footer');
  45. kBlockElements.add('FOOTER');
  46. kBlockElements.add('form');
  47. kBlockElements.add('FORM');
  48. kBlockElements.add('h1');
  49. kBlockElements.add('H1');
  50. kBlockElements.add('h2');
  51. kBlockElements.add('H2');
  52. kBlockElements.add('h3');
  53. kBlockElements.add('H3');
  54. kBlockElements.add('h4');
  55. kBlockElements.add('H4');
  56. kBlockElements.add('h5');
  57. kBlockElements.add('H5');
  58. kBlockElements.add('h6');
  59. kBlockElements.add('H6');
  60. kBlockElements.add('header');
  61. kBlockElements.add('HEADER');
  62. kBlockElements.add('hgroup');
  63. kBlockElements.add('HGROUP');
  64. kBlockElements.add('hr');
  65. kBlockElements.add('HR');
  66. kBlockElements.add('li');
  67. kBlockElements.add('LI');
  68. kBlockElements.add('main');
  69. kBlockElements.add('MAIN');
  70. kBlockElements.add('nav');
  71. kBlockElements.add('NAV');
  72. kBlockElements.add('ol');
  73. kBlockElements.add('OL');
  74. kBlockElements.add('p');
  75. kBlockElements.add('P');
  76. kBlockElements.add('pre');
  77. kBlockElements.add('PRE');
  78. kBlockElements.add('section');
  79. kBlockElements.add('SECTION');
  80. kBlockElements.add('table');
  81. kBlockElements.add('TABLE');
  82. kBlockElements.add('td');
  83. kBlockElements.add('TD');
  84. kBlockElements.add('tr');
  85. kBlockElements.add('TR');
  86. kBlockElements.add('ul');
  87. kBlockElements.add('UL');
  88. class DOMTokenList {
  89. constructor(valuesInit = [], afterUpdate = (() => null)) {
  90. this._set = new Set(valuesInit);
  91. this._afterUpdate = afterUpdate;
  92. }
  93. _validate(c) {
  94. if (/\s/.test(c)) {
  95. throw new Error(`DOMException in DOMTokenList.add: The token '${c}' contains HTML space characters, which are not valid in tokens.`);
  96. }
  97. }
  98. add(c) {
  99. this._validate(c);
  100. this._set.add(c);
  101. this._afterUpdate(this); // eslint-disable-line @typescript-eslint/no-unsafe-call
  102. }
  103. replace(c1, c2) {
  104. this._validate(c2);
  105. this._set.delete(c1);
  106. this._set.add(c2);
  107. this._afterUpdate(this); // eslint-disable-line @typescript-eslint/no-unsafe-call
  108. }
  109. remove(c) {
  110. this._set.delete(c) &&
  111. this._afterUpdate(this); // eslint-disable-line @typescript-eslint/no-unsafe-call
  112. }
  113. toggle(c) {
  114. this._validate(c);
  115. if (this._set.has(c))
  116. this._set.delete(c);
  117. else
  118. this._set.add(c);
  119. this._afterUpdate(this); // eslint-disable-line @typescript-eslint/no-unsafe-call
  120. }
  121. contains(c) {
  122. return this._set.has(c);
  123. }
  124. get length() {
  125. return this._set.size;
  126. }
  127. values() {
  128. return this._set.values();
  129. }
  130. get value() {
  131. return Array.from(this._set.values());
  132. }
  133. toString() {
  134. return Array.from(this._set.values()).join(' ');
  135. }
  136. }
  137. /**
  138. * HTMLElement, which contains a set of children.
  139. *
  140. * Note: this is a minimalist implementation, no complete tree
  141. * structure provided (no parentNode, nextSibling,
  142. * previousSibling etc).
  143. * @class HTMLElement
  144. * @extends {Node}
  145. */
  146. export default class HTMLElement extends Node {
  147. /**
  148. * Creates an instance of HTMLElement.
  149. * @param keyAttrs id and class attribute
  150. * @param [rawAttrs] attributes in string
  151. *
  152. * @memberof HTMLElement
  153. */
  154. constructor(tagName, keyAttrs, rawAttrs = '', parentNode) {
  155. super(parentNode);
  156. this.rawAttrs = rawAttrs;
  157. /**
  158. * Node Type declaration.
  159. */
  160. this.nodeType = NodeType.ELEMENT_NODE;
  161. this.rawTagName = tagName;
  162. this.rawAttrs = rawAttrs || '';
  163. this.id = keyAttrs.id || '';
  164. this.childNodes = [];
  165. this.classList = new DOMTokenList(keyAttrs.class ? keyAttrs.class.split(/\s+/) : [], (classList) => (this.setAttribute('class', classList.toString()) // eslint-disable-line @typescript-eslint/no-unsafe-member-access, @typescript-eslint/no-unsafe-call
  166. ));
  167. if (keyAttrs.id) {
  168. if (!rawAttrs) {
  169. this.rawAttrs = `id="${keyAttrs.id}"`;
  170. }
  171. }
  172. if (keyAttrs.class) {
  173. if (!rawAttrs) {
  174. const cls = `class="${this.classList.toString()}"`;
  175. if (this.rawAttrs) {
  176. this.rawAttrs += ` ${cls}`;
  177. }
  178. else {
  179. this.rawAttrs = cls;
  180. }
  181. }
  182. }
  183. }
  184. /**
  185. * Quote attribute values
  186. * @param attr attribute value
  187. * @returns {string} quoted value
  188. */
  189. quoteAttribute(attr) {
  190. if (attr === null) {
  191. return "null";
  192. }
  193. return JSON.stringify(attr.replace(/"/g, '"'));
  194. }
  195. /**
  196. * Remove current element
  197. */
  198. remove() {
  199. if (this.parentNode) {
  200. const children = this.parentNode.childNodes;
  201. this.parentNode.childNodes = children.filter((child) => {
  202. return this !== child;
  203. });
  204. }
  205. }
  206. /**
  207. * Remove Child element from childNodes array
  208. * @param {HTMLElement} node node to remove
  209. */
  210. removeChild(node) {
  211. this.childNodes = this.childNodes.filter((child) => {
  212. return (child !== node);
  213. });
  214. }
  215. /**
  216. * Exchanges given child with new child
  217. * @param {HTMLElement} oldNode node to exchange
  218. * @param {HTMLElement} newNode new node
  219. */
  220. exchangeChild(oldNode, newNode) {
  221. const children = this.childNodes;
  222. this.childNodes = children.map((child) => {
  223. if (child === oldNode) {
  224. return newNode;
  225. }
  226. return child;
  227. });
  228. }
  229. get tagName() {
  230. return this.rawTagName ? this.rawTagName.toUpperCase() : this.rawTagName;
  231. }
  232. get localName() {
  233. return this.rawTagName.toLowerCase();
  234. }
  235. /**
  236. * Get escpaed (as-it) text value of current node and its children.
  237. * @return {string} text content
  238. */
  239. get rawText() {
  240. return this.childNodes.reduce((pre, cur) => {
  241. return (pre += cur.rawText);
  242. }, '');
  243. }
  244. get textContent() {
  245. return this.rawText;
  246. }
  247. set textContent(val) {
  248. const content = [new TextNode(val, this)];
  249. this.childNodes = content;
  250. }
  251. /**
  252. * Get unescaped text value of current node and its children.
  253. * @return {string} text content
  254. */
  255. get text() {
  256. return decode(this.rawText);
  257. }
  258. /**
  259. * Get structured Text (with '\n' etc.)
  260. * @return {string} structured text
  261. */
  262. get structuredText() {
  263. let currentBlock = [];
  264. const blocks = [currentBlock];
  265. function dfs(node) {
  266. if (node.nodeType === NodeType.ELEMENT_NODE) {
  267. if (kBlockElements.has(node.rawTagName)) {
  268. if (currentBlock.length > 0) {
  269. blocks.push(currentBlock = []);
  270. }
  271. node.childNodes.forEach(dfs);
  272. if (currentBlock.length > 0) {
  273. blocks.push(currentBlock = []);
  274. }
  275. }
  276. else {
  277. node.childNodes.forEach(dfs);
  278. }
  279. }
  280. else if (node.nodeType === NodeType.TEXT_NODE) {
  281. if (node.isWhitespace) {
  282. // Whitespace node, postponed output
  283. currentBlock.prependWhitespace = true;
  284. }
  285. else {
  286. let text = node.trimmedText;
  287. if (currentBlock.prependWhitespace) {
  288. text = ` ${text}`;
  289. currentBlock.prependWhitespace = false;
  290. }
  291. currentBlock.push(text);
  292. }
  293. }
  294. }
  295. dfs(this);
  296. return blocks.map((block) => {
  297. // Normalize each line's whitespace
  298. return block.join('').replace(/\s{2,}/g, ' ');
  299. })
  300. .join('\n').replace(/\s+$/, ''); // trimRight;
  301. }
  302. toString() {
  303. const tag = this.rawTagName;
  304. if (tag) {
  305. // const void_tags = new Set('area|base|br|col|embed|hr|img|input|link|meta|param|source|track|wbr'.split('|'));
  306. // const is_void = void_tags.has(tag);
  307. const is_void = /^(area|base|br|col|embed|hr|img|input|link|meta|param|source|track|wbr)$/i.test(tag);
  308. const attrs = this.rawAttrs ? ` ${this.rawAttrs}` : '';
  309. if (is_void) {
  310. return `<${tag}${attrs}>`;
  311. }
  312. return `<${tag}${attrs}>${this.innerHTML}</${tag}>`;
  313. }
  314. return this.innerHTML;
  315. }
  316. get innerHTML() {
  317. return this.childNodes.map((child) => {
  318. return child.toString();
  319. }).join('');
  320. }
  321. set innerHTML(content) {
  322. //const r = parse(content, global.options); // TODO global.options ?
  323. const r = parse(content);
  324. this.childNodes = r.childNodes.length ? r.childNodes : [new TextNode(content, this)];
  325. }
  326. set_content(content, options = {}) {
  327. if (content instanceof Node) {
  328. content = [content];
  329. }
  330. else if (typeof content == 'string') {
  331. const r = parse(content, options);
  332. content = r.childNodes.length ? r.childNodes : [new TextNode(content, this)];
  333. }
  334. this.childNodes = content;
  335. }
  336. replaceWith(...nodes) {
  337. const content = nodes.map((node) => {
  338. if (node instanceof Node) {
  339. return [node];
  340. }
  341. else if (typeof node == 'string') {
  342. // const r = parse(content, global.options); // TODO global.options ?
  343. const r = parse(node);
  344. return r.childNodes.length ? r.childNodes : [new TextNode(node, this)];
  345. }
  346. return [];
  347. }).flat();
  348. const idx = this.parentNode.childNodes.findIndex((child) => {
  349. return child === this;
  350. });
  351. this.parentNode.childNodes = [
  352. ...this.parentNode.childNodes.slice(0, idx),
  353. ...content,
  354. ...this.parentNode.childNodes.slice(idx + 1),
  355. ];
  356. }
  357. get outerHTML() {
  358. return this.toString();
  359. }
  360. /**
  361. * Trim element from right (in block) after seeing pattern in a TextNode.
  362. * @param {RegExp} pattern pattern to find
  363. * @return {HTMLElement} reference to current node
  364. */
  365. trimRight(pattern) {
  366. for (let i = 0; i < this.childNodes.length; i++) {
  367. const childNode = this.childNodes[i];
  368. if (childNode.nodeType === NodeType.ELEMENT_NODE) {
  369. childNode.trimRight(pattern);
  370. }
  371. else {
  372. const index = childNode.rawText.search(pattern);
  373. if (index > -1) {
  374. childNode.rawText = childNode.rawText.substr(0, index);
  375. // trim all following nodes.
  376. this.childNodes.length = i + 1;
  377. }
  378. }
  379. }
  380. return this;
  381. }
  382. /**
  383. * Get DOM structure
  384. * @return {string} strucutre
  385. */
  386. get structure() {
  387. const res = [];
  388. let indention = 0;
  389. function write(str) {
  390. res.push(' '.repeat(indention) + str);
  391. }
  392. function dfs(node) {
  393. const idStr = node.id ? (`#${node.id}`) : '';
  394. const classStr = node.classList.length ? (`.${node.classList.value.join('.')}`) : ''; // eslint-disable-line @typescript-eslint/no-unsafe-member-access, @typescript-eslint/no-unsafe-member-access, @typescript-eslint/restrict-template-expressions, @typescript-eslint/no-unsafe-call
  395. write(`${node.rawTagName}${idStr}${classStr}`);
  396. indention++;
  397. node.childNodes.forEach((childNode) => {
  398. if (childNode.nodeType === NodeType.ELEMENT_NODE) {
  399. dfs(childNode);
  400. }
  401. else if (childNode.nodeType === NodeType.TEXT_NODE) {
  402. if (!childNode.isWhitespace) {
  403. write('#text');
  404. }
  405. }
  406. });
  407. indention--;
  408. }
  409. dfs(this);
  410. return res.join('\n');
  411. }
  412. /**
  413. * Remove whitespaces in this sub tree.
  414. * @return {HTMLElement} pointer to this
  415. */
  416. removeWhitespace() {
  417. let o = 0;
  418. this.childNodes.forEach((node) => {
  419. if (node.nodeType === NodeType.TEXT_NODE) {
  420. if (node.isWhitespace) {
  421. return;
  422. }
  423. node.rawText = node.trimmedText;
  424. }
  425. else if (node.nodeType === NodeType.ELEMENT_NODE) {
  426. node.removeWhitespace();
  427. }
  428. this.childNodes[o++] = node;
  429. });
  430. this.childNodes.length = o;
  431. return this;
  432. }
  433. /**
  434. * Query CSS selector to find matching nodes.
  435. * @param {string} selector Simplified CSS selector
  436. * @return {HTMLElement[]} matching elements
  437. */
  438. querySelectorAll(selector) {
  439. return selectAll(selector, this, {
  440. xmlMode: true,
  441. adapter: Matcher
  442. });
  443. // let matcher: Matcher;
  444. // if (selector instanceof Matcher) {
  445. // matcher = selector;
  446. // matcher.reset();
  447. // } else {
  448. // if (selector.includes(',')) {
  449. // const selectors = selector.split(',');
  450. // return Array.from(selectors.reduce((pre, cur) => {
  451. // const result = this.querySelectorAll(cur.trim());
  452. // return result.reduce((p, c) => {
  453. // return p.add(c);
  454. // }, pre);
  455. // }, new Set<HTMLElement>()));
  456. // }
  457. // matcher = new Matcher(selector);
  458. // }
  459. // interface IStack {
  460. // 0: Node; // node
  461. // 1: number; // children
  462. // 2: boolean; // found flag
  463. // }
  464. // const stack = [] as IStack[];
  465. // return this.childNodes.reduce((res, cur) => {
  466. // stack.push([cur, 0, false]);
  467. // while (stack.length) {
  468. // const state = arr_back(stack); // get last element
  469. // const el = state[0];
  470. // if (state[1] === 0) {
  471. // // Seen for first time.
  472. // if (el.nodeType !== NodeType.ELEMENT_NODE) {
  473. // stack.pop();
  474. // continue;
  475. // }
  476. // const html_el = el as HTMLElement;
  477. // state[2] = matcher.advance(html_el);
  478. // if (state[2]) {
  479. // if (matcher.matched) {
  480. // res.push(html_el);
  481. // res.push(...(html_el.querySelectorAll(selector)));
  482. // // no need to go further.
  483. // matcher.rewind();
  484. // stack.pop();
  485. // continue;
  486. // }
  487. // }
  488. // }
  489. // if (state[1] < el.childNodes.length) {
  490. // stack.push([el.childNodes[state[1]++], 0, false]);
  491. // } else {
  492. // if (state[2]) {
  493. // matcher.rewind();
  494. // }
  495. // stack.pop();
  496. // }
  497. // }
  498. // return res;
  499. // }, [] as HTMLElement[]);
  500. }
  501. /**
  502. * Query CSS Selector to find matching node.
  503. * @param {string} selector Simplified CSS selector
  504. * @return {HTMLElement} matching node
  505. */
  506. querySelector(selector) {
  507. return selectOne(selector, this, {
  508. xmlMode: true,
  509. adapter: Matcher
  510. });
  511. // let matcher: Matcher;
  512. // if (selector instanceof Matcher) {
  513. // matcher = selector;
  514. // matcher.reset();
  515. // } else {
  516. // matcher = new Matcher(selector);
  517. // }
  518. // const stack = [] as { 0: Node; 1: 0 | 1; 2: boolean }[];
  519. // for (const node of this.childNodes) {
  520. // stack.push([node, 0, false]);
  521. // while (stack.length) {
  522. // const state = arr_back(stack);
  523. // const el = state[0];
  524. // if (state[1] === 0) {
  525. // // Seen for first time.
  526. // if (el.nodeType !== NodeType.ELEMENT_NODE) {
  527. // stack.pop();
  528. // continue;
  529. // }
  530. // state[2] = matcher.advance(el as HTMLElement);
  531. // if (state[2]) {
  532. // if (matcher.matched) {
  533. // return el as HTMLElement;
  534. // }
  535. // }
  536. // }
  537. // if (state[1] < el.childNodes.length) {
  538. // stack.push([el.childNodes[state[1]++], 0, false]);
  539. // } else {
  540. // if (state[2]) {
  541. // matcher.rewind();
  542. // }
  543. // stack.pop();
  544. // }
  545. // }
  546. // }
  547. // return null;
  548. }
  549. /**
  550. * traverses the Element and its parents (heading toward the document root) until it finds a node that matches the provided selector string. Will return itself or the matching ancestor. If no such element exists, it returns null.
  551. * @param selector a DOMString containing a selector list
  552. */
  553. closest(selector) {
  554. const mapChild = new Map();
  555. let el = this;
  556. let old = null;
  557. function findOne(test, elems) {
  558. let elem = null;
  559. for (let i = 0, l = elems.length; i < l && !elem; i++) {
  560. const el = elems[i];
  561. if (test(el)) {
  562. elem = el;
  563. }
  564. else {
  565. const child = mapChild.get(el);
  566. if (child) {
  567. elem = findOne(test, [child]);
  568. }
  569. }
  570. }
  571. return elem;
  572. }
  573. while (el) {
  574. mapChild.set(el, old);
  575. old = el;
  576. el = el.parentNode;
  577. }
  578. el = this;
  579. while (el) {
  580. const e = selectOne(selector, el, {
  581. xmlMode: true,
  582. adapter: {
  583. ...Matcher,
  584. getChildren(node) {
  585. const child = mapChild.get(node);
  586. return child && [child];
  587. },
  588. getSiblings(node) {
  589. return [node];
  590. },
  591. findOne,
  592. findAll() {
  593. return [];
  594. }
  595. }
  596. });
  597. if (e) {
  598. return e;
  599. }
  600. el = el.parentNode;
  601. }
  602. return null;
  603. }
  604. /**
  605. * Append a child node to childNodes
  606. * @param {Node} node node to append
  607. * @return {Node} node appended
  608. */
  609. appendChild(node) {
  610. // node.parentNode = this;
  611. this.childNodes.push(node);
  612. node.parentNode = this;
  613. return node;
  614. }
  615. /**
  616. * Get first child node
  617. * @return {Node} first child node
  618. */
  619. get firstChild() {
  620. return this.childNodes[0];
  621. }
  622. /**
  623. * Get last child node
  624. * @return {Node} last child node
  625. */
  626. get lastChild() {
  627. return arr_back(this.childNodes);
  628. }
  629. /**
  630. * Get attributes
  631. * @access private
  632. * @return {Object} parsed and unescaped attributes
  633. */
  634. get attrs() {
  635. if (this._attrs) {
  636. return this._attrs;
  637. }
  638. this._attrs = {};
  639. const attrs = this.rawAttributes;
  640. for (const key in attrs) {
  641. const val = attrs[key] || '';
  642. this._attrs[key.toLowerCase()] = decode(val);
  643. }
  644. return this._attrs;
  645. }
  646. get attributes() {
  647. const ret_attrs = {};
  648. const attrs = this.rawAttributes;
  649. for (const key in attrs) {
  650. const val = attrs[key] || '';
  651. ret_attrs[key] = decode(val);
  652. }
  653. return ret_attrs;
  654. }
  655. /**
  656. * Get escaped (as-it) attributes
  657. * @return {Object} parsed attributes
  658. */
  659. get rawAttributes() {
  660. if (this._rawAttrs) {
  661. return this._rawAttrs;
  662. }
  663. const attrs = {};
  664. if (this.rawAttrs) {
  665. const re = /\b([a-z][a-z0-9-_:]*)(?:\s*=\s*(?:"([^"]*)"|'([^']*)'|(\S+)))?/ig;
  666. let match;
  667. while ((match = re.exec(this.rawAttrs))) {
  668. attrs[match[1]] = match[2] || match[3] || match[4] || null;
  669. }
  670. }
  671. this._rawAttrs = attrs;
  672. return attrs;
  673. }
  674. removeAttribute(key) {
  675. const attrs = this.rawAttributes;
  676. delete attrs[key];
  677. // Update this.attribute
  678. if (this._attrs) {
  679. delete this._attrs[key];
  680. }
  681. // Update rawString
  682. this.rawAttrs = Object.keys(attrs).map((name) => {
  683. const val = JSON.stringify(attrs[name]);
  684. if (val === undefined || val === 'null') {
  685. return name;
  686. }
  687. return `${name}=${val}`;
  688. }).join(' ');
  689. // Update this.id
  690. if (key === 'id') {
  691. this.id = '';
  692. }
  693. }
  694. hasAttribute(key) {
  695. return key.toLowerCase() in this.attrs;
  696. }
  697. /**
  698. * Get an attribute
  699. * @return {string} value of the attribute
  700. */
  701. getAttribute(key) {
  702. return this.attrs[key.toLowerCase()];
  703. }
  704. /**
  705. * Set an attribute value to the HTMLElement
  706. * @param {string} key The attribute name
  707. * @param {string} value The value to set, or null / undefined to remove an attribute
  708. */
  709. setAttribute(key, value) {
  710. if (arguments.length < 2) {
  711. throw new Error('Failed to execute \'setAttribute\' on \'Element\'');
  712. }
  713. const k2 = key.toLowerCase();
  714. const attrs = this.rawAttributes;
  715. for (const k in attrs) {
  716. if (k.toLowerCase() === k2) {
  717. key = k;
  718. break;
  719. }
  720. }
  721. attrs[key] = String(value);
  722. // update this.attrs
  723. if (this._attrs) {
  724. this._attrs[k2] = decode(attrs[key]);
  725. }
  726. // Update rawString
  727. this.rawAttrs = Object.keys(attrs).map((name) => {
  728. const val = this.quoteAttribute(attrs[name]);
  729. if (val === 'null' || val === '""') {
  730. return name;
  731. }
  732. return `${name}=${val}`;
  733. }).join(' ');
  734. // Update this.id
  735. if (key === 'id') {
  736. this.id = value;
  737. }
  738. }
  739. /**
  740. * Replace all the attributes of the HTMLElement by the provided attributes
  741. * @param {Attributes} attributes the new attribute set
  742. */
  743. setAttributes(attributes) {
  744. // Invalidate current this.attributes
  745. if (this._attrs) {
  746. delete this._attrs;
  747. }
  748. // Invalidate current this.rawAttributes
  749. if (this._rawAttrs) {
  750. delete this._rawAttrs;
  751. }
  752. // Update rawString
  753. this.rawAttrs = Object.keys(attributes).map((name) => {
  754. const val = attributes[name];
  755. if (val === 'null' || val === '""') {
  756. return name;
  757. }
  758. return `${name}=${this.quoteAttribute(String(val))}`;
  759. }).join(' ');
  760. }
  761. insertAdjacentHTML(where, html) {
  762. if (arguments.length < 2) {
  763. throw new Error('2 arguments required');
  764. }
  765. const p = parse(html);
  766. if (where === 'afterend') {
  767. const idx = this.parentNode.childNodes.findIndex((child) => {
  768. return child === this;
  769. });
  770. this.parentNode.childNodes.splice(idx + 1, 0, ...p.childNodes);
  771. p.childNodes.forEach((n) => {
  772. if (n instanceof HTMLElement) {
  773. n.parentNode = this.parentNode;
  774. }
  775. });
  776. }
  777. else if (where === 'afterbegin') {
  778. this.childNodes.unshift(...p.childNodes);
  779. }
  780. else if (where === 'beforeend') {
  781. p.childNodes.forEach((n) => {
  782. this.appendChild(n);
  783. });
  784. }
  785. else if (where === 'beforebegin') {
  786. const idx = this.parentNode.childNodes.findIndex((child) => {
  787. return child === this;
  788. });
  789. this.parentNode.childNodes.splice(idx, 0, ...p.childNodes);
  790. p.childNodes.forEach((n) => {
  791. if (n instanceof HTMLElement) {
  792. n.parentNode = this.parentNode;
  793. }
  794. });
  795. }
  796. else {
  797. throw new Error(`The value provided ('${where}') is not one of 'beforebegin', 'afterbegin', 'beforeend', or 'afterend'`);
  798. }
  799. // if (!where || html === undefined || html === null) {
  800. // return;
  801. // }
  802. }
  803. get nextSibling() {
  804. if (this.parentNode) {
  805. const children = this.parentNode.childNodes;
  806. let i = 0;
  807. while (i < children.length) {
  808. const child = children[i++];
  809. if (this === child) {
  810. return children[i] || null;
  811. }
  812. }
  813. return null;
  814. }
  815. }
  816. get nextElementSibling() {
  817. if (this.parentNode) {
  818. const children = this.parentNode.childNodes;
  819. let i = 0;
  820. let find = false;
  821. while (i < children.length) {
  822. const child = children[i++];
  823. if (find) {
  824. if (child instanceof HTMLElement) {
  825. return child || null;
  826. }
  827. }
  828. else if (this === child) {
  829. find = true;
  830. }
  831. }
  832. return null;
  833. }
  834. }
  835. get classNames() {
  836. return this.classList.toString();
  837. }
  838. }
  839. // https://html.spec.whatwg.org/multipage/custom-elements.html#valid-custom-element-name
  840. const kMarkupPattern = /<!--[^]*?(?=-->)-->|<(\/?)([a-z][-.:0-9_a-z]*)\s*([^>]*?)(\/?)>/ig;
  841. // <(?<tag>[^\s]*)(.*)>(.*)</\k<tag>>
  842. // <([a-z][-.:0-9_a-z]*)\s*\/>
  843. // <(area|base|br|col|hr|img|input|link|meta|source)\s*(.*)\/?>
  844. // <(area|base|br|col|hr|img|input|link|meta|source)\s*(.*)\/?>|<(?<tag>[^\s]*)(.*)>(.*)</\k<tag>>
  845. const kAttributePattern = /(^|\s)(id|class)\s*=\s*("([^"]*)"|'([^']*)'|(\S+))/ig;
  846. const kSelfClosingElements = {
  847. area: true,
  848. AREA: true,
  849. base: true,
  850. BASE: true,
  851. br: true,
  852. BR: true,
  853. col: true,
  854. COL: true,
  855. hr: true,
  856. HR: true,
  857. img: true,
  858. IMG: true,
  859. input: true,
  860. INPUT: true,
  861. link: true,
  862. LINK: true,
  863. meta: true,
  864. META: true,
  865. source: true,
  866. SOURCE: true,
  867. embed: true,
  868. EMBED: true,
  869. param: true,
  870. PARAM: true,
  871. track: true,
  872. TRACK: true,
  873. wbr: true,
  874. WBR: true
  875. };
  876. const kElementsClosedByOpening = {
  877. li: { li: true, LI: true },
  878. LI: { li: true, LI: true },
  879. p: { p: true, div: true, P: true, DIV: true },
  880. P: { p: true, div: true, P: true, DIV: true },
  881. b: { div: true, DIV: true },
  882. B: { div: true, DIV: true },
  883. td: { td: true, th: true, TD: true, TH: true },
  884. TD: { td: true, th: true, TD: true, TH: true },
  885. th: { td: true, th: true, TD: true, TH: true },
  886. TH: { td: true, th: true, TD: true, TH: true },
  887. h1: { h1: true, H1: true },
  888. H1: { h1: true, H1: true },
  889. h2: { h2: true, H2: true },
  890. H2: { h2: true, H2: true },
  891. h3: { h3: true, H3: true },
  892. H3: { h3: true, H3: true },
  893. h4: { h4: true, H4: true },
  894. H4: { h4: true, H4: true },
  895. h5: { h5: true, H5: true },
  896. H5: { h5: true, H5: true },
  897. h6: { h6: true, H6: true },
  898. H6: { h6: true, H6: true }
  899. };
  900. const kElementsClosedByClosing = {
  901. li: { ul: true, ol: true, UL: true, OL: true },
  902. LI: { ul: true, ol: true, UL: true, OL: true },
  903. a: { div: true, DIV: true },
  904. A: { div: true, DIV: true },
  905. b: { div: true, DIV: true },
  906. B: { div: true, DIV: true },
  907. i: { div: true, DIV: true },
  908. I: { div: true, DIV: true },
  909. p: { div: true, DIV: true },
  910. P: { div: true, DIV: true },
  911. td: { tr: true, table: true, TR: true, TABLE: true },
  912. TD: { tr: true, table: true, TR: true, TABLE: true },
  913. th: { tr: true, table: true, TR: true, TABLE: true },
  914. TH: { tr: true, table: true, TR: true, TABLE: true }
  915. };
  916. const frameflag = 'documentfragmentcontainer';
  917. /**
  918. * Parses HTML and returns a root element
  919. * Parse a chuck of HTML source.
  920. * @param {string} data html
  921. * @return {HTMLElement} root element
  922. */
  923. export function base_parse(data, options = { lowerCaseTagName: false, comment: false }) {
  924. const elements = options.blockTextElements || {
  925. script: true,
  926. noscript: true,
  927. style: true,
  928. pre: true
  929. };
  930. const element_names = Object.keys(elements);
  931. const kBlockTextElements = element_names.map((it) => {
  932. return new RegExp(it, 'i');
  933. });
  934. const kIgnoreElements = element_names.filter((it) => {
  935. return elements[it];
  936. }).map((it) => {
  937. return new RegExp(it, 'i');
  938. });
  939. function element_should_be_ignore(tag) {
  940. return kIgnoreElements.some((it) => {
  941. return it.test(tag);
  942. });
  943. }
  944. function is_block_text_element(tag) {
  945. return kBlockTextElements.some((it) => {
  946. return it.test(tag);
  947. });
  948. }
  949. const root = new HTMLElement(null, {}, '', null);
  950. let currentParent = root;
  951. const stack = [root];
  952. let lastTextPos = -1;
  953. let match;
  954. // https://github.com/taoqf/node-html-parser/issues/38
  955. data = `<${frameflag}>${data}</${frameflag}>`;
  956. while ((match = kMarkupPattern.exec(data))) {
  957. if (lastTextPos > -1) {
  958. if (lastTextPos + match[0].length < kMarkupPattern.lastIndex) {
  959. // if has content
  960. const text = data.substring(lastTextPos, kMarkupPattern.lastIndex - match[0].length);
  961. currentParent.appendChild(new TextNode(text, currentParent));
  962. }
  963. }
  964. lastTextPos = kMarkupPattern.lastIndex;
  965. if (match[2] === frameflag) {
  966. continue;
  967. }
  968. if (match[0][1] === '!') {
  969. // this is a comment
  970. if (options.comment) {
  971. // Only keep what is in between <!-- and -->
  972. const text = data.substring(lastTextPos - 3, lastTextPos - match[0].length + 4);
  973. currentParent.appendChild(new CommentNode(text, currentParent));
  974. }
  975. continue;
  976. }
  977. if (options.lowerCaseTagName) {
  978. match[2] = match[2].toLowerCase();
  979. }
  980. if (!match[1]) {
  981. // not </ tags
  982. const attrs = {};
  983. for (let attMatch; (attMatch = kAttributePattern.exec(match[3]));) {
  984. attrs[attMatch[2].toLowerCase()] = attMatch[4] || attMatch[5] || attMatch[6];
  985. }
  986. const tagName = currentParent.rawTagName;
  987. if (!match[4] && kElementsClosedByOpening[tagName]) {
  988. if (kElementsClosedByOpening[tagName][match[2]]) {
  989. stack.pop();
  990. currentParent = arr_back(stack);
  991. }
  992. }
  993. // ignore container tag we add above
  994. // https://github.com/taoqf/node-html-parser/issues/38
  995. currentParent = currentParent.appendChild(new HTMLElement(match[2], attrs, match[3], null));
  996. stack.push(currentParent);
  997. if (is_block_text_element(match[2])) {
  998. // a little test to find next </script> or </style> ...
  999. const closeMarkup = `</${match[2]}>`;
  1000. const index = (() => {
  1001. if (options.lowerCaseTagName) {
  1002. return data.toLocaleLowerCase().indexOf(closeMarkup, kMarkupPattern.lastIndex);
  1003. }
  1004. return data.indexOf(closeMarkup, kMarkupPattern.lastIndex);
  1005. })();
  1006. if (element_should_be_ignore(match[2])) {
  1007. let text;
  1008. if (index === -1) {
  1009. // there is no matching ending for the text element.
  1010. text = data.substr(kMarkupPattern.lastIndex);
  1011. }
  1012. else {
  1013. text = data.substring(kMarkupPattern.lastIndex, index);
  1014. }
  1015. if (text.length > 0) {
  1016. currentParent.appendChild(new TextNode(text, currentParent));
  1017. }
  1018. }
  1019. if (index === -1) {
  1020. lastTextPos = kMarkupPattern.lastIndex = data.length + 1;
  1021. }
  1022. else {
  1023. lastTextPos = kMarkupPattern.lastIndex = index + closeMarkup.length;
  1024. match[1] = 'true';
  1025. }
  1026. }
  1027. }
  1028. if (match[1] || match[4] || kSelfClosingElements[match[2]]) {
  1029. // </ or /> or <br> etc.
  1030. while (true) {
  1031. if (currentParent.rawTagName === match[2]) {
  1032. stack.pop();
  1033. currentParent = arr_back(stack);
  1034. break;
  1035. }
  1036. else {
  1037. const tagName = currentParent.tagName;
  1038. // Trying to close current tag, and move on
  1039. if (kElementsClosedByClosing[tagName]) {
  1040. if (kElementsClosedByClosing[tagName][match[2]]) {
  1041. stack.pop();
  1042. currentParent = arr_back(stack);
  1043. continue;
  1044. }
  1045. }
  1046. // Use aggressive strategy to handle unmatching markups.
  1047. break;
  1048. }
  1049. }
  1050. }
  1051. }
  1052. return stack;
  1053. }
  1054. /**
  1055. * Parses HTML and returns a root element
  1056. * Parse a chuck of HTML source.
  1057. */
  1058. export function parse(data, options = { lowerCaseTagName: false, comment: false }) {
  1059. const stack = base_parse(data, options);
  1060. const [root] = stack;
  1061. while (stack.length > 1) {
  1062. // Handle each error elements.
  1063. const last = stack.pop();
  1064. const oneBefore = arr_back(stack);
  1065. if (last.parentNode && last.parentNode.parentNode) {
  1066. if (last.parentNode === oneBefore && last.tagName === oneBefore.tagName) {
  1067. // Pair error case <h3> <h3> handle : Fixes to <h3> </h3>
  1068. oneBefore.removeChild(last);
  1069. last.childNodes.forEach((child) => {
  1070. oneBefore.parentNode.appendChild(child);
  1071. });
  1072. stack.pop();
  1073. }
  1074. else {
  1075. // Single error <div> <h3> </div> handle: Just removes <h3>
  1076. oneBefore.removeChild(last);
  1077. last.childNodes.forEach((child) => {
  1078. oneBefore.appendChild(child);
  1079. });
  1080. }
  1081. }
  1082. else {
  1083. // If it's final element just skip.
  1084. }
  1085. }
  1086. // response.childNodes.forEach((node) => {
  1087. // if (node instanceof HTMLElement) {
  1088. // node.parentNode = null;
  1089. // }
  1090. // });
  1091. return root;
  1092. }