html-encoding-sniffer.js 7.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295
  1. "use strict";
  2. const whatwgEncoding = require("whatwg-encoding");
  3. // https://html.spec.whatwg.org/#encoding-sniffing-algorithm
  4. module.exports = (buffer, { transportLayerEncodingLabel, defaultEncoding = "windows-1252" } = {}) => {
  5. let encoding = whatwgEncoding.getBOMEncoding(buffer); // see https://github.com/whatwg/html/issues/1910
  6. if (encoding === null && transportLayerEncodingLabel !== undefined) {
  7. encoding = whatwgEncoding.labelToName(transportLayerEncodingLabel);
  8. }
  9. if (encoding === null) {
  10. encoding = prescanMetaCharset(buffer);
  11. }
  12. if (encoding === null) {
  13. encoding = defaultEncoding;
  14. }
  15. return encoding;
  16. };
  17. // https://html.spec.whatwg.org/multipage/syntax.html#prescan-a-byte-stream-to-determine-its-encoding
  18. function prescanMetaCharset(buffer) {
  19. const l = Math.min(buffer.length, 1024);
  20. for (let i = 0; i < l; i++) {
  21. let c = buffer[i];
  22. if (c === 0x3C) {
  23. // "<"
  24. const c1 = buffer[i + 1];
  25. const c2 = buffer[i + 2];
  26. const c3 = buffer[i + 3];
  27. const c4 = buffer[i + 4];
  28. const c5 = buffer[i + 5];
  29. // !-- (comment start)
  30. if (c1 === 0x21 && c2 === 0x2D && c3 === 0x2D) {
  31. i += 4;
  32. for (; i < l; i++) {
  33. c = buffer[i];
  34. const cMinus1 = buffer[i - 1];
  35. const cMinus2 = buffer[i - 2];
  36. // --> (comment end)
  37. if (c === 0x3E && cMinus1 === 0x2D && cMinus2 === 0x2D) {
  38. break;
  39. }
  40. }
  41. } else if ((c1 === 0x4D || c1 === 0x6D) &&
  42. (c2 === 0x45 || c2 === 0x65) &&
  43. (c3 === 0x54 || c3 === 0x74) &&
  44. (c4 === 0x41 || c4 === 0x61) &&
  45. (isSpaceCharacter(c5) || c5 === 0x2F)) {
  46. // "meta" + space or /
  47. i += 6;
  48. const attributeList = new Set();
  49. let gotPragma = false;
  50. let needPragma = null;
  51. let charset = null;
  52. let attrRes;
  53. do {
  54. attrRes = getAttribute(buffer, i, l);
  55. if (attrRes.attr && !attributeList.has(attrRes.attr.name)) {
  56. attributeList.add(attrRes.attr.name);
  57. if (attrRes.attr.name === "http-equiv") {
  58. gotPragma = attrRes.attr.value === "content-type";
  59. } else if (attrRes.attr.name === "content" && !charset) {
  60. charset = extractCharacterEncodingFromMeta(attrRes.attr.value);
  61. if (charset !== null) {
  62. needPragma = true;
  63. }
  64. } else if (attrRes.attr.name === "charset") {
  65. charset = whatwgEncoding.labelToName(attrRes.attr.value);
  66. needPragma = false;
  67. }
  68. }
  69. i = attrRes.i;
  70. } while (attrRes.attr);
  71. if (needPragma === null) {
  72. continue;
  73. }
  74. if (needPragma === true && gotPragma === false) {
  75. continue;
  76. }
  77. if (charset === null) {
  78. continue;
  79. }
  80. if (charset === "UTF-16LE" || charset === "UTF-16BE") {
  81. charset = "UTF-8";
  82. }
  83. if (charset === "x-user-defined") {
  84. charset = "windows-1252";
  85. }
  86. return charset;
  87. } else if ((c1 >= 0x41 && c1 <= 0x5A) || (c1 >= 0x61 && c1 <= 0x7A)) {
  88. // a-z or A-Z
  89. for (i += 2; i < l; i++) {
  90. c = buffer[i];
  91. // space or >
  92. if (isSpaceCharacter(c) || c === 0x3E) {
  93. break;
  94. }
  95. }
  96. let attrRes;
  97. do {
  98. attrRes = getAttribute(buffer, i, l);
  99. i = attrRes.i;
  100. } while (attrRes.attr);
  101. } else if (c1 === 0x21 || c1 === 0x2F || c1 === 0x3F) {
  102. // ! or / or ?
  103. for (i += 2; i < l; i++) {
  104. c = buffer[i];
  105. // >
  106. if (c === 0x3E) {
  107. break;
  108. }
  109. }
  110. }
  111. }
  112. }
  113. return null;
  114. }
  115. // https://html.spec.whatwg.org/multipage/syntax.html#concept-get-attributes-when-sniffing
  116. function getAttribute(buffer, i, l) {
  117. for (; i < l; i++) {
  118. let c = buffer[i];
  119. // space or /
  120. if (isSpaceCharacter(c) || c === 0x2F) {
  121. continue;
  122. }
  123. // ">"
  124. if (c === 0x3E) {
  125. break;
  126. }
  127. let name = "";
  128. let value = "";
  129. nameLoop:for (; i < l; i++) {
  130. c = buffer[i];
  131. // "="
  132. if (c === 0x3D && name !== "") {
  133. i++;
  134. break;
  135. }
  136. // space
  137. if (isSpaceCharacter(c)) {
  138. for (i++; i < l; i++) {
  139. c = buffer[i];
  140. // space
  141. if (isSpaceCharacter(c)) {
  142. continue;
  143. }
  144. // not "="
  145. if (c !== 0x3D) {
  146. return { attr: { name, value }, i };
  147. }
  148. i++;
  149. break nameLoop;
  150. }
  151. break;
  152. }
  153. // / or >
  154. if (c === 0x2F || c === 0x3E) {
  155. return { attr: { name, value }, i };
  156. }
  157. // A-Z
  158. if (c >= 0x41 && c <= 0x5A) {
  159. name += String.fromCharCode(c + 0x20); // lowercase
  160. } else {
  161. name += String.fromCharCode(c);
  162. }
  163. }
  164. c = buffer[i];
  165. // space
  166. if (isSpaceCharacter(c)) {
  167. for (i++; i < l; i++) {
  168. c = buffer[i];
  169. // space
  170. if (isSpaceCharacter(c)) {
  171. continue;
  172. } else {
  173. break;
  174. }
  175. }
  176. }
  177. // " or '
  178. if (c === 0x22 || c === 0x27) {
  179. const quote = c;
  180. for (i++; i < l; i++) {
  181. c = buffer[i];
  182. if (c === quote) {
  183. i++;
  184. return { attr: { name, value }, i };
  185. }
  186. // A-Z
  187. if (c >= 0x41 && c <= 0x5A) {
  188. value += String.fromCharCode(c + 0x20); // lowercase
  189. } else {
  190. value += String.fromCharCode(c);
  191. }
  192. }
  193. }
  194. // >
  195. if (c === 0x3E) {
  196. return { attr: { name, value }, i };
  197. }
  198. // A-Z
  199. if (c >= 0x41 && c <= 0x5A) {
  200. value += String.fromCharCode(c + 0x20); // lowercase
  201. } else {
  202. value += String.fromCharCode(c);
  203. }
  204. for (i++; i < l; i++) {
  205. c = buffer[i];
  206. // space or >
  207. if (isSpaceCharacter(c) || c === 0x3E) {
  208. return { attr: { name, value }, i };
  209. }
  210. // A-Z
  211. if (c >= 0x41 && c <= 0x5A) {
  212. value += String.fromCharCode(c + 0x20); // lowercase
  213. } else {
  214. value += String.fromCharCode(c);
  215. }
  216. }
  217. }
  218. return { i };
  219. }
  220. function extractCharacterEncodingFromMeta(string) {
  221. let position = 0;
  222. while (true) {
  223. const indexOfCharset = string.substring(position).search(/charset/i);
  224. if (indexOfCharset === -1) {
  225. return null;
  226. }
  227. let subPosition = position + indexOfCharset + "charset".length;
  228. while (isSpaceCharacter(string[subPosition].charCodeAt(0))) {
  229. ++subPosition;
  230. }
  231. if (string[subPosition] !== "=") {
  232. position = subPosition - 1;
  233. continue;
  234. }
  235. ++subPosition;
  236. while (isSpaceCharacter(string[subPosition].charCodeAt(0))) {
  237. ++subPosition;
  238. }
  239. position = subPosition;
  240. break;
  241. }
  242. if (string[position] === "\"" || string[position] === "'") {
  243. const nextIndex = string.indexOf(string[position], position + 1);
  244. if (nextIndex !== -1) {
  245. return whatwgEncoding.labelToName(string.substring(position + 1, nextIndex));
  246. }
  247. // It is an unmatched quotation mark
  248. return null;
  249. }
  250. if (string.length === position + 1) {
  251. return null;
  252. }
  253. const indexOfASCIIWhitespaceOrSemicolon = string.substring(position + 1).search(/\x09|\x0A|\x0C|\x0D|\x20|;/);
  254. const end = indexOfASCIIWhitespaceOrSemicolon === -1 ?
  255. string.length :
  256. position + indexOfASCIIWhitespaceOrSemicolon + 1;
  257. return whatwgEncoding.labelToName(string.substring(position, end));
  258. }
  259. function isSpaceCharacter(c) {
  260. return c === 0x09 || c === 0x0A || c === 0x0C || c === 0x0D || c === 0x20;
  261. }