Press n or j to go to the next uncovered block, b, p or k for the previous block.
| 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 | 21x 21x 21x 68x 68x 5x 5x 63x 63x 63x 2x 2x 61x 61x 21x 30x 30x 30x 30x 30x 30x 34x 34x 15x 15x 19x 19x 19x 1x 1x 18x 18x 1x 1x 17x 17x 1x 1x 16x 16x 30x | // SPDX-FileCopyrightText: 2024-2026 Hack23 AB
// SPDX-License-Identifier: Apache-2.0
/**
* @module Utils/HtmlSanitize
* @description Shared HTML sanitization helpers used across the generation,
* validation, and quality-scoring pipelines.
*/
/**
* Remove all `<script>…</script>` blocks from an HTML string, replacing each
* with a single space.
*
* Uses iterative index-based scanning instead of a single-pass regex so that
* CodeQL does not flag the pattern as an insecure HTML tag filter
* (`js/bad-tag-filter`).
*
* @param html - HTML string to strip
* @returns The HTML with script blocks replaced by spaces
*/
/**
* Strip all HTML tags from a string, replacing each tag with a single space.
*
* Uses iterative index-based scanning instead of regex to avoid polynomial
* backtracking (CodeQL `js/polynomial-redos`).
*
* @param html - HTML string to strip
* @returns The text content with tags replaced by spaces
*/
export function stripHtmlTags(html: string): string {
let result = '';
let pos = 0;
while (pos < html.length) {
const openIdx = html.indexOf('<', pos);
if (openIdx < 0) {
result += html.slice(pos);
break;
}
result += html.slice(pos, openIdx);
const closeIdx = html.indexOf('>', openIdx + 1);
if (closeIdx < 0) {
result += html.slice(openIdx);
break;
}
result += ' ';
pos = closeIdx + 1;
}
return result;
}
/**
* Remove every `<script>…</script>` block (and their contents) from an HTML
* string using a case-insensitive linear scan. Defends against XSS payloads
* smuggled into AI-generated article fragments before the markdown is
* rendered into the final article HTML.
*
* @param html - Raw HTML possibly containing `<script>` blocks.
* @returns HTML with every `<script>…</script>` block removed.
*/
export function stripScriptBlocks(html: string): string {
const OPEN = '<script';
const CLOSE = '</script';
let result = '';
let pos = 0;
const lower = html.toLowerCase();
while (pos < html.length) {
const openIdx = lower.indexOf(OPEN, pos);
if (openIdx < 0) {
result += html.slice(pos);
break;
}
result += html.slice(pos, openIdx);
const openEnd = html.indexOf('>', openIdx);
if (openEnd < 0) {
result += html.slice(openIdx);
break;
}
const closeIdx = lower.indexOf(CLOSE, openEnd + 1);
if (closeIdx < 0) {
result += ' ';
break;
}
const closeEnd = html.indexOf('>', closeIdx);
if (closeEnd < 0) {
result += ' ';
break;
}
result += ' ';
pos = closeEnd + 1;
}
return result;
}
|