All files / src/utils html-sanitize.ts

100% Statements 41/41
100% Branches 12/12
100% Functions 2/2
100% Lines 41/41

Press n or j to go to the next uncovered block, b, p or k for the previous block.

1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95                                                            21x 21x   21x 68x 68x 5x 5x   63x 63x 63x 2x 2x   61x 61x   21x                         30x 30x 30x 30x 30x   30x 34x 34x 15x 15x   19x 19x 19x 1x 1x   18x 18x 1x 1x   17x 17x 1x 1x   16x 16x   30x    
// SPDX-FileCopyrightText: 2024-2026 Hack23 AB
// SPDX-License-Identifier: Apache-2.0
 
/**
 * @module Utils/HtmlSanitize
 * @description Shared HTML sanitization helpers used across the generation,
 * validation, and quality-scoring pipelines.
 */
 
/**
 * Remove all `<script>…</script>` blocks from an HTML string, replacing each
 * with a single space.
 *
 * Uses iterative index-based scanning instead of a single-pass regex so that
 * CodeQL does not flag the pattern as an insecure HTML tag filter
 * (`js/bad-tag-filter`).
 *
 * @param html - HTML string to strip
 * @returns The HTML with script blocks replaced by spaces
 */
/**
 * Strip all HTML tags from a string, replacing each tag with a single space.
 *
 * Uses iterative index-based scanning instead of regex to avoid polynomial
 * backtracking (CodeQL `js/polynomial-redos`).
 *
 * @param html - HTML string to strip
 * @returns The text content with tags replaced by spaces
 */
export function stripHtmlTags(html: string): string {
  let result = '';
  let pos = 0;
 
  while (pos < html.length) {
    const openIdx = html.indexOf('<', pos);
    if (openIdx < 0) {
      result += html.slice(pos);
      break;
    }
    result += html.slice(pos, openIdx);
    const closeIdx = html.indexOf('>', openIdx + 1);
    if (closeIdx < 0) {
      result += html.slice(openIdx);
      break;
    }
    result += ' ';
    pos = closeIdx + 1;
  }
  return result;
}
 
/**
 * Remove every `<script>…</script>` block (and their contents) from an HTML
 * string using a case-insensitive linear scan. Defends against XSS payloads
 * smuggled into AI-generated article fragments before the markdown is
 * rendered into the final article HTML.
 *
 * @param html - Raw HTML possibly containing `<script>` blocks.
 * @returns HTML with every `<script>…</script>` block removed.
 */
export function stripScriptBlocks(html: string): string {
  const OPEN = '<script';
  const CLOSE = '</script';
  let result = '';
  let pos = 0;
  const lower = html.toLowerCase();
 
  while (pos < html.length) {
    const openIdx = lower.indexOf(OPEN, pos);
    if (openIdx < 0) {
      result += html.slice(pos);
      break;
    }
    result += html.slice(pos, openIdx);
    const openEnd = html.indexOf('>', openIdx);
    if (openEnd < 0) {
      result += html.slice(openIdx);
      break;
    }
    const closeIdx = lower.indexOf(CLOSE, openEnd + 1);
    if (closeIdx < 0) {
      result += ' ';
      break;
    }
    const closeEnd = html.indexOf('>', closeIdx);
    if (closeEnd < 0) {
      result += ' ';
      break;
    }
    result += ' ';
    pos = closeEnd + 1;
  }
  return result;
}