utils html-sanitize.ts

100% Statements 41/41
100% Branches 12/12
100% Functions 2/2
100% Lines 41/41
Press n or j to go to the next uncovered block, b, p or k for the previous block.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95  
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1023x
1023x
 
1023x
40706x
40706x
229x
229x
 
40477x
40477x
40477x
2x
2x
 
40475x
40475x
 
1023x
 
 
 
 
 
 
 
 
 
 
 
 
30x
30x
30x
30x
30x
 
30x
34x
34x
15x
15x
 
19x
19x
19x
1x
1x
 
18x
18x
1x
1x
 
17x
17x
1x
1x
 
16x
16x
 
30x
 
  // SPDX-FileCopyrightText: 2024-2026 Hack23 AB
// SPDX-License-Identifier: Apache-2.0
 
/**
 * @module Utils/HtmlSanitize
 * @description Shared HTML sanitization helpers used across the generation,
 * validation, and quality-scoring pipelines.
 */
 
/**
 * Remove all `<script>…</script>` blocks from an HTML string, replacing each
 * with a single space.
 *
 * Uses iterative index-based scanning instead of a single-pass regex so that
 * CodeQL does not flag the pattern as an insecure HTML tag filter
 * (`js/bad-tag-filter`).
 *
 * @param html - HTML string to strip
 * @returns The HTML with script blocks replaced by spaces
 */
/**
 * Strip all HTML tags from a string, replacing each tag with a single space.
 *
 * Uses iterative index-based scanning instead of regex to avoid polynomial
 * backtracking (CodeQL `js/polynomial-redos`).
 *
 * @param html - HTML string to strip
 * @returns The text content with tags replaced by spaces
 */
export function stripHtmlTags(html: string): string {
  let result = '';
  let pos = 0;
 
  while (pos < html.length) {
    const openIdx = html.indexOf('<', pos);
    if (openIdx < 0) {
      result += html.slice(pos);
      break;
    }
    result += html.slice(pos, openIdx);
    const closeIdx = html.indexOf('>', openIdx + 1);
    if (closeIdx < 0) {
      result += html.slice(openIdx);
      break;
    }
    result += ' ';
    pos = closeIdx + 1;
  }
  return result;
}
 
/**
 * Remove every `<script>…</script>` block (and their contents) from an HTML
 * string using a case-insensitive linear scan. Defends against XSS payloads
 * smuggled into AI-generated article fragments before the markdown is
 * rendered into the final article HTML.
 *
 * @param html - Raw HTML possibly containing `<script>` blocks.
 * @returns HTML with every `<script>…</script>` block removed.
 */
export function stripScriptBlocks(html: string): string {
  const OPEN = '<script';
  const CLOSE = '</script';
  let result = '';
  let pos = 0;
  const lower = html.toLowerCase();
 
  while (pos < html.length) {
    const openIdx = lower.indexOf(OPEN, pos);
    if (openIdx < 0) {
      result += html.slice(pos);
      break;
    }
    result += html.slice(pos, openIdx);
    const openEnd = html.indexOf('>', openIdx);
    if (openEnd < 0) {
      result += html.slice(openIdx);
      break;
    }
    const closeIdx = lower.indexOf(CLOSE, openEnd + 1);
    if (closeIdx < 0) {
      result += ' ';
      break;
    }
    const closeEnd = html.indexOf('>', closeIdx);
    if (closeEnd < 0) {
      result += ' ';
      break;
    }
    result += ' ';
    pos = closeEnd + 1;
  }
  return result;
}