metadata keyword-filters.ts

93.61% Statements 44/47
85% Branches 34/40
100% Functions 2/2
100% Lines 31/31
Press n or j to go to the next uncovered block, b, p or k for the previous block.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152  
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25x
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25x
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
313778x
313778x
2x
2x
4x
2x
2x
 
 
2x
 
 
2x
2x
 
2x
2x
 
2x
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
341971x
341970x
341970x
341969x
 
 
341969x
314380x
341971x
 
 
313779x
313778x
 
 
313776x
313776x
455x
371x
263x
 
4x
 
 
313321x
 
  // SPDX-FileCopyrightText: 2024-2026 Hack23 AB
// SPDX-License-Identifier: Apache-2.0
 
/**
 * @module Aggregator/Metadata/KeywordFilters
 * @description Cross-site keyword catalogue and noise-token filter used
 * by {@link buildSeoKeywords} in `resolve-helpers.ts`.
 *
 * Two responsibilities:
 *
 *   1. **Always-on cross-site keywords** ({@link CROSS_SITE_KEYWORDS})
 *      are prepended to every article's `<meta name="keywords">` list
 *      regardless of language, so search-engine discovery of the
 *      Hack23 civic-tech portfolio (EU Parliament Monitor +
 *      Riksdagsmonitor + CIA) is consistent across all 14 localized
 *      surfaces. The user explicitly requested
 *      `riksdagsmonitor, political intelligence, riksdag, regeringen`
 *      (the sister Swedish-Parliament project) plus EP analogues.
 *
 *   2. **Noise-token rejection** ({@link isNoiseKeywordToken}) drops
 *      the UUID-fragment tokens (`77fc920c`, `3a76`, `9db5`, …) and
 *      synthetic run-id slugs (`propositions-run261-1779431162`) that
 *      the previous keyword extractor leaked into `<head>` when a
 *      brief mentioned its own run id editorially (e.g.
 *      `Analysis run 77fc920c-3a76-4813-9db5-43a7e9acc25e returned
 *      0 classified actors`).
 *
 * Pure leaf module — no imports.
 */
 
/**
 * Cross-site SEO keywords prepended to every article in every
 * language. Order is meaningful: stronger civic-tech-portfolio terms
 * first so they appear ahead of the per-article-type keywords when
 * the 16-entry budget is exceeded.
 */
export const CROSS_SITE_KEYWORDS: readonly string[] = [
  'EU Parliament Monitor',
  'European Parliament',
  'European Commission',
  'political intelligence',
  'Riksdagsmonitor',
  'Riksdag',
  'Regeringen',
];
 
/**
 * Lower-case allowlist of common English words that the noise filter
 * must always keep, even when their shape would otherwise match the
 * "looks like a hex token" heuristic (e.g. `face`, `dead`, `beef`).
 * Kept intentionally tiny to avoid lexicon drift.
 */
const HEX_ALPHABETIC_ALLOWLIST = new Set<string>([
  'face',
  'fade',
  'dead',
  'beef',
  'cafe',
  'feed',
  'deed',
  'fed',
  'add',
  'dad',
  'bad',
]);
 
/**
 * Detect run-id slug chains of the form
 * `<letters>(-<letters>)*-run<digits>(-<digits>)*` — e.g.
 * `propositions-run261-1779431162` or
 * `breaking-news-run17-1234567890`. Implemented as a split-and-scan
 * walker (instead of a single backtracking regex) to satisfy the
 * `security/detect-unsafe-regex` lint rule.
 *
 * @param lower - Lower-case candidate token
 * @returns `true` when the token matches the run-id slug shape
 */
function isRunSlugChain(lower: string): boolean {
  const parts = lower.split('-');
  if (parts.length < 2) return false;
  let runIndex = -1;
  for (let i = 0; i < parts.length; i++) {
    if (/^run\d+$/u.test(parts[i] ?? '')) {
      runIndex = i;
      break;
    }
  }
  Iif (runIndex <= 0) return false;
  // Every segment before `run<digits>` must be all-letters; every
  // segment after must be all-digits.
  for (let i = 0; i < runIndex; i++) {
    Iif (!/^[a-z]+$/u.test(parts[i] ?? '')) return false;
  }
  for (let i = runIndex + 1; i < parts.length; i++) {
    Iif (!/^\d+$/u.test(parts[i] ?? '')) return false;
  }
  return true;
}
 
/**
 * Decide whether a single keyword token should be discarded as noise.
 *
 * The current rules reject tokens that:
 *
 *   - Look like a UUID hex chunk: ≥4 chars and consist solely of the
 *     `[0-9a-f]` alphabet **and** contain at least one digit (so
 *     real English words like `dead` / `face` survive). Tokens of
 *     length ≥8 are always rejected (a real English word of that
 *     length composed exclusively of hex letters is vanishingly rare;
 *     the allowlist guards the short cases).
 *   - Are mostly digits (≥80 % digit characters) — runtime epoch
 *     suffixes such as `1779431162` and committee-codeoid mashes like
 *     `2024k1234`.
 *   - Start with `run` and end with all-digits (`run261`, `run17`),
 *     the per-run slug suffix the aggregator stamps onto run ids.
 *   - Match the full opaque-runId shape `<type>-run<digits>-<digits>`
 *     after a strip / normalization round-trip.
 *
 * Returns `false` for normal vocabulary so the keyword list stays
 * useful — every reject path is intentionally narrow.
 *
 * @param token - Single token candidate
 * @returns `true` when the token should be dropped from keywords
 */
export function isNoiseKeywordToken(token: string): boolean {
  if (!token) return true;
  const trimmed = token.trim();
  if (trimmed.length < 4) return true;
  const lower = trimmed.toLowerCase();
 
  // Reject pure-digit and digit-dominated tokens.
  if (/^\d+$/u.test(lower)) return true;
  const digitCount = (lower.match(/\d/gu) ?? []).length;
  if (digitCount > 0 && digitCount / lower.length >= 0.8) return true;
 
  // Reject `run<digits>` slugs and `…-run<digits>-<digits>` chains.
  if (/^run\d+$/u.test(lower)) return true;
  if (isRunSlugChain(lower)) return true;
 
  // Reject hex-shaped tokens unless they are common English words.
  const isHex = /^[0-9a-f]+$/u.test(lower);
  if (isHex) {
    if (lower.length >= 8) return true;
    if (digitCount > 0) return true;
    if (HEX_ALPHABETIC_ALLOWLIST.has(lower)) return false;
    // Short all-letter hex words: keep (avoids overfitting).
    return false;
  }
 
  return false;
}