Press n or j to go to the next uncovered block, b, p or k for the previous block.
| 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 | 22x 22x 16790x 16790x 2x 2x 4x 2x 2x 2x 2x 2x 2x 2x 2x 17211x 17210x 17210x 17209x 17209x 16792x 17211x 16791x 16790x 16788x 16788x 78x 33x 3x 16710x | // SPDX-FileCopyrightText: 2024-2026 Hack23 AB
// SPDX-License-Identifier: Apache-2.0
/**
* @module Aggregator/Metadata/KeywordFilters
* @description Cross-site keyword catalogue and noise-token filter used
* by {@link buildSeoKeywords} in `resolve-helpers.ts`.
*
* Two responsibilities:
*
* 1. **Always-on cross-site keywords** ({@link CROSS_SITE_KEYWORDS})
* are prepended to every article's `<meta name="keywords">` list
* regardless of language, so search-engine discovery of the
* Hack23 civic-tech portfolio (EU Parliament Monitor +
* Riksdagsmonitor + CIA) is consistent across all 14 localized
* surfaces. The user explicitly requested
* `riksdagsmonitor, political intelligence, riksdag, regeringen`
* (the sister Swedish-Parliament project) plus EP analogues.
*
* 2. **Noise-token rejection** ({@link isNoiseKeywordToken}) drops
* the UUID-fragment tokens (`77fc920c`, `3a76`, `9db5`, …) and
* synthetic run-id slugs (`propositions-run261-1779431162`) that
* the previous keyword extractor leaked into `<head>` when a
* brief mentioned its own run id editorially (e.g.
* `Analysis run 77fc920c-3a76-4813-9db5-43a7e9acc25e returned
* 0 classified actors`).
*
* Pure leaf module — no imports.
*/
/**
* Cross-site SEO keywords prepended to every article in every
* language. Order is meaningful: stronger civic-tech-portfolio terms
* first so they appear ahead of the per-article-type keywords when
* the 16-entry budget is exceeded.
*/
export const CROSS_SITE_KEYWORDS: readonly string[] = [
'EU Parliament Monitor',
'European Parliament',
'European Commission',
'political intelligence',
'Riksdagsmonitor',
'Riksdag',
'Regeringen',
];
/**
* Lower-case allowlist of common English words that the noise filter
* must always keep, even when their shape would otherwise match the
* "looks like a hex token" heuristic (e.g. `face`, `dead`, `beef`).
* Kept intentionally tiny to avoid lexicon drift.
*/
const HEX_ALPHABETIC_ALLOWLIST = new Set<string>([
'face',
'fade',
'dead',
'beef',
'cafe',
'feed',
'deed',
'fed',
'add',
'dad',
'bad',
]);
/**
* Detect run-id slug chains of the form
* `<letters>(-<letters>)*-run<digits>(-<digits>)*` — e.g.
* `propositions-run261-1779431162` or
* `breaking-news-run17-1234567890`. Implemented as a split-and-scan
* walker (instead of a single backtracking regex) to satisfy the
* `security/detect-unsafe-regex` lint rule.
*
* @param lower - Lower-case candidate token
* @returns `true` when the token matches the run-id slug shape
*/
function isRunSlugChain(lower: string): boolean {
const parts = lower.split('-');
if (parts.length < 2) return false;
let runIndex = -1;
for (let i = 0; i < parts.length; i++) {
if (/^run\d+$/u.test(parts[i] ?? '')) {
runIndex = i;
break;
}
}
Iif (runIndex <= 0) return false;
// Every segment before `run<digits>` must be all-letters; every
// segment after must be all-digits.
for (let i = 0; i < runIndex; i++) {
Iif (!/^[a-z]+$/u.test(parts[i] ?? '')) return false;
}
for (let i = runIndex + 1; i < parts.length; i++) {
Iif (!/^\d+$/u.test(parts[i] ?? '')) return false;
}
return true;
}
/**
* Decide whether a single keyword token should be discarded as noise.
*
* The current rules reject tokens that:
*
* - Look like a UUID hex chunk: ≥4 chars and consist solely of the
* `[0-9a-f]` alphabet **and** contain at least one digit (so
* real English words like `dead` / `face` survive). Tokens of
* length ≥8 are always rejected (a real English word of that
* length composed exclusively of hex letters is vanishingly rare;
* the allowlist guards the short cases).
* - Are mostly digits (≥80 % digit characters) — runtime epoch
* suffixes such as `1779431162` and committee-codeoid mashes like
* `2024k1234`.
* - Start with `run` and end with all-digits (`run261`, `run17`),
* the per-run slug suffix the aggregator stamps onto run ids.
* - Match the full opaque-runId shape `<type>-run<digits>-<digits>`
* after a strip / normalization round-trip.
*
* Returns `false` for normal vocabulary so the keyword list stays
* useful — every reject path is intentionally narrow.
*
* @param token - Single token candidate
* @returns `true` when the token should be dropped from keywords
*/
export function isNoiseKeywordToken(token: string): boolean {
if (!token) return true;
const trimmed = token.trim();
if (trimmed.length < 4) return true;
const lower = trimmed.toLowerCase();
// Reject pure-digit and digit-dominated tokens.
if (/^\d+$/u.test(lower)) return true;
const digitCount = (lower.match(/\d/gu) ?? []).length;
if (digitCount > 0 && digitCount / lower.length >= 0.8) return true;
// Reject `run<digits>` slugs and `…-run<digits>-<digits>` chains.
if (/^run\d+$/u.test(lower)) return true;
if (isRunSlugChain(lower)) return true;
// Reject hex-shaped tokens unless they are common English words.
const isHex = /^[0-9a-f]+$/u.test(lower);
if (isHex) {
if (lower.length >= 8) return true;
if (digitCount > 0) return true;
Eif (HEX_ALPHABETIC_ALLOWLIST.has(lower)) return false;
// Short all-letter hex words: keep (avoids overfitting).
return false;
}
return false;
}
|