Press n or j to go to the next uncovered block, b, p or k for the previous block.
| 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 | 24x 24x 24x 40507x 40507x 27817x 11117x 19745x 19740x 15427x 14315x 5324x 19740x 19740x 19740x 19740x 19740x 19722x 19708x 19703x 18899x 24x 24x 24x 3936x 3936x 3976x 3976x 2361x 3976x 4050x 4000x 4000x 4000x 3976x 3976x 3976x 18102x 18102x 14518x 3976x 3976x 3901x 3936x 3936x 3936x 3936x 3936x 3936x 3936x 3936x 3936x 4050x 4050x 3901x 3901x 3901x 3901x 3901x 3901x 35x | // SPDX-FileCopyrightText: 2024-2026 Hack23 AB
// SPDX-License-Identifier: Apache-2.0
/**
* @module Aggregator/Metadata/ResolveScriptUtils
* @description Script-aware SEO helpers extracted from
* `per-language-resolver.ts` to keep that module below the 600-line
* drift-guard cap (see `test/unit/source-file-size.test.js`).
*
* This leaf module owns the locale-script probes and the small,
* pure decision helpers that depend on them:
*
* - {@link contentMatchesLocaleScript} — does the copy carry a glyph in
* the locale's expected script family?
* - {@link shouldEnrichDescription} — should the raw description be
* pushed through the localized enrichment path? (Gate 4b)
* - {@link pickResolvedTitle} — pick the `<title>` from the candidate
* ladder, skipping pure-ASCII summary titles for non-Latin locales
* (Gate 4a).
* - {@link pickResolvedTitleCandidate} — gate the H1/document-derived
* title candidate.
* - {@link appendRunNumberSuffix} — preserved no-op for backward
* compatibility.
*
* Pure, leaf module: no I/O. Imports only from the sibling SEO leaf
* modules (`seo-budgets`, `resolve-helpers`, `text-utils`).
*/
import type { LanguageCode } from '../../types/index.js';
import { budgetFor, classifyScript } from './seo-budgets.js';
import {
deriveHeadlineFromSummary,
hasLeakySeoToken,
isUsableResolvedTitle,
pickFirstNonEmpty,
sanitizeTitleCandidate,
} from './resolve-helpers.js';
import { findTitleRejectionReason } from './title-rejection.js';
import { ENRICHMENT_TRIGGER_LENGTH, truncateTitle } from './text-utils.js';
/**
* Unicode glyph probes used to detect whether resolved SEO copy actually
* matches the publishing locale's expected script. The CJK range covers
* Hiragana / Katakana (Japanese), Han ideographs (Chinese + Japanese kanji)
* and Hangul (Korean). The RTL range covers Hebrew (U+0590–U+05FF) and
* Arabic + supplements (U+0600–U+06FF).
*
* These probes drive the script-aware description clamp and the
* English-summary-derived title rejection gate below.
*/
export const CJK_GLYPH_RE = /[\u3040-\u30FF\u3400-\u9FFF\uAC00-\uD7AF]/u;
export const RTL_GLYPH_RE = /[\u0590-\u05FF\u0600-\u06FF]/u;
// eslint-disable-next-line no-control-regex
export const ASCII_ONLY_RE = /^[\x00-\x7F]*$/u;
/**
* Test whether `text` contains a glyph in the script family expected for
* `lang`. Latin locales return `true` unconditionally — their content is
* always Latin glyphs by definition.
*
* @param text - SEO copy under inspection
* @param lang - Publishing locale
* @returns True when `text` carries at least one glyph in the locale's script
*/
export function contentMatchesLocaleScript(text: string, lang: LanguageCode): boolean {
const family = classifyScript(lang);
if (family === 'latin') return true;
if (family === 'cjk') return CJK_GLYPH_RE.test(text);
return RTL_GLYPH_RE.test(text);
}
/**
* No-op: run numbers must never appear in user-facing article titles.
* Titles should always be readable article headlines without workflow
* identifiers. This function is preserved for callsite backward
* compatibility.
*
* @param seoTitle - SEO title (returned unchanged)
* @param _lang - Language code (ignored)
* @param _runId - Manifest run identifier (ignored)
* @returns The unchanged input title
*/
export function appendRunNumberSuffix(
seoTitle: string,
_lang: LanguageCode,
_runId: string
): string {
// Run numbers must never appear in user-facing article titles.
// Titles should always be readable article headlines without
// workflow identifiers. This function is preserved as a no-op
// for callsite backward compatibility.
return seoTitle;
}
/**
* Decide whether to push the raw description through
* {@link composeContextualDescription}'s enrichment path. Triggers when
* the raw description is below {@link ENRICHMENT_TRIGGER_LENGTH} (the
* historical "too short" gate) **or** when the locale is non-Latin and
* the raw description is pure ASCII (the English-fallback all-ASCII
* description failure mode reported by Gate 4b in
* `executive-brief-seo-extraction.test.js`).
*
* The second branch is the key fix for ar/he descriptions that fell
* through from the English brief without any localized labels — even
* though the raw English summary cleared the 100-char trigger, leaving
* it untouched produced a pure-ASCII snippet that the SEO regression
* suite (correctly) rejects as a resolver leak.
*
* @param rawDescription - Composed description before enrichment
* @param lang - Publishing locale
* @returns True when enrichment must run
*/
export function shouldEnrichDescription(rawDescription: string, lang: LanguageCode): boolean {
if (rawDescription.length < ENRICHMENT_TRIGGER_LENGTH) return true;
if (lang === 'en') return false;
if (classifyScript(lang) === 'latin') return false;
return ASCII_ONLY_RE.test(rawDescription);
}
/**
* Pick the SEO `<title>` from the candidate ladder. Skips the
* summary-derived candidate for non-Latin locales when its content is
* pure ASCII so we never leak an English summary-derived title (e.g.
* `*Q1 2026 is the master-synthe`) into a CJK / RTL page (Gate 4a in
* `executive-brief-seo-extraction.test.js`).
*
* @param lang - Publishing locale
* @param candidates - Title candidate inputs (in priority order)
* @param candidates.explicitTitle - Manifest operator override title
* @param candidates.resolvedTitleCandidate - H1/document-derived title
* @param candidates.summaryDerivedTitle - Summary-first-sentence title
* @param candidates.contextualFallback - Final fallback title
* @returns Picked title (always non-empty when the contextual fallback fires)
*/
export function pickResolvedTitle(
lang: LanguageCode,
candidates: {
readonly explicitTitle: string;
readonly resolvedTitleCandidate: string;
readonly summaryDerivedTitle: string;
readonly contextualFallback: string;
}
): string {
const family = classifyScript(lang);
const summaryTitleAllowed =
candidates.summaryDerivedTitle &&
isUsableResolvedTitle(candidates.summaryDerivedTitle, { allowFullSentence: true }) &&
!(family !== 'latin' && !contentMatchesLocaleScript(candidates.summaryDerivedTitle, lang));
return pickFirstNonEmpty([
candidates.explicitTitle,
candidates.resolvedTitleCandidate,
summaryTitleAllowed ? candidates.summaryDerivedTitle : '',
truncateTitle(candidates.contextualFallback),
candidates.contextualFallback,
]);
}
/**
* Decide whether `clippedTitle` is usable as the resolved title candidate.
* Extracted from `resolveOneLanguage` to keep cognitive complexity under
* the SonarJS threshold (15).
*
* @param args - Title candidate inputs
* @param args.clippedTitle - The truncated editorial/manifest title to evaluate
* @param args.headlineWasContaminated - True when the editorial headline was rejected by sanitize
* @param args.nonLatinFamily - True for CJK/RTL locales requiring locale-script glyphs
* @param args.allowShortResolvedTitle - True when the source is a localized brief
* @param args.lang - Target language code
* @returns The clipped title when usable, '' otherwise
*/
export function pickResolvedTitleCandidate(args: {
clippedTitle: string;
headlineWasContaminated: boolean;
nonLatinFamily: boolean;
allowShortResolvedTitle: boolean;
lang: LanguageCode;
}): string {
const { clippedTitle, headlineWasContaminated, nonLatinFamily, allowShortResolvedTitle, lang } =
args;
if (headlineWasContaminated || !clippedTitle) return '';
if (hasLeakySeoToken(clippedTitle)) return '';
if (nonLatinFamily && !contentMatchesLocaleScript(clippedTitle, lang)) return '';
if (!allowShortResolvedTitle && !isUsableResolvedTitle(clippedTitle)) return '';
return clippedTitle;
}
/**
* Minimum number of characters that must remain after the localized base
* label + separator before a distinctive English topic fragment is worth
* splicing in. Below this the fragment would be a single clipped word and
* read as noise rather than a differentiator.
*/
const MIN_DISTINCTIVE_FRAGMENT_ROOM = 12;
/** Minimum length of a distinctive English topic fragment (post-clamp). */
const MIN_DISTINCTIVE_FRAGMENT_CHARS = 6;
/**
* Leading filler / linking words that, when they open a salvaged English
* fragment, leave it reading as a mid-sentence clause (`is a T-2 …`)
* rather than a headline. Stripped from the front of a candidate before it
* is spliced into a localized title.
*/
const LEADING_FILLER_WORDS: ReadonlySet<string> = new Set([
'a',
'an',
'and',
'as',
'at',
'but',
'by',
'for',
'from',
'in',
'is',
'its',
'of',
'on',
'or',
'that',
'the',
'to',
'which',
'with',
]);
/**
* Extract the leading locale-script clause of a localized template title.
*
* Localized template titles are shaped `"<localized label>: <qualifier>"`
* where the qualifier is either an English topic tail (e.g.
* `"Main Committees"`) or an ISO date (`"2026-04-13"`). The leading clause
* before the first ASCII colon is the pure locale-script label we reuse as
* the prefix for a content-differentiated title.
*
* @param templateTitle - Localized template title
* @returns The leading clause, trimmed, or '' when none can be isolated
*/
function extractLocaleBaseLabel(templateTitle: string): string {
const colonIdx = templateTitle.indexOf(':');
return (colonIdx >= 0 ? templateTitle.slice(0, colonIdx) : templateTitle).trim();
}
/**
* Drop leading filler / linking words from a salvaged fragment so it opens
* on a content word (`is a T-2 …` → `T-2 …`).
*
* @param words - Tokenized fragment words
* @returns Words with leading filler removed (at least one word retained)
*/
function dropLeadingFiller(words: readonly string[]): string[] {
let start = 0;
while (start < words.length - 1 && LEADING_FILLER_WORDS.has((words[start] ?? '').toLowerCase())) {
start += 1;
}
return words.slice(start);
}
/**
* Build a short, distinctive, reader-facing English topic fragment from a
* raw English editorial candidate (headline / summary / extended summary).
*
* The fragment is stripped of parentheticals and leaky workflow tokens,
* reduced to its first sentence, opened on a content word, word-clamped to
* `room` glyphs, and stripped of trailing punctuation so the spliced title
* never reads as a full sentence (`sentence-fragment` rejection) or a
* truncation (`ellipsis-cut`).
*
* @param rawEnglish - Raw English editorial text
* @param room - Glyph budget available after the localized prefix
* @returns A distinctive fragment, or '' when none can be salvaged
*/
function buildDistinctiveFragment(rawEnglish: string, room: number): string {
if (!rawEnglish) return '';
// Drop parentheticals (`(T-0 = 15 April …)`, `(second motions-track …)`)
// before salvage so they neither leak run tokens nor open the fragment
// on an unbalanced bracket.
const deparenthesized = rawEnglish.replace(/\([^)]*\)/gu, ' ').replace(/\s{2,}/gu, ' ');
const sanitized = sanitizeTitleCandidate(deriveHeadlineFromSummary(deparenthesized));
if (!sanitized) return '';
const words = dropLeadingFiller(sanitized.split(/\s+/u).filter(Boolean));
let fragment = '';
for (const word of words) {
const next = fragment ? `${fragment} ${word}` : word;
if ([...next].length > room) break;
fragment = next;
}
// Drop trailing punctuation so the splice never trips the
// sentence-fragment / ellipsis-cut rejection predicates.
fragment = fragment.replace(/[\s.,;:!?\u2026\u2013\u2014-]+$/u, '').trim();
if ([...fragment].length < MIN_DISTINCTIVE_FRAGMENT_CHARS) return '';
return fragment;
}
/**
* Compose a content-differentiated `<title>` for a non-Latin locale that
* would otherwise fall back to the date-suffixed template title.
*
* Two same-date / same-articleType runs whose English brief differs but
* whose localized template title is identical (e.g. two `2026-04-13`
* `motions` re-runs) collide on the SERP. Run-number / "Edition N"
* disambiguators are forbidden in titles, so the differentiator must be
* content-based: we reuse the localized template's leading locale-script
* clause as the prefix and splice in a short distinctive English topic
* fragment derived from the run's own English headline / summary. The
* resulting mixed-script title still carries a locale glyph (Gate 4a) and
* mirrors the existing `"<localized label>: Main Committees"` precedent.
*
* Returns '' (caller keeps the date-suffixed fallback) whenever the locale
* is Latin, the base label lacks a locale glyph, the per-script title
* budget leaves no room for a meaningful fragment, or no candidate yields a
* clean, leak-free, budget-fitting fragment.
*
* @param args - Composition inputs
* @param args.lang - Publishing locale
* @param args.templateTitle - Localized template title (prefix source)
* @param args.englishCandidates - Raw English editorial texts (priority order)
* @returns A distinctive mixed-script title, or '' when none can be built
*/
export function composeNonLatinDistinctiveTitle(args: {
readonly lang: LanguageCode;
readonly templateTitle: string;
readonly englishCandidates: readonly string[];
}): string {
const { lang, templateTitle, englishCandidates } = args;
Iif (classifyScript(lang) === 'latin') return '';
const baseLabel = extractLocaleBaseLabel(templateTitle);
Iif (!baseLabel || !contentMatchesLocaleScript(baseLabel, lang)) return '';
const budget = budgetFor(lang, 'title');
const prefix = `${baseLabel}: `;
const room = budget - [...prefix].length;
Iif (room < MIN_DISTINCTIVE_FRAGMENT_ROOM) return '';
for (const candidate of englishCandidates) {
const fragment = buildDistinctiveFragment(candidate, room);
if (!fragment) continue;
const composed = `${prefix}${fragment}`;
Iif ([...composed].length > budget) continue;
Iif (hasLeakySeoToken(composed)) continue;
Iif (findTitleRejectionReason(composed)) continue;
Iif (!contentMatchesLocaleScript(composed, lang)) continue;
return composed;
}
return '';
}
|