Press n or j to go to the next uncovered block, b, p or k for the previous block.
| 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 | 15x 4219x 3712x 2961x 15x 2395x 2395x 15x 15x 15x 1158x 1158x 7216x 7216x 587x 571x 1789x 1789x 1789x 1158x 1158x 1158x 1158x 1789x 1789x 587x 587x 571x 310x 310x 310x 310x 261x 261x 270x 270x 270x 270x 270x 270x 270x 270x 270x 80x 190x 21x 169x | // SPDX-FileCopyrightText: 2024-2026 Hack23 AB
// SPDX-License-Identifier: Apache-2.0
/**
* @module Aggregator/Metadata/SeoBudgets
* @description Per-script SEO byte budgets and a script-aware clamp.
*
* Background. Google Search Central and Bing Webmaster Guidelines both
* document SERP snippet limits in **pixels**, not characters. Latin
* glyphs render at roughly half the pixel width of CJK glyphs, while
* Arabic/Hebrew letterforms sit between the two. A single `length`
* budget for `<title>` / `<meta description>` will always be wrong for
* at least one of the 14 publishing languages — typically over-truncating
* Latin copy and over-running CJK by a factor of two.
*
* This module provides:
*
* - {@link classifyScript} — three-way `latin | cjk | rtl` family
* classifier driven by the locale code (no glyph inspection — the
* BCP-47 language tag is authoritative because every publishing
* pipeline emits one full output per language).
* - {@link SEO_BUDGETS} — per-surface × per-script byte caps derived
* from the documented platform envelopes (Google ≤580 px title /
* ≤155 char description; Bing slightly more generous; Facebook ≤95
* chars on `og:title`; Twitter ≤70 / ≤200; LinkedIn shares OG).
* - {@link budgetFor} — typed accessor returning the byte cap for a
* `(lang, surface)` pair, with a uniform fallback to the strictest
* Latin budget when the locale is unknown.
* - {@link clampForBudget} — script-aware truncator that prefers
* natural clause boundaries (CJK full-width punctuation, RTL
* sentence punctuation, Latin clause separators) before falling
* back to whitespace breaks. Returns the input verbatim when it
* already fits.
*
* Pure, leaf module. No I/O, no dependencies on other aggregator
* modules beyond the existing `text-utils.ts` clause-boundary
* vocabulary.
*/
import type { LanguageCode } from '../../types/index.js';
import { HEADLINE_CLAUSE_BOUNDARIES } from './text-utils.js';
// ────────────────────────────────────────────────────────────────────────
// Script-family classifier
// ────────────────────────────────────────────────────────────────────────
/**
* Three-way script family used as the column key in {@link SEO_BUDGETS}.
* `cjk` covers Chinese / Japanese / Korean (~2× Latin pixel width per
* glyph); `rtl` covers Arabic / Hebrew (bidi + ligature handling).
*/
export type ScriptFamily = 'latin' | 'cjk' | 'rtl';
/**
* Iteration helper — all three script families in a deterministic
* order (latin → cjk → rtl). Exported so test matrices and downstream
* tooling can walk every column of {@link SEO_BUDGETS} without
* duplicating the literal list.
*/
export const ALL_SCRIPT_FAMILIES: readonly ScriptFamily[] = ['latin', 'cjk', 'rtl'] as const;
/**
* Classify a locale code into a script family. Used to look up the
* correct byte cap in {@link SEO_BUDGETS}.
*
* @param lang - BCP-47 language tag (one of the 14 publishing locales)
* @returns Script family for SEO budget lookup
*/
export function classifyScript(lang: string): ScriptFamily {
if (lang === 'ar' || lang === 'he') return 'rtl';
if (lang === 'ja' || lang === 'ko' || lang === 'zh') return 'cjk';
return 'latin';
}
// ────────────────────────────────────────────────────────────────────────
// Surface catalogue + per-script byte budgets
// ────────────────────────────────────────────────────────────────────────
/**
* Public SEO surfaces this module budgets for. Each one has documented
* truncation behaviour by at least one major search engine or social
* platform.
*
* - `title` — HTML `<title>` (Google ≤580 px ≈ 60 Latin / 30 CJK / 55 RTL)
* - `metaDescription` — `<meta name="description">` (Google ≤~155 char)
* - `ogTitle` — Facebook / LinkedIn `og:title` (~95 Latin)
* - `ogDescription` — Facebook / LinkedIn `og:description` (~200 Latin)
* - `twitterTitle` — Twitter card title (≤70 Latin)
* - `twitterDescription` — Twitter card description (≤200 Latin)
* - `imageAlt` — `og:image:alt` / social card alt text (≤125 Latin)
* - `jsonLdHeadline` — Schema.org `NewsArticle.headline` (Google ≤110)
*/
export type SeoSurface =
| 'title'
| 'metaDescription'
| 'ogTitle'
| 'ogDescription'
| 'twitterTitle'
| 'twitterDescription'
| 'imageAlt'
| 'jsonLdHeadline';
/**
* Per-surface × per-script byte cap table. Numbers reflect the
* narrower of Google / Bing / Facebook / Twitter documented envelopes,
* with a ~5 % safety margin so a snippet on the edge of the budget
* isn't truncated mid-glyph by the rendering platform.
*
* For `jsonLdHeadline` the Schema.org `NewsArticle.headline` cap is
* script-independent (Google validates the literal character count at
* 110) — same value across the row.
*/
export const SEO_BUDGETS: Readonly<Record<SeoSurface, Readonly<Record<ScriptFamily, number>>>> = {
title: { latin: 60, cjk: 30, rtl: 55 },
metaDescription: { latin: 155, cjk: 78, rtl: 150 },
ogTitle: { latin: 95, cjk: 47, rtl: 90 },
ogDescription: { latin: 200, cjk: 100, rtl: 195 },
twitterTitle: { latin: 70, cjk: 35, rtl: 70 },
twitterDescription: { latin: 200, cjk: 100, rtl: 195 },
imageAlt: { latin: 125, cjk: 60, rtl: 120 },
jsonLdHeadline: { latin: 110, cjk: 110, rtl: 110 },
};
/**
* Resolve the byte cap for one `(lang, surface)` pair.
*
* @param lang - Publishing locale
* @param surface - SEO surface (see {@link SeoSurface})
* @returns Byte cap (positive integer)
*/
export function budgetFor(lang: LanguageCode | string, surface: SeoSurface): number {
const family = classifyScript(lang);
return SEO_BUDGETS[surface][family];
}
// ────────────────────────────────────────────────────────────────────────
// Script-aware truncator
// ────────────────────────────────────────────────────────────────────────
/**
* CJK full-width clause boundaries — the breakpoints CJK readers
* expect a snippet to end at. Listed in preferred-break order: a
* sentence-final mark beats a comma which beats a middle-dot.
*/
const CJK_CLAUSE_BOUNDARIES: readonly string[] = [
'。',
'!',
'?',
'、',
';',
':',
'——',
'—',
'・',
];
/**
* RTL sentence punctuation. Arabic uses U+061F (؟) for question mark
* and U+060C (،) for comma; full stop is the ASCII `.` (Hebrew uses
* `.` and `,` directly). Listed in preferred-break order.
*/
const RTL_CLAUSE_BOUNDARIES: readonly string[] = ['. ', '؟ ', '! ', '، ', '؛ ', ' — ', ' – '];
/**
* Soft-minimum fraction of the budget at which a clause-boundary break
* is acceptable. Below this fraction we fall through to whitespace
* truncation so we never ship a near-empty snippet just because the
* input started with a short clause.
*/
const SOFT_MIN_RATIO = 0.55;
/**
* Trim trailing punctuation that would otherwise leave a snippet
* ending on a dangling separator or ellipsis. Mirrors the spirit of
* `text-utils.ts::TRAILING_PUNCT` but keeps full-width CJK marks
* intact when they sit at a natural sentence boundary.
*
* @param s - Input string to trim
* @returns Input with trailing separator-class characters removed
*/
function trimTrailingSeparators(s: string): string {
return s.replace(/[\s,;:—\-–·•…]+$/u, '');
}
/**
* Pick the highest-priority clause boundary inside a candidate window.
* Iterates the boundary vocabulary in declared (preference) order and
* returns the first index that sits past the soft minimum.
*
* @param window - Candidate cut window (`text.slice(0, budget)`)
* @param boundaries - Boundary vocabulary, in preference order
* @param softMin - Soft-minimum cut position (chars)
* @returns Cut index, or -1 when no boundary qualifies
*/
function findClauseCut(window: string, boundaries: readonly string[], softMin: number): number {
for (const boundary of boundaries) {
const idx = window.lastIndexOf(boundary);
if (idx >= softMin) {
return idx + boundary.length;
}
}
return -1;
}
/**
* Truncate `text` to fit `(lang, surface)` SEO byte budget. Prefers a
* natural clause boundary inside the script's punctuation vocabulary
* (CJK / RTL / Latin) before falling back to a whitespace break.
*
* Always returns `text` verbatim when it already fits (no ellipsis
* appended). When truncation happens an ellipsis (`…`) is appended for
* Latin / RTL; for CJK the full-width ellipsis (`…`) reads as a
* partial-thought marker and is also appended — Schema.org and Google
* accept either glyph in `headline` / `description`.
*
* @param text - Source text (already plain-text — no Markdown / HTML)
* @param lang - Publishing locale
* @param surface - Target SEO surface
* @returns Clamped text ≤ `budgetFor(lang, surface)` characters
*/
export function clampForBudget(
text: string,
lang: LanguageCode | string,
surface: SeoSurface
): string {
const trimmed = text.trim();
const budget = budgetFor(lang, surface);
if (trimmed.length <= budget) return trimmed;
const family = classifyScript(lang);
const softMin = Math.floor(budget * SOFT_MIN_RATIO);
// Reserve one char for the ellipsis we may append.
const window = trimmed.slice(0, budget - 1);
const boundaries =
family === 'cjk'
? CJK_CLAUSE_BOUNDARIES
: family === 'rtl'
? RTL_CLAUSE_BOUNDARIES
: HEADLINE_CLAUSE_BOUNDARIES;
const clauseCut = findClauseCut(window, boundaries, softMin);
if (clauseCut > 0) {
const cleaned = trimTrailingSeparators(trimmed.slice(0, clauseCut));
Eif (cleaned.length >= softMin) return cleaned;
}
// Whitespace-aware fallback. CJK text often has no ASCII spaces, so
// skip this step for CJK and fall straight through to the hard cut.
if (family !== 'cjk') {
const lastSpace = window.lastIndexOf(' ');
Eif (lastSpace >= softMin) {
const safe = trimTrailingSeparators(window.slice(0, lastSpace));
return `${safe}…`;
}
}
const hardCut = trimTrailingSeparators(window);
return `${hardCut}…`;
}
/**
* Optional inputs to {@link clampTitleForSurface}.
*
* `siteTitle` is the brand suffix (e.g. "EU Parliament Monitor") and
* `separator` is the localized glue (e.g. `" | "` / `" ・ "` / `" ׀ "`).
* When both are provided the function tries to keep the brand suffix
* inside the budget; when the article title alone already fills the
* budget the suffix is *dropped* (better SERP outcome than a truncated
* headline followed by a clipped brand).
*
* `shortSiteTitle` is the optional fallback used when the full brand
* suffix can't fit but a shorter variant would (e.g. `"EPM"` for CJK).
*/
export interface TitleSurfaceOptions {
readonly siteTitle?: string;
readonly shortSiteTitle?: string;
readonly separator?: string;
}
/**
* Compose `{title}{separator}{siteTitle}` while honouring the
* `(lang, surface)` budget. Drops the brand suffix entirely when the
* article title alone is already at or past the budget. Prefers the
* short site title when supplied and the full suffix doesn't fit.
*
* @param title - Article title (plain text)
* @param lang - Publishing locale
* @param surface - Target SEO surface (`title` / `ogTitle` / `twitterTitle`)
* @param opts - Optional brand suffix wiring
* @returns Composed title ≤ budget
*/
export function clampTitleForSurface(
title: string,
lang: LanguageCode | string,
surface: SeoSurface,
opts: TitleSurfaceOptions = {}
): string {
const budget = budgetFor(lang, surface);
const cleanTitle = title.trim();
const sep = opts.separator ?? '';
const full = opts.siteTitle ?? '';
const short = opts.shortSiteTitle ?? '';
// No brand suffix wiring — just clamp the title in isolation.
Iif (!full) return clampForBudget(cleanTitle, lang, surface);
const fullSuffix = `${sep}${full}`;
const shortSuffix = short ? `${sep}${short}` : '';
// Best case: title + full suffix fits.
if (cleanTitle.length + fullSuffix.length <= budget) {
return `${cleanTitle}${fullSuffix}`;
}
// Second best: title + short suffix fits.
if (shortSuffix && cleanTitle.length + shortSuffix.length <= budget) {
return `${cleanTitle}${shortSuffix}`;
}
// Third: keep the title (clamped), drop the brand. Better SERP than
// a truncated headline followed by a clipped brand suffix.
return clampForBudget(cleanTitle, lang, surface);
}
|