All files / src/aggregator/metadata seo-keywords.ts

95.23% Statements 20/21
75% Branches 6/8
100% Functions 5/5
100% Lines 19/19

Press n or j to go to the next uncovered block, b, p or k for the previous block.

1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101                                                                                        19124x 19124x 19124x     19124x 19124x                 19124x                             19124x   557176x 557176x                     19124x 19124x 19124x 545152x 545152x 545152x 545152x 507405x 507405x   19124x    
// SPDX-FileCopyrightText: 2024-2026 Hack23 AB
// SPDX-License-Identifier: Apache-2.0
 
/**
 * @module Aggregator/Metadata/SeoKeywords
 * @description `<meta name="keywords">` builder extracted from
 * `resolve-helpers.ts` so that module stays below the 600-line drift-guard
 * cap (see `test/unit/source-file-size.test.js`). Pure leaf module — depends
 * only on localized-keyword tables, the slug humanizer, and the cross-site
 * noise / portfolio filters.
 */
 
import { getLocalizedString } from '../../constants/language-core.js';
import { LOCALIZED_KEYWORDS } from '../../constants/language-articles.js';
import type { LanguageCode } from '../../types/index.js';
import { CROSS_SITE_KEYWORDS, isNoiseKeywordToken } from './keyword-filters.js';
import { humanizeSlug } from './slug.js';
 
/**
 * Build a stable, localized keyword list from the article type plus the
 * resolved title/description context.
 *
 * @param lang - Target language code
 * @param articleType - Article type slug
 * @param date - ISO article date
 * @param runId - Optional run id (currently unused — preserved for callsite
 *                backward compatibility; see implementation note below)
 * @param title - Resolved title
 * @param description - Resolved description
 * @returns De-duplicated keywords for `<meta name="keywords">`
 */
export function buildSeoKeywords(
  lang: LanguageCode,
  articleType: string,
  date: string,
  runId: string,
  title: string,
  description: string
): readonly string[] {
  // `runId` is intentionally unused: the previous implementation
  // emitted `run <runId>` as a synthetic keyword, which surfaced
  // opaque tokens like `run propositions-run261-1779431162` in
  // `<meta name="keywords">`. The argument is preserved for callsite
  // backward compatibility.
  void runId;
  const localized = getLocalizedString(LOCALIZED_KEYWORDS, lang);
  const base = Object.getOwnPropertyDescriptor(localized, articleType)?.value as
    | readonly string[]
    | undefined;
  const fallback = ['EU Parliament', 'European Parliament', 'political intelligence'];
  const candidates = [
    // Always-on cross-site portfolio keywords lead the list so they
    // are guaranteed to survive the 16-entry budget cap.
    ...CROSS_SITE_KEYWORDS,
    ...(base ?? fallback),
    humanizeSlug(articleType),
    date,
    ...extractKeywordTerms(`${title} ${description}`),
  ];
  return dedupeKeywords(candidates).slice(0, 16);
}
 
/**
 * Extract short keyword terms from resolved SEO copy.
 *
 * Filters out tokens that look like UUID hex fragments, run-id slugs,
 * or digit-dominated noise (see {@link isNoiseKeywordToken}) so the
 * keyword list never leaks internal aggregator identifiers into
 * `<meta name="keywords">`.
 *
 * @param text - Title and description text
 * @returns Candidate terms
 */
function extractKeywordTerms(text: string): string[] {
  return text
    .split(/[^\p{L}\p{N}]+/u)
    .map((token) => token.trim())
    .filter((token) => token.length >= 4 && !isNoiseKeywordToken(token))
    .slice(0, 18);
}
 
/**
 * De-duplicate keywords case-insensitively while preserving original order.
 *
 * @param candidates - Raw keyword candidates
 * @returns De-duplicated keyword list
 */
function dedupeKeywords(candidates: readonly string[]): string[] {
  const seen = new Set<string>();
  const out: string[] = [];
  for (const candidate of candidates) {
    const trimmed = candidate.trim();
    Iif (!trimmed) continue;
    const key = trimmed.toLowerCase();
    if (seen.has(key)) continue;
    seen.add(key);
    out.push(trimmed);
  }
  return out;
}