All files / src/generators/news-indexes backfill-reader-label.ts

100% Statements 19/19
91.66% Branches 11/12
100% Functions 3/3
100% Lines 15/15

Press n or j to go to the next uncovered block, b, p or k for the previous block.

1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89                                                                      4x 4x 4x 2x                                   12x 12x     12x   10x 10x 10x 328x   7x 5x     3x                             3x    
// SPDX-FileCopyrightText: 2024-2026 Hack23 AB
// SPDX-License-Identifier: Apache-2.0
 
/**
 * @module Generators/NewsIndexes/BackfillReaderLabel
 * @description Truncated reader-label detection and stripping helpers,
 * extracted from `backfill.ts` to keep source files ≤600 lines.
 */
 
import { getLocalizedString } from '../../constants/languages.js';
import { SEO_CONTEXT_LABELS } from '../../aggregator/metadata/template-fallback.js';
import type { LanguageCode } from '../../types/index.js';
 
/**
 * Remove a trailing **truncated** copy of the localized reader label
 * (`SEO_CONTEXT_LABELS[lang].reader`) from a candidate description.
 *
 * Earlier backfill passes appended the reader label and then clamped the
 * whole buffer to the per-script `metaDescription` budget, hard-cutting
 * the label mid-word (e.g. zh `…政策后果的读` instead of `…政策后果的读者`,
 * ja `…追跡する読`, ko dangling `…추적하는.`). Those mangled fragments were
 * persisted to `<meta description>` and survive a plain prefix/date-label
 * strip, so re-feeding them to the resolver re-emits the broken tail.
 *
 * A trailing copy that matches the label **in full** is left intact — it
 * is a complete, reader-facing clause we want to preserve. Only a partial
 * (truncated) prefix of the label is dropped, leaving the clean body for
 * the resolver to re-enrich with a budget-aware (whole-label-or-nothing)
 * reader clause.
 *
 * @param description - Candidate description (prefix/date-label removed)
 * @param langCode - Article language code
 * @returns Description with any truncated trailing reader label removed
 */
export function stripTruncatedReaderLabel(description: string, langCode: LanguageCode): string {
  const text = description.trim();
  const cut = findTruncatedReaderLabelCut(text, langCode);
  if (cut < 0) return text;
  return text
    .replace(/[.。!?!?…]+$/u, '')
    .slice(0, cut)
    .replace(/[\s,;:—\-–·。、]+$/u, '')
    .trim();
}
 
/**
 * Locate a trailing **truncated** copy of the localized reader label and
 * return the index at which the description body ends (i.e. where the
 * partial label begins). Returns -1 when no partial label is present or
 * when the label is present in full (a complete clause we keep).
 *
 * @param text - Trimmed candidate description
 * @param langCode - Article language code
 * @returns Cut index for the partial label, or -1 when none applies
 */
export function findTruncatedReaderLabelCut(text: string, langCode: LanguageCode): number {
  const labels = getLocalizedString(SEO_CONTEXT_LABELS, langCode);
  const reader = (labels.reader ?? '').trim();
  // Require a reasonably long label so we never strip on a coincidental
  // short suffix match; real labels are 40+ chars (Latin) / 11+ (CJK).
  if (reader.length < 8 || text.length < 8) return -1;
  // Tolerate a terminator the resolver/healer appended after the cut.
  const core = text.replace(/[.。!?!?…]+$/u, '');
  const maxK = Math.min(core.length, reader.length);
  for (let k = maxK; k >= 8; k -= 1) {
    if (core.slice(core.length - k) === reader.slice(0, k)) {
      // Full label present at the tail — keep it (not a truncation).
      if (k === reader.length) return -1;
      return core.length - k;
    }
  }
  return -1;
}
 
/**
 * Detect whether a legacy `<meta description>` ends with a **truncated**
 * reader label once its dateline prefix and redundant date-label clause
 * are removed. Long, unique legacy descriptions otherwise bypass
 * `shouldBackfillDescription`, leaving a persisted mid-word cut
 * (e.g. zh `…政策后果的读`, ja `…追跡する読`, ko `…추적하는.`) in place.
 *
 * @param body - Stripped description body (prefix/date-label removed)
 * @param langCode - Article language code
 * @returns True when a truncated reader label remains in the body
 */
export function hasTruncatedReaderLabelInBody(body: string, langCode: LanguageCode): boolean {
  return findTruncatedReaderLabelCut(body, langCode) >= 0;
}