metadata text-utils.ts

98.41% Statements 62/63
97.36% Branches 37/38
100% Functions 25/25
100% Lines 49/49
Press n or j to go to the next uncovered block, b, p or k for the previous block.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262  
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31x
31x
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
139714x
1882785x
 
 
31x
139714x
133421x
132552x
132552x
132550x
1988172x
 
 
 
 
 
 
117585x
 
 
 
 
 
806448x
 
 
 
132544x
 
 
 
121152x
 
 
 
119428x
 
 
 
118939x
 
 
 
118309x
 
 
 
118309x
118309x
118309x
5067338x
5067338x
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
115598x
4678x
1726x
1008x
1008x
3242x
 
1008x
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
117584x
10097x
2389x
117584x
117584x
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14597x
14597x
2047x
2047x
2047x
1893x
194x
193x
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14491x
14491x
14491x
 
 
 
 
 
 
11755x
 
 
 
 
2736x
 
 
 
 
 
 
 
 
 
 
 
 
537557x
 
 
 
 
 
 
 
 
 
 
 
  // SPDX-FileCopyrightText: 2024-2026 Hack23 AB
// SPDX-License-Identifier: Apache-2.0
 
/**
 * @module Aggregator/Metadata/TextUtils
 * @description Pure text / Markdown classification + label-stripping
 * helpers used by the metadata resolver chain. Constants live in
 * `text-utils-constants.ts`; byte-budget truncators and sentence-
 * extraction live in `text-truncate.ts`. This file re-exports the
 * full public surface so existing call-sites keep working.
 *
 * Bounded-context rules:
 * - **No upward imports** — pure helpers, no I/O, no globals.
 * - **Deterministic** — same input always produces same output.
 * - **Locale-agnostic** — every helper works on raw Markdown / prose
 *   in any of the 14 publishing languages. Banner-row detection is
 *   driven by structural shape (double-bold + pipe-separator), not by
 *   a hard-coded English vocabulary.
 */
 
export {
  ABBREVIATION_PREFIXES,
  DESCRIPTION_MAX_LENGTH,
  DESCRIPTION_MIN_LENGTH,
  EMOJI_BANNER_CHARS,
  ENRICHMENT_TRIGGER_LENGTH,
  EXTENDED_DESCRIPTION_MAX_LENGTH,
  EXTENDED_DESCRIPTION_MIN_LENGTH,
  HEADLINE_CLAUSE_BOUNDARIES,
  HEADLINE_SOFT_MIN,
  METADATA_LINE_PREFIXES,
  TITLE_MAX_LENGTH,
  TRAILING_PUNCT,
  TRAILING_STOP_WORDS,
} from './text-utils-constants.js';
 
export {
  extractFirstSentence,
  stripTrailingStopWordsAndPunctuation,
  truncateDescription,
  truncateExtendedDescription,
  truncateTitle,
} from './text-truncate.js';
 
import { EMOJI_BANNER_CHARS, METADATA_LINE_PREFIXES } from './text-utils-constants.js';
 
const STRUCTURAL_LINE_PREFIXES = ['#', '>', '<', '|'] as const;
const FENCE_LINE_PREFIXES = ['```', '~~~'] as const;
 
// ────────────────────────────────────────────────────────────────────────
// Line-classification helpers
// ────────────────────────────────────────────────────────────────────────
 
/**
 * Return `true` when a line cannot serve as a prose description. Rejects
 * Markdown structural lines (headings, blockquotes, tables, HTML),
 * mermaid/chart directives, emoji-banner metadata rows, and the known
 * `Key: value` banners that Stage-B agents emit as artefact preamble.
 *
 * @param line - Trimmed line from the aggregated Markdown source
 * @returns `true` when the line is not prose and should be skipped
 */
export function shouldSkipDescriptionLine(line: string): boolean {
  Iif (line.length === 0) return true;
  return DESCRIPTION_SKIP_CHECKS.some((check) => check(line));
}
 
const DESCRIPTION_SKIP_CHECKS: ReadonlyArray<(line: string) => boolean> = [
  (line) => startsWithAny(line, STRUCTURAL_LINE_PREFIXES),
  (line) => line.startsWith('---') || line.startsWith('==='),
  (line) => startsWithAny(line, FENCE_LINE_PREFIXES),
  (line) => line.startsWith('%%'),
  (line) => /^title\s/i.test(line),
  (line) => EMOJI_BANNER_CHARS.some((char) => line.startsWith(char)),
  startsWithSeparatorFragment,
  isStructuralListLeader,
  startsWithContinuationConjunction,
  hasTrailingEllipsis,
  isPublishedBanner,
  startsWithMetadataLabel,
  (line) => /^[-*_=~.]{3,}$/.test(line),
  isLocalizedBannerRow,
  isPlainPipeBannerRow,
];
 
function startsWithAny(line: string, prefixes: readonly string[]): boolean {
  return prefixes.some((prefix) => line.startsWith(prefix));
}
 
function startsWithSeparatorFragment(line: string): boolean {
  return /^[:;,—–-]\s/u.test(line);
}
 
function isStructuralListLeader(line: string): boolean {
  return /^\(?[0-9]{1,2}[.):]\s/u.test(line) || /^\(?[a-z][.)]\s/iu.test(line);
}
 
function startsWithContinuationConjunction(line: string): boolean {
  return /^(that|which|while|whereas|and|but|for|yet|so|nor|or)\s/iu.test(line);
}
 
function hasTrailingEllipsis(line: string): boolean {
  return line.endsWith('…') || /\.{3,}$/u.test(line);
}
 
function isPublishedBanner(line: string): boolean {
  return /^published\s+\d{4}-\d{2}-\d{2}\b/iu.test(line);
}
 
function startsWithMetadataLabel(line: string): boolean {
  const labelSource = line.replace(/^\*+/, '').replace(/^\*\*/, '').replace(/^_+/, '').trim();
  const lower = labelSource.toLowerCase();
  return METADATA_LINE_PREFIXES.some((prefix) => {
    const prefixLower = prefix.toLowerCase();
    return (
      lower.startsWith(`${prefixLower}:`) ||
      lower.startsWith(`${prefixLower} :`) ||
      lower.startsWith(`${prefixLower}**:`) ||
      lower.startsWith(`${prefixLower}*:`)
    );
  });
}
 
/**
 * Detect a plain (non-bold) pipe-delimited banner row of the shape
 * `Tag: Value | Tag: Value | Tag: Value`. Matches three-or-more
 * `Word: …` segments separated by ` | ` so legitimate prose containing
 * a single colon (`The Commission's view: …`) is preserved.
 *
 * @param line - Trimmed source line
 * @returns `true` when the line is a plain pipe-banner row
 */
function isPlainPipeBannerRow(line: string): boolean {
  if (!line.includes('|')) return false;
  const segments = line.split('|').map((s) => s.trim());
  if (segments.length < 3) return false;
  let labeledSegments = 0;
  for (const seg of segments) {
    if (/^[A-Z][\p{L}\p{M}\p{N}\- ]{1,30}[:：]\s+\S/u.test(seg)) labeledSegments += 1;
  }
  return labeledSegments >= 2;
}
 
/**
 * Language-agnostic banner-row detector. Stage-B artefacts open with a
 * metadata banner of the shape
 *   `**Date:** 2026-05-15 | **Type:** Breaking | **Run:** breaking-run-001`
 * and its localized siblings — notably Japanese / Chinese / Korean briefs
 * which place the full-width colon `：` **inside** the bold span
 * (`**日付：**`) rather than after it. The `METADATA_LINE_PREFIXES` table
 * only covers the English vocabulary; this helper catches the structural
 * shape directly: a line that starts with `**`, contains at least one
 * `|` separator, and carries two-or-more bold key markers that end with
 * — or are followed by — an ASCII colon `:` or full-width colon `：`.
 * Banner rows look identical in every language we publish, so detecting
 * them here keeps localized briefs from leaking their first banner line
 * into the `<meta description>`.
 *
 * @param line - Trimmed source line
 * @returns `true` when the line is a banner row in any locale
 */
function isLocalizedBannerRow(line: string): boolean {
  if (!line.startsWith('**')) return false;
  if (!line.includes('|')) return false;
  const inside = (line.match(/\*\*[^*]+[:：]\s*\*\*/g) ?? []).length;
  const after = (line.match(/\*\*[^*]+\*\*\s*[:：]/g) ?? []).length;
  return inside + after >= 2;
}
 
/**
 * Strip a leading all-caps prose label (e.g. `SITUATION:`, `KEY MOTION:`,
 * `BLUF:`, `BOTTOM LINE:`, `TIER-1:`) from a prose line. These labels
 * are common in BLUF-style editorial writing — they survive
 * {@link stripInlineMarkdown} (which strips the `**bold**` wrapper but
 * keeps the literal text) and would otherwise leak into the SEO
 * description as a confusing all-caps shout.
 *
 * Matches up to 4 hyphenated all-caps tokens, optionally followed by a
 * digit suffix (`TIER-1`), terminating at a colon. Returns the original
 * line when no opener is present.
 *
 * @param line - Plain prose line (post-{@link stripInlineMarkdown})
 * @returns Line with the all-caps opener removed
 */
export function stripLeadingProseLabel(line: string): string {
  const colonIdx = line.indexOf(': ');
  if (colonIdx < 2 || colonIdx > 80) return line;
  const label = line.slice(0, colonIdx);
  const rest = line.slice(colonIdx + 2).trim();
  if (rest.length < 20) return line;
  if (!/^[A-Z][A-Z0-9 -]{1,79}$/.test(label)) return line;
  if (label.length < 3) return line;
  return rest;
}
 
/**
 * Strip a leading `**Label:**` / `**Label：**` prefix from a Markdown
 * BLUF line, in any of the 14 publishing languages. Translated
 * executive briefs open the `## FOR IMMEDIATE ACTION` section with
 * patterns such as `**Issue:** …`, `**Fråga:** …`, `**Asunto:** …`,
 * `**主題:** …`, `**الموضوع:** …`, `**Thema:** …`, `**Sujet :** …` —
 * without this stripper the localized label leaked into
 * `<meta description>` for every non-English locale (the English
 * `**Issue:**` line is already filtered by `METADATA_LINE_PREFIXES`).
 *
 * The matcher is *structural*, not vocabulary-driven: it accepts up to
 * 5 word/glyph tokens (letters, marks, digits, spaces, hyphens),
 * followed by either an ASCII colon `:` or full-width colon `：`,
 * followed by `**`, followed by whitespace. Returns the line verbatim
 * when no qualifying opener is present so it is safe to apply
 * unconditionally.
 *
 * @param raw - Raw Markdown line (still carrying `**…**` decorations)
 * @returns Line with the leading `**Label:**` prefix removed, or the
 * original input when no such prefix exists
 */
export function stripLeadingBoldLabel(raw: string): string {
  // Allowed label characters: any Unicode letter, mark, digit, space, hyphen.
  // 1–5 tokens (≤ 40 chars total) to avoid swallowing long inline-bold prose.
  // Both `**Label:**` (colon inside the bold span) and `**Label**:` are
  // observed in translations — match both shapes.
  const pattern =
    /^\*\*([\p{L}\p{M}\p{N}][\p{L}\p{M}\p{N} -]{0,38})[:：]\*\*\s+|^\*\*([\p{L}\p{M}\p{N}][\p{L}\p{M}\p{N} -]{0,38})\*\*\s*[:：]\s+/u;
  const match = pattern.exec(raw);
  if (!match) {
    // Defense in depth: even when no `**Label**` decoration is present,
    // strip a residual orphan separator at the line start. Upstream
    // strippers (e.g. {@link stripInlineMarkdown} applied after a
    // partial bold-label removal) can leave `: rest of sentence…`
    // shapes; we never want those leading punctuation glyphs to survive
    // into the description or title.
    return raw.replace(/^[:;—–-]\s+/u, '');
  }
  // After the bold-label match, also strip any *additional* residual
  // separator that may follow (rare, but observed when authors write
  // `**Issue**: : `).
  return raw.slice(match[0].length).replace(/^[:;—–-]\s+/u, '');
}
 
/**
 * Strip inline Markdown decorations so we can use the remaining text as
 * plain-text meta-tag content. Removes link syntax, emphasis, inline code
 * backticks, and HTML-entity fragments that the Markdown source sometimes
 * smuggles in. Keeps the visible text readable.
 *
 * @param raw - Trimmed Markdown line
 * @returns Plain-text variant
 */
export function stripInlineMarkdown(raw: string): string {
  return raw
    .replace(/!\[([^\]\n]{0,500})\]\(([^)\n]{0,500})\)/g, '$1')
    .replace(/\[([^\]\n]{1,500})\]\(([^)\n]{0,500})\)/g, '$1')
    .replace(/`([^`\n]{1,500})`/g, '$1')
    .replace(/\*\*([^*\n]{1,500})\*\*/g, '$1')
    .replace(/__([^_\n]{1,500})__/g, '$1')
    .replace(/\*([^*\n]{1,500})\*/g, '$1')
    .replace(/_([^_\n]{1,500})_/g, '$1')
    .replace(/~~([^~\n]{1,500})~~/g, '$1')
    .replace(/\s+/g, ' ')
    .trim();
}