aggregator lead-extractor.ts

94.04% Statements 79/84
80.39% Branches 41/51
100% Functions 8/8
98.5% Lines 66/67
Press n or j to go to the next uncovered block, b, p or k for the previous block.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211  
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6x
 
 
6x
 
 
 
 
 
 
6x
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27x
27x
27x
27x
27x
104x
104x
 
27x
674x
674x
32x
32x
 
642x
596x
596x
77x
77x
 
 
 
 
 
77x
 
519x
 
27x
27x
 
 
 
 
 
 
 
 
 
 
 
 
66x
28x
27x
27x
 
27x
27x
 
 
 
 
 
 
 
 
 
 
28x
28x
66x
66x
66x
17x
 
49x
 
11x
 
 
 
 
 
 
 
 
 
 
 
 
27x
27x
336x
74x
22x
22x
 
 
5x
6x
6x
6x
6x
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30x
30x
28x
28x
30x
 
1x
1x
1x
30x
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25x
54x
54x
23x
23x
23x
23x
 
2x
 
  // SPDX-FileCopyrightText: 2024-2026 Hack23 AB
// SPDX-License-Identifier: Apache-2.0
 
/**
 * @module Aggregator/LeadExtractor
 * @description Pure helper that extracts the strongest lead sentence from
 * an Executive Brief artifact.
 *
 * The aggregator's Executive Brief section is rendered first; this module
 * is consumed by {@link buildArticleMeta} (and by metadata fall-backs) to
 * surface a single concise lead — the journalistic "nut graf" — that
 * sharpens both the SEO description and the structured data emitted next
 * to `article.md`.
 *
 * The rules are deterministic:
 *  1. Prefer the first non-empty paragraph under a `## BLUF` /
 *     `## Bottom Line Up Front` heading in `executive-brief.md` (or
 *     `extended/executive-brief.md`).
 *  2. Fall back to the first non-empty paragraph under `## Top Findings`
 *     / `## Key Judgments` in `intelligence/synthesis-summary.md`.
 *  3. Fall back to the first non-empty paragraph in any of the canonical
 *     sources.
 *  4. Return the first sentence (split on `. ` / `! ` / `? `), capped at
 *     {@link MAX_LEAD_CHARS} characters with a trailing ellipsis.
 */
 
import fs from 'fs';
import path from 'path';
import { stripInlineMarkdown } from './article-metadata.js';
 
/** Hard cap on the returned lead length. */
export const MAX_LEAD_CHARS = 320;
 
/** Default canonical sources, in priority order. */
const DEFAULT_LEAD_SOURCES: readonly string[] = [
  'executive-brief.md',
  'extended/executive-brief.md',
  'intelligence/synthesis-summary.md',
];
 
/** H2 headings whose first paragraph is preferred as the lead. */
const PREFERRED_HEADINGS = [
  'bluf',
  'bottom line up front',
  'top findings',
  'key judgments',
  'lead',
  'headline',
];
 
/**
 * One scanned section produced by {@link splitSections}.
 */
interface MarkdownSection {
  /** Lower-cased, normalised heading title (`''` for the document preamble). */
  readonly heading: string;
  /** Body lines (no headings). */
  readonly lines: readonly string[];
}
 
/**
 * Split a Markdown document into headed sections. Fenced code blocks are
 * stripped so prose extraction never picks them up.
 *
 * @param markdown - Raw Markdown
 * @returns Ordered list of `{heading, lines}` sections
 */
function splitSections(markdown: string): MarkdownSection[] {
  const sections: MarkdownSection[] = [];
  let heading = '';
  let buffer: string[] = [];
  let inFence = false;
  const flush = (): void => {
    sections.push({ heading, lines: buffer });
    buffer = [];
  };
  for (const rawLine of markdown.split(/\r?\n/)) {
    const line = rawLine ?? '';
    if (/^```/.test(line)) {
      inFence = !inFence;
      continue;
    }
    if (inFence) continue;
    const headingMatch = /^(#{1,6})\s+(.*)$/.exec(line);
    if (headingMatch) {
      flush();
      heading = (headingMatch[2] ?? '')
        .trim()
        .toLowerCase()
        .replace(/[^\p{L}\p{N}\s]+/gu, ' ')
        .replace(/\s+/g, ' ')
        .trim();
      continue;
    }
    buffer.push(line);
  }
  flush();
  return sections;
}
 
/**
 * Decide whether a line should terminate the current paragraph (returning
 * what we have so far) or simply be ignored. Pure helper.
 *
 * @param trimmed - Trimmed line content
 * @returns `'flush'` when a paragraph break should be honoured,
 *          `'skip'` when the line is irrelevant content (e.g. a bullet),
 *          `'append'` when the line is prose that extends the paragraph
 */
function classifyParagraphLine(trimmed: string): 'flush' | 'skip' | 'append' {
  if (trimmed === '') return 'flush';
  if (/^[-*+]\s+/.test(trimmed)) return 'skip';
  Iif (/^\d+\.\s+/.test(trimmed)) return 'skip';
  Iif (/^(>|<|!?\[)/.test(trimmed)) return 'flush';
  // Artifact metadata key-value lines (e.g. "**Run:** …", "** IMF requirement:** …").
  Iif (/^\*\*\s*[A-Za-z][^*]+:\*\*/.test(trimmed)) return 'skip';
  return 'append';
}
 
/**
 * Return the first non-empty prose paragraph from a section's body,
 * skipping bullets, ordered-list items, blockquotes, and inline HTML.
 *
 * @param lines - Body lines of a single section
 * @returns First non-empty paragraph, or `''` when none qualifies
 */
function firstProseParagraph(lines: readonly string[]): string {
  const paragraph: string[] = [];
  for (const rawLine of lines) {
    const trimmed = (rawLine ?? '').trim();
    const action = classifyParagraphLine(trimmed);
    if (action === 'flush' && paragraph.length > 0) {
      return paragraph.join(' ').trim();
    }
    if (action === 'append') paragraph.push(trimmed);
  }
  return paragraph.length > 0 ? paragraph.join(' ').trim() : '';
}
 
/**
 * Extract the strongest lead paragraph from a Markdown body. Pure helper;
 * surfaced for unit testing.
 *
 * @param markdown - Artifact Markdown (front-matter / banners ignored via
 *                   heading-aware scanning rather than full cleaning)
 * @returns First non-empty paragraph under a preferred heading, or the
 *          first non-empty paragraph anywhere in the body, or `''`
 */
export function extractLeadParagraph(markdown: string): string {
  const sections = splitSections(markdown);
  for (const section of sections) {
    const isPreferred = PREFERRED_HEADINGS.some((h) => section.heading.startsWith(h));
    if (!isPreferred) continue;
    const paragraph = firstProseParagraph(section.lines);
    Eif (paragraph) return paragraph;
  }
  // Fallback: first prose paragraph in any section that follows a heading.
  for (let i = 1; i < sections.length; i++) {
    const section = sections[i];
    Iif (!section) continue;
    const paragraph = firstProseParagraph(section.lines);
    if (paragraph) return paragraph;
  }
  return '';
}
 
/**
 * Trim a paragraph down to its first sentence and cap the length at
 * {@link MAX_LEAD_CHARS}, appending an ellipsis when truncation occurs.
 *
 * @param paragraph - Raw paragraph (single line, multiple sentences allowed)
 * @returns Trimmed lead, never longer than {@link MAX_LEAD_CHARS}
 */
export function trimToLeadSentence(paragraph: string): string {
  // Strip inline Markdown (bold, italic, links, code) so the emitted lead
  // is plain text suitable for SEO/structured-data consumption.
  const cleaned = stripInlineMarkdown(paragraph).replace(/\s+/g, ' ').trim();
  if (cleaned.length === 0) return '';
  const sentenceMatch = /^(.*?[.!?])\s/.exec(cleaned);
  const sentence = sentenceMatch?.[1] ?? cleaned;
  if (sentence.length <= MAX_LEAD_CHARS) return sentence;
  // Hard cap with ellipsis at a word boundary.
  const slice = sentence.slice(0, MAX_LEAD_CHARS - 1);
  const lastSpace = slice.lastIndexOf(' ');
  const safe = lastSpace > MAX_LEAD_CHARS / 2 ? slice.slice(0, lastSpace) : slice;
  return `${safe.trimEnd()}…`;
}
 
/**
 * Resolve the executive lead by walking the canonical sources in priority
 * order and returning the first non-empty trimmed sentence.
 *
 * @param runDir - Absolute path to the analysis run directory
 * @param sources - Optional override for the source list
 * @returns Trimmed lead, or `''` when nothing is harvestable
 */
export function extractExecutiveLead(
  runDir: string,
  sources: readonly string[] = DEFAULT_LEAD_SOURCES
): string {
  for (const rel of sources) {
    const abs = path.join(runDir, rel);
    if (!fs.existsSync(abs)) continue;
    const markdown = fs.readFileSync(abs, 'utf8');
    const paragraph = extractLeadParagraph(markdown);
    const lead = trimToLeadSentence(paragraph);
    Eif (lead.length > 0) return lead;
  }
  return '';
}