All files / src/aggregator lead-extractor.ts

94.04% Statements 79/84
80.39% Branches 41/51
100% Functions 8/8
98.5% Lines 66/67

Press n or j to go to the next uncovered block, b, p or k for the previous block.

1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211                                                              6x     6x             6x                                                     27x 27x 27x 27x 27x 104x 104x   27x 674x 674x 32x 32x   642x 596x 596x 77x 77x           77x   519x   27x 27x                         66x 28x 27x 27x   27x 27x                     28x 28x 66x 66x 66x 17x   49x   11x                         27x 27x 336x 74x 22x 22x     5x 6x 6x 6x 6x                             30x 30x 28x 28x 30x   1x 1x 1x 30x                             25x 54x 54x 23x 23x 23x 23x   2x    
// SPDX-FileCopyrightText: 2024-2026 Hack23 AB
// SPDX-License-Identifier: Apache-2.0
 
/**
 * @module Aggregator/LeadExtractor
 * @description Pure helper that extracts the strongest lead sentence from
 * an Executive Brief artifact.
 *
 * The aggregator's Executive Brief section is rendered first; this module
 * is consumed by {@link buildArticleMeta} (and by metadata fall-backs) to
 * surface a single concise lead — the journalistic "nut graf" — that
 * sharpens both the SEO description and the structured data emitted next
 * to `article.md`.
 *
 * The rules are deterministic:
 *  1. Prefer the first non-empty paragraph under a `## BLUF` /
 *     `## Bottom Line Up Front` heading in `executive-brief.md` (or
 *     `extended/executive-brief.md`).
 *  2. Fall back to the first non-empty paragraph under `## Top Findings`
 *     / `## Key Judgments` in `intelligence/synthesis-summary.md`.
 *  3. Fall back to the first non-empty paragraph in any of the canonical
 *     sources.
 *  4. Return the first sentence (split on `. ` / `! ` / `? `), capped at
 *     {@link MAX_LEAD_CHARS} characters with a trailing ellipsis.
 */
 
import fs from 'fs';
import path from 'path';
import { stripInlineMarkdown } from './article-metadata.js';
 
/** Hard cap on the returned lead length. */
export const MAX_LEAD_CHARS = 320;
 
/** Default canonical sources, in priority order. */
const DEFAULT_LEAD_SOURCES: readonly string[] = [
  'executive-brief.md',
  'extended/executive-brief.md',
  'intelligence/synthesis-summary.md',
];
 
/** H2 headings whose first paragraph is preferred as the lead. */
const PREFERRED_HEADINGS = [
  'bluf',
  'bottom line up front',
  'top findings',
  'key judgments',
  'lead',
  'headline',
];
 
/**
 * One scanned section produced by {@link splitSections}.
 */
interface MarkdownSection {
  /** Lower-cased, normalised heading title (`''` for the document preamble). */
  readonly heading: string;
  /** Body lines (no headings). */
  readonly lines: readonly string[];
}
 
/**
 * Split a Markdown document into headed sections. Fenced code blocks are
 * stripped so prose extraction never picks them up.
 *
 * @param markdown - Raw Markdown
 * @returns Ordered list of `{heading, lines}` sections
 */
function splitSections(markdown: string): MarkdownSection[] {
  const sections: MarkdownSection[] = [];
  let heading = '';
  let buffer: string[] = [];
  let inFence = false;
  const flush = (): void => {
    sections.push({ heading, lines: buffer });
    buffer = [];
  };
  for (const rawLine of markdown.split(/\r?\n/)) {
    const line = rawLine ?? '';
    if (/^```/.test(line)) {
      inFence = !inFence;
      continue;
    }
    if (inFence) continue;
    const headingMatch = /^(#{1,6})\s+(.*)$/.exec(line);
    if (headingMatch) {
      flush();
      heading = (headingMatch[2] ?? '')
        .trim()
        .toLowerCase()
        .replace(/[^\p{L}\p{N}\s]+/gu, ' ')
        .replace(/\s+/g, ' ')
        .trim();
      continue;
    }
    buffer.push(line);
  }
  flush();
  return sections;
}
 
/**
 * Decide whether a line should terminate the current paragraph (returning
 * what we have so far) or simply be ignored. Pure helper.
 *
 * @param trimmed - Trimmed line content
 * @returns `'flush'` when a paragraph break should be honoured,
 *          `'skip'` when the line is irrelevant content (e.g. a bullet),
 *          `'append'` when the line is prose that extends the paragraph
 */
function classifyParagraphLine(trimmed: string): 'flush' | 'skip' | 'append' {
  if (trimmed === '') return 'flush';
  if (/^[-*+]\s+/.test(trimmed)) return 'skip';
  Iif (/^\d+\.\s+/.test(trimmed)) return 'skip';
  Iif (/^(>|<|!?\[)/.test(trimmed)) return 'flush';
  // Artifact metadata key-value lines (e.g. "**Run:** …", "** IMF requirement:** …").
  Iif (/^\*\*\s*[A-Za-z][^*]+:\*\*/.test(trimmed)) return 'skip';
  return 'append';
}
 
/**
 * Return the first non-empty prose paragraph from a section's body,
 * skipping bullets, ordered-list items, blockquotes, and inline HTML.
 *
 * @param lines - Body lines of a single section
 * @returns First non-empty paragraph, or `''` when none qualifies
 */
function firstProseParagraph(lines: readonly string[]): string {
  const paragraph: string[] = [];
  for (const rawLine of lines) {
    const trimmed = (rawLine ?? '').trim();
    const action = classifyParagraphLine(trimmed);
    if (action === 'flush' && paragraph.length > 0) {
      return paragraph.join(' ').trim();
    }
    if (action === 'append') paragraph.push(trimmed);
  }
  return paragraph.length > 0 ? paragraph.join(' ').trim() : '';
}
 
/**
 * Extract the strongest lead paragraph from a Markdown body. Pure helper;
 * surfaced for unit testing.
 *
 * @param markdown - Artifact Markdown (front-matter / banners ignored via
 *                   heading-aware scanning rather than full cleaning)
 * @returns First non-empty paragraph under a preferred heading, or the
 *          first non-empty paragraph anywhere in the body, or `''`
 */
export function extractLeadParagraph(markdown: string): string {
  const sections = splitSections(markdown);
  for (const section of sections) {
    const isPreferred = PREFERRED_HEADINGS.some((h) => section.heading.startsWith(h));
    if (!isPreferred) continue;
    const paragraph = firstProseParagraph(section.lines);
    Eif (paragraph) return paragraph;
  }
  // Fallback: first prose paragraph in any section that follows a heading.
  for (let i = 1; i < sections.length; i++) {
    const section = sections[i];
    Iif (!section) continue;
    const paragraph = firstProseParagraph(section.lines);
    if (paragraph) return paragraph;
  }
  return '';
}
 
/**
 * Trim a paragraph down to its first sentence and cap the length at
 * {@link MAX_LEAD_CHARS}, appending an ellipsis when truncation occurs.
 *
 * @param paragraph - Raw paragraph (single line, multiple sentences allowed)
 * @returns Trimmed lead, never longer than {@link MAX_LEAD_CHARS}
 */
export function trimToLeadSentence(paragraph: string): string {
  // Strip inline Markdown (bold, italic, links, code) so the emitted lead
  // is plain text suitable for SEO/structured-data consumption.
  const cleaned = stripInlineMarkdown(paragraph).replace(/\s+/g, ' ').trim();
  if (cleaned.length === 0) return '';
  const sentenceMatch = /^(.*?[.!?])\s/.exec(cleaned);
  const sentence = sentenceMatch?.[1] ?? cleaned;
  if (sentence.length <= MAX_LEAD_CHARS) return sentence;
  // Hard cap with ellipsis at a word boundary.
  const slice = sentence.slice(0, MAX_LEAD_CHARS - 1);
  const lastSpace = slice.lastIndexOf(' ');
  const safe = lastSpace > MAX_LEAD_CHARS / 2 ? slice.slice(0, lastSpace) : slice;
  return `${safe.trimEnd()}…`;
}
 
/**
 * Resolve the executive lead by walking the canonical sources in priority
 * order and returning the first non-empty trimmed sentence.
 *
 * @param runDir - Absolute path to the analysis run directory
 * @param sources - Optional override for the source list
 * @returns Trimmed lead, or `''` when nothing is harvestable
 */
export function extractExecutiveLead(
  runDir: string,
  sources: readonly string[] = DEFAULT_LEAD_SOURCES
): string {
  for (const rel of sources) {
    const abs = path.join(runDir, rel);
    if (!fs.existsSync(abs)) continue;
    const markdown = fs.readFileSync(abs, 'utf8');
    const paragraph = extractLeadParagraph(markdown);
    const lead = trimToLeadSentence(paragraph);
    Eif (lead.length > 0) return lead;
  }
  return '';
}