Press n or j to go to the next uncovered block, b, p or k for the previous block.
| 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 | 427x 427x 378x 215x 135x 135x 135x 124x 124x 124x 57x 57x 57x 206x 206x 2x 2x 204x 202x 202x 42x 42x 57x 41x 2873x 2825x 2553x 108x 108x 2006x 108x 2445x 2873x 48x 48x 2825x 108x 108x 108x 2717x 225x 66x 66x 66x 806x 806x 806x 806x 806x 131x 131x 74x 48x 66x 48x 875x 875x 875x 2067x 2067x 2067x 2067x 2067x 94x 94x 57x 34x 875x 34x | // SPDX-FileCopyrightText: 2024-2026 Hack23 AB
// SPDX-License-Identifier: Apache-2.0
/**
* @module Aggregator/Metadata/LedeExtractor
* @description Markdown lede / prose-paragraph walkers extracted from
* `article-metadata.ts`. Provides {@link extractStrongProseLine} (first
* qualifying prose paragraph anywhere in the body) and
* {@link extractLedeAfterHeading} / {@link extractExtendedLedeAfterHeading}
* (prose paragraph that follows a `## 60-Second Read`-style heading
* from the editorial-lede whitelist).
*
* Pure module — depends only on text-utils (for description-shaping
* helpers) and heading-rules (for the editorial-lede whitelist and the
* heading-text normaliser).
*/
import {
DESCRIPTION_MAX_LENGTH,
EXTENDED_DESCRIPTION_MAX_LENGTH,
shouldSkipDescriptionLine,
stripInlineMarkdown,
stripLeadingBoldLabel,
stripLeadingProseLabel,
truncateDescription,
truncateExtendedDescription,
} from './text-utils.js';
import {
EDITORIAL_LEDE_HEADINGS,
isLedeHeadingMatch,
normaliseHeadingText,
} from './heading-rules.js';
/**
* Internal paragraph-collector state shared between
* {@link extractStrongProseLine} and {@link extractLedeAfterHeading}.
* Both walk Markdown line-by-line and concatenate consecutive prose
* lines into a single description-sized paragraph, terminating on a
* blank line, a skip-line, or the cap.
*/
interface ParagraphBuffer {
lines: string[];
byteCount: number;
}
/**
* Process one Markdown line against the in-progress paragraph buffer.
* Returns the desired loop control: `'continue'` (skip silently),
* `'break'` (paragraph terminated — emit), or `'collected'` (line was
* pushed into the buffer; caller checks the cap separately).
*
* Factored out of the two extractors to reduce cognitive complexity.
*
* @param line - Trimmed Markdown line
* @param buf - In-progress paragraph buffer (mutated on `'collected'`)
* @returns Loop control directive
*/
function collectProseLine(line: string, buf: ParagraphBuffer): 'continue' | 'break' | 'collected' {
const hasBuffer = buf.lines.length > 0;
if (hasBuffer && line === '') return 'break';
if (line === '') return 'continue';
if (shouldSkipDescriptionLine(line)) return hasBuffer ? 'break' : 'continue';
// Strip the leading `**Label:**` opener (any language) *before*
// running the inline-markdown stripper, so localized BLUF labels
// like `**Fråga:**` / `**主題:**` / `**الموضوع:**` are removed
// structurally rather than leaking into the description as plain
// text (`"Fråga: …"`). The English `**Issue:**` line is already
// skipped earlier by METADATA_LINE_PREFIXES; this code path covers
// the 13 non-English locales for which the label vocabulary is
// open-ended.
const stripped = stripLeadingBoldLabel(line);
const plain = stripLeadingProseLabel(stripInlineMarkdown(stripped));
if (!hasBuffer && plain.length < 40) return 'continue';
buf.lines.push(plain);
buf.byteCount += plain.length + 1;
return 'collected';
}
/**
* Walk every line of the Markdown source and return the first paragraph
* that survives {@link shouldSkipDescriptionLine}. Consecutive non-blank
* prose lines are joined with a single space so hard-wrapped ledes
* (column-95 conventional wrap) produce a clean 140-180-character
* description rather than just the first 60-90-char line.
*
* Inline Markdown decorations are stripped and the result is truncated
* to fit `<meta description>`.
*
* @param markdown - Markdown source
* @returns Prose description, or empty string when nothing qualifies
*/
export function extractStrongProseLine(markdown: string): string {
let inFence = false;
const buf: ParagraphBuffer = { lines: [], byteCount: 0 };
for (const raw of markdown.split('\n')) {
const line = raw.trim();
if (line.startsWith('```') || line.startsWith('~~~')) {
inFence = !inFence;
continue;
}
if (inFence) continue;
const directive = collectProseLine(line, buf);
if (directive === 'continue') continue;
Iif (directive === 'break') break;
if (buf.byteCount >= DESCRIPTION_MAX_LENGTH) break;
}
if (buf.lines.length === 0) return '';
return truncateDescription(buf.lines.join(' '));
}
/**
* Loop-state directive emitted by {@link classifyLedeLine} for one
* line of a `## 60-Second Read`-style block. `'enter'` and `'leave'`
* adjust the in-lede flag; `'collect'` defers to {@link collectProseLine};
* `'skip-fence'` toggles the fence flag; `'break-buffer-has-content'`
* stops the walk because we hit the next heading mid-paragraph.
*/
type LedeDirective =
| { kind: 'fence' }
| { kind: 'heading'; inLede: boolean }
| { kind: 'collect' }
| { kind: 'pause' };
/**
* Classify one Markdown line for the {@link extractLedeAfterHeading}
* walker. The returned directive is then applied to walker state by
* {@link applyLedeDirective}.
*
* @param line - Trimmed Markdown line
* @param isInFence - True when the previous line opened a fenced block
* @param inLede - True when the previous line was inside a lede heading block
* @param hasBuffered - True when at least one prose line has been collected
* @returns Directive describing how the walker should treat this line
*/
function classifyLedeLine(
line: string,
isInFence: boolean,
inLede: boolean,
hasBuffered: boolean
): LedeDirective {
if (line.startsWith('```') || line.startsWith('~~~')) return { kind: 'fence' };
if (isInFence) return { kind: 'pause' };
if (/^#{2,3}\s+/.test(line)) {
Iif (hasBuffered) return { kind: 'pause' };
const headingText = normaliseHeadingText(line.replace(/^#{2,3}\s+/, ''));
const match = EDITORIAL_LEDE_HEADINGS.some((h) => isLedeHeadingMatch(headingText, h));
return { kind: 'heading', inLede: match };
}
return inLede ? { kind: 'collect' } : { kind: 'pause' };
}
/**
* Apply one directive emitted by {@link classifyLedeLine} to the walk
* state. Returns `'break'` to stop the walk, `'continue'` to skip to
* the next line, or `'collect'` when the caller should now run
* {@link collectProseLine}. Mutates `state` for fence/in-lede toggles.
*
* @param directive - Classification of the current line
* @param state - Walk state (mutated in place)
* @param state.inFence - True when the current line is inside a fenced block
* @param state.inLede - True when the current line is inside a lede heading block
* @param hasBuffered - Whether any prose has already been collected
* @returns Loop control directive
*/
function applyLedeDirective(
directive: LedeDirective,
state: { inFence: boolean; inLede: boolean },
hasBuffered: boolean
): 'break' | 'continue' | 'collect' {
if (directive.kind === 'fence') {
state.inFence = !state.inFence;
return 'continue';
}
if (directive.kind === 'heading') {
Iif (hasBuffered) return 'break';
state.inLede = directive.inLede;
return 'continue';
}
if (directive.kind === 'pause') return 'continue';
return 'collect';
}
/**
* Walk the body of an editorial artefact and, when it contains a `## …`
* heading whose text matches one of `EDITORIAL_LEDE_HEADINGS`,
* return the first prose paragraph that follows that heading. Consecutive
* non-blank lines are paragraph-joined (see {@link extractStrongProseLine}).
*
* Returns the empty string when no lede heading is found or no qualifying
* prose follows it. Inline Markdown is stripped and the result is
* truncated to fit `<meta description>`.
*
* @param markdown - Editorial artefact source
* @returns Lede paragraph, or empty string when none matched
*/
export function extractLedeAfterHeading(markdown: string): string {
const state = { inFence: false, inLede: false };
const buf: ParagraphBuffer = { lines: [], byteCount: 0 };
for (const raw of markdown.split('\n')) {
const line = raw.trim();
const directive = classifyLedeLine(line, state.inFence, state.inLede, buf.lines.length > 0);
const action = applyLedeDirective(directive, state, buf.lines.length > 0);
Iif (action === 'break') break;
if (action === 'continue') continue;
const collect = collectProseLine(line, buf);
if (collect === 'continue') continue;
if (collect === 'break') break;
if (buf.byteCount >= DESCRIPTION_MAX_LENGTH) break;
}
if (buf.lines.length === 0) return '';
return truncateDescription(buf.lines.join(' '));
}
/**
* Same parsing rules as {@link extractLedeAfterHeading} but with a
* larger byte budget so the full BLUF paragraph (typically 200-300
* characters in the editorial style guide) is captured for use as
* `og:description` / `twitter:description`. Returns the joined
* paragraph clamped via {@link truncateExtendedDescription} (which
* returns `''` when the result wouldn't be longer than the regular
* meta description).
*
* @param markdown - Brief body (SPDX preamble already stripped)
* @returns Extended lede paragraph, or `''` when not worth emitting
*/
export function extractExtendedLedeAfterHeading(markdown: string): string {
const state = { inFence: false, inLede: false };
const buf: ParagraphBuffer = { lines: [], byteCount: 0 };
for (const raw of markdown.split('\n')) {
const line = raw.trim();
const directive = classifyLedeLine(line, state.inFence, state.inLede, buf.lines.length > 0);
const action = applyLedeDirective(directive, state, buf.lines.length > 0);
Iif (action === 'break') break;
if (action === 'continue') continue;
const collect = collectProseLine(line, buf);
if (collect === 'continue') continue;
if (collect === 'break') break;
if (buf.byteCount >= EXTENDED_DESCRIPTION_MAX_LENGTH) break;
}
if (buf.lines.length === 0) return '';
return truncateExtendedDescription(buf.lines.join(' '));
}
|