aggregator article-metadata.ts

91.24% Statements 198/217
87.14% Branches 122/140
100% Functions 25/25
93.29% Lines 167/179
Press n or j to go to the next uncovered block, b, p or k for the previous block.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815  
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3x
 
 
3x
 
 
3x
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3x
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3x
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
138x
 
 
95x
56x
52x
50x
47x
45x
 
 
45x
43x
 
 
567x
 
 
 
37x
37x
720x
720x
720x
 
 
 
 
 
7x
 
 
 
 
30x
 
29x
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58x
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
362x
2x
2x
2x
362x
 
 
 
 
 
 
 
 
 
 
359x
2x
2x
2x
359x
 
 
 
 
 
 
 
 
 
 
 
29x
35x
35x
 
31x
 
 
 
28x
28x
28x
 
1x
 
 
 
 
 
 
 
 
 
 
 
33x
119x
119x
28x
28x
24x
 
9x
 
 
 
 
 
 
 
 
 
 
 
77x
 
116x
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61x
61x
 
60x
60x
 
 
 
 
 
 
 
 
 
60x
60x
237x
227x
222x
 
 
 
 
44x
44x
 
 
 
44x
 
 
 
 
 
 
 
 
 
 
44x
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14x
 
 
13x
27x
27x
13x
13x
13x
13x
12x
12x
 
 
 
 
1x
1x
 
 
 
 
 
 
 
 
 
 
1x
 
 
 
 
 
 
 
 
 
 
 
13x
13x
 
 
 
13x
 
13x
13x
16x
16x
1x
1x
 
15x
2x
2x
 
13x
 
13x
 
 
 
 
 
 
 
 
 
 
1x
1x
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35x
 
 
 
35x
35x
35x
 
35x
490x
 
 
 
 
 
 
490x
 
 
 
 
 
 
35x
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
490x
 
 
336x
 
28x
 
28x
 
28x
 
14x
 
14x
 
14x
 
14x
 
14x
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39x
39x
 
38x
 
38x
38x
38x
38x
 
 
 
 
 
 
 
 
 
 
 
 
37x
37x
37x
 
 
 
 
 
 
 
 
 
 
 
 
 
37x
37x
 
 
 
 
 
 
 
 
 
 
76x
75x
75x
 
 
 
 
 
 
 
 
 
76x
76x
76x
76x
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
672x
644x
 
 
 
 
 
42x
42x
84x
84x
 
42x
42x
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24x
 
 
24x
11x
11x
11x
 
 
 
 
 
 
 
13x
13x
13x
9x
 
 
 
 
 
 
4x
1x
 
 
3x
 
 
 
 
 
 
 
 
 
 
24x
24x
24x
 
24x
 
 
 
 
24x
336x
336x
336x
 
 
 
 
 
336x
336x
 
 
336x
 
 
 
336x
336x
 
336x
 
 
 
 
 
 
 
 
 
 
24x
 
 
 
 
 
 
 
 
 
 
672x
1320x
 
 
 
  // SPDX-FileCopyrightText: 2024-2026 Hack23 AB
// SPDX-License-Identifier: Apache-2.0
 
/**
 * @module Aggregator/ArticleMetadata
 * @description Resolve per-language `{title, description}` for an article
 * rendered by the aggregator pipeline. The resolver follows a strict
 * priority ladder that prefers *real editorial highlights* over boring,
 * repeated templates — satisfying the core SEO requirement that every
 * published article carry a unique, content-reflective headline and
 * description in every language variant.
 *
 * Priority ladder (per language, highest wins):
 *
 * 1. **Manifest override** — `manifest.title` / `manifest.description` on
 *    the analysis-run manifest, either as a plain string (applied to every
 *    language) or a `LanguageMap<string>` object for explicit per-language
 *    values. Authored by Stage-B agents when they have an editorial
 *    headline for the day.
 * 2. **Artefact editorial H1** — first `# …` heading from the first
 *    substantive artefact under the run directory (e.g.
 *    `intelligence/synthesis-summary.md`, `breaking-news-analysis.md`).
 *    Accepted only when the heading is not a generic
 *    `${humanize(articleType)} — ${date}` form.
 * 3. **Aggregated-markdown H1** — the first `# …` heading in the aggregator
 *    output, accepted under the same non-generic rule. In practice this
 *    tier rarely fires because the aggregator itself writes the generic
 *    default, but it covers hand-edited or legacy aggregates.
 * 4. **First strong prose paragraph** — the first line of the aggregated
 *    Markdown that survives {@link shouldSkipDescriptionLine}. Used for
 *    `description`; also used for `title` as a last editorial-content
 *    resort when every heading-level source is generic.
 * 5. **Localized template** — the per-article-type `*_TITLES` generator
 *    from `src/constants/language-articles.ts`. Always parameterised by
 *    date (or derived values), so the title changes from run to run even
 *    when this last tier fires — but still the "boring repeated" option.
 *
 * English highlights (tiers 2–4) are reserved for the `en` language
 * variant; non-English variants skip them and drop to the localized
 * template (tier 5) unless an explicit `manifest.title.<lang>` /
 * `manifest.description.<lang>` override is present. This guarantees
 * every variant's `<title>` and `<meta description>` are in the correct
 * locale even while the article body itself is still rendered from an
 * English source (until per-language body translations ship).
 */
 
import fs from 'fs';
import path from 'path';
import { ALL_LANGUAGES, getLocalizedString } from '../constants/language-core.js';
import {
  BREAKING_NEWS_TITLES,
  COMMITTEE_REPORTS_TITLES,
  MONTH_AHEAD_TITLES,
  MONTHLY_REVIEW_TITLES,
  MOTIONS_TITLES,
  PROPOSITIONS_TITLES,
  WEEK_AHEAD_TITLES,
  WEEKLY_REVIEW_TITLES,
} from '../constants/language-articles.js';
import type { LangTitleSubtitle, LanguageCode, LanguageMap } from '../types/index.js';
 
/** One resolved `(title, description)` pair for a single language. */
export interface ResolvedMetadataEntry {
  readonly title: string;
  readonly description: string;
}
 
/** Fully resolved metadata — one entry per supported language. */
export type ResolvedMetadata = LanguageMap<ResolvedMetadataEntry>;
 
/**
 * Raw manifest subset consumed by the resolver. Deliberately narrower
 * than the full {@link AnalysisManifest} shape so the resolver stays
 * usable for backport (which only has the manifest in text form) and for
 * callers that don't need the full typed structure.
 */
export interface MetadataManifest {
  readonly articleType?: string;
  readonly date?: string;
  readonly runId?: string;
  /**
   * Optional editorial-title override. `string` is applied to every
   * language; an object allows explicit per-language overrides.
   */
  readonly title?: string | Partial<Record<LanguageCode, string>>;
  /**
   * Optional editorial-description override. Same shape rules as
   * {@link title}.
   */
  readonly description?: string | Partial<Record<LanguageCode, string>>;
  /**
   * Optional committee code (e.g. `ENVI`) used by
   * {@link COMMITTEE_REPORTS_TITLES} when the template fallback fires.
   */
  readonly committee?: string;
}
 
/** Inputs to {@link resolveArticleMetadata}. */
export interface ResolveMetadataOptions {
  /** Article type slug (e.g. `breaking`, `motions`, `week-ahead`). */
  readonly articleType: string;
  /** ISO date of the run (`YYYY-MM-DD`). */
  readonly date: string;
  /** Aggregated Markdown document body (after provenance/header). */
  readonly markdown: string;
  /** Parsed analysis manifest (may be empty for legacy/backport callers). */
  readonly manifest?: MetadataManifest;
  /**
   * Absolute path to the analysis run directory so the resolver can
   * peek at individual artefact files. Omit for callers that only have
   * the aggregated Markdown (the artefact-H1 tier is then skipped).
   */
  readonly runDir?: string;
}
 
/** Maximum `<meta description>` length we will emit. */
const DESCRIPTION_MAX_LENGTH = 300;
 
/** Maximum `<title>` length — anything longer is truncated with an ellipsis. */
const TITLE_MAX_LENGTH = 140;
 
/** Ordered list of artefact filenames that typically carry the editorial H1. */
const EDITORIAL_ARTEFACT_CANDIDATES: readonly string[] = [
  'intelligence/synthesis-summary.md',
  'intelligence/executive-summary.md',
  'intelligence/intelligence-briefing.md',
  'executive-summary.md',
  'intelligence-briefing.md',
  'synthesis-summary.md',
  'breaking-news-analysis.md',
  'committee-activity-report.md',
  'legislative-pipeline-analysis.md',
  'weekly-outlook.md',
  'monthly-outlook.md',
  'week-in-review.md',
  'month-in-review.md',
  'motions-analysis.md',
  'propositions-analysis.md',
];
 
/**
 * Emoji-banner prefixes that Stage-B agents use to decorate metadata rows
 * (e.g. `📋 Analysis Owner:`). Any line starting with one of these is
 * metadata, never prose.
 */
const EMOJI_BANNER_CHARS = [
  '📋',
  '📅',
  '🔍',
  '🏛',
  '📰',
  '📊',
  '🏷',
  '📈',
  '📉',
  '⚠',
  '🔔',
  '🎯',
  '🗳',
  '🏢',
  '📄',
];
 
/**
 * Label prefixes that a prose description must never start with. Every
 * entry matches case-insensitively at the start of a trimmed line, followed
 * by optional space and a colon.
 */
const METADATA_LINE_PREFIXES: readonly string[] = [
  'Analysis Date',
  'Analysis Owner',
  'Article Type',
  'Assessment Date',
  'Classification',
  'Classification Date',
  'Confidence',
  'Data Sources',
  'Document Type',
  'Generated',
  'Last Updated',
  'Parliamentary Status',
  'Parliamentary Term',
  'Period',
  'Run',
  'Run ID',
  'Series',
  'Series Run',
  'SPDX-FileCopyrightText',
  'SPDX-License-Identifier',
  'Type',
  'Window',
];
 
/**
 * Return `true` when a line cannot serve as a prose description. Rejects
 * Markdown structural lines (headings, blockquotes, tables, HTML),
 * mermaid/chart directives, emoji-banner metadata rows, and the known
 * `Key: value` banners that Stage-B agents emit as artefact preamble.
 *
 * @param line - Trimmed line from the aggregated Markdown source
 * @returns `true` when the line is not prose and should be skipped
 */
export function shouldSkipDescriptionLine(line: string): boolean {
  if (line.length === 0) return true;
 
  // Markdown structural openers
  if (line.startsWith('#')) return true;
  if (line.startsWith('>')) return true;
  if (line.startsWith('<')) return true;
  if (line.startsWith('|')) return true;
  if (line.startsWith('---') || line.startsWith('===')) return true;
  Iif (line.startsWith('```') || line.startsWith('~~~')) return true;
 
  // Mermaid / chart init blocks and the `title <text>` directive inside them
  if (line.startsWith('%%')) return true;
  if (/^title\s/i.test(line)) return true;
 
  // Emoji-banner metadata rows
  if (EMOJI_BANNER_CHARS.some((char) => line.startsWith(char))) return true;
 
  // `Key: value` metadata banners. Match plain text, bold `**Key**`,
  // and italic `*Key*` variants.
  const labelSource = line.replace(/^\*+/, '').replace(/^\*\*/, '').replace(/^_+/, '').trim();
  for (const prefix of METADATA_LINE_PREFIXES) {
    const lower = labelSource.toLowerCase();
    const prefixLower = prefix.toLowerCase();
    if (
      lower.startsWith(`${prefixLower}:`) ||
      lower.startsWith(`${prefixLower} :`) ||
      lower.startsWith(`${prefixLower}**:`) ||
      lower.startsWith(`${prefixLower}*:`)
    ) {
      return true;
    }
  }
 
  // Pure punctuation / decorative separators
  if (/^[-*_=~.]{3,}$/.test(line)) return true;
 
  return false;
}
 
/**
 * Strip inline Markdown decorations so we can use the remaining text as
 * plain-text meta-tag content. Removes link syntax, emphasis, inline code
 * backticks, and HTML-entity fragments that the Markdown source sometimes
 * smuggles in. Keeps the visible text readable.
 *
 * @param raw - Trimmed Markdown line
 * @returns Plain-text variant
 */
export function stripInlineMarkdown(raw: string): string {
  // All inner character classes are length-bounded to eliminate the
  // polynomial-regex worst case that CodeQL flags on uncontrolled input —
  // none of these decorations are legitimately longer than 500 chars.
  return raw
    .replace(/!\[([^\]\n]{0,500})\]\(([^)\n]{0,500})\)/g, '$1') // ![alt](img) — must precede [text](url)
    .replace(/\[([^\]\n]{1,500})\]\(([^)\n]{0,500})\)/g, '$1') // [text](url) → text
    .replace(/`([^`\n]{1,500})`/g, '$1') // inline code
    .replace(/\*\*([^*\n]{1,500})\*\*/g, '$1') // **bold**
    .replace(/__([^_\n]{1,500})__/g, '$1') // __bold__
    .replace(/\*([^*\n]{1,500})\*/g, '$1') // *italic*
    .replace(/_([^_\n]{1,500})_/g, '$1') // _italic_
    .replace(/~~([^~\n]{1,500})~~/g, '$1') // ~~strike~~
    .replace(/\s+/g, ' ')
    .trim();
}
 
/**
 * Clamp a string to {@link DESCRIPTION_MAX_LENGTH} characters, appending
 * an ellipsis when truncation actually happens. Does not break words if
 * avoidable — a trailing partial word is trimmed back to the previous
 * space first.
 *
 * @param text - Raw description text
 * @returns Truncated description with trailing ellipsis when clipped
 */
export function truncateDescription(text: string): string {
  if (text.length <= DESCRIPTION_MAX_LENGTH) return text;
  const cut = text.slice(0, DESCRIPTION_MAX_LENGTH - 3);
  const lastSpace = cut.lastIndexOf(' ');
  const safe = lastSpace > DESCRIPTION_MAX_LENGTH - 60 ? cut.slice(0, lastSpace) : cut;
  return `${safe.replace(/[.,;:—-]+$/, '')}…`;
}
 
/**
 * Clamp a title to {@link TITLE_MAX_LENGTH} characters in the same
 * word-boundary-preserving fashion as {@link truncateDescription}.
 *
 * @param text - Raw title text
 * @returns Truncated title with trailing ellipsis when clipped
 */
export function truncateTitle(text: string): string {
  if (text.length <= TITLE_MAX_LENGTH) return text;
  const cut = text.slice(0, TITLE_MAX_LENGTH - 3);
  const lastSpace = cut.lastIndexOf(' ');
  const safe = lastSpace > TITLE_MAX_LENGTH - 40 ? cut.slice(0, lastSpace) : cut;
  return `${safe.replace(/[.,;:—-]+$/, '')}…`;
}
 
/**
 * Return the first Markdown H1 (`# …`) in the supplied text, stripped of
 * the leading `#` and trailing anchor syntax. Returns an empty string when
 * no H1 is present.
 *
 * @param markdown - Markdown source
 * @returns Plain-text H1, or empty string when none found
 */
export function extractFirstH1(markdown: string): string {
  for (const raw of markdown.split('\n')) {
    const line = raw.trim();
    if (!line.startsWith('#')) continue;
    // Accept `# Title` but not `## Sub-heading`.
    if (!/^#\s+/.test(line)) continue;
    // Strip the leading `# ` marker, then trim trailing `#` characters
    // without an unbounded `\s*#+\s*$` regex (CodeQL flags that form as
    // polynomial on pathological repeated-`#` input).
    let text = line.replace(/^#\s+/, '').trimEnd();
    while (text.endsWith('#')) text = text.slice(0, -1).trimEnd();
    return stripInlineMarkdown(text);
  }
  return '';
}
 
/**
 * Walk every line of the Markdown source and return the first line that
 * survives {@link shouldSkipDescriptionLine}. Inline Markdown decorations
 * are stripped and the result is truncated to fit `<meta description>`.
 *
 * @param markdown - Markdown source
 * @returns Prose description, or empty string when nothing qualifies
 */
export function extractStrongProseLine(markdown: string): string {
  for (const raw of markdown.split('\n')) {
    const line = raw.trim();
    if (shouldSkipDescriptionLine(line)) continue;
    const plain = stripInlineMarkdown(line);
    if (plain.length < 40) continue;
    return truncateDescription(plain);
  }
  return '';
}
 
/**
 * Humanise an `article-type` slug the same way the aggregator does (see
 * `src/aggregator/analysis-aggregator.ts:humanize`). Kept in sync by value
 * — we deliberately do not import the private helper.
 *
 * @param slug - Slug like `week-ahead` or `breaking_news`
 * @returns Title-cased humanised form (`Week Ahead`, `Breaking News`)
 */
export function humanizeSlug(slug: string): string {
  return slug
    .split(/[-_]/g)
    .map((seg) => (seg ? seg.charAt(0).toUpperCase() + seg.slice(1) : seg))
    .join(' ')
    .trim();
}
 
/**
 * Return `true` when the supplied heading matches the generic
 * `${humanize(articleType)} — ${date}` form that the aggregator writes as
 * its default document title. Accepts em-dash, en-dash, and ASCII hyphen
 * separators, and matches the `breaking-breaking` variant that some
 * same-day collision runs produce.
 *
 * @param heading - Plain-text heading (post-{@link stripInlineMarkdown})
 * @param articleType - Article type slug
 * @param date - ISO date string
 * @returns `true` when the heading carries no editorial information
 */
export function isGenericHeading(heading: string, articleType: string, date: string): boolean {
  const normalized = heading.trim().replace(/\s+/g, ' ');
  if (normalized === '') return true;
 
  const human = humanizeSlug(articleType);
  const patterns = [
    `${human} — ${date}`,
    `${human} - ${date}`,
    `${human} – ${date}`,
    `${human}: ${date}`,
    `${human} ${date}`,
  ];
 
  // Also accept the collision-suffix pattern (e.g. `Breaking Breaking — …`)
  // and the auto-generated "EU Parliament <Type> — <date>" legacy form.
  const humanRedundant = `${human} ${human}`;
  for (const p of patterns) {
    if (normalized === p) return true;
    if (normalized === `EU Parliament ${p}`) return true;
    if (normalized === `${humanRedundant} — ${date}`) return true;
  }
 
  // The bare `${human} — <anything>` with nothing extra is also generic.
  // eslint-disable-next-line security/detect-non-literal-regexp -- `human` derives from a sanitised slug via escapeRegex
  const trailingDateOnly = new RegExp(`^${escapeRegex(human)}\\s*[—–-]\\s*[\\d-]+$`, 'u');
  Iif (trailingDateOnly.test(normalized)) {
    return true;
  }
 
  return false;
}
 
/**
 * Escape regex metacharacters so a dynamic string can be embedded safely
 * in a pattern built at runtime.
 *
 * @param input - Raw string
 * @returns Regex-safe form of {@link input}
 */
function escapeRegex(input: string): string {
  return input.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
}
 
/**
 * Attempt to read the first H1 and first prose paragraph from the first
 * existing artefact under {@link EDITORIAL_ARTEFACT_CANDIDATES}. Returns
 * `null` when no candidate artefact exists.
 *
 * @param runDir - Absolute run directory path
 * @param articleType - Article type slug (used by {@link isGenericHeading})
 * @param date - ISO run date (used by {@link isGenericHeading})
 * @returns `{headline, summary}` where either field may be empty
 */
export function extractArtifactHighlight(
  runDir: string,
  articleType: string,
  date: string
): { readonly headline: string; readonly summary: string } | null {
  if (!runDir || !fs.existsSync(runDir)) return null;
 
  // Direct candidate lookup — cheap and deterministic.
  for (const rel of EDITORIAL_ARTEFACT_CANDIDATES) {
    const abs = path.join(runDir, rel);
    if (!fs.existsSync(abs)) continue;
    const body = readArtefactBody(abs);
    const headline = extractFirstH1(body);
    Iif (!headline) continue;
    if (isGenericHeading(headline, articleType, date)) continue;
    const summary = extractStrongProseLine(body);
    return { headline: truncateTitle(headline), summary };
  }
 
  // Fallback: walk the top-level `.md` files in the run dir once, looking
  // for any that starts with `#` and has a non-generic headline.
  const topLevel = safeReaddir(runDir).filter((f) => f.endsWith('.md'));
  for (const rel of topLevel) {
    if (rel === 'manifest.json') continue;
    const abs = path.join(runDir, rel);
    const body = readArtefactBody(abs);
    const headline = extractFirstH1(body);
    if (!headline) continue;
    if (isGenericHeading(headline, articleType, date)) continue;
    const summary = extractStrongProseLine(body);
    return { headline: truncateTitle(headline), summary };
  }
 
  return null;
}
 
/**
 * Read an artefact file, skipping any SPDX HTML-comment header rows so the
 * first-H1 / first-prose logic is never derailed by the REUSE preamble.
 *
 * @param abs - Absolute file path
 * @returns File contents with SPDX comment lines dropped
 */
function readArtefactBody(abs: string): string {
  let text: string;
  try {
    text = fs.readFileSync(abs, 'utf8');
  } catch {
    return '';
  }
  const lines = text.split('\n');
  // Drop a run of leading `<!--` SPDX/provenance comments plus blank lines.
  let i = 0;
  while (i < lines.length) {
    const line = (lines[i] ?? '').trim();
    if (line === '') {
      i++;
      continue;
    }
    if (line.startsWith('<!--') && line.endsWith('-->')) {
      i++;
      continue;
    }
    break;
  }
  return lines.slice(i).join('\n');
}
 
/**
 * `fs.readdirSync` wrapped to never throw for missing or unreadable
 * directories.
 *
 * @param dir - Absolute directory path
 * @returns Entries in {@link dir} or `[]` when unreadable
 */
function safeReaddir(dir: string): string[] {
  try {
    return fs.readdirSync(dir);
  } catch {
    return [];
  }
}
 
/**
 * Build the per-language `{title, description}` pair using the
 * article-type–specific `*_TITLES` generator from
 * `src/constants/language-articles.ts`. This is the last-resort tier and
 * is always parameterised by date (or equivalent), so even when it fires
 * the result is not identical across runs of the same type.
 *
 * @param articleType - Article type slug
 * @param date - ISO run date
 * @param committee - Optional committee code (used by `committee-reports`)
 * @returns Per-language `LangTitleSubtitle`
 */
export function buildTemplateFallback(
  articleType: string,
  date: string,
  committee?: string
): LanguageMap<LangTitleSubtitle> {
  const map: Record<LanguageCode, LangTitleSubtitle> = Object.create(null) as Record<
    LanguageCode,
    LangTitleSubtitle
  >;
  const weekRange = deriveWeekRange(date);
  const monthLabel = deriveMonthLabel(date);
  const committeeLabel = committee && committee.trim().length > 0 ? committee : 'Main Committees';
 
  for (const lang of ALL_LANGUAGES) {
    const entry = templateForType(lang, articleType, {
      date,
      weekStart: weekRange.start,
      weekEnd: weekRange.end,
      month: monthLabel,
      committee: committeeLabel,
    });
    Object.defineProperty(map, lang, {
      value: entry,
      enumerable: true,
      writable: true,
      configurable: true,
    });
  }
  return map;
}
 
/** Inputs for {@link templateForType}. */
interface TemplateInputs {
  readonly date: string;
  readonly weekStart: string;
  readonly weekEnd: string;
  readonly month: string;
  readonly committee: string;
}
 
/**
 * Dispatch an article-type slug to the matching localized template
 * generator. Unknown types get a uniform fallback built from
 * {@link humanizeSlug} and the run date.
 *
 * @param lang - Target language code
 * @param articleType - Article type slug
 * @param inputs - Pre-derived inputs used by the generators
 * @returns `LangTitleSubtitle` for the requested language
 */
function templateForType(
  lang: LanguageCode,
  articleType: string,
  inputs: TemplateInputs
): LangTitleSubtitle {
  switch (articleType) {
    case 'breaking':
    case 'breaking-breaking':
      return getLocalizedString(BREAKING_NEWS_TITLES, lang)(inputs.date);
    case 'committee-reports':
      return getLocalizedString(COMMITTEE_REPORTS_TITLES, lang)(inputs.committee);
    case 'motions':
      return getLocalizedString(MOTIONS_TITLES, lang)(inputs.date);
    case 'propositions':
      return getLocalizedString(PROPOSITIONS_TITLES, lang)();
    case 'week-ahead':
      return getLocalizedString(WEEK_AHEAD_TITLES, lang)(inputs.weekStart, inputs.weekEnd);
    case 'month-ahead':
      return getLocalizedString(MONTH_AHEAD_TITLES, lang)(inputs.month);
    case 'week-in-review':
      return getLocalizedString(WEEKLY_REVIEW_TITLES, lang)(inputs.weekStart, inputs.weekEnd);
    case 'month-in-review':
      return getLocalizedString(MONTHLY_REVIEW_TITLES, lang)(inputs.month);
    default:
      return {
        title: `${humanizeSlug(articleType)} — ${inputs.date}`,
        subtitle: `EU Parliament analysis — ${inputs.date}`,
      };
  }
}
 
/**
 * Parse an ISO date and return the `[start, end]` week range as ISO
 * strings. Week starts on Monday and ends on the following Sunday.
 *
 * @param date - ISO date string (`YYYY-MM-DD`)
 * @returns `{ start, end }` both in `YYYY-MM-DD` form
 */
export function deriveWeekRange(date: string): { readonly start: string; readonly end: string } {
  const parsed = parseIsoDate(date);
  if (!parsed) return { start: date, end: date };
  // getUTCDay(): 0 = Sunday, 1 = Monday, …
  const day = parsed.getUTCDay();
  // Shift so Monday = 0, Sunday = 6.
  const shift = (day + 6) % 7;
  const startMs = parsed.getTime() - shift * 86_400_000;
  const endMs = startMs + 6 * 86_400_000;
  return { start: formatIsoDate(new Date(startMs)), end: formatIsoDate(new Date(endMs)) };
}
 
/**
 * Return a human-friendly month label for an ISO date — English month
 * name + four-digit year (e.g. `April 2026`). The non-English template
 * generators accept this same label verbatim because they interpolate it
 * into a localized sentence rather than translating the month itself.
 *
 * @param date - ISO date string
 * @returns Month label, or the input when parsing fails
 */
export function deriveMonthLabel(date: string): string {
  const parsed = parseIsoDate(date);
  Iif (!parsed) return date;
  const monthNames = [
    'January',
    'February',
    'March',
    'April',
    'May',
    'June',
    'July',
    'August',
    'September',
    'October',
    'November',
    'December',
  ];
  const name = monthNames[parsed.getUTCMonth()] ?? '';
  return `${name} ${parsed.getUTCFullYear()}`.trim();
}
 
/**
 * Parse an ISO date string as UTC midnight. Returns `null` for malformed
 * input so callers can skip month/week derivation gracefully.
 *
 * @param iso - ISO date string
 * @returns Parsed `Date` or `null`
 */
function parseIsoDate(iso: string): Date | null {
  if (!/^\d{4}-\d{2}-\d{2}$/.test(iso)) return null;
  const parsed = new Date(`${iso}T00:00:00Z`);
  return Number.isNaN(parsed.getTime()) ? null : parsed;
}
 
/**
 * Format a `Date` as `YYYY-MM-DD` in UTC.
 *
 * @param d - Date object
 * @returns ISO date string
 */
function formatIsoDate(d: Date): string {
  const y = d.getUTCFullYear();
  const m = String(d.getUTCMonth() + 1).padStart(2, '0');
  const day = String(d.getUTCDate()).padStart(2, '0');
  return `${y}-${m}-${day}`;
}
 
/**
 * Extract a manifest override value for a single language. Accepts either
 * a plain string (applied to every language) or a `LanguageMap` object.
 *
 * @param value - Raw manifest value (string or per-lang object)
 * @param lang - Target language code
 * @returns Override string, or empty string when absent
 */
function manifestOverrideFor(
  value: string | Partial<Record<LanguageCode, string>> | undefined,
  lang: LanguageCode
): string {
  // A plain string is a blanket editorial override — the operator is
  // telling the resolver "use this exact text for every language". This
  // is the one path where a single string is applied cross-locale; the
  // operator takes responsibility for its language.
  if (typeof value === 'string') return value.trim();
  if (!value) return '';
  // Per-language object: respect ONLY the explicit entry for `lang`. We
  // deliberately do NOT fall back to the `en` entry for non-English
  // variants — otherwise an EN-only override would leak English into
  // every other locale's <title>. Missing languages fall through to the
  // localized template tier.
  const map = new Map<string, string>();
  for (const key of Object.keys(value)) {
    const v = (value as Record<string, unknown>)[key];
    Eif (typeof v === 'string') map.set(key, v);
  }
  const entry = map.get(lang);
  return typeof entry === 'string' ? entry.trim() : '';
}
 
/**
 * Internal: best editorial `{headline, summary}` pair available from the
 * aggregator output and artefacts, independent of language. Used for
 * tiers 2–4.
 *
 * @param opts - Resolver inputs
 * @returns Editorial content derived from English source
 */
function resolveEditorialContent(opts: ResolveMetadataOptions): {
  readonly headline: string;
  readonly summary: string;
} {
  const { articleType, date, markdown, runDir } = opts;
 
  // Tier 2: first non-generic H1 in the first substantive artefact.
  if (runDir) {
    const highlight = extractArtifactHighlight(runDir, articleType, date);
    Eif (highlight?.headline) {
      return {
        headline: highlight.headline,
        summary: highlight.summary,
      };
    }
  }
 
  // Tier 3: first non-generic H1 in the aggregated Markdown itself.
  const aggregatedH1 = extractFirstH1(markdown);
  const aggregatedSummary = extractStrongProseLine(markdown);
  if (aggregatedH1 && !isGenericHeading(aggregatedH1, articleType, date)) {
    return {
      headline: truncateTitle(aggregatedH1),
      summary: aggregatedSummary,
    };
  }
 
  // Tier 4: first strong prose paragraph (title = same prose clipped).
  if (aggregatedSummary) {
    return { headline: truncateTitle(aggregatedSummary), summary: aggregatedSummary };
  }
 
  return { headline: '', summary: '' };
}
 
/**
 * Resolve per-language `{title, description}` for one article following
 * the priority ladder documented at the top of this module.
 *
 * @param opts - Resolver inputs ({@link ResolveMetadataOptions})
 * @returns One `{title, description}` entry per supported language
 */
export function resolveArticleMetadata(opts: ResolveMetadataOptions): ResolvedMetadata {
  const manifest = opts.manifest ?? {};
  const editorial = resolveEditorialContent(opts);
  const template = buildTemplateFallback(opts.articleType, opts.date, manifest.committee);
 
  const result: Record<LanguageCode, ResolvedMetadataEntry> = Object.create(null) as Record<
    LanguageCode,
    ResolvedMetadataEntry
  >;
 
  for (const lang of ALL_LANGUAGES) {
    const manifestTitle = manifestOverrideFor(manifest.title, lang);
    const manifestDescription = manifestOverrideFor(manifest.description, lang);
    const fallback = template[lang];
 
    // Non-English languages must not inherit the English editorial
    // headline/summary — they would render a non-locale title in a
    // localized chrome. We skip tiers 2–4 for non-EN and drop straight to
    // the localized template (or explicit manifest override when provided).
    const useEditorial = lang === 'en';
    const titleCandidates = useEditorial
      ? [manifestTitle, editorial.headline, fallback.title]
      : [manifestTitle, fallback.title];
    const descCandidates = useEditorial
      ? [manifestDescription, editorial.summary, fallback.subtitle]
      : [manifestDescription, fallback.subtitle];
 
    const title = pickFirstNonEmpty(titleCandidates) || fallback.title;
    const description = pickFirstNonEmpty(descCandidates) || fallback.subtitle;
 
    Object.defineProperty(result, lang, {
      value: {
        title: truncateTitle(title),
        description: truncateDescription(description),
      },
      enumerable: true,
      writable: true,
      configurable: true,
    });
  }
 
  return result;
}
 
/**
 * Return the first non-empty, trimmed entry from a candidate list, or
 * the empty string when every entry is blank.
 *
 * @param candidates - Ordered list of candidate strings
 * @returns First non-empty entry
 */
function pickFirstNonEmpty(candidates: readonly string[]): string {
  for (const c of candidates) {
    if (typeof c === 'string' && c.trim().length > 0) return c.trim();
  }
  return '';
}