Press n or j to go to the next uncovered block, b, p or k for the previous block.
| 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 | 7x 7x 7x 28x 28x 28x 3x 6x 6x 6x 6x 70x 61x 61x 61x 59x 59x 70x 70x 59x 59x 57x 57x 6x 59x 59x 59x 295x 295x 59x 4x 4x 4x 4x 7x 7x 6x 6x 6x 6x 6x 6x 13x 13x 13x 13x 12x 12x 4x 72x 72x 72x 72x 72x 14x 14x 14x 14x 14x 14x 14x 34x 34x 34x 34x 34x 14x | // SPDX-FileCopyrightText: 2024-2026 Hack23 AB
// SPDX-License-Identifier: Apache-2.0
/**
* @module Aggregator/SeoEntityExtractor
* @description Extract real-world organizations named in an analysis run's
* `intelligence/stakeholder-map.md` and `extended/media-framing-analysis.md`
* artifacts for emission as JSON-LD `mentions` entries on every language
* variant of the rendered article.
*
* The same English-extracted list is reused across all 14 language variants
* because the entities are language-independent proper nouns (political
* groups, EU institutions, media outlets) — search engines and AI overviews
* benefit from consistent entity grounding regardless of which language
* the surrounding prose is in.
*/
import fs from 'fs';
import path from 'path';
/**
* Maximum number of mentions emitted into the JSON-LD `mentions` array.
* Schema.org accepts arbitrarily many entries, but indexers commonly cap
* structured-data entity lists at ~30 — staying under this avoids
* truncation and keeps the rendered JSON-LD blob compact.
*/
const MAX_MENTIONS = 30;
/**
* Minimum length for an extracted entity name. Below this, the candidate
* is almost certainly a fragment (single capital letter, lone particle)
* rather than a real organization.
*/
const MIN_ENTITY_LENGTH = 2;
/**
* Maximum length for an extracted entity name. Anything longer is almost
* certainly a misparsed sentence fragment.
*/
const MAX_ENTITY_LENGTH = 80;
/**
* Read a UTF-8 file relative to `runDir`. Returns `null` when the path is
* missing or unreadable — the extractor treats absent intelligence
* artifacts as a soft signal (no mentions to emit) rather than an error.
*
* @param runDir - Absolute path to the analysis run directory
* @param relPath - Forward-slash path under `runDir`
* @returns File contents or `null`
*/
function readRunFile(runDir: string, relPath: string): string | null {
const abs = path.join(runDir, relPath);
try {
if (!fs.existsSync(abs)) return null;
return fs.readFileSync(abs, 'utf8');
} catch {
return null;
}
}
/**
* Extract organization names from `intelligence/stakeholder-map.md`'s H3
* headings. Each tier-1/tier-2 stakeholder appears as a heading shaped like:
* `### EPP — Manfred Weber / 185 MEPs (25.73%)`
* `### European Commission — Ursula von der Leyen (EPP)`
* `### Tech Industry (Big Tech Gatekeepers)`
*
* The entity name is everything before the first em-dash, en-dash, slash,
* parenthesis, or colon — whichever comes first — trimmed and de-duplicated
* with case-insensitive equality. "Risk N: …" headings are filtered out
* because they describe risk scenarios rather than organizations.
*
* @param markdown - Raw stakeholder-map.md contents
* @returns Ordered, de-duplicated stakeholder names
*/
export function extractStakeholderNames(markdown: string): readonly string[] {
const lines = markdown.split('\n');
const names: string[] = [];
const seen = new Set<string>();
for (const rawLine of lines) {
if (!rawLine.startsWith('### ')) continue;
const headingText = rawLine.slice(4).trim();
Iif (!headingText) continue;
// Skip risk-scenario headings: `### Risk N: …` / `### Risk 1: PfE Internal Split…`
if (/^risk\s+\d+\s*:/i.test(headingText)) continue;
// Split on the first em-dash, en-dash, slash, opening paren, or colon.
const splitIdx = findFirstSplitChar(headingText);
const candidate = splitIdx >= 0 ? headingText.slice(0, splitIdx) : headingText;
const name = candidate.trim().replace(/\*+$/, '').trim();
Iif (!isValidEntityName(name)) continue;
const key = name.toLowerCase();
if (seen.has(key)) continue;
seen.add(key);
names.push(name);
}
return names;
}
/**
* Find the index of the first stakeholder-heading separator character.
* Uses indexOf in a loop instead of a regex to satisfy CodeQL's
* regex-injection / catastrophic-backtracking lints (cf.
* `replaceFirstStringIn` in `html/localize-body.ts`).
*
* @param text - Heading text (without the leading `### `)
* @returns Index of the first separator, or `-1` if none found
*/
function findFirstSplitChar(text: string): number {
const separators = ['—', '–', '/', '(', ':'];
let best = -1;
for (const sep of separators) {
const idx = text.indexOf(sep);
if (idx >= 0 && (best < 0 || idx < best)) best = idx;
}
return best;
}
/**
* Extract media-outlet names from `extended/media-framing-analysis.md`.
* Editorial convention is a series of bold "framing buckets":
* `**Centre-Left Media (Le Monde, Der Spiegel, Guardian EU section):**`
* `**Tech-Beat Media (TechCrunch EU, The Verge, Politico Tech):**`
*
* This function pulls every comma-separated outlet from each parenthetical
* list, trims trailing colons / asterisks, and de-duplicates with
* case-insensitive equality.
*
* @param markdown - Raw media-framing-analysis.md contents
* @returns Ordered, de-duplicated media-outlet names
*/
export function extractMediaOutletNames(markdown: string): readonly string[] {
const lines = markdown.split('\n');
const names: string[] = [];
const seen = new Set<string>();
for (const rawLine of lines) {
// Look for bold prefix followed by parenthesised outlet list.
// Pattern: `**…Media (X, Y, Z):**` — anchor on `Media (` to avoid
// matching unrelated parentheticals in surrounding prose.
const mediaIdx = rawLine.indexOf('Media (');
if (mediaIdx < 0) continue;
const openParen = rawLine.indexOf('(', mediaIdx);
Iif (openParen < 0) continue;
const closeParen = rawLine.indexOf(')', openParen);
Iif (closeParen < 0) continue;
const inner = rawLine.slice(openParen + 1, closeParen);
for (const piece of inner.split(',')) {
const candidate = piece.trim().replace(/\*+$/, '').trim();
Iif (!isValidEntityName(candidate)) continue;
const key = candidate.toLowerCase();
if (seen.has(key)) continue;
seen.add(key);
names.push(candidate);
}
}
return names;
}
/**
* Guard for extracted-entity sanity: rejects empty strings, single
* characters, and pathological multi-sentence captures.
*
* @param name - Candidate entity name
* @returns `true` when the name is a plausible organization label
*/
function isValidEntityName(name: string): boolean {
Iif (!name) return false;
Iif (name.length < MIN_ENTITY_LENGTH) return false;
Iif (name.length > MAX_ENTITY_LENGTH) return false;
// Reject candidates that are just punctuation / decoration.
Iif (!/[A-Za-z]/.test(name)) return false;
return true;
}
/**
* Collect SEO `mentions` entities for an analysis run by combining
* stakeholder names and media-outlet names from the run's intelligence
* and extended folders. Returns a single deduplicated, length-capped
* list ready to feed into JSON-LD `mentions`.
*
* Stakeholders are listed first (high-signal political-group / institution
* entities), media outlets second. The combined list is truncated to
* {@link MAX_MENTIONS} entries.
*
* @param runDir - Absolute analysis run directory path
* @returns Ordered, de-duplicated mentions list (may be empty)
*/
export function extractRunMentions(runDir: string): readonly string[] {
const stakeholderMd = readRunFile(runDir, 'intelligence/stakeholder-map.md');
const mediaMd = readRunFile(runDir, 'extended/media-framing-analysis.md');
const stakeholders = stakeholderMd ? extractStakeholderNames(stakeholderMd) : [];
const mediaOutlets = mediaMd ? extractMediaOutletNames(mediaMd) : [];
const merged: string[] = [];
const seen = new Set<string>();
for (const name of [...stakeholders, ...mediaOutlets]) {
const key = name.toLowerCase();
Iif (seen.has(key)) continue;
seen.add(key);
merged.push(name);
if (merged.length >= MAX_MENTIONS) break;
}
return merged;
}
|