Files
indiekit-endpoint-activitypub/lib/og-unfurl.js
Ricardo 9a61145d97 feat: FEP-8fcf/fe34 compliance, custom emoji, manual follow approval (v2.13.0)
- FEP-8fcf: add syncCollection to Undo(Announce) sendActivity
- FEP-fe34: centralized lookupWithSecurity() helper with crossOrigin: "ignore" on all 23 lookupObject call sites
- Custom emoji: replaceCustomEmoji() renders :shortcode: as inline <img> in content and actor display names
- Manual follow approval: profile toggle, ap_pending_follows collection, approve/reject controllers with federation, pending tab on followers page, follow_request notification type
- Coverage audit updated to v2.12.x (overall ~70% → ~82%)

Confab-Link: http://localhost:8080/sessions/1f1e729b-0087-499e-a991-f36f46211fe4
2026-03-17 08:21:36 +01:00

340 lines
11 KiB
JavaScript

/**
* OpenGraph metadata fetching with concurrency limiting
* @module og-unfurl
*/
import { unfurl } from "unfurl.js";
import { extractObjectData } from "./timeline-store.js";
import { lookupWithSecurity } from "./lookup-helpers.js";
const USER_AGENT =
"Mozilla/5.0 (compatible; Indiekit/1.0; +https://getindiekit.com)";
const TIMEOUT_MS = 10000; // 10 seconds per URL
const MAX_CONCURRENT = 3; // Lower than theme's 5 (inbox context)
const MAX_PREVIEWS = 3; // Max previews per post
// Concurrency limiter — prevents overwhelming outbound network
let activeRequests = 0;
const queue = [];
function runNext() {
if (queue.length === 0 || activeRequests >= MAX_CONCURRENT) return;
activeRequests++;
const { resolve: res, fn } = queue.shift();
fn()
.then(res)
.finally(() => {
activeRequests--;
runNext();
});
}
function throttled(fn) {
return new Promise((res) => {
queue.push({ resolve: res, fn });
runNext();
});
}
function extractDomain(url) {
try {
return new URL(url).hostname.replace(/^www\./, "");
} catch {
return url;
}
}
/**
* Check if a URL points to a private/reserved IP or localhost (SSRF protection)
* @param {string} url - URL to check
* @returns {boolean} True if URL targets a private network
*/
function isPrivateUrl(url) {
try {
const urlObj = new URL(url);
const hostname = urlObj.hostname.toLowerCase();
// Block non-http(s) schemes
if (urlObj.protocol !== "http:" && urlObj.protocol !== "https:") {
return true;
}
// Block localhost variants
if (hostname === "localhost" || hostname === "127.0.0.1" || hostname === "::1" || hostname === "[::1]") {
return true;
}
// Block private IPv4 ranges
const ipv4Match = hostname.match(/^(\d+)\.(\d+)\.(\d+)\.(\d+)$/);
if (ipv4Match) {
const [, a, b] = ipv4Match.map(Number);
if (a === 10) return true; // 10.0.0.0/8
if (a === 172 && b >= 16 && b <= 31) return true; // 172.16.0.0/12
if (a === 192 && b === 168) return true; // 192.168.0.0/16
if (a === 169 && b === 254) return true; // 169.254.0.0/16 (link-local / cloud metadata)
if (a === 127) return true; // 127.0.0.0/8
if (a === 0) return true; // 0.0.0.0/8
}
// Block IPv6 private ranges (basic check)
if (hostname.startsWith("[fc") || hostname.startsWith("[fd") || hostname.startsWith("[fe80")) {
return true;
}
return false;
} catch {
return true; // Invalid URL, treat as private
}
}
/**
* Extract links from HTML content
* @param {string} html - Sanitized HTML content
* @returns {Array<{url: string, classes: string}>} Links with their class attributes
*/
function extractLinks(html) {
if (!html) return [];
const links = [];
// Match complete <a> tags and extract href + class from anywhere in attributes
const anchorRegex = /<a\s([^>]+)>/gi;
let match;
while ((match = anchorRegex.exec(html)) !== null) {
const attrs = match[1];
const hrefMatch = attrs.match(/href="([^"]+)"/);
const classMatch = attrs.match(/class="([^"]+)"/);
if (hrefMatch) {
links.push({ url: hrefMatch[1], classes: classMatch ? classMatch[1] : "" });
}
}
return links;
}
/**
* Check if URL is likely an ActivityPub object or media file
* @param {string} url - URL to check
* @returns {boolean} True if URL should be skipped
*/
function shouldSkipUrl(url) {
try {
const urlObj = new URL(url);
// SSRF protection — skip private/internal URLs
if (isPrivateUrl(url)) {
return true;
}
// Skip media extensions
const mediaExtensions = /\.(jpg|jpeg|png|gif|webp|mp4|webm|mov|mp3|wav|ogg)$/i;
if (mediaExtensions.test(urlObj.pathname)) {
return true;
}
// Skip common AP object patterns (heuristic - not exhaustive)
const apPatterns = [
/\/@[\w.-]+\/\d+/, // Mastodon /@user/12345
/\/@[\w.-]+\/statuses\/[\w]+/, // GoToSocial /@user/statuses/id
/\/users\/[\w.-]+\/statuses\/\d+/, // Mastodon/Pleroma /users/user/statuses/12345
/\/objects\/[\w-]+/, // Pleroma/Akkoma /objects/uuid
/\/notice\/[\w]+/, // Pleroma /notice/id
/\/notes\/[\w]+/, // Misskey /notes/id
];
return apPatterns.some((pattern) => pattern.test(urlObj.pathname));
} catch {
return true; // Invalid URL, skip
}
}
/**
* Fetch OpenGraph metadata for external links in HTML content
* @param {string} html - Sanitized HTML content
* @returns {Promise<Array<{url: string, title: string, description: string, image: string, favicon: string, domain: string, fetchedAt: string}>>} Link preview objects
*/
export async function fetchLinkPreviews(html) {
if (!html) return [];
const links = extractLinks(html);
// Filter links
const urlsToFetch = links
.filter((link) => {
// Skip mention links (class="mention")
if (link.classes.includes("mention")) return false;
// Skip hashtag links (class="hashtag")
if (link.classes.includes("hashtag")) return false;
// Skip AP object URLs and media files
if (shouldSkipUrl(link.url)) return false;
return true;
})
.map((link) => link.url)
.filter((url, index, self) => self.indexOf(url) === index) // Dedupe
.slice(0, MAX_PREVIEWS); // Cap at max
if (urlsToFetch.length === 0) return [];
// Fetch metadata for each URL (throttled)
const previews = await Promise.all(
urlsToFetch.map(async (url) => {
const metadata = await throttled(async () => {
try {
return await unfurl(url, {
timeout: TIMEOUT_MS,
headers: { "User-Agent": USER_AGENT },
});
} catch (error) {
console.warn(`[og-unfurl] Failed to fetch ${url}: ${error.message}`);
return null;
}
});
if (!metadata) return null;
const og = metadata.open_graph || {};
const tc = metadata.twitter_card || {};
const title = og.title || tc.title || metadata.title || extractDomain(url);
const description = og.description || tc.description || metadata.description || "";
const image = og.images?.[0]?.url || tc.images?.[0]?.url || null;
const favicon = metadata.favicon || null;
const domain = extractDomain(url);
// Truncate description
const maxDesc = 160;
const desc =
description.length > maxDesc
? description.slice(0, maxDesc).trim() + "\u2026"
: description;
return {
url,
title,
description: desc,
image,
favicon,
domain,
fetchedAt: new Date().toISOString(),
};
}),
);
// Filter out failed fetches (null results)
return previews.filter((preview) => preview !== null);
}
/**
* Fetch link previews and store them on a timeline item
* Fire-and-forget — caller does NOT await. Errors are caught and logged.
* @param {object} collections - MongoDB collections
* @param {string} uid - Timeline item UID
* @param {string} html - Post content HTML
* @returns {Promise<void>}
*/
export async function fetchAndStorePreviews(collections, uid, html) {
try {
const linkPreviews = await fetchLinkPreviews(html);
await collections.ap_timeline.updateOne(
{ uid },
{ $set: { linkPreviews } },
);
} catch (error) {
// Fire-and-forget — log errors but don't throw
console.error(
`[og-unfurl] Failed to store previews for ${uid}: ${error.message}`,
);
}
}
/**
* Fetch a quoted post's data and store it on the timeline item.
* Fire-and-forget — caller does NOT await. Errors are caught and logged.
* @param {object} collections - MongoDB collections
* @param {string} uid - Timeline item UID (the quoting post)
* @param {string} quoteUrl - URL of the quoted post
* @param {object} ctx - Fedify context (for lookupObject)
* @param {object} documentLoader - Authenticated DocumentLoader
* @returns {Promise<void>}
*/
export async function fetchAndStoreQuote(collections, uid, quoteUrl, ctx, documentLoader) {
try {
const object = await lookupWithSecurity(ctx,new URL(quoteUrl), { documentLoader });
if (!object) return;
const quoteData = await extractObjectData(object, { documentLoader });
// If author photo is empty, try fetching the actor directly
if (!quoteData.author.photo && quoteData.author.url) {
try {
const actor = await lookupWithSecurity(ctx,new URL(quoteData.author.url), { documentLoader });
if (actor) {
const { extractActorInfo } = await import("./timeline-store.js");
const actorInfo = await extractActorInfo(actor, { documentLoader });
if (actorInfo.photo) {
quoteData.author.photo = actorInfo.photo;
}
}
} catch {
// Actor fetch failed — keep existing author data
}
}
const quote = {
url: quoteData.url || quoteData.uid,
uid: quoteData.uid,
author: quoteData.author,
content: quoteData.content,
published: quoteData.published,
name: quoteData.name,
photo: quoteData.photo?.slice(0, 1) || [],
};
// Strip the "RE: <link>" paragraph from the parent post's content
// Mastodon adds this as: <p>RE: <a href="QUOTE_URL">...</a></p>
const update = { $set: { quote } };
const parentItem = await collections.ap_timeline.findOne({ uid });
if (parentItem?.content?.html) {
const cleaned = stripQuoteReferenceHtml(parentItem.content.html, quoteUrl);
if (cleaned !== parentItem.content.html) {
update.$set["content.html"] = cleaned;
}
}
await collections.ap_timeline.updateOne({ uid }, update);
} catch (error) {
console.error(`[og-unfurl] Failed to fetch quote for ${uid}: ${error.message}`);
}
}
/**
* Strip the "RE: <link>" paragraph that Mastodon adds for quoted posts.
* Removes <p> elements containing "RE:" followed by a link to the quote URL.
* @param {string} html - Content HTML
* @param {string} quoteUrl - URL of the quoted post
* @returns {string} Cleaned HTML
*/
export function stripQuoteReferenceHtml(html, quoteUrl) {
if (!html || !quoteUrl) return html;
// Match <p> containing "RE:" and a link whose href contains the quote domain+path
// Mastodon uses both /users/X/statuses/Y and /@X/Y URL formats
try {
const quoteUrlObj = new URL(quoteUrl);
const quoteDomain = quoteUrlObj.hostname;
// Escape special regex chars in domain
const domainEscaped = quoteDomain.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
// Match <p>RE: <a href="...DOMAIN...">...</a></p> (with optional whitespace)
const re = new RegExp(
`<p>\\s*RE:\\s*<a\\s[^>]*href="[^"]*${domainEscaped}[^"]*"[^>]*>.*?</a>\\s*</p>`,
"i",
);
return html.replace(re, "").trim();
} catch {
return html;
}
}