/** * OpenGraph metadata fetching with concurrency limiting * @module og-unfurl */ import { unfurl } from "unfurl.js"; import { extractObjectData } from "./timeline-store.js"; import { lookupWithSecurity } from "./lookup-helpers.js"; const USER_AGENT = "Mozilla/5.0 (compatible; Indiekit/1.0; +https://getindiekit.com)"; const TIMEOUT_MS = 10000; // 10 seconds per URL const MAX_CONCURRENT = 3; // Lower than theme's 5 (inbox context) const MAX_PREVIEWS = 3; // Max previews per post // Concurrency limiter — prevents overwhelming outbound network let activeRequests = 0; const queue = []; function runNext() { if (queue.length === 0 || activeRequests >= MAX_CONCURRENT) return; activeRequests++; const { resolve: res, fn } = queue.shift(); fn() .then(res) .finally(() => { activeRequests--; runNext(); }); } function throttled(fn) { return new Promise((res) => { queue.push({ resolve: res, fn }); runNext(); }); } function extractDomain(url) { try { return new URL(url).hostname.replace(/^www\./, ""); } catch { return url; } } /** * Check if a URL points to a private/reserved IP or localhost (SSRF protection) * @param {string} url - URL to check * @returns {boolean} True if URL targets a private network */ function isPrivateUrl(url) { try { const urlObj = new URL(url); const hostname = urlObj.hostname.toLowerCase(); // Block non-http(s) schemes if (urlObj.protocol !== "http:" && urlObj.protocol !== "https:") { return true; } // Block localhost variants if (hostname === "localhost" || hostname === "127.0.0.1" || hostname === "::1" || hostname === "[::1]") { return true; } // Block private IPv4 ranges const ipv4Match = hostname.match(/^(\d+)\.(\d+)\.(\d+)\.(\d+)$/); if (ipv4Match) { const [, a, b] = ipv4Match.map(Number); if (a === 10) return true; // 10.0.0.0/8 if (a === 172 && b >= 16 && b <= 31) return true; // 172.16.0.0/12 if (a === 192 && b === 168) return true; // 192.168.0.0/16 if (a === 169 && b === 254) return true; // 169.254.0.0/16 (link-local / cloud metadata) if (a === 127) return true; // 127.0.0.0/8 if (a === 0) return true; // 0.0.0.0/8 } // Block IPv6 private ranges (basic check) if (hostname.startsWith("[fc") || hostname.startsWith("[fd") || hostname.startsWith("[fe80")) { return true; } return false; } catch { return true; // Invalid URL, treat as private } } /** * Extract links from HTML content * @param {string} html - Sanitized HTML content * @returns {Array<{url: string, classes: string}>} Links with their class attributes */ function extractLinks(html) { if (!html) return []; const links = []; // Match complete tags and extract href + class from anywhere in attributes const anchorRegex = /]+)>/gi; let match; while ((match = anchorRegex.exec(html)) !== null) { const attrs = match[1]; const hrefMatch = attrs.match(/href="([^"]+)"/); const classMatch = attrs.match(/class="([^"]+)"/); if (hrefMatch) { links.push({ url: hrefMatch[1], classes: classMatch ? classMatch[1] : "" }); } } return links; } /** * Check if URL is likely an ActivityPub object or media file * @param {string} url - URL to check * @returns {boolean} True if URL should be skipped */ function shouldSkipUrl(url) { try { const urlObj = new URL(url); // SSRF protection — skip private/internal URLs if (isPrivateUrl(url)) { return true; } // Skip media extensions const mediaExtensions = /\.(jpg|jpeg|png|gif|webp|mp4|webm|mov|mp3|wav|ogg)$/i; if (mediaExtensions.test(urlObj.pathname)) { return true; } // Skip common AP object patterns (heuristic - not exhaustive) const apPatterns = [ /\/@[\w.-]+\/\d+/, // Mastodon /@user/12345 /\/@[\w.-]+\/statuses\/[\w]+/, // GoToSocial /@user/statuses/id /\/users\/[\w.-]+\/statuses\/\d+/, // Mastodon/Pleroma /users/user/statuses/12345 /\/objects\/[\w-]+/, // Pleroma/Akkoma /objects/uuid /\/notice\/[\w]+/, // Pleroma /notice/id /\/notes\/[\w]+/, // Misskey /notes/id ]; return apPatterns.some((pattern) => pattern.test(urlObj.pathname)); } catch { return true; // Invalid URL, skip } } /** * Fetch OpenGraph metadata for external links in HTML content * @param {string} html - Sanitized HTML content * @returns {Promise>} Link preview objects */ export async function fetchLinkPreviews(html) { if (!html) return []; const links = extractLinks(html); // Filter links const urlsToFetch = links .filter((link) => { // Skip mention links (class="mention") if (link.classes.includes("mention")) return false; // Skip hashtag links (class="hashtag") if (link.classes.includes("hashtag")) return false; // Skip AP object URLs and media files if (shouldSkipUrl(link.url)) return false; return true; }) .map((link) => link.url) .filter((url, index, self) => self.indexOf(url) === index) // Dedupe .slice(0, MAX_PREVIEWS); // Cap at max if (urlsToFetch.length === 0) return []; // Fetch metadata for each URL (throttled) const previews = await Promise.all( urlsToFetch.map(async (url) => { const metadata = await throttled(async () => { try { return await unfurl(url, { timeout: TIMEOUT_MS, headers: { "User-Agent": USER_AGENT }, }); } catch (error) { console.warn(`[og-unfurl] Failed to fetch ${url}: ${error.message}`); return null; } }); if (!metadata) return null; const og = metadata.open_graph || {}; const tc = metadata.twitter_card || {}; const title = og.title || tc.title || metadata.title || extractDomain(url); const description = og.description || tc.description || metadata.description || ""; const image = og.images?.[0]?.url || tc.images?.[0]?.url || null; const favicon = metadata.favicon || null; const domain = extractDomain(url); // Truncate description const maxDesc = 160; const desc = description.length > maxDesc ? description.slice(0, maxDesc).trim() + "\u2026" : description; return { url, title, description: desc, image, favicon, domain, fetchedAt: new Date().toISOString(), }; }), ); // Filter out failed fetches (null results) return previews.filter((preview) => preview !== null); } /** * Fetch link previews and store them on a timeline item * Fire-and-forget — caller does NOT await. Errors are caught and logged. * @param {object} collections - MongoDB collections * @param {string} uid - Timeline item UID * @param {string} html - Post content HTML * @returns {Promise} */ export async function fetchAndStorePreviews(collections, uid, html) { try { const linkPreviews = await fetchLinkPreviews(html); await collections.ap_timeline.updateOne( { uid }, { $set: { linkPreviews } }, ); } catch (error) { // Fire-and-forget — log errors but don't throw console.error( `[og-unfurl] Failed to store previews for ${uid}: ${error.message}`, ); } } /** * Fetch a quoted post's data and store it on the timeline item. * Fire-and-forget — caller does NOT await. Errors are caught and logged. * @param {object} collections - MongoDB collections * @param {string} uid - Timeline item UID (the quoting post) * @param {string} quoteUrl - URL of the quoted post * @param {object} ctx - Fedify context (for lookupObject) * @param {object} documentLoader - Authenticated DocumentLoader * @returns {Promise} */ export async function fetchAndStoreQuote(collections, uid, quoteUrl, ctx, documentLoader) { try { const object = await lookupWithSecurity(ctx,new URL(quoteUrl), { documentLoader }); if (!object) return; const quoteData = await extractObjectData(object, { documentLoader }); // If author photo is empty, try fetching the actor directly if (!quoteData.author.photo && quoteData.author.url) { try { const actor = await lookupWithSecurity(ctx,new URL(quoteData.author.url), { documentLoader }); if (actor) { const { extractActorInfo } = await import("./timeline-store.js"); const actorInfo = await extractActorInfo(actor, { documentLoader }); if (actorInfo.photo) { quoteData.author.photo = actorInfo.photo; } } } catch { // Actor fetch failed — keep existing author data } } const quote = { url: quoteData.url || quoteData.uid, uid: quoteData.uid, author: quoteData.author, content: quoteData.content, published: quoteData.published, name: quoteData.name, photo: quoteData.photo?.slice(0, 1) || [], }; // Strip the "RE: " paragraph from the parent post's content // Mastodon adds this as:

RE: ...

const update = { $set: { quote } }; const parentItem = await collections.ap_timeline.findOne({ uid }); if (parentItem?.content?.html) { const cleaned = stripQuoteReferenceHtml(parentItem.content.html, quoteUrl); if (cleaned !== parentItem.content.html) { update.$set["content.html"] = cleaned; } } await collections.ap_timeline.updateOne({ uid }, update); } catch (error) { console.error(`[og-unfurl] Failed to fetch quote for ${uid}: ${error.message}`); } } /** * Strip the "RE: " paragraph that Mastodon adds for quoted posts. * Removes

elements containing "RE:" followed by a link to the quote URL. * @param {string} html - Content HTML * @param {string} quoteUrl - URL of the quoted post * @returns {string} Cleaned HTML */ export function stripQuoteReferenceHtml(html, quoteUrl) { if (!html || !quoteUrl) return html; // Match

containing "RE:" and a link whose href contains the quote domain+path // Mastodon uses both /users/X/statuses/Y and /@X/Y URL formats try { const quoteUrlObj = new URL(quoteUrl); const quoteDomain = quoteUrlObj.hostname; // Escape special regex chars in domain const domainEscaped = quoteDomain.replace(/[.*+?^${}()|[\]\\]/g, "\\$&"); // Match

RE: ...

(with optional whitespace) const re = new RegExp( `

\\s*RE:\\s*]*href="[^"]*${domainEscaped}[^"]*"[^>]*>.*?\\s*

`, "i", ); return html.replace(re, "").trim(); } catch { return html; } }