From ee2cd2620813d77b2eb6f686a89a19b170f74c93 Mon Sep 17 00:00:00 2001 From: Ricardo Date: Wed, 25 Feb 2026 15:22:04 +0100 Subject: [PATCH] feat: extract images from HTML content for photo-based feeds Feeds like xkcd embed images as tags in the RSS description rather than using enclosures or media:content. Similarly, h-feed photo posts may have images only in e-content HTML without explicit u-photo properties. Add extractImagesFromHtml() that pulls src URLs from sanitized HTML content as a fallback when no explicit photos exist. Applied to all three normalizers (RSS/Atom, JSON Feed, h-feed). This makes comics, photo posts, and other image-centric feeds display their images in the reader timeline card's photo grid. --- lib/feeds/normalizer.js | 46 +++++++++++++++++++++++++++++++++++++++++ package.json | 2 +- 2 files changed, 47 insertions(+), 1 deletion(-) diff --git a/lib/feeds/normalizer.js b/lib/feeds/normalizer.js index ffd87e2..e010389 100644 --- a/lib/feeds/normalizer.js +++ b/lib/feeds/normalizer.js @@ -7,6 +7,28 @@ import crypto from "node:crypto"; import sanitizeHtml from "sanitize-html"; +/** + * Extract image URLs from HTML content. + * Used as a fallback when no explicit photo/enclosure is provided. + * @param {string} html - HTML content (already sanitized) + * @returns {string[]} Array of image URLs + */ +function extractImagesFromHtml(html) { + if (!html) { + return []; + } + const urls = []; + const imgRegex = /]+src=["']([^"']+)["'][^>]*>/gi; + let match; + while ((match = imgRegex.exec(html)) !== null) { + const src = match[1]; + if (src && !urls.includes(src)) { + urls.push(src); + } + } + return urls; +} + /** * Parse a date string with fallback for non-standard formats * @param {string|Date} dateInput - Date string or Date object @@ -232,6 +254,14 @@ export function normalizeItem(item, feedUrl, feedType) { } } + // Extract images from HTML content as fallback + if (!normalized.photo && normalized.content?.html) { + const extracted = extractImagesFromHtml(normalized.content.html); + if (extracted.length > 0) { + normalized.photo = extracted; + } + } + return normalized; } @@ -395,6 +425,14 @@ export function normalizeJsonFeedItem(item, feedUrl) { normalized["bookmark-of"] = [item.external_url]; } + // Extract images from HTML content as fallback + if (!normalized.photo && normalized.content?.html) { + const extracted = extractImagesFromHtml(normalized.content.html); + if (extracted.length > 0) { + normalized.photo = extracted; + } + } + return normalized; } @@ -568,6 +606,14 @@ export function normalizeHfeedItem(entry, feedUrl) { normalized.syndication = properties.syndication; } + // Extract images from HTML content as fallback + if (!normalized.photo && normalized.content?.html) { + const extracted = extractImagesFromHtml(normalized.content.html); + if (extracted.length > 0) { + normalized.photo = extracted; + } + } + return normalized; } diff --git a/package.json b/package.json index da64cd1..8ca2424 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "@rmdes/indiekit-endpoint-microsub", - "version": "1.0.34", + "version": "1.0.35", "description": "Microsub endpoint for Indiekit. Enables subscribing to feeds and reading content using the Microsub protocol.", "keywords": [ "indiekit",