feat: extract images from HTML content for photo-based feeds

Feeds like xkcd embed images as <img> tags in the RSS description
rather than using enclosures or media:content. Similarly, h-feed
photo posts may have images only in e-content HTML without explicit
u-photo properties.

Add extractImagesFromHtml() that pulls <img> src URLs from sanitized
HTML content as a fallback when no explicit photos exist. Applied to
all three normalizers (RSS/Atom, JSON Feed, h-feed).

This makes comics, photo posts, and other image-centric feeds display
their images in the reader timeline card's photo grid.
This commit is contained in:
Ricardo
2026-02-25 15:22:04 +01:00
parent b4d2b7418d
commit ee2cd26208
2 changed files with 47 additions and 1 deletions

View File

@@ -7,6 +7,28 @@ import crypto from "node:crypto";
import sanitizeHtml from "sanitize-html"; import sanitizeHtml from "sanitize-html";
/**
* Extract image URLs from HTML content.
* Used as a fallback when no explicit photo/enclosure is provided.
* @param {string} html - HTML content (already sanitized)
* @returns {string[]} Array of image URLs
*/
function extractImagesFromHtml(html) {
if (!html) {
return [];
}
const urls = [];
const imgRegex = /<img[^>]+src=["']([^"']+)["'][^>]*>/gi;
let match;
while ((match = imgRegex.exec(html)) !== null) {
const src = match[1];
if (src && !urls.includes(src)) {
urls.push(src);
}
}
return urls;
}
/** /**
* Parse a date string with fallback for non-standard formats * Parse a date string with fallback for non-standard formats
* @param {string|Date} dateInput - Date string or Date object * @param {string|Date} dateInput - Date string or Date object
@@ -232,6 +254,14 @@ export function normalizeItem(item, feedUrl, feedType) {
} }
} }
// Extract images from HTML content as fallback
if (!normalized.photo && normalized.content?.html) {
const extracted = extractImagesFromHtml(normalized.content.html);
if (extracted.length > 0) {
normalized.photo = extracted;
}
}
return normalized; return normalized;
} }
@@ -395,6 +425,14 @@ export function normalizeJsonFeedItem(item, feedUrl) {
normalized["bookmark-of"] = [item.external_url]; normalized["bookmark-of"] = [item.external_url];
} }
// Extract images from HTML content as fallback
if (!normalized.photo && normalized.content?.html) {
const extracted = extractImagesFromHtml(normalized.content.html);
if (extracted.length > 0) {
normalized.photo = extracted;
}
}
return normalized; return normalized;
} }
@@ -568,6 +606,14 @@ export function normalizeHfeedItem(entry, feedUrl) {
normalized.syndication = properties.syndication; normalized.syndication = properties.syndication;
} }
// Extract images from HTML content as fallback
if (!normalized.photo && normalized.content?.html) {
const extracted = extractImagesFromHtml(normalized.content.html);
if (extracted.length > 0) {
normalized.photo = extracted;
}
}
return normalized; return normalized;
} }

View File

@@ -1,6 +1,6 @@
{ {
"name": "@rmdes/indiekit-endpoint-microsub", "name": "@rmdes/indiekit-endpoint-microsub",
"version": "1.0.34", "version": "1.0.35",
"description": "Microsub endpoint for Indiekit. Enables subscribing to feeds and reading content using the Microsub protocol.", "description": "Microsub endpoint for Indiekit. Enables subscribing to feeds and reading content using the Microsub protocol.",
"keywords": [ "keywords": [
"indiekit", "indiekit",