feat: extract images from HTML content for photo-based feeds

Feeds like xkcd embed images as <img> tags in the RSS description rather than using enclosures or media:content. Similarly, h-feed photo posts may have images only in e-content HTML without explicit u-photo properties. Add extractImagesFromHtml() that pulls <img> src URLs from sanitized HTML content as a fallback when no explicit photos exist. Applied to all three normalizers (RSS/Atom, JSON Feed, h-feed). This makes comics, photo posts, and other image-centric feeds display their images in the reader timeline card's photo grid.
2026-04-02 15:35:00 +02:00 · 2026-02-25 15:22:04 +01:00
parent b4d2b7418d
commit ee2cd26208
2 changed files with 47 additions and 1 deletions
--- a/lib/feeds/normalizer.js
+++ b/lib/feeds/normalizer.js
@@ -7,6 +7,28 @@ import crypto from "node:crypto";
 import sanitizeHtml from "sanitize-html";
 /**
 * Extract image URLs from HTML content.
 * Used as a fallback when no explicit photo/enclosure is provided.
 * @param {string} html - HTML content (already sanitized)
 * @returns {string[]} Array of image URLs
 */
 function extractImagesFromHtml(html) {
  if (!html) {
    return [];
  }
  const urls = [];
  const imgRegex = /<img[^>]+src=["']([^"']+)["'][^>]*>/gi;
  let match;
  while ((match = imgRegex.exec(html)) !== null) {
    const src = match[1];
    if (src && !urls.includes(src)) {
      urls.push(src);
    }
  }
  return urls;
 }
 /**
 * Parse a date string with fallback for non-standard formats
 * @param {string|Date} dateInput - Date string or Date object
@@ -232,6 +254,14 @@ export function normalizeItem(item, feedUrl, feedType) {
    }
  }
  // Extract images from HTML content as fallback
  if (!normalized.photo && normalized.content?.html) {
    const extracted = extractImagesFromHtml(normalized.content.html);
    if (extracted.length > 0) {
      normalized.photo = extracted;
    }
  }
  return normalized;
 }
@@ -395,6 +425,14 @@ export function normalizeJsonFeedItem(item, feedUrl) {
    normalized["bookmark-of"] = [item.external_url];
  }
  // Extract images from HTML content as fallback
  if (!normalized.photo && normalized.content?.html) {
    const extracted = extractImagesFromHtml(normalized.content.html);
    if (extracted.length > 0) {
      normalized.photo = extracted;
    }
  }
  return normalized;
 }
@@ -568,6 +606,14 @@ export function normalizeHfeedItem(entry, feedUrl) {
    normalized.syndication = properties.syndication;
  }
  // Extract images from HTML content as fallback
  if (!normalized.photo && normalized.content?.html) {
    const extracted = extractImagesFromHtml(normalized.content.html);
    if (extracted.length > 0) {
      normalized.photo = extracted;
    }
  }
  return normalized;
 }
--- a/package.json
+++ b/package.json
@@ -1,6 +1,6 @@
 {
  "name": "@rmdes/indiekit-endpoint-microsub",
-  "version": "1.0.34",
+  "version": "1.0.35",
  "description": "Microsub endpoint for Indiekit. Enables subscribing to feeds and reading content using the Microsub protocol.",
  "keywords": [
    "indiekit",