From ee2cd2620813d77b2eb6f686a89a19b170f74c93 Mon Sep 17 00:00:00 2001
From: Ricardo <rick@rmendes.net>
Date: Wed, 25 Feb 2026 15:22:04 +0100
Subject: [PATCH] feat: extract images from HTML content for photo-based feeds

Feeds like xkcd embed images as <img> tags in the RSS description
rather than using enclosures or media:content. Similarly, h-feed
photo posts may have images only in e-content HTML without explicit
u-photo properties.

Add extractImagesFromHtml() that pulls <img> src URLs from sanitized
HTML content as a fallback when no explicit photos exist. Applied to
all three normalizers (RSS/Atom, JSON Feed, h-feed).

This makes comics, photo posts, and other image-centric feeds display
their images in the reader timeline card's photo grid.
---
 lib/feeds/normalizer.js | 46 +++++++++++++++++++++++++++++++++++++++++
 package.json            |  2 +-
 2 files changed, 47 insertions(+), 1 deletion(-)
diff --git a/lib/feeds/normalizer.js b/lib/feeds/normalizer.js
index ffd87e2..e010389 100644
--- a/lib/feeds/normalizer.js
+++ b/lib/feeds/normalizer.js
@@ -7,6 +7,28 @@ import crypto from "node:crypto";
 
 import sanitizeHtml from "sanitize-html";
 
+/**
+ * Extract image URLs from HTML content.
+ * Used as a fallback when no explicit photo/enclosure is provided.
+ * @param {string} html - HTML content (already sanitized)
+ * @returns {string[]} Array of image URLs
+ */
+function extractImagesFromHtml(html) {
+  if (!html) {
+    return [];
+  }
+  const urls = [];
+  const imgRegex = /<img[^>]+src=["']([^"']+)["'][^>]*>/gi;
+  let match;
+  while ((match = imgRegex.exec(html)) !== null) {
+    const src = match[1];
+    if (src && !urls.includes(src)) {
+      urls.push(src);
+    }
+  }
+  return urls;
+}
+
 /**
  * Parse a date string with fallback for non-standard formats
  * @param {string|Date} dateInput - Date string or Date object
@@ -232,6 +254,14 @@ export function normalizeItem(item, feedUrl, feedType) {
     }
   }
 
+  // Extract images from HTML content as fallback
+  if (!normalized.photo && normalized.content?.html) {
+    const extracted = extractImagesFromHtml(normalized.content.html);
+    if (extracted.length > 0) {
+      normalized.photo = extracted;
+    }
+  }
+
   return normalized;
 }
 
@@ -395,6 +425,14 @@ export function normalizeJsonFeedItem(item, feedUrl) {
     normalized["bookmark-of"] = [item.external_url];
   }
 
+  // Extract images from HTML content as fallback
+  if (!normalized.photo && normalized.content?.html) {
+    const extracted = extractImagesFromHtml(normalized.content.html);
+    if (extracted.length > 0) {
+      normalized.photo = extracted;
+    }
+  }
+
   return normalized;
 }
 
@@ -568,6 +606,14 @@ export function normalizeHfeedItem(entry, feedUrl) {
     normalized.syndication = properties.syndication;
   }
 
+  // Extract images from HTML content as fallback
+  if (!normalized.photo && normalized.content?.html) {
+    const extracted = extractImagesFromHtml(normalized.content.html);
+    if (extracted.length > 0) {
+      normalized.photo = extracted;
+    }
+  }
+
   return normalized;
 }
 
diff --git a/package.json b/package.json
index da64cd1..8ca2424 100644
--- a/package.json
+++ b/package.json
@@ -1,6 +1,6 @@
 {
   "name": "@rmdes/indiekit-endpoint-microsub",
-  "version": "1.0.34",
+  "version": "1.0.35",
   "description": "Microsub endpoint for Indiekit. Enables subscribing to feeds and reading content using the Microsub protocol.",
   "keywords": [
     "indiekit",