fix: extract images from HTML content at read time for existing items

The normalizer fix (1.0.35) only applies to newly ingested items. Existing items in MongoDB lack photo arrays because dedup prevents re-processing. Add the same extractImagesFromHtml() fallback in transformToJf2() so images are extracted from content.html at read time, making existing xkcd comics and photo posts display immediately.
2026-04-02 15:35:00 +02:00 · 2026-02-25 17:20:53 +01:00
parent ee2cd26208
commit cdd4a58015
2 changed files with 30 additions and 1 deletions
--- a/lib/storage/items.js
+++ b/lib/storage/items.js
@@ -12,6 +12,27 @@ import {
  parseLimit,
 } from "../utils/pagination.js";

+/**
+ * Extract image URLs from HTML content (fallback for items without explicit photos)
+ * @param {string} html - HTML content
+ * @returns {string[]} Array of image URLs
+ */
+function extractImagesFromHtml(html) {
+  if (!html) {
+    return [];
+  }
+  const urls = [];
+  const imgRegex = /<img[^>]+src=["']([^"']+)["'][^>]*>/gi;
+  let match;
+  while ((match = imgRegex.exec(html)) !== null) {
+    const src = match[1];
+    if (src && !urls.includes(src)) {
+      urls.push(src);
+    }
+  }
+  return urls;
+}
+
 /**
 * Get items collection from application
 * @param {object} application - Indiekit application
@@ -201,6 +222,14 @@ function transformToJf2(item, userId) {
  const videos = normalizeMediaArray(item.video);
  const audios = normalizeMediaArray(item.audio);

+  // Fallback: extract images from HTML content if no explicit photos
+  if (photos.length === 0 && item.content?.html) {
+    const extracted = extractImagesFromHtml(item.content.html);
+    if (extracted.length > 0) {
+      photos.push(...extracted);
+    }
+  }
+
  if (photos.length > 0) jf2.photo = photos;
  if (videos.length > 0) jf2.video = videos;
  if (audios.length > 0) jf2.audio = audios;