mirror of
https://github.com/svemagie/indiekit-endpoint-microsub.git
synced 2026-04-02 15:35:00 +02:00
feat: extract images from HTML content for photo-based feeds
Feeds like xkcd embed images as <img> tags in the RSS description rather than using enclosures or media:content. Similarly, h-feed photo posts may have images only in e-content HTML without explicit u-photo properties. Add extractImagesFromHtml() that pulls <img> src URLs from sanitized HTML content as a fallback when no explicit photos exist. Applied to all three normalizers (RSS/Atom, JSON Feed, h-feed). This makes comics, photo posts, and other image-centric feeds display their images in the reader timeline card's photo grid.
This commit is contained in:
@@ -7,6 +7,28 @@ import crypto from "node:crypto";
|
||||
|
||||
import sanitizeHtml from "sanitize-html";
|
||||
|
||||
/**
|
||||
* Extract image URLs from HTML content.
|
||||
* Used as a fallback when no explicit photo/enclosure is provided.
|
||||
* @param {string} html - HTML content (already sanitized)
|
||||
* @returns {string[]} Array of image URLs
|
||||
*/
|
||||
function extractImagesFromHtml(html) {
|
||||
if (!html) {
|
||||
return [];
|
||||
}
|
||||
const urls = [];
|
||||
const imgRegex = /<img[^>]+src=["']([^"']+)["'][^>]*>/gi;
|
||||
let match;
|
||||
while ((match = imgRegex.exec(html)) !== null) {
|
||||
const src = match[1];
|
||||
if (src && !urls.includes(src)) {
|
||||
urls.push(src);
|
||||
}
|
||||
}
|
||||
return urls;
|
||||
}
|
||||
|
||||
/**
|
||||
* Parse a date string with fallback for non-standard formats
|
||||
* @param {string|Date} dateInput - Date string or Date object
|
||||
@@ -232,6 +254,14 @@ export function normalizeItem(item, feedUrl, feedType) {
|
||||
}
|
||||
}
|
||||
|
||||
// Extract images from HTML content as fallback
|
||||
if (!normalized.photo && normalized.content?.html) {
|
||||
const extracted = extractImagesFromHtml(normalized.content.html);
|
||||
if (extracted.length > 0) {
|
||||
normalized.photo = extracted;
|
||||
}
|
||||
}
|
||||
|
||||
return normalized;
|
||||
}
|
||||
|
||||
@@ -395,6 +425,14 @@ export function normalizeJsonFeedItem(item, feedUrl) {
|
||||
normalized["bookmark-of"] = [item.external_url];
|
||||
}
|
||||
|
||||
// Extract images from HTML content as fallback
|
||||
if (!normalized.photo && normalized.content?.html) {
|
||||
const extracted = extractImagesFromHtml(normalized.content.html);
|
||||
if (extracted.length > 0) {
|
||||
normalized.photo = extracted;
|
||||
}
|
||||
}
|
||||
|
||||
return normalized;
|
||||
}
|
||||
|
||||
@@ -568,6 +606,14 @@ export function normalizeHfeedItem(entry, feedUrl) {
|
||||
normalized.syndication = properties.syndication;
|
||||
}
|
||||
|
||||
// Extract images from HTML content as fallback
|
||||
if (!normalized.photo && normalized.content?.html) {
|
||||
const extracted = extractImagesFromHtml(normalized.content.html);
|
||||
if (extracted.length > 0) {
|
||||
normalized.photo = extracted;
|
||||
}
|
||||
}
|
||||
|
||||
return normalized;
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user