mirror of
https://github.com/svemagie/indiekit-endpoint-microsub.git
synced 2026-04-02 15:35:00 +02:00
feat: extract images from HTML content for photo-based feeds
Feeds like xkcd embed images as <img> tags in the RSS description rather than using enclosures or media:content. Similarly, h-feed photo posts may have images only in e-content HTML without explicit u-photo properties. Add extractImagesFromHtml() that pulls <img> src URLs from sanitized HTML content as a fallback when no explicit photos exist. Applied to all three normalizers (RSS/Atom, JSON Feed, h-feed). This makes comics, photo posts, and other image-centric feeds display their images in the reader timeline card's photo grid.
This commit is contained in:
@@ -7,6 +7,28 @@ import crypto from "node:crypto";
|
|||||||
|
|
||||||
import sanitizeHtml from "sanitize-html";
|
import sanitizeHtml from "sanitize-html";
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Extract image URLs from HTML content.
|
||||||
|
* Used as a fallback when no explicit photo/enclosure is provided.
|
||||||
|
* @param {string} html - HTML content (already sanitized)
|
||||||
|
* @returns {string[]} Array of image URLs
|
||||||
|
*/
|
||||||
|
function extractImagesFromHtml(html) {
|
||||||
|
if (!html) {
|
||||||
|
return [];
|
||||||
|
}
|
||||||
|
const urls = [];
|
||||||
|
const imgRegex = /<img[^>]+src=["']([^"']+)["'][^>]*>/gi;
|
||||||
|
let match;
|
||||||
|
while ((match = imgRegex.exec(html)) !== null) {
|
||||||
|
const src = match[1];
|
||||||
|
if (src && !urls.includes(src)) {
|
||||||
|
urls.push(src);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return urls;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Parse a date string with fallback for non-standard formats
|
* Parse a date string with fallback for non-standard formats
|
||||||
* @param {string|Date} dateInput - Date string or Date object
|
* @param {string|Date} dateInput - Date string or Date object
|
||||||
@@ -232,6 +254,14 @@ export function normalizeItem(item, feedUrl, feedType) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Extract images from HTML content as fallback
|
||||||
|
if (!normalized.photo && normalized.content?.html) {
|
||||||
|
const extracted = extractImagesFromHtml(normalized.content.html);
|
||||||
|
if (extracted.length > 0) {
|
||||||
|
normalized.photo = extracted;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
return normalized;
|
return normalized;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -395,6 +425,14 @@ export function normalizeJsonFeedItem(item, feedUrl) {
|
|||||||
normalized["bookmark-of"] = [item.external_url];
|
normalized["bookmark-of"] = [item.external_url];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Extract images from HTML content as fallback
|
||||||
|
if (!normalized.photo && normalized.content?.html) {
|
||||||
|
const extracted = extractImagesFromHtml(normalized.content.html);
|
||||||
|
if (extracted.length > 0) {
|
||||||
|
normalized.photo = extracted;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
return normalized;
|
return normalized;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -568,6 +606,14 @@ export function normalizeHfeedItem(entry, feedUrl) {
|
|||||||
normalized.syndication = properties.syndication;
|
normalized.syndication = properties.syndication;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Extract images from HTML content as fallback
|
||||||
|
if (!normalized.photo && normalized.content?.html) {
|
||||||
|
const extracted = extractImagesFromHtml(normalized.content.html);
|
||||||
|
if (extracted.length > 0) {
|
||||||
|
normalized.photo = extracted;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
return normalized;
|
return normalized;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "@rmdes/indiekit-endpoint-microsub",
|
"name": "@rmdes/indiekit-endpoint-microsub",
|
||||||
"version": "1.0.34",
|
"version": "1.0.35",
|
||||||
"description": "Microsub endpoint for Indiekit. Enables subscribing to feeds and reading content using the Microsub protocol.",
|
"description": "Microsub endpoint for Indiekit. Enables subscribing to feeds and reading content using the Microsub protocol.",
|
||||||
"keywords": [
|
"keywords": [
|
||||||
"indiekit",
|
"indiekit",
|
||||||
|
|||||||
Reference in New Issue
Block a user