Changeset

4426:3fe2c264aac4

mod_pubsub_summary: Mangle HTML payloads in Atom/RSS feeds This module is meant for use with mod_pubsub_feeds and tries to improve on mod_pubsub's built-in Atom summary generator.
author Kim Alvefur <zash@zash.se>
date Thu, 04 Feb 2021 01:12:41 +0100
parents 4425:b3e0295e14a3
children 4427:c402b273f2e3
files mod_pubsub_summary/mod_pubsub_summary.lua
diffstat 1 files changed, 42 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mod_pubsub_summary/mod_pubsub_summary.lua	Thu Feb 04 01:12:41 2021 +0100
@@ -0,0 +1,42 @@
+-- No, not trying to parse HTML here. It's an illusion. Just trying to read RSS feeds.
+--
+-- Compose a textual representation of Atom payloads
+module:hook("pubsub-summary/http://www.w3.org/2005/Atom", function (event)
+	local payload = event.payload;
+	local title = payload:get_child_text("title");
+	local content_tag = payload:get_child("content") or payload:get_child("summary");
+	local content = content_tag:get_text();
+	if content_tag.attr.type == "html" then
+		content = content:gsub("\n*<p[^>]*>\n*(.-)\n*</p>\n*", "%1\n\n");
+		content = content:gsub("<li>(.-)</li>\n", "* %1\n");
+		content = content:gsub("<a[^>]*href=[\"'](.-)[\"'][^>]*>(.-)</a>", "%2 <%1>");
+		content = content:gsub("<b>(.-)</b>", "*%1*");
+		content = content:gsub("<strong>(.-)</strong>", "*%1*");
+		content = content:gsub("<em>(.-)</em>", "*%1*");
+		content = content:gsub("<i>(.-)</i>", "*%1*");
+		content = content:gsub("<img[^>]*src=[\"'](.-)[\"'][^>]*>", " %1 "); -- TODO alt= would have been nice to grab
+		content = content:gsub("<br[^>]*>", "\n");
+		content = content:gsub("<[^>]+>", "");
+		content = content:gsub("^%s*", ""):gsub("%s*$", "");
+		content = content:gsub("\n\n\n+", "\n\n");
+		content = content:gsub("&(%w+);", {
+				apos = "'";
+				quot = '"';
+				lt = "<";
+				gt = ">";
+				amp = "&";
+				nbsp = utf8 and utf8.char(0xa0) or " ";
+			});
+	end
+	local link = payload:get_child("link");
+	local summary;
+	if title and content then
+		summary = title .. "\n\n" .. content;
+	elseif title or content then
+		summary = content or title;
+	end
+	if link and link.attr.href and link.attr.href ~= content then
+		summary = (summary and summary .. "\n" or "") .. link.attr.href;
+	end
+	return summary;
+end, 1);