Software /
code /
prosody-modules
Annotate
mod_pubsub_summary/mod_pubsub_summary.lua @ 4513:ade2064160e3
mod_pubsub_summary: Fix to not strip inline links
Links were turned into `label <url>` and then a later pass removed the
`<url>` part leaving only the label. This should avoid that. Escaping is
hard.
author | Kim Alvefur <zash@zash.se> |
---|---|
date | Wed, 17 Mar 2021 15:52:12 +0100 |
parent | 4507:86a97e7572b2 |
child | 4600:98864dffb231 |
rev | line source |
---|---|
4426
3fe2c264aac4
mod_pubsub_summary: Mangle HTML payloads in Atom/RSS feeds
Kim Alvefur <zash@zash.se>
parents:
diff
changeset
|
1 -- No, not trying to parse HTML here. It's an illusion. Just trying to read RSS feeds. |
3fe2c264aac4
mod_pubsub_summary: Mangle HTML payloads in Atom/RSS feeds
Kim Alvefur <zash@zash.se>
parents:
diff
changeset
|
2 -- |
3fe2c264aac4
mod_pubsub_summary: Mangle HTML payloads in Atom/RSS feeds
Kim Alvefur <zash@zash.se>
parents:
diff
changeset
|
3 -- Compose a textual representation of Atom payloads |
3fe2c264aac4
mod_pubsub_summary: Mangle HTML payloads in Atom/RSS feeds
Kim Alvefur <zash@zash.se>
parents:
diff
changeset
|
4 module:hook("pubsub-summary/http://www.w3.org/2005/Atom", function (event) |
3fe2c264aac4
mod_pubsub_summary: Mangle HTML payloads in Atom/RSS feeds
Kim Alvefur <zash@zash.se>
parents:
diff
changeset
|
5 local payload = event.payload; |
3fe2c264aac4
mod_pubsub_summary: Mangle HTML payloads in Atom/RSS feeds
Kim Alvefur <zash@zash.se>
parents:
diff
changeset
|
6 local title = payload:get_child_text("title"); |
4435
a620bf249e63
mod_pubsub_summary: Explain why it picks content or summary in a comment
Kim Alvefur <zash@zash.se>
parents:
4426
diff
changeset
|
7 -- Note: This prefers content over summary, it was made for a news feed where |
a620bf249e63
mod_pubsub_summary: Explain why it picks content or summary in a comment
Kim Alvefur <zash@zash.se>
parents:
4426
diff
changeset
|
8 -- the interesting stuff was in the content and the summary was .. meh. |
4426
3fe2c264aac4
mod_pubsub_summary: Mangle HTML payloads in Atom/RSS feeds
Kim Alvefur <zash@zash.se>
parents:
diff
changeset
|
9 local content_tag = payload:get_child("content") or payload:get_child("summary"); |
4507
86a97e7572b2
mod_pubsub_summary: Fix traceback when <content> not included
Kim Alvefur <zash@zash.se>
parents:
4438
diff
changeset
|
10 local content = content_tag and content_tag:get_text(); |
86a97e7572b2
mod_pubsub_summary: Fix traceback when <content> not included
Kim Alvefur <zash@zash.se>
parents:
4438
diff
changeset
|
11 if content and content_tag.attr.type == "html" then |
4426
3fe2c264aac4
mod_pubsub_summary: Mangle HTML payloads in Atom/RSS feeds
Kim Alvefur <zash@zash.se>
parents:
diff
changeset
|
12 content = content:gsub("\n*<p[^>]*>\n*(.-)\n*</p>\n*", "%1\n\n"); |
3fe2c264aac4
mod_pubsub_summary: Mangle HTML payloads in Atom/RSS feeds
Kim Alvefur <zash@zash.se>
parents:
diff
changeset
|
13 content = content:gsub("<li>(.-)</li>\n", "* %1\n"); |
4513
ade2064160e3
mod_pubsub_summary: Fix to not strip inline links
Kim Alvefur <zash@zash.se>
parents:
4507
diff
changeset
|
14 content = content:gsub("<a[^>]*href=[\"'](.-)[\"'][^>]*>(.-)</a>", "\1%1\2%2\3"); |
4426
3fe2c264aac4
mod_pubsub_summary: Mangle HTML payloads in Atom/RSS feeds
Kim Alvefur <zash@zash.se>
parents:
diff
changeset
|
15 content = content:gsub("<b>(.-)</b>", "*%1*"); |
3fe2c264aac4
mod_pubsub_summary: Mangle HTML payloads in Atom/RSS feeds
Kim Alvefur <zash@zash.se>
parents:
diff
changeset
|
16 content = content:gsub("<strong>(.-)</strong>", "*%1*"); |
3fe2c264aac4
mod_pubsub_summary: Mangle HTML payloads in Atom/RSS feeds
Kim Alvefur <zash@zash.se>
parents:
diff
changeset
|
17 content = content:gsub("<em>(.-)</em>", "*%1*"); |
3fe2c264aac4
mod_pubsub_summary: Mangle HTML payloads in Atom/RSS feeds
Kim Alvefur <zash@zash.se>
parents:
diff
changeset
|
18 content = content:gsub("<i>(.-)</i>", "*%1*"); |
3fe2c264aac4
mod_pubsub_summary: Mangle HTML payloads in Atom/RSS feeds
Kim Alvefur <zash@zash.se>
parents:
diff
changeset
|
19 content = content:gsub("<img[^>]*src=[\"'](.-)[\"'][^>]*>", " %1 "); -- TODO alt= would have been nice to grab |
3fe2c264aac4
mod_pubsub_summary: Mangle HTML payloads in Atom/RSS feeds
Kim Alvefur <zash@zash.se>
parents:
diff
changeset
|
20 content = content:gsub("<br[^>]*>", "\n"); |
3fe2c264aac4
mod_pubsub_summary: Mangle HTML payloads in Atom/RSS feeds
Kim Alvefur <zash@zash.se>
parents:
diff
changeset
|
21 content = content:gsub("<[^>]+>", ""); |
4513
ade2064160e3
mod_pubsub_summary: Fix to not strip inline links
Kim Alvefur <zash@zash.se>
parents:
4507
diff
changeset
|
22 content = content:gsub("\1(.-)\2(.-)\3", "%2 <%1>"); |
4426
3fe2c264aac4
mod_pubsub_summary: Mangle HTML payloads in Atom/RSS feeds
Kim Alvefur <zash@zash.se>
parents:
diff
changeset
|
23 content = content:gsub("^%s*", ""):gsub("%s*$", ""); |
3fe2c264aac4
mod_pubsub_summary: Mangle HTML payloads in Atom/RSS feeds
Kim Alvefur <zash@zash.se>
parents:
diff
changeset
|
24 content = content:gsub("\n\n\n+", "\n\n"); |
3fe2c264aac4
mod_pubsub_summary: Mangle HTML payloads in Atom/RSS feeds
Kim Alvefur <zash@zash.se>
parents:
diff
changeset
|
25 content = content:gsub("&(%w+);", { |
3fe2c264aac4
mod_pubsub_summary: Mangle HTML payloads in Atom/RSS feeds
Kim Alvefur <zash@zash.se>
parents:
diff
changeset
|
26 apos = "'"; |
3fe2c264aac4
mod_pubsub_summary: Mangle HTML payloads in Atom/RSS feeds
Kim Alvefur <zash@zash.se>
parents:
diff
changeset
|
27 quot = '"'; |
3fe2c264aac4
mod_pubsub_summary: Mangle HTML payloads in Atom/RSS feeds
Kim Alvefur <zash@zash.se>
parents:
diff
changeset
|
28 lt = "<"; |
3fe2c264aac4
mod_pubsub_summary: Mangle HTML payloads in Atom/RSS feeds
Kim Alvefur <zash@zash.se>
parents:
diff
changeset
|
29 gt = ">"; |
3fe2c264aac4
mod_pubsub_summary: Mangle HTML payloads in Atom/RSS feeds
Kim Alvefur <zash@zash.se>
parents:
diff
changeset
|
30 amp = "&"; |
3fe2c264aac4
mod_pubsub_summary: Mangle HTML payloads in Atom/RSS feeds
Kim Alvefur <zash@zash.se>
parents:
diff
changeset
|
31 nbsp = utf8 and utf8.char(0xa0) or " "; |
3fe2c264aac4
mod_pubsub_summary: Mangle HTML payloads in Atom/RSS feeds
Kim Alvefur <zash@zash.se>
parents:
diff
changeset
|
32 }); |
3fe2c264aac4
mod_pubsub_summary: Mangle HTML payloads in Atom/RSS feeds
Kim Alvefur <zash@zash.se>
parents:
diff
changeset
|
33 end |
3fe2c264aac4
mod_pubsub_summary: Mangle HTML payloads in Atom/RSS feeds
Kim Alvefur <zash@zash.se>
parents:
diff
changeset
|
34 local summary; |
4437
09657f758f53
mod_pubsub_summary: Skip adding title if already part of summary
Kim Alvefur <zash@zash.se>
parents:
4436
diff
changeset
|
35 if title and content and content:sub(1, #title) ~= title then |
4438
2bb11055e4bb
mod_pubsub_summary: Make titles *bold* to stand out more
Kim Alvefur <zash@zash.se>
parents:
4437
diff
changeset
|
36 summary = "*" .. title .. "*\n\n" .. content; |
4426
3fe2c264aac4
mod_pubsub_summary: Mangle HTML payloads in Atom/RSS feeds
Kim Alvefur <zash@zash.se>
parents:
diff
changeset
|
37 elseif title or content then |
3fe2c264aac4
mod_pubsub_summary: Mangle HTML payloads in Atom/RSS feeds
Kim Alvefur <zash@zash.se>
parents:
diff
changeset
|
38 summary = content or title; |
3fe2c264aac4
mod_pubsub_summary: Mangle HTML payloads in Atom/RSS feeds
Kim Alvefur <zash@zash.se>
parents:
diff
changeset
|
39 end |
4436
07529dba102d
mod_pubsub_summary: Include multiple links (e.g. podcast media)
Kim Alvefur <zash@zash.se>
parents:
4435
diff
changeset
|
40 for link in payload:childtags("link") do |
07529dba102d
mod_pubsub_summary: Include multiple links (e.g. podcast media)
Kim Alvefur <zash@zash.se>
parents:
4435
diff
changeset
|
41 if link and link.attr.href and link.attr.href ~= content then |
07529dba102d
mod_pubsub_summary: Include multiple links (e.g. podcast media)
Kim Alvefur <zash@zash.se>
parents:
4435
diff
changeset
|
42 summary = (summary and summary .. "\n" or "") .. link.attr.href; |
07529dba102d
mod_pubsub_summary: Include multiple links (e.g. podcast media)
Kim Alvefur <zash@zash.se>
parents:
4435
diff
changeset
|
43 if link.attr.rel then summary = summary .. " [" .. link.attr.rel .. "]" end |
07529dba102d
mod_pubsub_summary: Include multiple links (e.g. podcast media)
Kim Alvefur <zash@zash.se>
parents:
4435
diff
changeset
|
44 end |
4426
3fe2c264aac4
mod_pubsub_summary: Mangle HTML payloads in Atom/RSS feeds
Kim Alvefur <zash@zash.se>
parents:
diff
changeset
|
45 end |
3fe2c264aac4
mod_pubsub_summary: Mangle HTML payloads in Atom/RSS feeds
Kim Alvefur <zash@zash.se>
parents:
diff
changeset
|
46 return summary; |
3fe2c264aac4
mod_pubsub_summary: Mangle HTML payloads in Atom/RSS feeds
Kim Alvefur <zash@zash.se>
parents:
diff
changeset
|
47 end, 1); |