Changeset

12797:be09ac8300a7 0.11 0.11.14

util.stanza: Allow U+7F Allowed by XML despite arguably being a control character. Drops the part of the range meant to rule out octets invalid in UTF-8 (\247 starts a 4-byte sequence), since UTF-8 correctness is validated by util.encodings.utf8.valid().
author Kim Alvefur <zash@zash.se>
date Tue, 22 Nov 2022 23:56:01 +0100
parents 12210:458c5f8d5d3e
children 12798:c4b1b5cbc20b
files util/stanza.lua
diffstat 1 files changed, 5 insertions(+), 1 deletions(-) [+]
line wrap: on
line diff
--- a/util/stanza.lua	Mon Jan 24 13:58:04 2022 +0000
+++ b/util/stanza.lua	Tue Nov 22 23:56:01 2022 +0100
@@ -45,8 +45,12 @@
 local stanza_mt = { __name = "stanza" };
 stanza_mt.__index = stanza_mt;
 
+-- Basic check for valid XML character data.
+-- Disallow control characters.
+-- Tab U+09 and newline U+0A are allowed.
+-- For attributes, allow the \1 separator between namespace and name.
 local function valid_xml_cdata(str, attr)
-	return not s_find(str, attr and "[^\1\9\10\13\20-~\128-\247]" or "[^\9\10\13\20-~\128-\247]");
+	return not s_find(str, attr and "[^\1\9\10\13\20-\255]" or "[^\9\10\13\20-\255]");
 end
 
 local function check_name(name, name_type)