# HG changeset patch # User Kim Alvefur # Date 1639090105 -3600 # Node ID 9c2af2146ee236c58529f67b4162184183fed4b9 # Parent 5f12c75fd210685d228e43cbf907a6f99f2ad3f0 mod_export_skeletons: Command to aid in analysis of archive contents diff -r 5f12c75fd210 -r 9c2af2146ee2 mod_export_skeletons/README.md --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mod_export_skeletons/README.md Thu Dec 09 23:48:25 2021 +0100 @@ -0,0 +1,41 @@ +--- +summary: Export message archives in sanitized minimal form for analysis +--- + +Exports message archives in a format stripped from private information +and message content. + +# Usage + + prosodyctl mod_export_skeletons [options] user@host* + +Multiple user JIDs can be given. + +Some storage drivers such as [SQL][doc:modules:mod_storage_sql] allows +exporting all users at once by giving the special username `*`, i.e. +`prosodyctl mod_export_skeletons \*@example.com`. + +`--start=timestamp` +: Start of time span to export in [XEP-0082] format + +`--end=timestamp` +: End of time span to export in [XEP-0082] format + +# Output + +All content is stripped, leaving only the basic XML structure, with +child tags sorted. + +Top level attributes are given special treatment since they carry +protocol semantics. Notably the `@to` and `@from` JIDs are replaced by +symbolic labels to convey what form (bare, full or host) they had. The +`@id` attribute is replaced with a string of the same length. + +## Example + +```xml + + + + +``` diff -r 5f12c75fd210 -r 9c2af2146ee2 mod_export_skeletons/mod_export_skeletons.lua --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mod_export_skeletons/mod_export_skeletons.lua Thu Dec 09 23:48:25 2021 +0100 @@ -0,0 +1,71 @@ + +local t_insert = table.insert; +local t_sort = table.sort; + +local sm = require "core.storagemanager"; +local um = require "core.usermanager"; + +local argparse = require "util.argparse"; +local dt = require "util.datetime"; +local jid = require "util.jid"; +local st = require "util.stanza"; + +local function skeleton(s) + local o = st.stanza(s.name, { xmlns = s.attr.xmlns }); + + local children = {}; + for _, child in ipairs(s.tags) do t_insert(children, skeleton(child)) end + t_sort(children, function(a, b) + if a.attr.xmlns == b.attr.xmlns then return a.name < b.name; end + return (a.attr.xmlns or "") < (b.attr.xmlns or ""); + end); + for _, child in ipairs(children) do o:add_direct_child(child); end + return o; +end + +local function classify_jid(s) + if not s then return "" end + local u, h, r = jid.split(s); + if r then + return "full" + elseif u then + return "bare" + elseif h then + return "host" + else + return "invalid" + end +end + +function module.command(arg) + local opts = argparse.parse(arg, { value_params = { store = true; with = true; start = true; ["end"] = true } }); + local store = opts.store or "archive"; -- so you can pass 'archive2' + opts.store = nil; + local query = { with = jid.prep(opts.with); start = dt.parse(opts.start); ["end"] = dt.parse(opts["end"]) }; + local host_initialized = {}; + for _, export_jid in ipairs(arg) do + + local username, host = jid.split(export_jid); + if not host_initialized[host] then + sm.initialize_host(host); + um.initialize_host(host); + host_initialized[host] = true; + end + + local archive = module:context(host):open_store(store, "archive"); + local iter, total = assert(archive:find(username ~= "*" and username, query)) + if total then io.stderr:write(string.format("Processing %d entries\n", total)); end + for _, item in iter do + local clean = skeleton(item); + + -- Normalize top level attributes + clean.attr.type = item.attr.type; + if clean.attr.type == nil and clean.name == "message" then clean.attr.type = "normal"; end + clean.attr.id = string.rep("x", #(item.attr.id or "")); -- worth rounding to nearest power of two or so? + clean.attr.from = classify_jid(item.attr.from); + clean.attr.to = classify_jid(item.attr.to); + print(clean); + end + + end +end