Diff

plugins/mod_admin_shell.lua @ 11523:5f15ab7c6ae5

Statistics: Rewrite statistics backends to use OpenMetrics The metric subsystem of Prosody has had some shortcomings from the perspective of the current state-of-the-art in metric observability. The OpenMetrics standard [0] is a formalization of the data model (and serialization format) of the well-known and widely-used Prometheus [1] software stack. The previous stats subsystem of Prosody did not map well to that format (see e.g. [2] and [3]); the key reason is that it was trying to do too much math on its own ([2]) while lacking first-class support for "families" of metrics ([3]) and structured metric metadata (despite the `extra` argument to metrics, there was no standard way of representing common things like "tags" or "labels"). Even though OpenMetrics has grown from the Prometheus world of monitoring, it maps well to other popular monitoring stacks such as: - InfluxDB (labels can be mapped to tags and fields as necessary) - Carbon/Graphite (labels can be attached to the metric name with dot-separation) - StatsD (see graphite when assuming that graphite is used as backend, which is the default) The util.statsd module has been ported to use the OpenMetrics model as a proof of concept. An implementation which exposes the util.statistics backend data as Prometheus metrics is ready for publishing in prosody-modules (most likely as mod_openmetrics_prometheus to avoid breaking existing 0.11 deployments). At the same time, the previous measure()-based API had one major advantage: It is really simple and easy to use without requiring lots of knowledge about OpenMetrics or similar concepts. For that reason as well as compatibility with existing code, it is preserved and may even be extended in the future. However, code relying on the `stats-updated` event as well as `get_stats` from `statsmanager` will break because the data model has changed completely; in case of `stats-updated`, the code will simply not run (as the event was renamed in order to avoid conflicts); the `get_stats` function has been removed completely (so it will cause a traceback when it is attempted to be used). Note that the measure_*_event methods have been removed from the module API. I was unable to find any uses or documentation and thus deemed they should not be ported. Re-implementation is possible when necessary. [0]: https://openmetrics.io/ [1]: https://prometheus.io/ [2]: #959 [3]: #960
author Jonas Schäfer <jonas@wielicki.name>
date Sun, 18 Apr 2021 11:47:41 +0200
parent 11504:1f700f5f62cb
child 11601:9483728f890f
line wrap: on
line diff
--- a/plugins/mod_admin_shell.lua	Sun Apr 25 17:32:27 2021 +0200
+++ b/plugins/mod_admin_shell.lua	Sun Apr 18 11:47:41 2021 +0200
@@ -36,6 +36,9 @@
 local serialize_config = serialization.new ({ fatal = false, unquoted = true});
 local time = require "util.time";
 
+local t_insert = table.insert;
+local t_concat = table.concat;
+
 local format_number = require "util.human.units".format;
 local format_table = require "util.human.io".table;
 
@@ -1342,187 +1345,112 @@
 	bytes = "B",
 };
 
-local function format_stat(type, unit, value, ref_value)
-	ref_value = ref_value or value;
-	--do return tostring(value) end
-	if not unit then
-		if type == "duration" then
-			unit = "seconds"
-		elseif type == "size" then
-			unit = "bytes";
-		elseif type == "rate" then
-			unit = " events/sec"
-			if ref_value < 0.9 then
-				unit = "events/min"
-				value = value*60;
-				if ref_value < 0.6/60 then
-					unit = "events/h"
-					value = value*60;
-				end
+local stats_methods = {};
+
+function stats_methods:render_single_fancy_histogram_ex(print, prefix, metric_family, metric, cumulative)
+	local creation_timestamp, sum, count
+	local buckets = {}
+	local prev_bucket_count = 0
+	for suffix, extra_labels, value in metric:iter_samples() do
+		if suffix == "_created" then
+			creation_timestamp = value
+		elseif suffix == "_sum" then
+			sum = value
+		elseif suffix == "_count" then
+			count = value
+		else
+			local bucket_threshold = extra_labels["le"]
+			local bucket_count
+			if cumulative then
+				bucket_count = value
+			else
+				bucket_count = value - prev_bucket_count
+				prev_bucket_count = value
 			end
-			return ("%.3g %s"):format(value, unit);
+			if bucket_threshold == "+Inf" then
+				t_insert(buckets, {threshold = 1/0, count = bucket_count})
+			elseif bucket_threshold ~= nil then
+				t_insert(buckets, {threshold = tonumber(bucket_threshold), count = bucket_count})
+			end
 		end
 	end
-	return format_number(value, short_units[unit] or unit or "", unit == "bytes" and 'b' or nil);
-end
+
+	if #buckets == 0 or not creation_timestamp or not sum or not count then
+		print("[no data or not a histogram]")
+		return false
+	end
 
-local stats_methods = {};
-function stats_methods:bounds(_lower, _upper)
-	for _, stat_info in ipairs(self) do
-		local data = stat_info[4];
-		if data then
-			local lower = _lower or data.min;
-			local upper = _upper or data.max;
-			local new_data = {
-				min = lower;
-				max = upper;
-				samples = {};
-				sample_count = 0;
-				count = data.count;
-				units = data.units;
-			};
-			local sum = 0;
-			for _, v in ipairs(data.samples) do
-				if v > upper then
-					break;
-				elseif v>=lower then
-					table.insert(new_data.samples, v);
-					sum = sum + v;
-				end
-			end
-			new_data.sample_count = #new_data.samples;
-			stat_info[4] = new_data;
-			stat_info[3] = sum/new_data.sample_count;
+	local graph_width, graph_height, wscale = #buckets, 10, 1;
+	if graph_width < 8 then
+		wscale = 8
+	elseif graph_width < 16 then
+		wscale = 4
+	elseif graph_width < 32 then
+		wscale = 2
+	end
+	local eighth_chars = "   ▁▂▃▄▅▆▇█";
+
+	local max_bin_samples = 0
+	for _, bucket in ipairs(buckets) do
+		if bucket.count > max_bin_samples then
+			max_bin_samples = bucket.count
 		end
 	end
-	return self;
+
+	print("");
+	print(prefix)
+	print(("_"):rep(graph_width*wscale).." "..max_bin_samples);
+	for row = graph_height, 1, -1 do
+		local row_chars = {};
+		local min_eighths, max_eighths = 8, 0;
+		for i = 1, #buckets do
+			local char_eighths = math.ceil(math.max(math.min((graph_height/(max_bin_samples/buckets[i].count))-(row-1), 1), 0)*8);
+			if char_eighths < min_eighths then
+				min_eighths = char_eighths;
+			end
+			if char_eighths > max_eighths then
+				max_eighths = char_eighths;
+			end
+			if char_eighths == 0 then
+				row_chars[i] = ("-"):rep(wscale);
+			else
+				local char = eighth_chars:sub(char_eighths*3+1, char_eighths*3+3);
+				row_chars[i] = char:rep(wscale);
+			end
+		end
+		print(table.concat(row_chars).."|- "..string.format("%.8g", math.ceil((max_bin_samples/graph_height)*(row-0.5))));
+	end
+
+	local legend_pat = string.format("%%%d.%dg", wscale-1, wscale-1)
+	local row = {}
+	for i = 1, #buckets do
+		local threshold = buckets[i].threshold
+		t_insert(row, legend_pat:format(threshold))
+	end
+	t_insert(row, " " .. metric_family.unit)
+	print(t_concat(row, "/"))
+
+	return true
 end
 
-function stats_methods:trim(lower, upper)
-	upper = upper or (100-lower);
-	local statistics = require "util.statistics";
-	for _, stat_info in ipairs(self) do
-		-- Strip outliers
-		local data = stat_info[4];
-		if data then
-			local new_data = {
-				min = statistics.get_percentile(data, lower);
-				max = statistics.get_percentile(data, upper);
-				samples = {};
-				sample_count = 0;
-				count = data.count;
-				units = data.units;
-			};
-			local sum = 0;
-			for _, v in ipairs(data.samples) do
-				if v > new_data.max then
-					break;
-				elseif v>=new_data.min then
-					table.insert(new_data.samples, v);
-					sum = sum + v;
-				end
-			end
-			new_data.sample_count = #new_data.samples;
-			stat_info[4] = new_data;
-			stat_info[3] = sum/new_data.sample_count;
-		end
-	end
-	return self;
+function stats_methods:render_single_fancy_histogram(print, prefix, metric_family, metric)
+	return self:render_single_fancy_histogram_ex(print, prefix, metric_family, metric, false)
 end
 
-function stats_methods:max(upper)
-	return self:bounds(nil, upper);
-end
-
-function stats_methods:min(lower)
-	return self:bounds(lower, nil);
-end
-
-function stats_methods:summary()
-	local statistics = require "util.statistics";
-	for _, stat_info in ipairs(self) do
-		local type, value, data = stat_info[2], stat_info[3], stat_info[4];
-		if data and data.samples then
-			table.insert(stat_info.output, string.format("Count: %d (%d captured)",
-				data.count,
-				data.sample_count
-			));
-			table.insert(stat_info.output, string.format("Min: %s  Mean: %s  Max: %s",
-				format_stat(type, data.units, data.min),
-				format_stat(type, data.units, value),
-				format_stat(type, data.units, data.max)
-			));
-			table.insert(stat_info.output, string.format("Q1: %s  Median: %s  Q3: %s",
-				format_stat(type, data.units, statistics.get_percentile(data, 25)),
-				format_stat(type, data.units, statistics.get_percentile(data, 50)),
-				format_stat(type, data.units, statistics.get_percentile(data, 75))
-			));
-		end
-	end
-	return self;
+function stats_methods:render_single_fancy_histogram_cf(print, prefix, metric_family, metric)
+	-- cf = cumulative frequency
+	return self:render_single_fancy_histogram_ex(print, prefix, metric_family, metric, true)
 end
 
 function stats_methods:cfgraph()
 	for _, stat_info in ipairs(self) do
-		local name, type, value, data = unpack(stat_info, 1, 4); -- luacheck: ignore 211
+		local family_name, metric_family = unpack(stat_info, 1, 2)
 		local function print(s)
 			table.insert(stat_info.output, s);
 		end
 
-		if data and data.sample_count and data.sample_count > 0 then
-			local raw_histogram = require "util.statistics".get_histogram(data);
-
-			local graph_width, graph_height = 50, 10;
-			local eighth_chars = "   ▁▂▃▄▅▆▇█";
-
-			local range = data.max - data.min;
-
-			if range > 0 then
-				local x_scaling = #raw_histogram/graph_width;
-				local histogram = {};
-				for i = 1, graph_width do
-					histogram[i] = math.max(raw_histogram[i*x_scaling-1] or 0, raw_histogram[i*x_scaling] or 0);
-				end
-
-				print("");
-				print(("_"):rep(52)..format_stat(type, data.units, data.max));
-				for row = graph_height, 1, -1 do
-					local row_chars = {};
-					local min_eighths, max_eighths = 8, 0;
-					for i = 1, #histogram do
-						local char_eighths = math.ceil(math.max(math.min((graph_height/(data.max/histogram[i]))-(row-1), 1), 0)*8);
-						if char_eighths < min_eighths then
-							min_eighths = char_eighths;
-						end
-						if char_eighths > max_eighths then
-							max_eighths = char_eighths;
-						end
-						if char_eighths == 0 then
-							row_chars[i] = "-";
-						else
-							local char = eighth_chars:sub(char_eighths*3+1, char_eighths*3+3);
-							row_chars[i] = char;
-						end
-					end
-					print(table.concat(row_chars).."|-"..format_stat(type, data.units, data.max/(graph_height/(row-0.5))));
-				end
-				print(("\\    "):rep(11));
-				local x_labels = {};
-				for i = 1, 11 do
-					local s = ("%-4s"):format((i-1)*10);
-					if #s > 4 then
-						s = s:sub(1, 3).."…";
-					end
-					x_labels[i] = s;
-				end
-				print(" "..table.concat(x_labels, " "));
-				local units = "%";
-				local margin = math.floor((graph_width-#units)/2);
-				print((" "):rep(margin)..units);
-			else
-				print("[range too small to graph]");
-			end
-			print("");
+		if not self:render_family(print, family_name, metric_family, self.render_single_fancy_histogram_cf) then
+			return self
 		end
 	end
 	return self;
@@ -1530,83 +1458,92 @@
 
 function stats_methods:histogram()
 	for _, stat_info in ipairs(self) do
-		local name, type, value, data = unpack(stat_info, 1, 4); -- luacheck: ignore 211
+		local family_name, metric_family = unpack(stat_info, 1, 2)
 		local function print(s)
 			table.insert(stat_info.output, s);
 		end
 
-		if not data then
-			print("[no data]");
-			return self;
-		elseif not data.sample_count then
-			print("[not a sampled metric type]");
-			return self;
+		if not self:render_family(print, family_name, metric_family, self.render_single_fancy_histogram) then
+			return self
 		end
-
-		local graph_width, graph_height = 50, 10;
-		local eighth_chars = "   ▁▂▃▄▅▆▇█";
-
-		local range = data.max - data.min;
-
-		if range > 0 then
-			local n_buckets = graph_width;
-
-			local histogram = {};
-			for i = 1, n_buckets do
-				histogram[i] = 0;
-			end
-			local max_bin_samples = 0;
-			for _, d in ipairs(data.samples) do
-				local bucket = math.floor(1+(n_buckets-1)/(range/(d-data.min)));
-				histogram[bucket] = histogram[bucket] + 1;
-				if histogram[bucket] > max_bin_samples then
-					max_bin_samples = histogram[bucket];
-				end
-			end
-
-			print("");
-			print(("_"):rep(52)..max_bin_samples);
-			for row = graph_height, 1, -1 do
-				local row_chars = {};
-				local min_eighths, max_eighths = 8, 0;
-				for i = 1, #histogram do
-					local char_eighths = math.ceil(math.max(math.min((graph_height/(max_bin_samples/histogram[i]))-(row-1), 1), 0)*8);
-					if char_eighths < min_eighths then
-						min_eighths = char_eighths;
-					end
-					if char_eighths > max_eighths then
-						max_eighths = char_eighths;
-					end
-					if char_eighths == 0 then
-						row_chars[i] = "-";
-					else
-						local char = eighth_chars:sub(char_eighths*3+1, char_eighths*3+3);
-						row_chars[i] = char;
-					end
-				end
-				print(table.concat(row_chars).."|-"..math.ceil((max_bin_samples/graph_height)*(row-0.5)));
-			end
-			print(("\\    "):rep(11));
-			local x_labels = {};
-			for i = 1, 11 do
-				local s = ("%-4s"):format(format_stat(type, data.units, data.min+range*i/11, data.min):match("^%S+"));
-				if #s > 4 then
-					s = s:sub(1, 3).."…";
-				end
-				x_labels[i] = s;
-			end
-			print(" "..table.concat(x_labels, " "));
-			local units = format_stat(type, data.units, data.min):match("%s+(.+)$") or data.units or "";
-			local margin = math.floor((graph_width-#units)/2);
-			print((" "):rep(margin)..units);
-		else
-			print("[range too small to graph]");
-		end
-		print("");
 	end
 	return self;
 end
 
+function stats_methods:render_single_counter(print, prefix, metric_family, metric)
+	local created_timestamp, current_value
+	for suffix, _, value in metric:iter_samples() do
+		if suffix == "_created" then
+			created_timestamp = value
+		elseif suffix == "_total" then
+			current_value = value
+		end
+	end
+	if current_value and created_timestamp then
+		local base_unit = short_units[metric_family.unit] or metric_family.unit
+		local unit = base_unit .. "/s"
+		local factor = 1
+		if base_unit == "s" then
+			-- be smart!
+			unit = "%"
+			factor = 100
+		elseif base_unit == "" then
+			unit = "events/s"
+		end
+		print(("%-50s %s"):format(prefix, format_number(factor * current_value / (self.now - created_timestamp), unit.." [avg]")));
+	end
+end
+
+function stats_methods:render_single_gauge(print, prefix, metric_family, metric)
+	local current_value
+	for _, _, value in metric:iter_samples() do
+		current_value = value
+	end
+	if current_value then
+		local unit = short_units[metric_family.unit] or metric_family.unit
+		print(("%-50s %s"):format(prefix, format_number(current_value, unit)));
+	end
+end
+
+function stats_methods:render_single_summary(print, prefix, metric_family, metric)
+	local sum, count
+	for suffix, _, value in metric:iter_samples() do
+		if suffix == "_sum" then
+			sum = value
+		elseif suffix == "_count" then
+			count = value
+		end
+	end
+	if sum and count then
+		local unit = short_units[metric_family.unit] or metric_family.unit
+		if count == 0 then
+			print(("%-50s %s"):format(prefix, "no obs."));
+		else
+			print(("%-50s %s"):format(prefix, format_number(sum / count, unit.."/event [avg]")));
+		end
+	end
+end
+
+function stats_methods:render_family(print, family_name, metric_family, render_func)
+	local labelkeys = metric_family.label_keys
+	if #labelkeys > 0 then
+		print(family_name)
+		for labelset, metric in metric_family:iter_metrics() do
+			local labels = {}
+			for i, k in ipairs(labelkeys) do
+				local v = labelset[i]
+				t_insert(labels, ("%s=%s"):format(k, v))
+			end
+			local prefix = "  "..t_concat(labels, " ")
+			render_func(self, print, prefix, metric_family, metric)
+		end
+	else
+		for _, metric in metric_family:iter_metrics() do
+			render_func(self, print, family_name, metric_family, metric)
+		end
+	end
+end
+
 local function stats_tostring(stats)
 	local print = stats.session.print;
 	for _, stat_info in ipairs(stats) do
@@ -1618,7 +1555,14 @@
 			end
 			print("");
 		else
-			print(("%-50s %s"):format(stat_info[1], format_stat(stat_info[2], (stat_info[4] or {}).units, stat_info[3])));
+			local metric_family = stat_info[2]
+			if metric_family.type_ == "counter" then
+				stats:render_family(print, stat_info[1], metric_family, stats.render_single_counter)
+			elseif metric_family.type_ == "gauge" or metric_family.type_ == "unknown" then
+				stats:render_family(print, stat_info[1], metric_family, stats.render_single_gauge)
+			elseif metric_family.type_ == "summary" or metric_family.type_ == "histogram" then
+				stats:render_family(print, stat_info[1], metric_family, stats.render_single_summary)
+			end
 		end
 	end
 	return #stats.." statistics displayed";
@@ -1626,23 +1570,29 @@
 
 local stats_mt = {__index = stats_methods, __tostring = stats_tostring }
 local function new_stats_context(self)
-	return setmetatable({ session = self.session, stats = true }, stats_mt);
+	-- TODO: instead of now(), it might be better to take the time of the last
+	-- interval, if the statistics backend is set to use periodic collection
+	-- Otherwise we get strange stuff like average cpu usage decreasing until
+	-- the next sample and so on.
+	return setmetatable({ session = self.session, stats = true, now = time.now() }, stats_mt);
 end
 
-function def_env.stats:show(filter)
-	-- luacheck: ignore 211/changed
-	local stats, changed, extra = require "core.statsmanager".get_stats();
-	local available, displayed = 0, 0;
+function def_env.stats:show(name_filter)
+	local statsman = require "core.statsmanager"
+	local collect = statsman.collect
+	if collect then
+		-- force collection if in manual mode
+		collect()
+	end
+	local metric_registry = statsman.get_metric_registry();
 	local displayed_stats = new_stats_context(self);
-	for name, value in iterators.sorted_pairs(stats) do
-		available = available + 1;
-		if not filter or name:match(filter) then
-			displayed = displayed + 1;
-			local type = name:match(":(%a+)$");
+	for family_name, metric_family in iterators.sorted_pairs(metric_registry:get_metric_families()) do
+		if not name_filter or family_name:match(name_filter) then
 			table.insert(displayed_stats, {
-				name, type, value, extra[name];
-				output = {};
-			});
+				family_name,
+				metric_family,
+				output = {}
+			})
 		end
 	end
 	return displayed_stats;