Comparison

util/statistics.lua @ 11523:5f15ab7c6ae5

Statistics: Rewrite statistics backends to use OpenMetrics The metric subsystem of Prosody has had some shortcomings from the perspective of the current state-of-the-art in metric observability. The OpenMetrics standard [0] is a formalization of the data model (and serialization format) of the well-known and widely-used Prometheus [1] software stack. The previous stats subsystem of Prosody did not map well to that format (see e.g. [2] and [3]); the key reason is that it was trying to do too much math on its own ([2]) while lacking first-class support for "families" of metrics ([3]) and structured metric metadata (despite the `extra` argument to metrics, there was no standard way of representing common things like "tags" or "labels"). Even though OpenMetrics has grown from the Prometheus world of monitoring, it maps well to other popular monitoring stacks such as: - InfluxDB (labels can be mapped to tags and fields as necessary) - Carbon/Graphite (labels can be attached to the metric name with dot-separation) - StatsD (see graphite when assuming that graphite is used as backend, which is the default) The util.statsd module has been ported to use the OpenMetrics model as a proof of concept. An implementation which exposes the util.statistics backend data as Prometheus metrics is ready for publishing in prosody-modules (most likely as mod_openmetrics_prometheus to avoid breaking existing 0.11 deployments). At the same time, the previous measure()-based API had one major advantage: It is really simple and easy to use without requiring lots of knowledge about OpenMetrics or similar concepts. For that reason as well as compatibility with existing code, it is preserved and may even be extended in the future. However, code relying on the `stats-updated` event as well as `get_stats` from `statsmanager` will break because the data model has changed completely; in case of `stats-updated`, the code will simply not run (as the event was renamed in order to avoid conflicts); the `get_stats` function has been removed completely (so it will cause a traceback when it is attempted to be used). Note that the measure_*_event methods have been removed from the module API. I was unable to find any uses or documentation and thus deemed they should not be ported. Re-implementation is possible when necessary. [0]: https://openmetrics.io/ [1]: https://prometheus.io/ [2]: #959 [3]: #960
author Jonas Schäfer <jonas@wielicki.name>
date Sun, 18 Apr 2021 11:47:41 +0200
parent 10883:d75d805c852f
child 12123:7ba686696250
comparison
equal deleted inserted replaced
11522:5bd38d9197e1 11523:5f15ab7c6ae5
1 local t_sort = table.sort
2 local m_floor = math.floor;
3 local time = require "util.time".now; 1 local time = require "util.time".now;
2 local new_metric_registry = require "util.openmetrics".new_metric_registry;
3 local render_histogram_le = require "util.openmetrics".render_histogram_le;
4 4
5 local function nop_function() end 5 -- BEGIN of Metric implementations
6 6
7 local function percentile(arr, length, pc) 7 -- Gauges
8 local n = pc/100 * (length + 1); 8 local gauge_metric_mt = {}
9 local k, d = m_floor(n), n%1; 9 gauge_metric_mt.__index = gauge_metric_mt
10 if k == 0 then 10
11 return arr[1] or 0; 11 local function new_gauge_metric()
12 elseif k >= length then 12 local metric = { value = 0 }
13 return arr[length]; 13 setmetatable(metric, gauge_metric_mt)
14 end 14 return metric
15 return arr[k] + d*(arr[k+1] - arr[k]);
16 end 15 end
17 16
18 local function new_registry(config) 17 function gauge_metric_mt:set(value)
19 config = config or {}; 18 self.value = value
20 local duration_sample_interval = config.duration_sample_interval or 5; 19 end
21 local duration_max_samples = config.duration_max_stored_samples or 5000;
22 20
23 local function get_distribution_stats(events, n_actual_events, since, new_time, units) 21 function gauge_metric_mt:add(delta)
24 local n_stored_events = #events; 22 self.value = self.value + delta
25 t_sort(events); 23 end
26 local sum = 0; 24
27 for i = 1, n_stored_events do 25 function gauge_metric_mt:reset()
28 sum = sum + events[i]; 26 self.value = 0
27 end
28
29 function gauge_metric_mt:iter_samples()
30 local done = false
31 return function(_s)
32 if done then
33 return nil, true
29 end 34 end
35 done = true
36 return "", nil, _s.value
37 end, self
38 end
30 39
31 return { 40 -- Counters
32 samples = events; 41 local counter_metric_mt = {}
33 sample_count = n_stored_events; 42 counter_metric_mt.__index = counter_metric_mt
34 count = n_actual_events, 43
35 rate = n_actual_events/(new_time-since); 44 local function new_counter_metric()
36 average = n_stored_events > 0 and sum/n_stored_events or 0, 45 local metric = {
37 min = events[1] or 0, 46 _created = time(),
38 max = events[n_stored_events] or 0, 47 value = 0,
39 units = units, 48 }
40 }; 49 setmetatable(metric, counter_metric_mt)
50 return metric
51 end
52
53 function counter_metric_mt:set(value)
54 self.value = value
55 end
56
57 function counter_metric_mt:add(value)
58 self.value = (self.value or 0) + value
59 end
60
61 function counter_metric_mt:iter_samples()
62 local step = 0
63 return function(_s)
64 step = step + 1
65 if step == 1 then
66 return "_created", nil, _s._created
67 elseif step == 2 then
68 return "_total", nil, _s.value
69 else
70 return nil, nil, true
71 end
72 end, self
73 end
74
75 function counter_metric_mt:reset()
76 self.value = 0
77 end
78
79 -- Histograms
80 local histogram_metric_mt = {}
81 histogram_metric_mt.__index = histogram_metric_mt
82
83 local function new_histogram_metric(buckets)
84 local metric = {
85 _created = time(),
86 _sum = 0,
87 _count = 0,
88 }
89 -- the order of buckets matters unfortunately, so we cannot directly use
90 -- the thresholds as table keys
91 for i, threshold in ipairs(buckets) do
92 metric[i] = {
93 threshold = threshold,
94 threshold_s = render_histogram_le(threshold),
95 count = 0
96 }
41 end 97 end
98 setmetatable(metric, histogram_metric_mt)
99 return metric
100 end
42 101
102 function histogram_metric_mt:sample(value)
103 -- According to the I-D, values must be part of all buckets
104 for i, bucket in pairs(self) do
105 if "number" == type(i) and bucket.threshold > value then
106 bucket.count = bucket.count + 1
107 end
108 end
109 self._sum = self._sum + value
110 self._count = self._count + 1
111 end
43 112
44 local registry = {}; 113 function histogram_metric_mt:iter_samples()
45 local methods; 114 local key = nil
46 methods = { 115 return function (_s)
47 amount = function (name, conf) 116 local data
48 local v = conf and conf.initial or 0; 117 key, data = next(_s, key)
49 registry[name..":amount"] = function () 118 if key == "_created" or key == "_sum" or key == "_count" then
50 return "amount", v, conf; 119 return key, nil, data
51 end 120 elseif key ~= nil then
52 return function (new_v) v = new_v; end 121 return "_bucket", {["le"] = data.threshold_s}, data.count
53 end; 122 else
54 counter = function (name, conf) 123 return nil, nil, nil
55 local v = conf and conf.initial or 0; 124 end
56 registry[name..":amount"] = function () 125 end, self
57 return "amount", v, conf; 126 end
58 end
59 return function (delta)
60 v = v + delta;
61 end;
62 end;
63 rate = function (name, conf)
64 local since, n, total = time(), 0, 0;
65 registry[name..":rate"] = function ()
66 total = total + n;
67 local t = time();
68 local stats = {
69 rate = n/(t-since);
70 count = n;
71 total = total;
72 units = conf and conf.units;
73 type = conf and conf.type;
74 };
75 since, n = t, 0;
76 return "rate", stats.rate, stats;
77 end;
78 return function ()
79 n = n + 1;
80 end;
81 end;
82 distribution = function (name, conf)
83 local units = conf and conf.units;
84 local type = conf and conf.type or "distribution";
85 local events, last_event = {}, 0;
86 local n_actual_events = 0;
87 local since = time();
88 127
89 registry[name..":"..type] = function () 128 function histogram_metric_mt:reset()
90 local new_time = time(); 129 self._created = time()
91 local stats = get_distribution_stats(events, n_actual_events, since, new_time, units); 130 self._count = 0
92 events, last_event = {}, 0; 131 self._sum = 0
93 n_actual_events = 0; 132 for i, bucket in pairs(self) do
94 since = new_time; 133 if "number" == type(i) then
95 return type, stats.average, stats; 134 bucket.count = 0
96 end; 135 end
136 end
137 end
97 138
98 return function (value) 139 -- Summary
99 n_actual_events = n_actual_events + 1; 140 local summary_metric_mt = {}
100 if n_actual_events%duration_sample_interval == 1 then 141 summary_metric_mt.__index = summary_metric_mt
101 last_event = (last_event%duration_max_samples) + 1;
102 events[last_event] = value;
103 end
104 end;
105 end;
106 sizes = function (name, conf)
107 conf = conf or { units = "bytes", type = "size" }
108 return methods.distribution(name, conf);
109 end;
110 times = function (name, conf)
111 local units = conf and conf.units or "seconds";
112 local events, last_event = {}, 0;
113 local n_actual_events = 0;
114 local since = time();
115 142
116 registry[name..":duration"] = function () 143 local function new_summary_metric()
117 local new_time = time(); 144 -- quantiles are not supported yet
118 local stats = get_distribution_stats(events, n_actual_events, since, new_time, units); 145 local metric = {
119 events, last_event = {}, 0; 146 _created = time(),
120 n_actual_events = 0; 147 _sum = 0,
121 since = new_time; 148 _count = 0,
122 return "duration", stats.average, stats; 149 }
123 end; 150 setmetatable(metric, summary_metric_mt)
151 return metric
152 end
124 153
125 return function () 154 function summary_metric_mt:sample(value)
126 n_actual_events = n_actual_events + 1; 155 self._sum = self._sum + value
127 if n_actual_events%duration_sample_interval ~= 1 then 156 self._count = self._count + 1
128 return nop_function; 157 end
129 end
130 158
131 local start_time = time(); 159 function summary_metric_mt:iter_samples()
132 return function () 160 local key = nil
133 local end_time = time(); 161 return function (_s)
134 local duration = end_time - start_time; 162 local data
135 last_event = (last_event%duration_max_samples) + 1; 163 key, data = next(_s, key)
136 events[last_event] = duration; 164 return key, nil, data
137 end 165 end, self
138 end; 166 end
139 end;
140 167
141 get_stats = function () 168 function summary_metric_mt:reset()
142 return registry; 169 self._created = time()
143 end; 170 self._count = 0
144 }; 171 self._sum = 0
145 return methods; 172 end
173
174 local pull_backend = {
175 gauge = new_gauge_metric,
176 counter = new_counter_metric,
177 histogram = new_histogram_metric,
178 summary = new_summary_metric,
179 }
180
181 -- END of Metric implementations
182
183 local function new()
184 return {
185 metric_registry = new_metric_registry(pull_backend),
186 }
146 end 187 end
147 188
148 return { 189 return {
149 new = new_registry; 190 new = new;
150 get_histogram = function (duration, n_buckets)
151 n_buckets = n_buckets or 100;
152 local events, n_events = duration.samples, duration.sample_count;
153 if not (events and n_events) then
154 return nil, "not a valid distribution stat";
155 end
156 local histogram = {};
157
158 for i = 1, 100, 100/n_buckets do
159 histogram[i] = percentile(events, n_events, i);
160 end
161 return histogram;
162 end;
163
164 get_percentile = function (duration, pc)
165 local events, n_events = duration.samples, duration.sample_count;
166 if not (events and n_events) then
167 return nil, "not a valid distribution stat";
168 end
169 return percentile(events, n_events, pc);
170 end;
171 } 191 }