Comparison

util/statsd.lua @ 11523:5f15ab7c6ae5

Statistics: Rewrite statistics backends to use OpenMetrics The metric subsystem of Prosody has had some shortcomings from the perspective of the current state-of-the-art in metric observability. The OpenMetrics standard [0] is a formalization of the data model (and serialization format) of the well-known and widely-used Prometheus [1] software stack. The previous stats subsystem of Prosody did not map well to that format (see e.g. [2] and [3]); the key reason is that it was trying to do too much math on its own ([2]) while lacking first-class support for "families" of metrics ([3]) and structured metric metadata (despite the `extra` argument to metrics, there was no standard way of representing common things like "tags" or "labels"). Even though OpenMetrics has grown from the Prometheus world of monitoring, it maps well to other popular monitoring stacks such as: - InfluxDB (labels can be mapped to tags and fields as necessary) - Carbon/Graphite (labels can be attached to the metric name with dot-separation) - StatsD (see graphite when assuming that graphite is used as backend, which is the default) The util.statsd module has been ported to use the OpenMetrics model as a proof of concept. An implementation which exposes the util.statistics backend data as Prometheus metrics is ready for publishing in prosody-modules (most likely as mod_openmetrics_prometheus to avoid breaking existing 0.11 deployments). At the same time, the previous measure()-based API had one major advantage: It is really simple and easy to use without requiring lots of knowledge about OpenMetrics or similar concepts. For that reason as well as compatibility with existing code, it is preserved and may even be extended in the future. However, code relying on the `stats-updated` event as well as `get_stats` from `statsmanager` will break because the data model has changed completely; in case of `stats-updated`, the code will simply not run (as the event was renamed in order to avoid conflicts); the `get_stats` function has been removed completely (so it will cause a traceback when it is attempted to be used). Note that the measure_*_event methods have been removed from the module API. I was unable to find any uses or documentation and thus deemed they should not be ported. Re-implementation is possible when necessary. [0]: https://openmetrics.io/ [1]: https://prometheus.io/ [2]: #959 [3]: #960
author Jonas Schäfer <jonas@wielicki.name>
date Sun, 18 Apr 2021 11:47:41 +0200
parent 10924:0c072dd69603
child 12123:7ba686696250
comparison
equal deleted inserted replaced
11522:5bd38d9197e1 11523:5f15ab7c6ae5
1 local socket = require "socket"; 1 local socket = require "socket";
2 2 local time = require "util.time".now;
3 local time = require "util.time".now 3 local array = require "util.array";
4 local t_concat = table.concat;
5
6 local new_metric_registry = require "util.openmetrics".new_metric_registry;
7 local render_histogram_le = require "util.openmetrics".render_histogram_le;
8
9 -- BEGIN of Metric implementations
10
11 -- Gauges
12 local gauge_metric_mt = {}
13 gauge_metric_mt.__index = gauge_metric_mt
14
15 local function new_gauge_metric(full_name, impl)
16 local metric = {
17 _full_name = full_name;
18 _impl = impl;
19 value = 0;
20 }
21 setmetatable(metric, gauge_metric_mt)
22 return metric
23 end
24
25 function gauge_metric_mt:set(value)
26 self.value = value
27 self._impl:push_gauge(self._full_name, value)
28 end
29
30 function gauge_metric_mt:add(delta)
31 self.value = self.value + delta
32 self._impl:push_gauge(self._full_name, self.value)
33 end
34
35 function gauge_metric_mt:reset()
36 self.value = 0
37 self._impl:push_gauge(self._full_name, 0)
38 end
39
40 function gauge_metric_mt.iter_samples()
41 -- statsd backend does not support iteration.
42 return function()
43 return nil
44 end
45 end
46
47 -- Counters
48 local counter_metric_mt = {}
49 counter_metric_mt.__index = counter_metric_mt
50
51 local function new_counter_metric(full_name, impl)
52 local metric = {
53 _full_name = full_name,
54 _impl = impl,
55 value = 0,
56 }
57 setmetatable(metric, counter_metric_mt)
58 return metric
59 end
60
61 function counter_metric_mt:set(value)
62 local delta = value - self.value
63 self.value = value
64 self._impl:push_counter_delta(self._full_name, delta)
65 end
66
67 function counter_metric_mt:add(value)
68 self.value = (self.value or 0) + value
69 self._impl:push_counter_delta(self._full_name, value)
70 end
71
72 function counter_metric_mt.iter_samples()
73 -- statsd backend does not support iteration.
74 return function()
75 return nil
76 end
77 end
78
79 function counter_metric_mt:reset()
80 self.value = 0
81 end
82
83 -- Histograms
84 local histogram_metric_mt = {}
85 histogram_metric_mt.__index = histogram_metric_mt
86
87 local function new_histogram_metric(buckets, full_name, impl)
88 -- NOTE: even though the more or less proprietrary dogstatsd has its own
89 -- histogram implementation, we push the individual buckets in this statsd
90 -- backend for both consistency and compatibility across statsd
91 -- implementations.
92 local metric = {
93 _sum_name = full_name..".sum",
94 _count_name = full_name..".count",
95 _impl = impl,
96 _created = time(),
97 _sum = 0,
98 _count = 0,
99 }
100 -- the order of buckets matters unfortunately, so we cannot directly use
101 -- the thresholds as table keys
102 for i, threshold in ipairs(buckets) do
103 local threshold_s = render_histogram_le(threshold)
104 metric[i] = {
105 threshold = threshold,
106 threshold_s = threshold_s,
107 count = 0,
108 _full_name = full_name..".bucket."..(threshold_s:gsub("%.", "_")),
109 }
110 end
111 setmetatable(metric, histogram_metric_mt)
112 return metric
113 end
114
115 function histogram_metric_mt:sample(value)
116 -- According to the I-D, values must be part of all buckets
117 for i, bucket in pairs(self) do
118 if "number" == type(i) and bucket.threshold > value then
119 bucket.count = bucket.count + 1
120 self._impl:push_counter_delta(bucket._full_name, 1)
121 end
122 end
123 self._sum = self._sum + value
124 self._count = self._count + 1
125 self._impl:push_gauge(self._sum_name, self._sum)
126 self._impl:push_counter_delta(self._count_name, 1)
127 end
128
129 function histogram_metric_mt.iter_samples()
130 -- statsd backend does not support iteration.
131 return function()
132 return nil
133 end
134 end
135
136 function histogram_metric_mt:reset()
137 self._created = time()
138 self._count = 0
139 self._sum = 0
140 for i, bucket in pairs(self) do
141 if "number" == type(i) then
142 bucket.count = 0
143 end
144 end
145 self._impl:push_gauge(self._sum_name, self._sum)
146 end
147
148 -- Summaries
149 local summary_metric_mt = {}
150 summary_metric_mt.__index = summary_metric_mt
151
152 local function new_summary_metric(full_name, impl)
153 local metric = {
154 _sum_name = full_name..".sum",
155 _count_name = full_name..".count",
156 _impl = impl,
157 }
158 setmetatable(metric, summary_metric_mt)
159 return metric
160 end
161
162 function summary_metric_mt:sample(value)
163 self._impl:push_counter_delta(self._sum_name, value)
164 self._impl:push_counter_delta(self._count_name, 1)
165 end
166
167 function summary_metric_mt.iter_samples()
168 -- statsd backend does not support iteration.
169 return function()
170 return nil
171 end
172 end
173
174 function summary_metric_mt.reset()
175 end
176
177 -- BEGIN of statsd client implementation
178
179 local statsd_mt = {}
180 statsd_mt.__index = statsd_mt
181
182 function statsd_mt:cork()
183 self.corked = true
184 self.cork_buffer = self.cork_buffer or {}
185 end
186
187 function statsd_mt:uncork()
188 self.corked = false
189 self:_flush_cork_buffer()
190 end
191
192 function statsd_mt:_flush_cork_buffer()
193 local buffer = self.cork_buffer
194 for metric_name, value in pairs(buffer) do
195 self:_send_gauge(metric_name, value)
196 buffer[metric_name] = nil
197 end
198 end
199
200 function statsd_mt:push_gauge(metric_name, value)
201 if self.corked then
202 self.cork_buffer[metric_name] = value
203 else
204 self:_send_gauge(metric_name, value)
205 end
206 end
207
208 function statsd_mt:_send_gauge(metric_name, value)
209 self:_send(self.prefix..metric_name..":"..tostring(value).."|g")
210 end
211
212 function statsd_mt:push_counter_delta(metric_name, delta)
213 self:_send(self.prefix..metric_name..":"..tostring(delta).."|c")
214 end
215
216 function statsd_mt:_send(s)
217 return self.sock:send(s)
218 end
219
220 -- END of statsd client implementation
221
222 local function build_metric_name(family_name, labels)
223 local parts = array { family_name }
224 if labels then
225 parts:append(labels)
226 end
227 return t_concat(parts, "/"):gsub("%.", "_"):gsub("/", ".")
228 end
4 229
5 local function new(config) 230 local function new(config)
6 if not config or not config.statsd_server then 231 if not config or not config.statsd_server then
7 return nil, "No statsd server specified in the config, please see https://prosody.im/doc/statistics"; 232 return nil, "No statsd server specified in the config, please see https://prosody.im/doc/statistics";
8 end 233 end
10 local sock = socket.udp(); 235 local sock = socket.udp();
11 sock:setpeername(config.statsd_server, config.statsd_port or 8125); 236 sock:setpeername(config.statsd_server, config.statsd_port or 8125);
12 237
13 local prefix = (config.prefix or "prosody").."."; 238 local prefix = (config.prefix or "prosody")..".";
14 239
15 local function send_metric(s) 240 local impl = {
16 return sock:send(prefix..s); 241 metric_registry = nil;
17 end 242 sock = sock;
18 243 prefix = prefix;
19 local function send_gauge(name, amount, relative)
20 local s_amount = tostring(amount);
21 if relative and amount > 0 then
22 s_amount = "+"..s_amount;
23 end
24 return send_metric(name..":"..s_amount.."|g");
25 end
26
27 local function send_counter(name, amount)
28 return send_metric(name..":"..tostring(amount).."|c");
29 end
30
31 local function send_duration(name, duration)
32 return send_metric(name..":"..tostring(duration).."|ms");
33 end
34
35 local function send_histogram_sample(name, sample)
36 return send_metric(name..":"..tostring(sample).."|h");
37 end
38
39 local methods;
40 methods = {
41 amount = function (name, conf)
42 if conf and conf.initial then
43 send_gauge(name, conf.initial);
44 end
45 return function (new_v) send_gauge(name, new_v); end
46 end;
47 counter = function (name, conf) --luacheck: ignore 212/conf
48 return function (delta)
49 send_gauge(name, delta, true);
50 end;
51 end;
52 rate = function (name)
53 return function ()
54 send_counter(name, 1);
55 end;
56 end;
57 distribution = function (name, conf) --luacheck: ignore 212/conf
58 return function (value)
59 send_histogram_sample(name, value);
60 end;
61 end;
62 sizes = function (name)
63 name = name.."_size";
64 return function (value)
65 send_histogram_sample(name, value);
66 end;
67 end;
68 times = function (name)
69 return function ()
70 local start_time = time();
71 return function ()
72 local end_time = time();
73 local duration = end_time - start_time;
74 send_duration(name, duration*1000);
75 end
76 end;
77 end;
78 }; 244 };
79 return methods; 245 setmetatable(impl, statsd_mt)
246
247 local backend = {
248 gauge = function(family_name, labels)
249 return new_gauge_metric(build_metric_name(family_name, labels), impl)
250 end;
251 counter = function(family_name, labels)
252 return new_counter_metric(build_metric_name(family_name, labels), impl)
253 end;
254 histogram = function(buckets, family_name, labels)
255 return new_histogram_metric(buckets, build_metric_name(family_name, labels), impl)
256 end;
257 summary = function(family_name, labels, extra)
258 return new_summary_metric(build_metric_name(family_name, labels), impl, extra)
259 end;
260 };
261
262 impl.metric_registry = new_metric_registry(backend);
263
264 return impl;
80 end 265 end
81 266
82 return { 267 return {
83 new = new; 268 new = new;
84 } 269 }