Comparison

util/openmetrics.lua @ 11523:5f15ab7c6ae5

Statistics: Rewrite statistics backends to use OpenMetrics The metric subsystem of Prosody has had some shortcomings from the perspective of the current state-of-the-art in metric observability. The OpenMetrics standard [0] is a formalization of the data model (and serialization format) of the well-known and widely-used Prometheus [1] software stack. The previous stats subsystem of Prosody did not map well to that format (see e.g. [2] and [3]); the key reason is that it was trying to do too much math on its own ([2]) while lacking first-class support for "families" of metrics ([3]) and structured metric metadata (despite the `extra` argument to metrics, there was no standard way of representing common things like "tags" or "labels"). Even though OpenMetrics has grown from the Prometheus world of monitoring, it maps well to other popular monitoring stacks such as: - InfluxDB (labels can be mapped to tags and fields as necessary) - Carbon/Graphite (labels can be attached to the metric name with dot-separation) - StatsD (see graphite when assuming that graphite is used as backend, which is the default) The util.statsd module has been ported to use the OpenMetrics model as a proof of concept. An implementation which exposes the util.statistics backend data as Prometheus metrics is ready for publishing in prosody-modules (most likely as mod_openmetrics_prometheus to avoid breaking existing 0.11 deployments). At the same time, the previous measure()-based API had one major advantage: It is really simple and easy to use without requiring lots of knowledge about OpenMetrics or similar concepts. For that reason as well as compatibility with existing code, it is preserved and may even be extended in the future. However, code relying on the `stats-updated` event as well as `get_stats` from `statsmanager` will break because the data model has changed completely; in case of `stats-updated`, the code will simply not run (as the event was renamed in order to avoid conflicts); the `get_stats` function has been removed completely (so it will cause a traceback when it is attempted to be used). Note that the measure_*_event methods have been removed from the module API. I was unable to find any uses or documentation and thus deemed they should not be ported. Re-implementation is possible when necessary. [0]: https://openmetrics.io/ [1]: https://prometheus.io/ [2]: #959 [3]: #960
author Jonas Schäfer <jonas@wielicki.name>
date Sun, 18 Apr 2021 11:47:41 +0200
parent 10883:util/statistics.lua@d75d805c852f
child 11593:0db763f3f3be
comparison
equal deleted inserted replaced
11522:5bd38d9197e1 11523:5f15ab7c6ae5
1 --[[
2 This module implements a subset of the OpenMetrics Internet Draft version 00.
3
4 URL: https://tools.ietf.org/html/draft-richih-opsawg-openmetrics-00
5
6 The following metric types are supported:
7
8 - Counter
9 - Gauge
10 - Histogram
11 - Summary
12
13 It is used by util.statsd and util.statistics to provite the OpenMetrics API.
14
15 To understand what this module is about, it is useful to familiarize oneself
16 with the terms MetricFamily, Metric, LabelSet, Label and MetricPoint as
17 defined in the I-D linked above.
18 --]]
19 -- metric constructor interface:
20 -- metric_ctor(..., family_name, labels, extra)
21
22 local time = require "util.time".now;
23 local select = select;
24 local array = require "util.array";
25 local log = require "util.logger".init("util.openmetrics");
26 local new_multitable = require "util.multitable".new;
27 local iter_multitable = require "util.multitable".iter;
28
29 -- BEGIN of Utility: "metric proxy"
30 -- This allows to wrap a MetricFamily in a proxy which only provides the
31 -- `with_labels` and `with_partial_label` methods. This allows to pre-set one
32 -- or more labels on a metric family. This is used in particular via
33 -- `with_partial_label` by the moduleapi in order to pre-set the `host` label
34 -- on metrics created in non-global modules.
35 local metric_proxy_mt = {}
36 metric_proxy_mt.__index = metric_proxy_mt
37
38 local function new_metric_proxy(metric_family, with_labels_proxy_fun)
39 return {
40 _family = metric_family,
41 with_labels = function(self, ...)
42 return with_labels_proxy_fun(self._family, ...)
43 end;
44 with_partial_label = function(self, label)
45 return new_metric_proxy(self._family, function(family, ...)
46 return family:with_labels(label, ...)
47 end)
48 end
49 }
50 end
51
52 -- END of Utility: "metric proxy"
53
54 local function render_histogram_le(v)
55 if v == 1/0 then
56 -- I-D-00: 4.1.2.2.1:
57 -- Exposers MUST produce output for positive infinity as +Inf.
58 return "+Inf"
59 end
60
61 return string.format("%g", v)
62 end
63
64 -- BEGIN of generic MetricFamily implementation
65
66 local metric_family_mt = {}
67 metric_family_mt.__index = metric_family_mt
68
69 local function histogram_metric_ctor(orig_ctor, buckets)
70 return function(family_name, labels, extra)
71 return orig_ctor(buckets, family_name, labels, extra)
72 end
73 end
74
75 local function new_metric_family(backend, type_, family_name, unit, description, label_keys, extra)
76 local metric_ctor = assert(backend[type_], "statistics backend does not support "..type_.." metrics families")
77 local labels = label_keys or {}
78 local user_labels = #labels
79 if type_ == "histogram" then
80 local buckets = extra and extra.buckets
81 if not buckets then
82 error("no buckets given for histogram metric")
83 end
84 buckets = array(buckets)
85 buckets:push(1/0) -- must have +inf bucket
86
87 metric_ctor = histogram_metric_ctor(metric_ctor, buckets)
88 end
89
90 local data
91 if #labels == 0 then
92 data = metric_ctor(family_name, nil, extra)
93 else
94 data = new_multitable()
95 end
96
97 local mf = {
98 family_name = family_name,
99 data = data,
100 type_ = type_,
101 unit = unit,
102 description = description,
103 user_labels = user_labels,
104 label_keys = labels,
105 extra = extra,
106 _metric_ctor = metric_ctor,
107 }
108 setmetatable(mf, metric_family_mt);
109 return mf
110 end
111
112 function metric_family_mt:new_metric(labels)
113 return self._metric_ctor(self.family_name, labels, self.extra)
114 end
115
116 function metric_family_mt:clear()
117 for _, metric in self:iter_metrics() do
118 metric:reset()
119 end
120 end
121
122 function metric_family_mt:with_labels(...)
123 local count = select('#', ...)
124 if count ~= self.user_labels then
125 error("number of labels passed to with_labels does not match number of label keys")
126 end
127 if count == 0 then
128 return self.data
129 end
130 local metric = self.data:get(...)
131 if not metric then
132 local values = table.pack(...)
133 metric = self:new_metric(values)
134 values[values.n+1] = metric
135 self.data:set(table.unpack(values, 1, values.n+1))
136 end
137 return metric
138 end
139
140 function metric_family_mt:with_partial_label(label)
141 return new_metric_proxy(self, function (family, ...)
142 return family:with_labels(label, ...)
143 end)
144 end
145
146 function metric_family_mt:iter_metrics()
147 if #self.label_keys == 0 then
148 local done = false
149 return function()
150 if done then
151 return nil
152 end
153 done = true
154 return {}, self.data
155 end
156 end
157 local searchkeys = {};
158 local nlabels = #self.label_keys
159 for i=1,nlabels do
160 searchkeys[i] = nil;
161 end
162 local it, state = iter_multitable(self.data, table.unpack(searchkeys, 1, nlabels))
163 return function(_s)
164 local label_values = table.pack(it(_s))
165 if label_values.n == 0 then
166 return nil, nil
167 end
168 local metric = label_values[label_values.n]
169 label_values[label_values.n] = nil
170 label_values.n = label_values.n - 1
171 return label_values, metric
172 end, state
173 end
174
175 -- END of generic MetricFamily implementation
176
177 -- BEGIN of MetricRegistry implementation
178
179
180 -- Helper to test whether two metrics are "equal".
181 local function equal_metric_family(mf1, mf2)
182 if mf1.type_ ~= mf2.type_ then
183 return false
184 end
185 if #mf1.label_keys ~= #mf2.label_keys then
186 return false
187 end
188 -- Ignoring unit here because in general it'll be part of the name anyway
189 -- So either the unit was moved into/out of the name (which is a valid)
190 -- thing to do on an upgrade or we would expect not to see any conflicts
191 -- anyway.
192 --[[
193 if mf1.unit ~= mf2.unit then
194 return false
195 end
196 ]]
197 for i, key in ipairs(mf1.label_keys) do
198 if key ~= mf2.label_keys[i] then
199 return false
200 end
201 end
202 return true
203 end
204
205 -- If the unit is not empty, add it to the full name as per the I-D spec.
206 local function compose_name(name, unit)
207 local full_name = name
208 if unit and unit ~= "" then
209 full_name = full_name .. "_" .. unit
210 end
211 -- TODO: prohibit certain suffixes used by metrics if where they may cause
212 -- conflicts
213 return full_name
214 end
215
216 local metric_registry_mt = {}
217 metric_registry_mt.__index = metric_registry_mt
218
219 local function new_metric_registry(backend)
220 local reg = {
221 families = {},
222 backend = backend,
223 }
224 setmetatable(reg, metric_registry_mt)
225 return reg
226 end
227
228 function metric_registry_mt:register_metric_family(name, metric_family)
229 local existing = self.families[name];
230 if existing then
231 if not equal_metric_family(metric_family, existing) then
232 -- We could either be strict about this, or replace the
233 -- existing metric family with the new one.
234 -- Being strict is nice to avoid programming errors /
235 -- conflicts, but causes issues when a new version of a module
236 -- is loaded.
237 --
238 -- We will thus assume that the new metric is the correct one;
239 -- That is probably OK because unless you're reaching down into
240 -- the util.openmetrics or core.statsmanager API, your metric
241 -- name is going to be scoped to `prosody_mod_$modulename`
242 -- anyway and the damage is thus controlled.
243 --
244 -- To make debugging such issues easier, we still log.
245 log("debug", "replacing incompatible existing metric family %s", name)
246 -- Below is the code to be strict.
247 --error("conflicting declarations for metric family "..name)
248 else
249 return existing
250 end
251 end
252 self.families[name] = metric_family
253 return metric_family
254 end
255
256 function metric_registry_mt:gauge(name, unit, description, labels, extra)
257 name = compose_name(name, unit)
258 local mf = new_metric_family(self.backend, "gauge", name, unit, description, labels, extra)
259 mf = self:register_metric_family(name, mf)
260 return mf
261 end
262
263 function metric_registry_mt:counter(name, unit, description, labels, extra)
264 name = compose_name(name, unit)
265 local mf = new_metric_family(self.backend, "counter", name, unit, description, labels, extra)
266 mf = self:register_metric_family(name, mf)
267 return mf
268 end
269
270 function metric_registry_mt:histogram(name, unit, description, labels, extra)
271 name = compose_name(name, unit)
272 local mf = new_metric_family(self.backend, "histogram", name, unit, description, labels, extra)
273 mf = self:register_metric_family(name, mf)
274 return mf
275 end
276
277 function metric_registry_mt:summary(name, unit, description, labels, extra)
278 name = compose_name(name, unit)
279 local mf = new_metric_family(self.backend, "summary", name, unit, description, labels, extra)
280 mf = self:register_metric_family(name, mf)
281 return mf
282 end
283
284 function metric_registry_mt:get_metric_families()
285 return self.families
286 end
287
288 -- END of MetricRegistry implementation
289
290 -- BEGIN of general helpers for implementing high-level APIs on top of OpenMetrics
291
292 local function timed(metric)
293 local t0 = time()
294 local submitter = assert(metric.sample or metric.set, "metric type cannot be used with timed()")
295 return function()
296 local t1 = time()
297 submitter(metric, t1-t0)
298 end
299 end
300
301 -- END of general helpers
302
303 return {
304 new_metric_proxy = new_metric_proxy;
305 new_metric_registry = new_metric_registry;
306 render_histogram_le = render_histogram_le;
307 timed = timed;
308 }