Comparison

plugins/mod_admin_shell.lua @ 11523:5f15ab7c6ae5

Statistics: Rewrite statistics backends to use OpenMetrics The metric subsystem of Prosody has had some shortcomings from the perspective of the current state-of-the-art in metric observability. The OpenMetrics standard [0] is a formalization of the data model (and serialization format) of the well-known and widely-used Prometheus [1] software stack. The previous stats subsystem of Prosody did not map well to that format (see e.g. [2] and [3]); the key reason is that it was trying to do too much math on its own ([2]) while lacking first-class support for "families" of metrics ([3]) and structured metric metadata (despite the `extra` argument to metrics, there was no standard way of representing common things like "tags" or "labels"). Even though OpenMetrics has grown from the Prometheus world of monitoring, it maps well to other popular monitoring stacks such as: - InfluxDB (labels can be mapped to tags and fields as necessary) - Carbon/Graphite (labels can be attached to the metric name with dot-separation) - StatsD (see graphite when assuming that graphite is used as backend, which is the default) The util.statsd module has been ported to use the OpenMetrics model as a proof of concept. An implementation which exposes the util.statistics backend data as Prometheus metrics is ready for publishing in prosody-modules (most likely as mod_openmetrics_prometheus to avoid breaking existing 0.11 deployments). At the same time, the previous measure()-based API had one major advantage: It is really simple and easy to use without requiring lots of knowledge about OpenMetrics or similar concepts. For that reason as well as compatibility with existing code, it is preserved and may even be extended in the future. However, code relying on the `stats-updated` event as well as `get_stats` from `statsmanager` will break because the data model has changed completely; in case of `stats-updated`, the code will simply not run (as the event was renamed in order to avoid conflicts); the `get_stats` function has been removed completely (so it will cause a traceback when it is attempted to be used). Note that the measure_*_event methods have been removed from the module API. I was unable to find any uses or documentation and thus deemed they should not be ported. Re-implementation is possible when necessary. [0]: https://openmetrics.io/ [1]: https://prometheus.io/ [2]: #959 [3]: #960
author Jonas Schäfer <jonas@wielicki.name>
date Sun, 18 Apr 2021 11:47:41 +0200
parent 11504:1f700f5f62cb
child 11601:9483728f890f
comparison
equal deleted inserted replaced
11522:5bd38d9197e1 11523:5f15ab7c6ae5
33 local has_pposix, pposix = pcall(require, "util.pposix"); 33 local has_pposix, pposix = pcall(require, "util.pposix");
34 local async = require "util.async"; 34 local async = require "util.async";
35 local serialization = require "util.serialization"; 35 local serialization = require "util.serialization";
36 local serialize_config = serialization.new ({ fatal = false, unquoted = true}); 36 local serialize_config = serialization.new ({ fatal = false, unquoted = true});
37 local time = require "util.time"; 37 local time = require "util.time";
38
39 local t_insert = table.insert;
40 local t_concat = table.concat;
38 41
39 local format_number = require "util.human.units".format; 42 local format_number = require "util.human.units".format;
40 local format_table = require "util.human.io".table; 43 local format_table = require "util.human.io".table;
41 44
42 local commands = module:shared("commands") 45 local commands = module:shared("commands")
1340 local short_units = { 1343 local short_units = {
1341 seconds = "s", 1344 seconds = "s",
1342 bytes = "B", 1345 bytes = "B",
1343 }; 1346 };
1344 1347
1345 local function format_stat(type, unit, value, ref_value)
1346 ref_value = ref_value or value;
1347 --do return tostring(value) end
1348 if not unit then
1349 if type == "duration" then
1350 unit = "seconds"
1351 elseif type == "size" then
1352 unit = "bytes";
1353 elseif type == "rate" then
1354 unit = " events/sec"
1355 if ref_value < 0.9 then
1356 unit = "events/min"
1357 value = value*60;
1358 if ref_value < 0.6/60 then
1359 unit = "events/h"
1360 value = value*60;
1361 end
1362 end
1363 return ("%.3g %s"):format(value, unit);
1364 end
1365 end
1366 return format_number(value, short_units[unit] or unit or "", unit == "bytes" and 'b' or nil);
1367 end
1368
1369 local stats_methods = {}; 1348 local stats_methods = {};
1370 function stats_methods:bounds(_lower, _upper) 1349
1371 for _, stat_info in ipairs(self) do 1350 function stats_methods:render_single_fancy_histogram_ex(print, prefix, metric_family, metric, cumulative)
1372 local data = stat_info[4]; 1351 local creation_timestamp, sum, count
1373 if data then 1352 local buckets = {}
1374 local lower = _lower or data.min; 1353 local prev_bucket_count = 0
1375 local upper = _upper or data.max; 1354 for suffix, extra_labels, value in metric:iter_samples() do
1376 local new_data = { 1355 if suffix == "_created" then
1377 min = lower; 1356 creation_timestamp = value
1378 max = upper; 1357 elseif suffix == "_sum" then
1379 samples = {}; 1358 sum = value
1380 sample_count = 0; 1359 elseif suffix == "_count" then
1381 count = data.count; 1360 count = value
1382 units = data.units; 1361 else
1383 }; 1362 local bucket_threshold = extra_labels["le"]
1384 local sum = 0; 1363 local bucket_count
1385 for _, v in ipairs(data.samples) do 1364 if cumulative then
1386 if v > upper then 1365 bucket_count = value
1387 break; 1366 else
1388 elseif v>=lower then 1367 bucket_count = value - prev_bucket_count
1389 table.insert(new_data.samples, v); 1368 prev_bucket_count = value
1390 sum = sum + v; 1369 end
1391 end 1370 if bucket_threshold == "+Inf" then
1392 end 1371 t_insert(buckets, {threshold = 1/0, count = bucket_count})
1393 new_data.sample_count = #new_data.samples; 1372 elseif bucket_threshold ~= nil then
1394 stat_info[4] = new_data; 1373 t_insert(buckets, {threshold = tonumber(bucket_threshold), count = bucket_count})
1395 stat_info[3] = sum/new_data.sample_count; 1374 end
1396 end 1375 end
1397 end 1376 end
1398 return self; 1377
1399 end 1378 if #buckets == 0 or not creation_timestamp or not sum or not count then
1400 1379 print("[no data or not a histogram]")
1401 function stats_methods:trim(lower, upper) 1380 return false
1402 upper = upper or (100-lower); 1381 end
1403 local statistics = require "util.statistics"; 1382
1404 for _, stat_info in ipairs(self) do 1383 local graph_width, graph_height, wscale = #buckets, 10, 1;
1405 -- Strip outliers 1384 if graph_width < 8 then
1406 local data = stat_info[4]; 1385 wscale = 8
1407 if data then 1386 elseif graph_width < 16 then
1408 local new_data = { 1387 wscale = 4
1409 min = statistics.get_percentile(data, lower); 1388 elseif graph_width < 32 then
1410 max = statistics.get_percentile(data, upper); 1389 wscale = 2
1411 samples = {}; 1390 end
1412 sample_count = 0; 1391 local eighth_chars = " ▁▂▃▄▅▆▇█";
1413 count = data.count; 1392
1414 units = data.units; 1393 local max_bin_samples = 0
1415 }; 1394 for _, bucket in ipairs(buckets) do
1416 local sum = 0; 1395 if bucket.count > max_bin_samples then
1417 for _, v in ipairs(data.samples) do 1396 max_bin_samples = bucket.count
1418 if v > new_data.max then 1397 end
1419 break; 1398 end
1420 elseif v>=new_data.min then 1399
1421 table.insert(new_data.samples, v); 1400 print("");
1422 sum = sum + v; 1401 print(prefix)
1423 end 1402 print(("_"):rep(graph_width*wscale).." "..max_bin_samples);
1424 end 1403 for row = graph_height, 1, -1 do
1425 new_data.sample_count = #new_data.samples; 1404 local row_chars = {};
1426 stat_info[4] = new_data; 1405 local min_eighths, max_eighths = 8, 0;
1427 stat_info[3] = sum/new_data.sample_count; 1406 for i = 1, #buckets do
1428 end 1407 local char_eighths = math.ceil(math.max(math.min((graph_height/(max_bin_samples/buckets[i].count))-(row-1), 1), 0)*8);
1429 end 1408 if char_eighths < min_eighths then
1430 return self; 1409 min_eighths = char_eighths;
1431 end 1410 end
1432 1411 if char_eighths > max_eighths then
1433 function stats_methods:max(upper) 1412 max_eighths = char_eighths;
1434 return self:bounds(nil, upper); 1413 end
1435 end 1414 if char_eighths == 0 then
1436 1415 row_chars[i] = ("-"):rep(wscale);
1437 function stats_methods:min(lower) 1416 else
1438 return self:bounds(lower, nil); 1417 local char = eighth_chars:sub(char_eighths*3+1, char_eighths*3+3);
1439 end 1418 row_chars[i] = char:rep(wscale);
1440 1419 end
1441 function stats_methods:summary() 1420 end
1442 local statistics = require "util.statistics"; 1421 print(table.concat(row_chars).."|- "..string.format("%.8g", math.ceil((max_bin_samples/graph_height)*(row-0.5))));
1443 for _, stat_info in ipairs(self) do 1422 end
1444 local type, value, data = stat_info[2], stat_info[3], stat_info[4]; 1423
1445 if data and data.samples then 1424 local legend_pat = string.format("%%%d.%dg", wscale-1, wscale-1)
1446 table.insert(stat_info.output, string.format("Count: %d (%d captured)", 1425 local row = {}
1447 data.count, 1426 for i = 1, #buckets do
1448 data.sample_count 1427 local threshold = buckets[i].threshold
1449 )); 1428 t_insert(row, legend_pat:format(threshold))
1450 table.insert(stat_info.output, string.format("Min: %s Mean: %s Max: %s", 1429 end
1451 format_stat(type, data.units, data.min), 1430 t_insert(row, " " .. metric_family.unit)
1452 format_stat(type, data.units, value), 1431 print(t_concat(row, "/"))
1453 format_stat(type, data.units, data.max) 1432
1454 )); 1433 return true
1455 table.insert(stat_info.output, string.format("Q1: %s Median: %s Q3: %s", 1434 end
1456 format_stat(type, data.units, statistics.get_percentile(data, 25)), 1435
1457 format_stat(type, data.units, statistics.get_percentile(data, 50)), 1436 function stats_methods:render_single_fancy_histogram(print, prefix, metric_family, metric)
1458 format_stat(type, data.units, statistics.get_percentile(data, 75)) 1437 return self:render_single_fancy_histogram_ex(print, prefix, metric_family, metric, false)
1459 )); 1438 end
1460 end 1439
1461 end 1440 function stats_methods:render_single_fancy_histogram_cf(print, prefix, metric_family, metric)
1462 return self; 1441 -- cf = cumulative frequency
1442 return self:render_single_fancy_histogram_ex(print, prefix, metric_family, metric, true)
1463 end 1443 end
1464 1444
1465 function stats_methods:cfgraph() 1445 function stats_methods:cfgraph()
1466 for _, stat_info in ipairs(self) do 1446 for _, stat_info in ipairs(self) do
1467 local name, type, value, data = unpack(stat_info, 1, 4); -- luacheck: ignore 211 1447 local family_name, metric_family = unpack(stat_info, 1, 2)
1468 local function print(s) 1448 local function print(s)
1469 table.insert(stat_info.output, s); 1449 table.insert(stat_info.output, s);
1470 end 1450 end
1471 1451
1472 if data and data.sample_count and data.sample_count > 0 then 1452 if not self:render_family(print, family_name, metric_family, self.render_single_fancy_histogram_cf) then
1473 local raw_histogram = require "util.statistics".get_histogram(data); 1453 return self
1474
1475 local graph_width, graph_height = 50, 10;
1476 local eighth_chars = " ▁▂▃▄▅▆▇█";
1477
1478 local range = data.max - data.min;
1479
1480 if range > 0 then
1481 local x_scaling = #raw_histogram/graph_width;
1482 local histogram = {};
1483 for i = 1, graph_width do
1484 histogram[i] = math.max(raw_histogram[i*x_scaling-1] or 0, raw_histogram[i*x_scaling] or 0);
1485 end
1486
1487 print("");
1488 print(("_"):rep(52)..format_stat(type, data.units, data.max));
1489 for row = graph_height, 1, -1 do
1490 local row_chars = {};
1491 local min_eighths, max_eighths = 8, 0;
1492 for i = 1, #histogram do
1493 local char_eighths = math.ceil(math.max(math.min((graph_height/(data.max/histogram[i]))-(row-1), 1), 0)*8);
1494 if char_eighths < min_eighths then
1495 min_eighths = char_eighths;
1496 end
1497 if char_eighths > max_eighths then
1498 max_eighths = char_eighths;
1499 end
1500 if char_eighths == 0 then
1501 row_chars[i] = "-";
1502 else
1503 local char = eighth_chars:sub(char_eighths*3+1, char_eighths*3+3);
1504 row_chars[i] = char;
1505 end
1506 end
1507 print(table.concat(row_chars).."|-"..format_stat(type, data.units, data.max/(graph_height/(row-0.5))));
1508 end
1509 print(("\\ "):rep(11));
1510 local x_labels = {};
1511 for i = 1, 11 do
1512 local s = ("%-4s"):format((i-1)*10);
1513 if #s > 4 then
1514 s = s:sub(1, 3).."…";
1515 end
1516 x_labels[i] = s;
1517 end
1518 print(" "..table.concat(x_labels, " "));
1519 local units = "%";
1520 local margin = math.floor((graph_width-#units)/2);
1521 print((" "):rep(margin)..units);
1522 else
1523 print("[range too small to graph]");
1524 end
1525 print("");
1526 end 1454 end
1527 end 1455 end
1528 return self; 1456 return self;
1529 end 1457 end
1530 1458
1531 function stats_methods:histogram() 1459 function stats_methods:histogram()
1532 for _, stat_info in ipairs(self) do 1460 for _, stat_info in ipairs(self) do
1533 local name, type, value, data = unpack(stat_info, 1, 4); -- luacheck: ignore 211 1461 local family_name, metric_family = unpack(stat_info, 1, 2)
1534 local function print(s) 1462 local function print(s)
1535 table.insert(stat_info.output, s); 1463 table.insert(stat_info.output, s);
1536 end 1464 end
1537 1465
1538 if not data then 1466 if not self:render_family(print, family_name, metric_family, self.render_single_fancy_histogram) then
1539 print("[no data]"); 1467 return self
1540 return self; 1468 end
1541 elseif not data.sample_count then 1469 end
1542 print("[not a sampled metric type]"); 1470 return self;
1543 return self; 1471 end
1544 end 1472
1545 1473 function stats_methods:render_single_counter(print, prefix, metric_family, metric)
1546 local graph_width, graph_height = 50, 10; 1474 local created_timestamp, current_value
1547 local eighth_chars = " ▁▂▃▄▅▆▇█"; 1475 for suffix, _, value in metric:iter_samples() do
1548 1476 if suffix == "_created" then
1549 local range = data.max - data.min; 1477 created_timestamp = value
1550 1478 elseif suffix == "_total" then
1551 if range > 0 then 1479 current_value = value
1552 local n_buckets = graph_width; 1480 end
1553 1481 end
1554 local histogram = {}; 1482 if current_value and created_timestamp then
1555 for i = 1, n_buckets do 1483 local base_unit = short_units[metric_family.unit] or metric_family.unit
1556 histogram[i] = 0; 1484 local unit = base_unit .. "/s"
1557 end 1485 local factor = 1
1558 local max_bin_samples = 0; 1486 if base_unit == "s" then
1559 for _, d in ipairs(data.samples) do 1487 -- be smart!
1560 local bucket = math.floor(1+(n_buckets-1)/(range/(d-data.min))); 1488 unit = "%"
1561 histogram[bucket] = histogram[bucket] + 1; 1489 factor = 100
1562 if histogram[bucket] > max_bin_samples then 1490 elseif base_unit == "" then
1563 max_bin_samples = histogram[bucket]; 1491 unit = "events/s"
1564 end 1492 end
1565 end 1493 print(("%-50s %s"):format(prefix, format_number(factor * current_value / (self.now - created_timestamp), unit.." [avg]")));
1566 1494 end
1567 print(""); 1495 end
1568 print(("_"):rep(52)..max_bin_samples); 1496
1569 for row = graph_height, 1, -1 do 1497 function stats_methods:render_single_gauge(print, prefix, metric_family, metric)
1570 local row_chars = {}; 1498 local current_value
1571 local min_eighths, max_eighths = 8, 0; 1499 for _, _, value in metric:iter_samples() do
1572 for i = 1, #histogram do 1500 current_value = value
1573 local char_eighths = math.ceil(math.max(math.min((graph_height/(max_bin_samples/histogram[i]))-(row-1), 1), 0)*8); 1501 end
1574 if char_eighths < min_eighths then 1502 if current_value then
1575 min_eighths = char_eighths; 1503 local unit = short_units[metric_family.unit] or metric_family.unit
1576 end 1504 print(("%-50s %s"):format(prefix, format_number(current_value, unit)));
1577 if char_eighths > max_eighths then 1505 end
1578 max_eighths = char_eighths; 1506 end
1579 end 1507
1580 if char_eighths == 0 then 1508 function stats_methods:render_single_summary(print, prefix, metric_family, metric)
1581 row_chars[i] = "-"; 1509 local sum, count
1582 else 1510 for suffix, _, value in metric:iter_samples() do
1583 local char = eighth_chars:sub(char_eighths*3+1, char_eighths*3+3); 1511 if suffix == "_sum" then
1584 row_chars[i] = char; 1512 sum = value
1585 end 1513 elseif suffix == "_count" then
1586 end 1514 count = value
1587 print(table.concat(row_chars).."|-"..math.ceil((max_bin_samples/graph_height)*(row-0.5))); 1515 end
1588 end 1516 end
1589 print(("\\ "):rep(11)); 1517 if sum and count then
1590 local x_labels = {}; 1518 local unit = short_units[metric_family.unit] or metric_family.unit
1591 for i = 1, 11 do 1519 if count == 0 then
1592 local s = ("%-4s"):format(format_stat(type, data.units, data.min+range*i/11, data.min):match("^%S+")); 1520 print(("%-50s %s"):format(prefix, "no obs."));
1593 if #s > 4 then
1594 s = s:sub(1, 3).."…";
1595 end
1596 x_labels[i] = s;
1597 end
1598 print(" "..table.concat(x_labels, " "));
1599 local units = format_stat(type, data.units, data.min):match("%s+(.+)$") or data.units or "";
1600 local margin = math.floor((graph_width-#units)/2);
1601 print((" "):rep(margin)..units);
1602 else 1521 else
1603 print("[range too small to graph]"); 1522 print(("%-50s %s"):format(prefix, format_number(sum / count, unit.."/event [avg]")));
1604 end 1523 end
1605 print(""); 1524 end
1606 end 1525 end
1607 return self; 1526
1527 function stats_methods:render_family(print, family_name, metric_family, render_func)
1528 local labelkeys = metric_family.label_keys
1529 if #labelkeys > 0 then
1530 print(family_name)
1531 for labelset, metric in metric_family:iter_metrics() do
1532 local labels = {}
1533 for i, k in ipairs(labelkeys) do
1534 local v = labelset[i]
1535 t_insert(labels, ("%s=%s"):format(k, v))
1536 end
1537 local prefix = " "..t_concat(labels, " ")
1538 render_func(self, print, prefix, metric_family, metric)
1539 end
1540 else
1541 for _, metric in metric_family:iter_metrics() do
1542 render_func(self, print, family_name, metric_family, metric)
1543 end
1544 end
1608 end 1545 end
1609 1546
1610 local function stats_tostring(stats) 1547 local function stats_tostring(stats)
1611 local print = stats.session.print; 1548 local print = stats.session.print;
1612 for _, stat_info in ipairs(stats) do 1549 for _, stat_info in ipairs(stats) do
1616 for _, v in ipairs(stat_info.output) do 1553 for _, v in ipairs(stat_info.output) do
1617 print(v); 1554 print(v);
1618 end 1555 end
1619 print(""); 1556 print("");
1620 else 1557 else
1621 print(("%-50s %s"):format(stat_info[1], format_stat(stat_info[2], (stat_info[4] or {}).units, stat_info[3]))); 1558 local metric_family = stat_info[2]
1559 if metric_family.type_ == "counter" then
1560 stats:render_family(print, stat_info[1], metric_family, stats.render_single_counter)
1561 elseif metric_family.type_ == "gauge" or metric_family.type_ == "unknown" then
1562 stats:render_family(print, stat_info[1], metric_family, stats.render_single_gauge)
1563 elseif metric_family.type_ == "summary" or metric_family.type_ == "histogram" then
1564 stats:render_family(print, stat_info[1], metric_family, stats.render_single_summary)
1565 end
1622 end 1566 end
1623 end 1567 end
1624 return #stats.." statistics displayed"; 1568 return #stats.." statistics displayed";
1625 end 1569 end
1626 1570
1627 local stats_mt = {__index = stats_methods, __tostring = stats_tostring } 1571 local stats_mt = {__index = stats_methods, __tostring = stats_tostring }
1628 local function new_stats_context(self) 1572 local function new_stats_context(self)
1629 return setmetatable({ session = self.session, stats = true }, stats_mt); 1573 -- TODO: instead of now(), it might be better to take the time of the last
1630 end 1574 -- interval, if the statistics backend is set to use periodic collection
1631 1575 -- Otherwise we get strange stuff like average cpu usage decreasing until
1632 function def_env.stats:show(filter) 1576 -- the next sample and so on.
1633 -- luacheck: ignore 211/changed 1577 return setmetatable({ session = self.session, stats = true, now = time.now() }, stats_mt);
1634 local stats, changed, extra = require "core.statsmanager".get_stats(); 1578 end
1635 local available, displayed = 0, 0; 1579
1580 function def_env.stats:show(name_filter)
1581 local statsman = require "core.statsmanager"
1582 local collect = statsman.collect
1583 if collect then
1584 -- force collection if in manual mode
1585 collect()
1586 end
1587 local metric_registry = statsman.get_metric_registry();
1636 local displayed_stats = new_stats_context(self); 1588 local displayed_stats = new_stats_context(self);
1637 for name, value in iterators.sorted_pairs(stats) do 1589 for family_name, metric_family in iterators.sorted_pairs(metric_registry:get_metric_families()) do
1638 available = available + 1; 1590 if not name_filter or family_name:match(name_filter) then
1639 if not filter or name:match(filter) then
1640 displayed = displayed + 1;
1641 local type = name:match(":(%a+)$");
1642 table.insert(displayed_stats, { 1591 table.insert(displayed_stats, {
1643 name, type, value, extra[name]; 1592 family_name,
1644 output = {}; 1593 metric_family,
1645 }); 1594 output = {}
1595 })
1646 end 1596 end
1647 end 1597 end
1648 return displayed_stats; 1598 return displayed_stats;
1649 end 1599 end
1650 1600