diff --git a/LICENSE-3rdparty.csv b/LICENSE-3rdparty.csv index 9b0b034ca2b..312e9a77bd9 100644 --- a/LICENSE-3rdparty.csv +++ b/LICENSE-3rdparty.csv @@ -59,6 +59,7 @@ async-task,https://github.com/smol-rs/async-task,Apache-2.0 OR MIT,Stjepan Glavi async-trait,https://github.com/dtolnay/async-trait,MIT OR Apache-2.0,David Tolnay atoi,https://github.com/pacman82/atoi-rs,MIT,Markus Klein atomic-waker,https://github.com/smol-rs/atomic-waker,Apache-2.0 OR MIT,"Stjepan Glavina , Contributors to futures-rs" +atomic_float,https://github.com/thomcc/atomic_float,Apache-2.0 OR MIT OR Unlicense,Thom Chiovoloni aws-config,https://github.com/smithy-lang/smithy-rs,Apache-2.0,"AWS Rust SDK Team , Russell Cohen " aws-credential-types,https://github.com/smithy-lang/smithy-rs,Apache-2.0,AWS Rust SDK Team aws-lc-rs,https://github.com/aws/aws-lc-rs,ISC AND (Apache-2.0 OR ISC),AWS-LibCrypto @@ -179,6 +180,8 @@ const-oid,https://github.com/RustCrypto/formats/tree/master/const-oid,Apache-2.0 const-random,https://github.com/tkaitchuck/constrandom,MIT OR Apache-2.0,Tom Kaitchuck const-random-macro,https://github.com/tkaitchuck/constrandom,MIT OR Apache-2.0,Tom Kaitchuck const_fn,https://github.com/taiki-e/const_fn,Apache-2.0 OR MIT,The const_fn Authors +const_format,https://github.com/rodrimati1992/const_format_crates,Zlib,rodrimati1992 +const_format_proc_macros,https://github.com/rodrimati1992/const_format_crates,Zlib,rodrimati1992 constant_time_eq,https://github.com/cesarb/constant_time_eq,CC0-1.0 OR MIT-0 OR Apache-2.0,Cesar Eduardo Barros convert_case,https://github.com/rutrum/convert-case,MIT,rutrum core-foundation,https://github.com/servo/core-foundation-rs,MIT OR Apache-2.0,The Servo Project Developers @@ -190,6 +193,7 @@ crc-catalog,https://github.com/akhilles/crc-catalog,MIT OR Apache-2.0,Akhil Vela crc-fast,https://github.com/awesomized/crc-fast-rust,MIT OR Apache-2.0,Don MacAskill crc32fast,https://github.com/srijs/rust-crc32fast,MIT OR Apache-2.0,"Sam Rijs , Alex Crichton " criterion-plot,https://github.com/criterion-rs/criterion.rs,Apache-2.0 OR MIT,"Jorge Aparicio , Brook Heisler " +critical-section,https://github.com/rust-embedded/critical-section,MIT OR Apache-2.0,The critical-section Authors cron,https://github.com/zslayton/cron,MIT OR Apache-2.0,Zack Slayton crossbeam-channel,https://github.com/crossbeam-rs/crossbeam,MIT OR Apache-2.0,The crossbeam-channel Authors crossbeam-deque,https://github.com/crossbeam-rs/crossbeam,MIT OR Apache-2.0,The crossbeam-deque Authors @@ -280,6 +284,7 @@ embedded-io,https://github.com/rust-embedded/embedded-hal,MIT OR Apache-2.0,The ena,https://github.com/rust-lang/ena,MIT OR Apache-2.0,Niko Matsakis encode_unicode,https://github.com/tormol/encode_unicode,Apache-2.0 OR MIT,Torbjørn Birch Moltu encoding_rs,https://github.com/hsivonen/encoding_rs,(Apache-2.0 OR MIT) AND BSD-3-Clause,Henri Sivonen +endian-type,https://github.com/Lolirofle/endian-type,MIT,Lolirofle enum-iterator,https://github.com/stephaneyfx/enum-iterator,0BSD,Stephane Raux enum-iterator-derive,https://github.com/stephaneyfx/enum-iterator,0BSD,Stephane Raux env_filter,https://github.com/rust-cli/env_logger,MIT OR Apache-2.0,The env_filter Authors @@ -294,6 +299,7 @@ etcetera,https://github.com/lunacookies/etcetera,MIT OR Apache-2.0,The etcetera event-listener,https://github.com/smol-rs/event-listener,Apache-2.0 OR MIT,Stjepan Glavina event-listener,https://github.com/smol-rs/event-listener,Apache-2.0 OR MIT,"Stjepan Glavina , John Nunley " event-listener-strategy,https://github.com/smol-rs/event-listener-strategy,Apache-2.0 OR MIT,John Nunley +evmap,https://github.com/jonhoo/evmap,MIT OR Apache-2.0,Jon Gjengset fail,https://github.com/tikv/fail-rs,Apache-2.0,The TiKV Project Developers fancy-regex,https://github.com/fancy-regex/fancy-regex,MIT,"Raph Levien , Robin Stocker , Keith Hall " fastdivide,https://github.com/fulmicoton/fastdivide,zlib-acknowledgement OR MIT,Paul Masurel @@ -329,6 +335,7 @@ futures-sink,https://github.com/rust-lang/futures-rs,MIT OR Apache-2.0,The futur futures-task,https://github.com/rust-lang/futures-rs,MIT OR Apache-2.0,The futures-task Authors futures-timer,https://github.com/async-rs/futures-timer,MIT OR Apache-2.0,Alex Crichton futures-util,https://github.com/rust-lang/futures-rs,MIT OR Apache-2.0,The futures-util Authors +generator,https://github.com/Xudong-Huang/generator-rs,MIT OR Apache-2.0,Xudong Huang generic-array,https://github.com/fizyk20/generic-array,MIT,"Bartłomiej Kamiński , Aaron Trent " getrandom,https://github.com/rust-random/getrandom,MIT OR Apache-2.0,The Rand Project Developers gimli,https://github.com/gimli-rs/gimli,MIT OR Apache-2.0,The gimli Authors @@ -345,6 +352,7 @@ grok,https://github.com/mmastrac/grok,Apache-2.0,"Matt Mastracci , Jack Grigg " h2,https://github.com/hyperium/h2,MIT,"Carl Lerche , Sean McArthur " half,https://github.com/VoidStarKat/half-rs,MIT OR Apache-2.0,Kathryn Long +hashbag,https://github.com/jonhoo/hashbag,MIT OR Apache-2.0,Jon Gjengset hashbrown,https://github.com/rust-lang/hashbrown,MIT OR Apache-2.0,Amanieu d'Antras hashlink,https://github.com/kyren/hashlink,MIT OR Apache-2.0,kyren hdrhistogram,https://github.com/HdrHistogram/HdrHistogram_rust,MIT OR Apache-2.0,"Jon Gjengset , Marshall Pierce " @@ -417,12 +425,15 @@ json_comments,https://github.com/tmccombs/json-comments-rs,Apache-2.0,Thayne McC jsonschema,https://github.com/Stranger6667/jsonschema,MIT,Dmitry Dygalo jsonwebtoken,https://github.com/Keats/jsonwebtoken,MIT,Vincent Prouillet keccak,https://github.com/RustCrypto/sponges/tree/master/keccak,Apache-2.0 OR MIT,RustCrypto Developers +konst,https://github.com/rodrimati1992/konst,Zlib,rodrimati1992 +konst_macro_rules,https://github.com/rodrimati1992/konst,Zlib,rodrimati1992 krb5-src,https://github.com/MaterializeInc/rust-krb5-src,Apache-2.0,"Materialize, Inc." lalrpop-util,https://github.com/lalrpop/lalrpop,Apache-2.0 OR MIT,Niko Matsakis lambda_runtime,https://github.com/awslabs/aws-lambda-rust-runtime,Apache-2.0,"David Calavera , Harold Sun " lambda_runtime_api_client,https://github.com/awslabs/aws-lambda-rust-runtime,Apache-2.0,"David Calavera , Harold Sun " lazy_static,https://github.com/rust-lang-nursery/lazy-static.rs,MIT OR Apache-2.0,Marvin Löbel leb128fmt,https://github.com/bluk/leb128fmt,MIT OR Apache-2.0,Bryant Luk +left-right,https://github.com/jonhoo/left-right,MIT OR Apache-2.0,Jon Gjengset levenshtein_automata,https://github.com/tantivy-search/levenshtein-automata,MIT,Paul Masurel lexical-core,https://github.com/Alexhuszagh/rust-lexical,MIT OR Apache-2.0,Alex Huszagh lexical-parse-float,https://github.com/Alexhuszagh/rust-lexical,MIT OR Apache-2.0,Alex Huszagh @@ -444,6 +455,7 @@ linux-raw-sys,https://github.com/sunfishcode/linux-raw-sys,Apache-2.0 WITH LLVM- litemap,https://github.com/unicode-org/icu4x,Unicode-3.0,The ICU4X Project Developers lock_api,https://github.com/Amanieu/parking_lot,MIT OR Apache-2.0,Amanieu d'Antras log,https://github.com/rust-lang/log,MIT OR Apache-2.0,The Rust Project Developers +loom,https://github.com/tokio-rs/loom,MIT,Carl Lerche lru,https://github.com/jeromefroe/lru-rs,MIT,Jerome Froelich lru-slab,https://github.com/Ralith/lru-slab,MIT OR Apache-2.0 OR Zlib,Benjamin Saunders lz4,https://github.com/10xGenomics/lz4-rs,MIT,"Jens Heyens , Artem V. Navrotskiy , Patrick Marks " @@ -456,6 +468,10 @@ md5,https://github.com/stainless-steel/md5,Apache-2.0 OR MIT,"Ivan Ukhov memchr,https://github.com/BurntSushi/memchr,Unlicense OR MIT,"Andrew Gallant , bluss" memmap2,https://github.com/RazrFalcon/memmap2-rs,MIT OR Apache-2.0,"Dan Burkert , Yevhenii Reizner , The Contributors" +metrics,https://github.com/metrics-rs/metrics,MIT,Toby Lawrence +metrics-exporter-otel,https://github.com/palindrom615/metrics,MIT,Whoemoon Jang +metrics-exporter-prometheus,https://github.com/metrics-rs/metrics,MIT AND Apache-2.0,Toby Lawrence +metrics-util,https://github.com/metrics-rs/metrics,MIT,Toby Lawrence mime,https://github.com/hyperium/mime,MIT OR Apache-2.0,Sean McArthur mime_guess,https://github.com/abonander/mime_guess,MIT,Austin Bonander mini-internal,https://github.com/dtolnay/miniserde,MIT OR Apache-2.0,David Tolnay @@ -474,6 +490,7 @@ murmurhash32,https://github.com/quickwit-inc/murmurhash32,MIT,Paul Masurel new_debug_unreachable,https://github.com/mbrubeck/rust-debug-unreachable,MIT,"Matt Brubeck , Jonathan Reem " new_string_template,https://github.com/hasezoey/new_string_template,MIT,hasezoey +nibble_vec,https://github.com/michaelsproul/rust_nibble_vec,MIT,Michael Sproul nix,https://github.com/nix-rust/nix,MIT,The nix-rust Project Developers no-std-net,https://github.com/dunmatt/no-std-net,MIT,M@ Dunlap nohash-hasher,https://github.com/paritytech/nohash-hasher,Apache-2.0 OR MIT,Parity Technologies @@ -635,12 +652,15 @@ quinn-udp,https://github.com/quinn-rs/quinn,MIT OR Apache-2.0,The quinn-udp Auth quote,https://github.com/dtolnay/quote,MIT OR Apache-2.0,David Tolnay quoted_printable,https://github.com/staktrace/quoted-printable,0BSD,Kartikaya Gupta r-efi,https://github.com/r-efi/r-efi,MIT OR Apache-2.0 OR LGPL-2.1-or-later,The r-efi Authors +radix_trie,https://github.com/michaelsproul/rust_radix_trie,MIT,Michael Sproul rand,https://github.com/rust-random/rand,MIT OR Apache-2.0,"The Rand Project Developers, The Rust Project Developers" rand_chacha,https://github.com/rust-random/rand,MIT OR Apache-2.0,"The Rand Project Developers, The Rust Project Developers, The CryptoCorrosion Contributors" rand_core,https://github.com/rust-random/rand,MIT OR Apache-2.0,"The Rand Project Developers, The Rust Project Developers" rand_core,https://github.com/rust-random/rand_core,MIT OR Apache-2.0,The Rand Project Developers rand_hc,https://github.com/rust-random/rand,MIT OR Apache-2.0,The Rand Project Developers rand_xorshift,https://github.com/rust-random/rngs,MIT OR Apache-2.0,"The Rand Project Developers, The Rust Project Developers" +rand_xoshiro,https://github.com/rust-random/rngs,MIT OR Apache-2.0,The Rand Project Developers +rapidhash,https://github.com/hoxxep/rapidhash,MIT OR Apache-2.0,Liam Gray raw-cpuid,https://github.com/gz/rust-cpuid,MIT,Gerd Zellweger rayon,https://github.com/rayon-rs/rayon,MIT OR Apache-2.0,The rayon Authors rayon-core,https://github.com/rayon-rs/rayon,MIT OR Apache-2.0,The rayon-core Authors @@ -705,6 +725,7 @@ seahash,https://gitlab.redox-os.org/redox-os/seahash,MIT,"ticki , Kornel " security-framework-sys,https://github.com/kornelski/rust-security-framework,MIT OR Apache-2.0,"Steven Fackler , Kornel " +seize,https://github.com/ibraheemdev/seize,MIT,Ibraheem Ahmed semver,https://github.com/dtolnay/semver,MIT OR Apache-2.0,David Tolnay separator,https://github.com/saghm/rust-separator,MIT,Saghm Rossi seq-macro,https://github.com/dtolnay/seq-macro,MIT OR Apache-2.0,David Tolnay diff --git a/quickwit/Cargo.lock b/quickwit/Cargo.lock index ebf6e4727cd..debb6b5ea06 100644 --- a/quickwit/Cargo.lock +++ b/quickwit/Cargo.lock @@ -704,6 +704,12 @@ version = "1.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1505bd5d3d116872e7271a6d4e16d81d0c8570876c8de68093a09ac269d8aac0" +[[package]] +name = "atomic_float" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "628d228f918ac3b82fe590352cc719d30664a0c13ca3a60266fe02c7132d480a" + [[package]] name = "autocfg" version = "1.5.0" @@ -2324,6 +2330,27 @@ version = "0.4.12" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "413d67b29ef1021b4d60f4aa1e925ca031751e213832b4b1d588fae623c05c60" +[[package]] +name = "const_format" +version = "0.2.36" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4481a617ad9a412be3b97c5d403fef8ed023103368908b9c50af598ff467cc1e" +dependencies = [ + "const_format_proc_macros", + "konst", +] + +[[package]] +name = "const_format_proc_macros" +version = "0.2.34" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1d57c2eccfb16dbac1f4e61e206105db5820c9d26c3c472bc17c774259ef7744" +dependencies = [ + "proc-macro2", + "quote", + "unicode-xid", +] + [[package]] name = "constant_time_eq" version = "0.4.2" @@ -2454,6 +2481,12 @@ dependencies = [ "itertools 0.13.0", ] +[[package]] +name = "critical-section" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "790eea4361631c5e7d22598ecd5723ff611904e3344ce8720784c93e3d83d40b" + [[package]] name = "cron" version = "0.16.0" @@ -3893,6 +3926,12 @@ dependencies = [ "cfg-if", ] +[[package]] +name = "endian-type" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c34f04666d835ff5d62e058c3995147c06f42fe86ff053337632bca83e42702d" + [[package]] name = "enum-iterator" version = "2.3.0" @@ -4028,6 +4067,17 @@ dependencies = [ "pin-project-lite", ] +[[package]] +name = "evmap" +version = "11.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1b8874945f036109c72242964c1174cf99434e30cfa45bf45fedc983f50046f8" +dependencies = [ + "hashbag", + "left-right", + "smallvec", +] + [[package]] name = "fail" version = "0.5.1" @@ -4408,6 +4458,21 @@ dependencies = [ "slab", ] +[[package]] +name = "generator" +version = "0.8.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "52f04ae4152da20c76fe800fa48659201d5cf627c5149ca0b707b69d7eef6cf9" +dependencies = [ + "cc", + "cfg-if", + "libc", + "log", + "rustversion", + "windows-link", + "windows-result", +] + [[package]] name = "generic-array" version = "0.14.7" @@ -4688,6 +4753,12 @@ dependencies = [ "zerocopy", ] +[[package]] +name = "hashbag" +version = "0.1.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7040a10f52cba493ddb09926e15d10a9d8a28043708a405931fe4c6f19fac064" + [[package]] name = "hashbrown" version = "0.12.3" @@ -5566,6 +5637,21 @@ dependencies = [ "cpufeatures 0.2.17", ] +[[package]] +name = "konst" +version = "0.2.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "128133ed7824fcd73d6e7b17957c5eb7bacb885649bd8c69708b2331a10bcefb" +dependencies = [ + "konst_macro_rules", +] + +[[package]] +name = "konst_macro_rules" +version = "0.2.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a4933f3f57a8e9d9da04db23fb153356ecaf00cbd14aee46279c33dc80925c37" + [[package]] name = "krb5-src" version = "0.3.4" @@ -5670,6 +5756,17 @@ version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "09edd9e8b54e49e587e4f6295a7d29c3ea94d469cb40ab8ca70b288248a81db2" +[[package]] +name = "left-right" +version = "0.11.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0f0c21e4c8ff95f487fb34e6f9182875f42c84cef966d29216bf115d9bba835a" +dependencies = [ + "crossbeam-utils", + "loom", + "slab", +] + [[package]] name = "levenshtein_automata" version = "0.2.1" @@ -5854,6 +5951,19 @@ version = "0.4.29" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5e5032e24019045c762d3c0f28f5b6b8bbf38563a65908389bf7978758920897" +[[package]] +name = "loom" +version = "0.7.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "419e0dc8046cb947daa77eb95ae174acfbddb7673b4151f56d1eed8e93fbfaca" +dependencies = [ + "cfg-if", + "generator", + "scoped-tls", + "tracing", + "tracing-subscriber", +] + [[package]] name = "lru" version = "0.16.4" @@ -5977,6 +6087,71 @@ dependencies = [ "libc", ] +[[package]] +name = "metrics" +version = "0.24.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b7cd3e9eb685089c784f5769b1197d348c7274bc20d4e1349650f63b91b6d0af" +dependencies = [ + "portable-atomic", + "rapidhash", +] + +[[package]] +name = "metrics-exporter-otel" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "58b8984fa38406b80c094943c0ba90e53d5fff0aea051ff9fac96cf6940993c8" +dependencies = [ + "metrics", + "metrics-util", + "opentelemetry", + "portable-atomic", + "scc", +] + +[[package]] +name = "metrics-exporter-prometheus" +version = "0.18.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c0ca2990f7f78a72c4000ddce186db7d1b700477426563ee851c95ea3c0d0c4" +dependencies = [ + "base64 0.22.1", + "evmap", + "http-body-util", + "hyper 1.9.0", + "hyper-util", + "indexmap 2.14.0", + "ipnet", + "metrics", + "metrics-util", + "quanta", + "thiserror 2.0.18", + "tokio", + "tracing", +] + +[[package]] +name = "metrics-util" +version = "0.20.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "55ff5c12b797ebf094dc7c1d87e905efc0329cba332f96d51db03875441012b5" +dependencies = [ + "aho-corasick", + "crossbeam-epoch", + "crossbeam-utils", + "hashbrown 0.16.1", + "indexmap 2.14.0", + "metrics", + "ordered-float 5.3.0", + "quanta", + "radix_trie", + "rand 0.9.4", + "rand_xoshiro", + "rapidhash", + "sketches-ddsketch 0.3.1", +] + [[package]] name = "mime" version = "0.3.17" @@ -6167,6 +6342,15 @@ dependencies = [ "regex", ] +[[package]] +name = "nibble_vec" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "77a5d83df9f36fe23f0c3648c6bbb8b0298bb5f1939c8f2704431371f4b84d43" +dependencies = [ + "smallvec", +] + [[package]] name = "nix" version = "0.26.4" @@ -6905,6 +7089,16 @@ dependencies = [ "winapi 0.3.9", ] +[[package]] +name = "papaya" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "997ee03cd38c01469a7046643714f0ad28880bcb9e6679ff0666e24817ca19b7" +dependencies = [ + "equivalent", + "seize", +] + [[package]] name = "papergrid" version = "0.17.0" @@ -7549,6 +7743,9 @@ name = "portable-atomic" version = "1.13.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c33a9471896f1c69cecef8d20cbe2f7accd12527ce60845ff44c153bb2a21b49" +dependencies = [ + "critical-section", +] [[package]] name = "portable-atomic-util" @@ -8108,6 +8305,7 @@ dependencies = [ "flume 0.12.0", "futures", "quickwit-common", + "quickwit-metrics", "rand 0.10.1", "serde", "serde_json", @@ -8137,12 +8335,10 @@ name = "quickwit-cli" version = "0.8.0" dependencies = [ "anyhow", - "backtrace", "bytesize", "chrono", "clap", "colored", - "console-subscriber", "dialoguer", "futures", "humantime", @@ -8150,10 +8346,6 @@ dependencies = [ "itertools 0.14.0", "numfmt", "openssl-probe 0.1.6", - "opentelemetry", - "opentelemetry-appender-tracing", - "opentelemetry-otlp", - "opentelemetry_sdk", "predicates", "quickwit-actors", "quickwit-cluster", @@ -8163,12 +8355,14 @@ dependencies = [ "quickwit-indexing", "quickwit-ingest", "quickwit-metastore", + "quickwit-metrics", "quickwit-proto", "quickwit-rest-client", "quickwit-search", "quickwit-serve", "quickwit-storage", "quickwit-telemetry", + "quickwit-telemetry-exporters", "reqwest", "rustls 0.23.38", "serde_json", @@ -8182,8 +8376,6 @@ dependencies = [ "tokio", "toml", "tracing", - "tracing-opentelemetry", - "tracing-subscriber", ] [[package]] @@ -8199,6 +8391,7 @@ dependencies = [ "pin-project", "quickwit-common", "quickwit-config", + "quickwit-metrics", "quickwit-proto", "rand 0.10.1", "serde", @@ -8273,10 +8466,13 @@ dependencies = [ "hyper 1.9.0", "hyper-util", "itertools 0.14.0", + "metrics", + "metrics-util", "pin-project", "pnet", "prometheus", "proptest", + "quickwit-metrics", "rand 0.10.1", "rayon", "regex", @@ -8348,6 +8544,7 @@ dependencies = [ "quickwit-indexing", "quickwit-ingest", "quickwit-metastore", + "quickwit-metrics", "quickwit-proto", "rand 0.10.1", "serde", @@ -8499,6 +8696,7 @@ dependencies = [ "quickwit-config", "quickwit-indexing", "quickwit-metastore", + "quickwit-metrics", "quickwit-parquet-engine", "quickwit-proto", "quickwit-storage", @@ -8550,6 +8748,7 @@ dependencies = [ "quickwit-indexing", "quickwit-ingest", "quickwit-metastore", + "quickwit-metrics", "quickwit-opentelemetry", "quickwit-parquet-engine", "quickwit-proto", @@ -8595,6 +8794,7 @@ dependencies = [ "quickwit-common", "quickwit-config", "quickwit-doc-mapper", + "quickwit-metrics", "quickwit-proto", "rand 0.10.1", "rand_distr", @@ -8667,6 +8867,7 @@ dependencies = [ "quickwit-indexing", "quickwit-ingest", "quickwit-metastore", + "quickwit-metrics", "quickwit-opentelemetry", "quickwit-proto", "quickwit-query", @@ -8699,6 +8900,7 @@ dependencies = [ "quickwit-index-management", "quickwit-indexing", "quickwit-metastore", + "quickwit-metrics", "quickwit-parquet-engine", "quickwit-proto", "quickwit-query", @@ -8731,6 +8933,7 @@ dependencies = [ "quickwit-common", "quickwit-config", "quickwit-lambda-server", + "quickwit-metrics", "quickwit-proto", "quickwit-search", "quickwit-storage", @@ -8793,6 +8996,7 @@ dependencies = [ "quickwit-common", "quickwit-config", "quickwit-doc-mapper", + "quickwit-metrics", "quickwit-parquet-engine", "quickwit-proto", "quickwit-query", @@ -8820,6 +9024,31 @@ dependencies = [ "uuid", ] +[[package]] +name = "quickwit-metrics" +version = "0.8.0" +dependencies = [ + "atomic_float", + "const_format", + "criterion", + "dashmap 6.1.0", + "inventory", + "metrics", + "metrics-exporter-prometheus", + "metrics-util", + "papaya", + "proptest", + "quanta", + "rustc-hash", +] + +[[package]] +name = "quickwit-metrics-inventory" +version = "0.8.0" +dependencies = [ + "quickwit-metrics", +] + [[package]] name = "quickwit-opentelemetry" version = "0.8.0" @@ -8832,6 +9061,7 @@ dependencies = [ "quickwit-config", "quickwit-ingest", "quickwit-metastore", + "quickwit-metrics", "quickwit-parquet-engine", "quickwit-proto", "serde", @@ -8854,8 +9084,8 @@ dependencies = [ "parquet", "proptest", "prost 0.14.3", - "quickwit-common", "quickwit-dst", + "quickwit-metrics", "quickwit-proto", "rand 0.10.1", "regex", @@ -8984,6 +9214,7 @@ dependencies = [ "quickwit-doc-mapper", "quickwit-indexing", "quickwit-metastore", + "quickwit-metrics", "quickwit-proto", "quickwit-query", "quickwit-storage", @@ -9045,12 +9276,14 @@ dependencies = [ "quickwit-janitor", "quickwit-lambda-client", "quickwit-metastore", + "quickwit-metrics", "quickwit-opentelemetry", "quickwit-proto", "quickwit-query", "quickwit-search", "quickwit-storage", "quickwit-telemetry", + "quickwit-telemetry-exporters", "regex", "rust-embed", "rustls 0.23.38", @@ -9113,6 +9346,7 @@ dependencies = [ "quickwit-aws", "quickwit-common", "quickwit-config", + "quickwit-metrics", "quickwit-proto", "regex", "reqwest", @@ -9148,6 +9382,30 @@ dependencies = [ "uuid", ] +[[package]] +name = "quickwit-telemetry-exporters" +version = "0.8.0" +dependencies = [ + "anyhow", + "backtrace", + "console-subscriber", + "metrics", + "metrics-exporter-otel", + "metrics-exporter-prometheus", + "metrics-util", + "opentelemetry", + "opentelemetry-appender-tracing", + "opentelemetry-otlp", + "opentelemetry_sdk", + "quickwit-common", + "quickwit-metrics", + "serde_json", + "time", + "tracing", + "tracing-opentelemetry", + "tracing-subscriber", +] + [[package]] name = "quinn" version = "0.11.9" @@ -9230,6 +9488,16 @@ version = "6.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f8dcc9c7d52a811697d2151c701e0d08956f92b0e24136cf4cf27b57a6a0d9bf" +[[package]] +name = "radix_trie" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c069c179fcdc6a2fe24d8d18305cf085fdbd4f922c041943e203685d6a1c58fd" +dependencies = [ + "endian-type", + "nibble_vec", +] + [[package]] name = "rand" version = "0.7.3" @@ -9366,6 +9634,24 @@ dependencies = [ "rand_core 0.9.5", ] +[[package]] +name = "rand_xoshiro" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f703f4665700daf5512dcca5f43afa6af89f09db47fb56be587f80636bda2d41" +dependencies = [ + "rand_core 0.9.5", +] + +[[package]] +name = "rapidhash" +version = "4.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b5e48930979c155e2f33aa36ab3119b5ee81332beb6482199a8ecd6029b80b59" +dependencies = [ + "rustversion", +] + [[package]] name = "raw-cpuid" version = "11.6.0" @@ -10240,6 +10526,16 @@ dependencies = [ "libc", ] +[[package]] +name = "seize" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b55fb86dfd3a2f5f76ea78310a88f96c4ea21a3031f8d212443d56123fd0521" +dependencies = [ + "libc", + "windows-sys 0.61.2", +] + [[package]] name = "semver" version = "1.0.28" diff --git a/quickwit/Cargo.toml b/quickwit/Cargo.toml index e844fc086ac..e36af9f2a5f 100644 --- a/quickwit/Cargo.toml +++ b/quickwit/Cargo.toml @@ -26,6 +26,9 @@ members = [ "quickwit-lambda-server", "quickwit-macros", "quickwit-metastore", + "quickwit-metrics", + "quickwit-telemetry-exporters", + "quickwit-metrics-inventory", # Disabling metastore-utils from the quickwit projects to ease build/deps. # We can reenable it when we need it. @@ -69,6 +72,9 @@ default-members = [ "quickwit-lambda-server", "quickwit-macros", "quickwit-metastore", + "quickwit-metrics", + "quickwit-telemetry-exporters", + "quickwit-metrics-inventory", "quickwit-opentelemetry", "quickwit-parquet-engine", "quickwit-proto", @@ -97,6 +103,7 @@ assert-json-diff = "2" async-compression = { version = "0.4", features = ["tokio", "gzip"] } async-speed-limit = "0.4" async-trait = "0.1" +atomic_float = "1.1" backtrace = "0.3" base64 = "0.22" binggan = { version = "0.15" } @@ -113,8 +120,10 @@ clap = { version = "4.5", features = ["env", "string"] } coarsetime = "0.1" colored = "3.0" console-subscriber = "0.5" +const_format = "0.2" criterion = { version = "0.8", features = ["async_tokio"] } cron = "0.16" +dashmap = "6.1" dialoguer = { version = "0.12", default-features = false } dotenvy = "0.15" dyn-clone = "1.0" @@ -154,6 +163,7 @@ hyper-util = { version = "0.1", default-features = false, features = [ ] } indexmap = { version = "2.12", features = ["serde"] } indicatif = "0.18" +inventory = "0.3" itertools = "0.14" lambda_runtime = "0.13" json_comments = "0.2" @@ -161,6 +171,10 @@ libz-sys = "1.1" lru = "0.16" matches = "0.1" md5 = "0.8" +metrics = "0.24" +metrics-exporter-otel = "0.3" +metrics-exporter-prometheus = { version = "0.18", default-features = false } +metrics-util = "0.20" mime_guess = "2.0" mini-moka = "0.10.3" mockall = "0.14" @@ -199,6 +213,7 @@ pulsar = { version = "6.6", default-features = false, features = [ "compression", "tokio-runtime", ] } +quanta = "0.12" quick_cache = "0.6.18" quote = "1.0" rand = "0.10" @@ -375,6 +390,9 @@ quickwit-lambda-client = { path = "quickwit-lambda-client" } quickwit-lambda-server = { path = "quickwit-lambda-server" } quickwit-macros = { path = "quickwit-macros" } quickwit-metastore = { path = "quickwit-metastore" } +quickwit-metrics = { path = "quickwit-metrics" } +quickwit-telemetry-exporters = { path = "quickwit-telemetry-exporters" } +quickwit-metrics-inventory = { path = "quickwit-metrics-inventory" } quickwit-opentelemetry = { path = "quickwit-opentelemetry" } quickwit-parquet-engine = { path = "quickwit-parquet-engine" } quickwit-proto = { path = "quickwit-proto" } diff --git a/quickwit/quickwit-actors/Cargo.toml b/quickwit/quickwit-actors/Cargo.toml index 7832c48e967..08c63e62add 100644 --- a/quickwit/quickwit-actors/Cargo.toml +++ b/quickwit/quickwit-actors/Cargo.toml @@ -23,6 +23,7 @@ tokio = { workspace = true } tracing = { workspace = true } quickwit-common = { workspace = true } +quickwit-metrics = { workspace = true } [features] testsuite = [] diff --git a/quickwit/quickwit-actors/src/actor_context.rs b/quickwit/quickwit-actors/src/actor_context.rs index 3186e210647..9a55a6f91eb 100644 --- a/quickwit/quickwit-actors/src/actor_context.rs +++ b/quickwit/quickwit-actors/src/actor_context.rs @@ -20,8 +20,8 @@ use std::sync::Arc; use std::sync::atomic::{AtomicBool, Ordering}; use std::time::Duration; -use quickwit_common::metrics::IntCounter; use quickwit_common::{KillSwitch, Progress, ProtectedZoneGuard}; +use quickwit_metrics::Counter; use tokio::sync::{oneshot, watch}; use tracing::{debug, error}; @@ -61,7 +61,7 @@ pub struct ActorContextInner { self_mailbox: Mailbox, progress: Progress, actor_state: AtomicState, - backpressure_micros_counter_opt: Option, + backpressure_micros_counter_opt: Option, observable_state_tx: watch::Sender, // Boolean marking the presence of an observe message in the actor's high priority queue. observe_enqueued: AtomicBool, @@ -72,7 +72,7 @@ impl ActorContext { self_mailbox: Mailbox, spawn_ctx: SpawnContext, observable_state_tx: watch::Sender, - backpressure_micros_counter_opt: Option, + backpressure_micros_counter_opt: Option, ) -> Self { ActorContext { inner: ActorContextInner { diff --git a/quickwit/quickwit-actors/src/mailbox.rs b/quickwit/quickwit-actors/src/mailbox.rs index f222294e4c2..2cb173ba109 100644 --- a/quickwit/quickwit-actors/src/mailbox.rs +++ b/quickwit/quickwit-actors/src/mailbox.rs @@ -19,7 +19,7 @@ use std::sync::atomic::{AtomicUsize, Ordering}; use std::sync::{Arc, LazyLock, Weak}; use std::time::Instant; -use quickwit_common::metrics::{GaugeGuard, IntCounter, IntGauge}; +use quickwit_metrics::{Counter, Gauge, GaugeGuard, gauge}; use tokio::sync::oneshot; use crate::channel_with_priority::{Receiver, Sender, TrySendError}; @@ -191,7 +191,7 @@ impl Mailbox { pub async fn send_message_with_backpressure_counter( &self, message: M, - backpressure_micros_counter_opt: Option<&IntCounter>, + backpressure_micros_counter_opt: Option<&Counter>, ) -> Result, SendError> where A: DeferableReplyHandler, @@ -205,7 +205,7 @@ impl Mailbox { let now = Instant::now(); self.inner.tx.send_low_priority(envelope).await?; let elapsed = now.elapsed(); - backpressure_micros_counter.inc_by(elapsed.as_micros() as u64); + backpressure_micros_counter.increment(elapsed.as_micros() as u64); } else { self.inner.tx.send_low_priority(envelope).await?; } @@ -273,7 +273,7 @@ impl Mailbox { pub async fn ask_with_backpressure_counter( &self, message: M, - backpressure_micros_counter_opt: Option<&IntCounter>, + backpressure_micros_counter_opt: Option<&Counter>, ) -> Result> where A: DeferableReplyHandler, @@ -308,9 +308,17 @@ impl Mailbox { struct InboxInner { rx: Receiver>, - _inboxes_count_gauge_guard: GaugeGuard<'static>, + _inboxes_count_gauge_guard: GaugeGuard, } +static INBOX_GAUGE: LazyLock = LazyLock::new(|| { + gauge!( + name: "inboxes_count", + description: "overall count of actors", + subsystem: "actor", + ) +}); + pub struct Inbox { inner: Arc>, } @@ -385,20 +393,6 @@ impl Inbox { } } -fn get_actor_inboxes_count_gauge_guard() -> GaugeGuard<'static> { - static INBOX_GAUGE: LazyLock = LazyLock::new(|| { - quickwit_common::metrics::new_gauge( - "inboxes_count", - "overall count of actors", - "actor", - &[], - ) - }); - let mut gauge_guard = GaugeGuard::from_gauge(&INBOX_GAUGE); - gauge_guard.add(1); - gauge_guard -} - pub(crate) fn create_mailbox( actor_name: String, queue_capacity: QueueCapacity, @@ -416,7 +410,7 @@ pub(crate) fn create_mailbox( }; let inner = InboxInner { rx, - _inboxes_count_gauge_guard: get_actor_inboxes_count_gauge_guard(), + _inboxes_count_gauge_guard: GaugeGuard::new(&INBOX_GAUGE, 1.0), }; let inbox = Inbox { inner: Arc::new(inner), @@ -452,6 +446,8 @@ mod tests { use std::mem; use std::time::Duration; + use quickwit_metrics::counter; + use super::*; use crate::tests::{Ping, PingReceiverActor}; use crate::{ActorContext, ActorExitStatus, Handler, Universe}; @@ -519,8 +515,11 @@ mod tests { .await .unwrap(); // At this point the actor was started and even processed a message entirely. - let backpressure_micros_counter = - IntCounter::new("test_counter", "help for test_counter").unwrap(); + let backpressure_micros_counter = counter!( + name: "test_counter_low_backpressure", + description: "help for test_counter", + subsystem: "actor", + ); let wait_duration = Duration::from_millis(1); let processed = mailbox .send_message_with_backpressure_counter( @@ -546,8 +545,11 @@ mod tests { .ask_with_backpressure_counter(Duration::default(), None) .await .unwrap(); - let backpressure_micros_counter = - IntCounter::new("test_counter", "help for test_counter").unwrap(); + let backpressure_micros_counter = counter!( + name: "test_counter_backpressure", + description: "help for test_counter", + subsystem: "actor", + ); let wait_duration = Duration::from_millis(1); mailbox .send_message_with_backpressure_counter( @@ -578,8 +580,11 @@ mod tests { .ask_with_backpressure_counter(Duration::default(), None) .await .unwrap(); - let backpressure_micros_counter = - IntCounter::new("test_counter", "help for test_counter").unwrap(); + let backpressure_micros_counter = counter!( + name: "test_counter_no_waiting_backpressure", + description: "help for test_counter", + subsystem: "actor", + ); let start = Instant::now(); mailbox .ask_with_backpressure_counter(Duration::from_millis(1), None) diff --git a/quickwit/quickwit-actors/src/spawn_builder.rs b/quickwit/quickwit-actors/src/spawn_builder.rs index 6dfc1aa9155..922cfc4d71d 100644 --- a/quickwit/quickwit-actors/src/spawn_builder.rs +++ b/quickwit/quickwit-actors/src/spawn_builder.rs @@ -16,7 +16,7 @@ use std::fmt; use std::time::Duration; use anyhow::Context; -use quickwit_common::metrics::IntCounter; +use quickwit_metrics::Counter; use sync_wrapper::SyncWrapper; use tokio::sync::watch; use tracing::{debug, error, info}; @@ -91,7 +91,7 @@ pub struct SpawnBuilder { spawn_ctx: SpawnContext, #[allow(clippy::type_complexity)] mailboxes: Option<(Mailbox, Inbox)>, - backpressure_micros_counter_opt: Option, + backpressure_micros_counter_opt: Option, } impl SpawnBuilder { @@ -129,10 +129,7 @@ impl SpawnBuilder { /// /// When using `.ask` the amount of time counted may be misleading. /// (See `Mailbox::ask_with_backpressure_counter` for more details) - pub fn set_backpressure_micros_counter( - mut self, - backpressure_micros_counter: IntCounter, - ) -> Self { + pub fn set_backpressure_micros_counter(mut self, backpressure_micros_counter: Counter) -> Self { self.backpressure_micros_counter_opt = Some(backpressure_micros_counter); self } diff --git a/quickwit/quickwit-cli/Cargo.toml b/quickwit/quickwit-cli/Cargo.toml index 80ab6ae0203..d095ec4765f 100644 --- a/quickwit/quickwit-cli/Cargo.toml +++ b/quickwit/quickwit-cli/Cargo.toml @@ -22,12 +22,10 @@ path = "src/generate_markdown.rs" [dependencies] anyhow = { workspace = true } -backtrace = { workspace = true, optional = true } bytesize = { workspace = true } chrono = { workspace = true } clap = { workspace = true } colored = { workspace = true } -console-subscriber = { workspace = true, optional = true } dialoguer = { workspace = true } futures = { workspace = true } humantime = { workspace = true } @@ -35,10 +33,6 @@ indicatif = { workspace = true } itertools = { workspace = true } numfmt = { workspace = true } openssl-probe = { workspace = true, optional = true } -opentelemetry = { workspace = true } -opentelemetry-appender-tracing = { workspace = true } -opentelemetry_sdk = { workspace = true } -opentelemetry-otlp = { workspace = true } reqwest = { workspace = true } rustls = { workspace = true } serde_json = { workspace = true } @@ -52,17 +46,17 @@ time = { workspace = true } tokio = { workspace = true } toml = { workspace = true } tracing = { workspace = true } -tracing-opentelemetry = { workspace = true } -tracing-subscriber = { workspace = true } quickwit-actors = { workspace = true } quickwit-cluster = { workspace = true } quickwit-common = { workspace = true } +quickwit-metrics = { workspace = true } quickwit-config = { workspace = true } quickwit-index-management = { workspace = true } quickwit-indexing = { workspace = true } quickwit-ingest = { workspace = true } quickwit-metastore = { workspace = true } +quickwit-telemetry-exporters = { workspace = true } quickwit-proto = { workspace = true } quickwit-rest-client = { workspace = true } quickwit-search = { workspace = true } @@ -85,8 +79,8 @@ quickwit-storage = { workspace = true, features = ["testsuite"] } datafusion = ["quickwit-serve/datafusion"] jemalloc = ["dep:tikv-jemalloc-ctl", "dep:tikv-jemallocator"] jemalloc-profiled = [ - "dep:backtrace", "quickwit-common/jemalloc-profiled", + "quickwit-telemetry-exporters/jemalloc-profiled", "quickwit-serve/jemalloc-profiled" ] ci-test = [] @@ -96,7 +90,10 @@ openssl-support = ["openssl-probe"] # (this is not about quickwit's metrics themselves) metrics = [ "quickwit-indexing/metrics" ] # Requires to enable tokio unstable via RUSTFLAGS="--cfg tokio_unstable" -tokio-console = ["console-subscriber", "quickwit-common/named_tasks"] +tokio-console = [ + "quickwit-common/named_tasks", + "quickwit-telemetry-exporters/tokio-console", +] release-feature-set = [ "jemalloc", "openssl-support", diff --git a/quickwit/quickwit-cli/src/jemalloc.rs b/quickwit/quickwit-cli/src/jemalloc.rs index f22caff5a37..66ca7c8f4df 100644 --- a/quickwit/quickwit-cli/src/jemalloc.rs +++ b/quickwit/quickwit-cli/src/jemalloc.rs @@ -14,7 +14,6 @@ use std::time::Duration; -use quickwit_common::metrics::MEMORY_METRICS; use tikv_jemallocator::Jemalloc; use tracing::error; @@ -30,8 +29,6 @@ pub static GLOBAL: Jemalloc = Jemalloc; const JEMALLOC_METRICS_POLLING_INTERVAL: Duration = Duration::from_secs(1); pub async fn jemalloc_metrics_loop() -> tikv_jemalloc_ctl::Result<()> { - let memory_metrics = MEMORY_METRICS.clone(); - // Obtain a MIB for the `epoch`, `stats.active`, `stats.allocated`, and `stats.resident` keys: let epoch_mib = tikv_jemalloc_ctl::epoch::mib()?; let active_mib = tikv_jemalloc_ctl::stats::active::mib()?; @@ -48,13 +45,13 @@ pub async fn jemalloc_metrics_loop() -> tikv_jemalloc_ctl::Result<()> { // Read statistics using MIB keys: let active = active_mib.read()?; - memory_metrics.active_bytes.set(active as i64); + quickwit_common::metrics::MEMORY_ACTIVE_BYTES.set(active as f64); let allocated = allocated_mib.read()?; - memory_metrics.allocated_bytes.set(allocated as i64); + quickwit_common::metrics::MEMORY_ALLOCATED_BYTES.set(allocated as f64); let resident = resident_mib.read()?; - memory_metrics.resident_bytes.set(resident as i64); + quickwit_common::metrics::MEMORY_RESIDENT_BYTES.set(resident as f64); } } diff --git a/quickwit/quickwit-cli/src/lib.rs b/quickwit/quickwit-cli/src/lib.rs index 45275c3ff5d..4a12a087aec 100644 --- a/quickwit/quickwit-cli/src/lib.rs +++ b/quickwit/quickwit-cli/src/lib.rs @@ -49,7 +49,6 @@ pub mod cli; pub mod index; #[cfg(feature = "jemalloc")] pub mod jemalloc; -pub mod logger; pub mod metrics; pub mod service; pub mod source; @@ -60,11 +59,6 @@ pub mod tool; /// Throughput calculation window size. const THROUGHPUT_WINDOW_SIZE: usize = 5; -pub const QW_ENABLE_TOKIO_CONSOLE_ENV_KEY: &str = "QW_ENABLE_TOKIO_CONSOLE"; - -pub const QW_ENABLE_OPENTELEMETRY_OTLP_EXPORTER_ENV_KEY: &str = - "QW_ENABLE_OPENTELEMETRY_OTLP_EXPORTER"; - fn config_cli_arg() -> Arg { Arg::new("config") .long("config") @@ -354,7 +348,7 @@ pub mod busy_detector { use tracing::debug; - use crate::metrics::CLI_METRICS; + use crate::metrics::THREAD_UNPARK_DURATION_MICROSECONDS; // we need that time reference to use an atomic and not a mutex for LAST_UNPARK static TIME_REF: LazyLock = LazyLock::new(Instant::now); @@ -393,10 +387,7 @@ pub mod busy_detector { .unwrap_or_default(); let now = now.as_micros() as u64; let delta = now - time.load(Ordering::Relaxed); - CLI_METRICS - .thread_unpark_duration_microseconds - .with_label_values([]) - .observe(delta as f64); + THREAD_UNPARK_DURATION_MICROSECONDS.record(delta as f64); if delta > ALLOWED_DELAY_MICROS { emit_debug(delta, now); } diff --git a/quickwit/quickwit-cli/src/main.rs b/quickwit/quickwit-cli/src/main.rs index 176c33bee8f..b3064f4dfae 100644 --- a/quickwit/quickwit-cli/src/main.rs +++ b/quickwit/quickwit-cli/src/main.rs @@ -14,18 +14,17 @@ #![recursion_limit = "256"] -use std::collections::BTreeMap; - use anyhow::Context; use colored::Colorize; use quickwit_cli::checklist::RED_COLOR; use quickwit_cli::cli::{CliCommand, build_cli}; #[cfg(feature = "jemalloc")] use quickwit_cli::jemalloc::start_jemalloc_metrics_loop; -use quickwit_cli::logger::setup_logging_and_tracing; +use quickwit_cli::metrics::register_build_info_metric; use quickwit_cli::{busy_detector, install_default_crypto_ring_provider}; use quickwit_common::runtimes::scrape_tokio_runtime_metrics; use quickwit_serve::BuildInfo; +use quickwit_telemetry_exporters::TelemetryHandle; use tracing::error; /// The main tokio runtime takes num_cores / 3 threads by default, and can be overridden by the @@ -47,6 +46,8 @@ fn main() -> anyhow::Result<()> { openssl_probe::init_openssl_env_vars() }; + let (command, ansi_colors) = parse_cli_command(); + let main_runtime_num_threads: usize = get_main_runtime_num_threads(); let rt = tokio::runtime::Builder::new_multi_thread() .enable_all() @@ -57,29 +58,25 @@ fn main() -> anyhow::Result<()> { .build() .context("failed to start main Tokio runtime")?; - scrape_tokio_runtime_metrics(rt.handle(), "main"); + rt.block_on(async move { + install_default_crypto_ring_provider(); - rt.block_on(main_impl()) -} + let build_info = BuildInfo::get(); + let telemetry_handle = quickwit_telemetry_exporters::init_telemetry( + &build_info.version, + command.default_log_level(), + ansi_colors, + )?; + register_build_info_metric(build_info); -fn register_build_info_metric() { - use itertools::Itertools; - let build_info = BuildInfo::get(); - let mut build_kvs = BTreeMap::default(); - build_kvs.insert("build_date", build_info.build_date.to_string()); - build_kvs.insert("commit_hash", build_info.commit_short_hash.to_string()); - build_kvs.insert("version", build_info.version.to_string()); - if !build_info.commit_tags.is_empty() { - let tags_str = build_info.commit_tags.iter().join(","); - build_kvs.insert("commit_tags", tags_str); - } - build_kvs.insert("target", build_info.build_target.to_string()); - quickwit_common::metrics::register_info("build_info", "Quickwit's build info", build_kvs); -} + let runtime_handle = tokio::runtime::Handle::current(); + scrape_tokio_runtime_metrics(&runtime_handle, "main"); -async fn main_impl() -> anyhow::Result<()> { - register_build_info_metric(); + main_impl(command, telemetry_handle).await + }) +} +fn parse_cli_command() -> (CliCommand, bool) { let about_text = about_text(); let version_text = BuildInfo::get_version_text(); @@ -94,17 +91,17 @@ async fn main_impl() -> anyhow::Result<()> { std::process::exit(1); } }; + (command, ansi_colors) +} - install_default_crypto_ring_provider(); - +async fn main_impl(command: CliCommand, telemetry_handle: TelemetryHandle) -> anyhow::Result<()> { #[cfg(feature = "jemalloc")] start_jemalloc_metrics_loop(); - let build_info = BuildInfo::get(); - let (env_filter_reload_fn, tracer_provider_opt) = - setup_logging_and_tracing(command.default_log_level(), ansi_colors, build_info)?; - - let return_code: i32 = if let Err(command_error) = command.execute(env_filter_reload_fn).await { + let return_code: i32 = if let Err(command_error) = command + .execute(telemetry_handle.env_filter_reload_fn()) + .await + { error!(error=%command_error, "command failed"); eprintln!( "{} command failed: {:?}\n", @@ -116,14 +113,7 @@ async fn main_impl() -> anyhow::Result<()> { 0 }; - if let Some((trace_provider, logs_provider)) = tracer_provider_opt { - trace_provider - .shutdown() - .context("failed to shutdown OpenTelemetry tracer provider")?; - logs_provider - .shutdown() - .context("failed to shutdown OpenTelemetry logs provider")?; - } + telemetry_handle.shutdown()?; std::process::exit(return_code) } diff --git a/quickwit/quickwit-cli/src/metrics.rs b/quickwit/quickwit-cli/src/metrics.rs index c51d010c9ea..ccec7c6a320 100644 --- a/quickwit/quickwit-cli/src/metrics.rs +++ b/quickwit/quickwit-cli/src/metrics.rs @@ -14,26 +14,34 @@ use std::sync::LazyLock; -use quickwit_common::metrics::{HistogramVec, new_histogram_vec}; +use quickwit_common::metrics::exponential_buckets; +use quickwit_metrics::{Counter, Histogram, counter, histogram, labels}; +use quickwit_serve::BuildInfo; -pub struct CliMetrics { - pub thread_unpark_duration_microseconds: HistogramVec<0>, +static BUILD_INFO: LazyLock = LazyLock::new(|| { + counter!( + name: "build_info", + description: "Quickwit's build info", + subsystem: "", + ) +}); +pub fn register_build_info_metric(build_info: &BuildInfo) { + let commit_tags = build_info.commit_tags.join(","); + let labels = labels!( + "build_date" => build_info.build_date, + "commit_hash" => build_info.commit_short_hash, + "version" => build_info.version.clone(), + "commit_tags" => commit_tags, + "target" => build_info.build_target, + ); + counter!(parent: BUILD_INFO, labels: [labels]).increment(1); } -impl Default for CliMetrics { - fn default() -> Self { - CliMetrics { - thread_unpark_duration_microseconds: new_histogram_vec( - "thread_unpark_duration_microseconds", - "Duration for which a thread of the main tokio runtime is unparked.", - "cli", - &[], - [], - quickwit_common::metrics::exponential_buckets(5.0, 5.0, 5).unwrap(), - ), - } - } -} - -/// Serve counters exposes a bunch a set of metrics about the request received to quickwit. -pub static CLI_METRICS: LazyLock = LazyLock::new(CliMetrics::default); +pub(crate) static THREAD_UNPARK_DURATION_MICROSECONDS: LazyLock = LazyLock::new(|| { + histogram!( + name: "thread_unpark_duration_microseconds", + description: "Duration for which a thread of the main tokio runtime is unparked.", + subsystem: "cli", + buckets: exponential_buckets(5.0, 5.0, 5).unwrap(), + ) +}); diff --git a/quickwit/quickwit-cluster/Cargo.toml b/quickwit/quickwit-cluster/Cargo.toml index ab01f587cb9..758b3af3cdb 100644 --- a/quickwit/quickwit-cluster/Cargo.toml +++ b/quickwit/quickwit-cluster/Cargo.toml @@ -29,6 +29,7 @@ tracing = { workspace = true } utoipa = { workspace = true } quickwit-common = { workspace = true } +quickwit-metrics = { workspace = true } quickwit-config = { workspace = true } quickwit-proto = { workspace = true } diff --git a/quickwit/quickwit-cluster/src/grpc_gossip.rs b/quickwit/quickwit-cluster/src/grpc_gossip.rs index 0ebf9f662d2..cdbc76d4075 100644 --- a/quickwit/quickwit-cluster/src/grpc_gossip.rs +++ b/quickwit/quickwit-cluster/src/grpc_gossip.rs @@ -31,7 +31,7 @@ use tracing::{info, warn}; use crate::grpc_service::cluster_grpc_client; use crate::member::NodeStateExt; -use crate::metrics::CLUSTER_METRICS; +use crate::metrics::GRPC_GOSSIP_ROUNDS_TOTAL; const MAX_GOSSIP_PEERS: usize = 3; @@ -108,7 +108,7 @@ async fn perform_grpc_gossip_rounds( warn!("failed to fetch cluster state from node `{node_id}`"); continue; }; - CLUSTER_METRICS.grpc_gossip_rounds_total.inc(); + GRPC_GOSSIP_ROUNDS_TOTAL.increment(1); let mut chitchat_guard = chitchat.lock().await; diff --git a/quickwit/quickwit-cluster/src/lib.rs b/quickwit/quickwit-cluster/src/lib.rs index 0f2dbebf749..0ac923dfc7f 100644 --- a/quickwit/quickwit-cluster/src/lib.rs +++ b/quickwit/quickwit-cluster/src/lib.rs @@ -31,7 +31,6 @@ use chitchat::transport::{Socket, Transport, UdpSocket}; use chitchat::{ChitchatMessage, Serializable}; pub use chitchat::{FailureDetectorConfig, KeyChangeEvent, ListenerHandle}; pub use grpc_service::cluster_grpc_server; -use quickwit_common::metrics::IntCounter; use quickwit_common::tower::ClientGrpcConfig; use quickwit_config::service::QuickwitService; use quickwit_config::{GrpcConfig, NodeConfig, TlsConfig}; @@ -49,6 +48,10 @@ pub use crate::cluster::{ create_cluster_for_test, create_cluster_for_test_with_id, grpc_addr_from_listen_addr_for_test, }; pub use crate::member::{ClusterMember, INDEXING_CPU_CAPACITY_KEY}; +use crate::metrics::{ + GOSSIP_RECV_BYTES_TOTAL, GOSSIP_RECV_MESSAGES_TOTAL, GOSSIP_SENT_BYTES_TOTAL, + GOSSIP_SENT_MESSAGES_TOTAL, +}; pub use crate::node::ClusterNode; #[derive(Debug, Clone, Copy, Eq, PartialEq)] @@ -74,10 +77,6 @@ struct CountingUdpTransport; struct CountingUdpSocket { socket: UdpSocket, - gossip_recv: IntCounter, - gossip_recv_bytes: IntCounter, - gossip_send: IntCounter, - gossip_send_bytes: IntCounter, } #[async_trait] @@ -85,16 +84,16 @@ impl Socket for CountingUdpSocket { async fn send(&mut self, to: SocketAddr, msg: ChitchatMessage) -> anyhow::Result<()> { let msg_len = msg.serialized_len() as u64; self.socket.send(to, msg).await?; - self.gossip_send.inc(); - self.gossip_send_bytes.inc_by(msg_len); + GOSSIP_SENT_MESSAGES_TOTAL.increment(1); + GOSSIP_SENT_BYTES_TOTAL.increment(msg_len); Ok(()) } async fn recv(&mut self) -> anyhow::Result<(SocketAddr, ChitchatMessage)> { let (socket_addr, msg) = self.socket.recv().await?; - self.gossip_recv.inc(); + GOSSIP_RECV_MESSAGES_TOTAL.increment(1); let msg_len = msg.serialized_len() as u64; - self.gossip_recv_bytes.inc_by(msg_len); + GOSSIP_RECV_BYTES_TOTAL.increment(msg_len); Ok((socket_addr, msg)) } } @@ -103,21 +102,7 @@ impl Socket for CountingUdpSocket { impl Transport for CountingUdpTransport { async fn open(&self, listen_addr: SocketAddr) -> anyhow::Result> { let socket = UdpSocket::open(listen_addr).await?; - Ok(Box::new(CountingUdpSocket { - socket, - gossip_recv: crate::metrics::CLUSTER_METRICS - .gossip_recv_messages_total - .clone(), - gossip_recv_bytes: crate::metrics::CLUSTER_METRICS - .gossip_recv_bytes_total - .clone(), - gossip_send: crate::metrics::CLUSTER_METRICS - .gossip_sent_messages_total - .clone(), - gossip_send_bytes: crate::metrics::CLUSTER_METRICS - .gossip_sent_bytes_total - .clone(), - })) + Ok(Box::new(CountingUdpSocket { socket })) } } diff --git a/quickwit/quickwit-cluster/src/metrics.rs b/quickwit/quickwit-cluster/src/metrics.rs index a5ac5d4a9ef..ab6fffe66af 100644 --- a/quickwit/quickwit-cluster/src/metrics.rs +++ b/quickwit/quickwit-cluster/src/metrics.rs @@ -18,106 +18,106 @@ use std::sync::{LazyLock, Weak}; use std::time::Duration; use chitchat::{Chitchat, ChitchatId}; -use quickwit_common::metrics::{IntCounter, IntGauge, new_counter, new_gauge}; +use quickwit_metrics::{Counter, Gauge, counter, gauge}; use tokio::sync::Mutex; use crate::member::NodeStateExt; -pub struct ClusterMetrics { - pub live_nodes: IntGauge, - pub ready_nodes: IntGauge, - pub zombie_nodes: IntGauge, - pub dead_nodes: IntGauge, - pub cluster_state_size_bytes: IntGauge, - pub node_state_size_bytes: IntGauge, - pub node_state_keys: IntGauge, - pub gossip_recv_messages_total: IntCounter, - pub gossip_recv_bytes_total: IntCounter, - pub gossip_sent_messages_total: IntCounter, - pub gossip_sent_bytes_total: IntCounter, - pub grpc_gossip_rounds_total: IntCounter, -} - -impl Default for ClusterMetrics { - fn default() -> Self { - ClusterMetrics { - live_nodes: new_gauge( - "live_nodes", - "The number of live nodes observed locally.", - "cluster", - &[], - ), - ready_nodes: new_gauge( - "ready_nodes", - "The number of ready nodes observed locally.", - "cluster", - &[], - ), - zombie_nodes: new_gauge( - "zombie_nodes", - "The number of zombie nodes observed locally.", - "cluster", - &[], - ), - dead_nodes: new_gauge( - "dead_nodes", - "The number of dead nodes observed locally.", - "cluster", - &[], - ), - cluster_state_size_bytes: new_gauge( - "cluster_state_size_bytes", - "The size of the cluster state in bytes.", - "cluster", - &[], - ), - node_state_keys: new_gauge( - "node_state_keys", - "The number of keys in the node state.", - "cluster", - &[], - ), - node_state_size_bytes: new_gauge( - "node_state_size_bytes", - "The size of the node state in bytes.", - "cluster", - &[], - ), - gossip_recv_messages_total: new_counter( - "gossip_recv_messages_total", - "Total number of gossip messages received.", - "cluster", - &[], - ), - gossip_recv_bytes_total: new_counter( - "gossip_recv_bytes_total", - "Total amount of gossip data received in bytes.", - "cluster", - &[], - ), - gossip_sent_messages_total: new_counter( - "gossip_sent_messages_total", - "Total number of gossip messages sent.", - "cluster", - &[], - ), - gossip_sent_bytes_total: new_counter( - "gossip_sent_bytes_total", - "Total amount of gossip data sent in bytes.", - "cluster", - &[], - ), - grpc_gossip_rounds_total: new_counter( - "grpc_gossip_rounds_total", - "Total number of gRPC gossip rounds performed with peer nodes.", - "cluster", - &[], - ), - } - } -} - -pub static CLUSTER_METRICS: LazyLock = LazyLock::new(ClusterMetrics::default); +pub(crate) static LIVE_NODES: LazyLock = LazyLock::new(|| { + gauge!( + name: "live_nodes", + description: "The number of live nodes observed locally.", + subsystem: "cluster", + ) +}); + +pub(crate) static READY_NODES: LazyLock = LazyLock::new(|| { + gauge!( + name: "ready_nodes", + description: "The number of ready nodes observed locally.", + subsystem: "cluster", + ) +}); + +pub(crate) static ZOMBIE_NODES: LazyLock = LazyLock::new(|| { + gauge!( + name: "zombie_nodes", + description: "The number of zombie nodes observed locally.", + subsystem: "cluster", + ) +}); + +pub(crate) static DEAD_NODES: LazyLock = LazyLock::new(|| { + gauge!( + name: "dead_nodes", + description: "The number of dead nodes observed locally.", + subsystem: "cluster", + ) +}); + +pub(crate) static CLUSTER_STATE_SIZE_BYTES: LazyLock = LazyLock::new(|| { + gauge!( + name: "cluster_state_size_bytes", + description: "The size of the cluster state in bytes.", + subsystem: "cluster", + ) +}); + +pub(crate) static NODE_STATE_KEYS: LazyLock = LazyLock::new(|| { + gauge!( + name: "node_state_keys", + description: "The number of keys in the node state.", + subsystem: "cluster", + ) +}); + +pub(crate) static NODE_STATE_SIZE_BYTES: LazyLock = LazyLock::new(|| { + gauge!( + name: "node_state_size_bytes", + description: "The size of the node state in bytes.", + subsystem: "cluster", + ) +}); + +pub(crate) static GOSSIP_RECV_MESSAGES_TOTAL: LazyLock = LazyLock::new(|| { + counter!( + name: "gossip_recv_messages_total", + description: "Total number of gossip messages received.", + subsystem: "cluster", + ) +}); + +pub(crate) static GOSSIP_RECV_BYTES_TOTAL: LazyLock = LazyLock::new(|| { + counter!( + name: "gossip_recv_bytes_total", + description: "Total amount of gossip data received in bytes.", + subsystem: "cluster", + ) +}); + +pub(crate) static GOSSIP_SENT_MESSAGES_TOTAL: LazyLock = LazyLock::new(|| { + counter!( + name: "gossip_sent_messages_total", + description: "Total number of gossip messages sent.", + subsystem: "cluster", + ) +}); + +pub(crate) static GOSSIP_SENT_BYTES_TOTAL: LazyLock = LazyLock::new(|| { + counter!( + name: "gossip_sent_bytes_total", + description: "Total amount of gossip data sent in bytes.", + subsystem: "cluster", + ) +}); + +pub(crate) static GRPC_GOSSIP_ROUNDS_TOTAL: LazyLock = LazyLock::new(|| { + counter!( + name: "grpc_gossip_rounds_total", + description: "Total number of gRPC gossip rounds performed with peer nodes.", + subsystem: "cluster", + ) +}); pub(crate) fn spawn_metrics_task( weak_chitchat: Weak>, @@ -155,24 +155,18 @@ pub(crate) fn spawn_metrics_task( cluster_state_size_bytes += chitchat_id_size_bytes + node_state_size_bytes; if *chitchat_id == self_chitchat_id { - CLUSTER_METRICS - .node_state_keys - .set(node_state.num_key_values() as i64); - CLUSTER_METRICS - .node_state_size_bytes - .set(node_state_size_bytes as i64); + NODE_STATE_KEYS.set(node_state.num_key_values() as f64); + NODE_STATE_SIZE_BYTES.set(node_state_size_bytes as f64); } } drop(chitchat_guard); - CLUSTER_METRICS.live_nodes.set(num_live_nodes as i64); - CLUSTER_METRICS.ready_nodes.set(num_ready_nodes as i64); - CLUSTER_METRICS.zombie_nodes.set(num_zombie_nodes as i64); - CLUSTER_METRICS.dead_nodes.set(num_dead_nodes as i64); + LIVE_NODES.set(num_live_nodes as f64); + READY_NODES.set(num_ready_nodes as f64); + ZOMBIE_NODES.set(num_zombie_nodes as f64); + DEAD_NODES.set(num_dead_nodes as f64); - CLUSTER_METRICS - .cluster_state_size_bytes - .set(cluster_state_size_bytes as i64); + CLUSTER_STATE_SIZE_BYTES.set(cluster_state_size_bytes as f64); } }; tokio::spawn(future); diff --git a/quickwit/quickwit-common/Cargo.toml b/quickwit/quickwit-common/Cargo.toml index 14c05e19c5e..8760f895704 100644 --- a/quickwit/quickwit-common/Cargo.toml +++ b/quickwit/quickwit-common/Cargo.toml @@ -28,9 +28,11 @@ http = { workspace = true } hyper = { workspace = true } hyper-util = { workspace = true, optional = true } itertools = { workspace = true } +metrics = { workspace = true } pin-project = { workspace = true } pnet = { workspace = true } prometheus = { workspace = true } +quickwit-metrics = { workspace = true } rand = { workspace = true } rayon = { workspace = true } regex = { workspace = true } @@ -62,6 +64,7 @@ jemalloc-profiled = [ [dev-dependencies] hyper-util = { workspace = true } +metrics-util = { workspace = true } proptest = { workspace = true } serde_json = { workspace = true } serial_test = { workspace = true } diff --git a/quickwit/quickwit-common/src/io.rs b/quickwit/quickwit-common/src/io.rs index e1d9ad796f1..8f3818fb3b3 100644 --- a/quickwit/quickwit-common/src/io.rs +++ b/quickwit/quickwit-common/src/io.rs @@ -34,10 +34,9 @@ use async_speed_limit::clock::StandardClock; use async_speed_limit::limiter::Consume; use bytesize::ByteSize; use pin_project::pin_project; -use prometheus::IntCounter; +use quickwit_metrics::{Counter, counter}; use tokio::io::AsyncWrite; -use crate::metrics::{IntCounterVec, new_counter_vec}; use crate::{KillSwitch, Progress, ProtectedZoneGuard}; // Max 1MB at a time. @@ -48,25 +47,13 @@ fn truncate_bytes(bytes: &[u8]) -> &[u8] { &bytes[..num_bytes] } -struct IoMetrics { - write_bytes: IntCounterVec<1>, -} - -impl Default for IoMetrics { - fn default() -> Self { - let write_bytes = new_counter_vec( - "write_bytes", - "Number of bytes written by a given component in [indexer, merger, deleter, \ - split_downloader_{merge,delete}]", - "", - &[], - ["component"], - ); - Self { write_bytes } - } -} - -static IO_METRICS: LazyLock = LazyLock::new(IoMetrics::default); +static WRITE_BYTES: LazyLock = LazyLock::new(|| { + counter!( + name: "write_bytes", + description: "Number of bytes written by a given component in [indexer, merger, deleter, split_downloader_{merge,delete}]", + subsystem: "", + ) +}); /// Parameter used in `async_speed_limit`. /// @@ -91,20 +78,18 @@ pub fn limiter(throughput: ByteSize) -> Limiter { #[derive(Clone)] pub struct IoControls { throughput_limiter_opt: Option, - bytes_counter: IntCounter, + bytes_counter: Counter, progress: Progress, kill_switch: KillSwitch, } impl Default for IoControls { fn default() -> Self { - let default_bytes_counter = - IntCounter::new("default_write_num_bytes", "Default write counter.").unwrap(); - IoControls { + Self { throughput_limiter_opt: None, + bytes_counter: Counter::local(), progress: Progress::default(), kill_switch: KillSwitch::default(), - bytes_counter: default_bytes_counter, } } } @@ -131,8 +116,11 @@ impl IoControls { Ok(guard) } - pub fn set_component(mut self, component: &str) -> Self { - self.bytes_counter = IO_METRICS.write_bytes.with_label_values([component]); + pub fn set_component(mut self, component: &'static str) -> Self { + self.bytes_counter = counter!( + parent: WRITE_BYTES, + "component" => component, + ); self } @@ -148,7 +136,7 @@ impl IoControls { self } - pub fn set_bytes_counter(mut self, bytes_counter: IntCounter) -> Self { + pub fn set_bytes_counter(mut self, bytes_counter: Counter) -> Self { self.bytes_counter = bytes_counter; self } @@ -167,7 +155,7 @@ impl IoControls { if let Some(throughput_limiter) = &self.throughput_limiter_opt { throughput_limiter.blocking_consume(num_bytes); } - self.bytes_counter.inc_by(num_bytes as u64); + self.bytes_counter.increment(num_bytes as u64); Ok(()) } } @@ -220,7 +208,7 @@ impl ControlledWrite { let len = *obj.as_ref().unwrap_or(&0); if len > 0 { let waiter = this.io_controls_access.apply(|io_controls| { - io_controls.bytes_counter.inc_by(len as u64); + io_controls.bytes_counter.increment(len as u64); io_controls .throughput_limiter_opt .as_ref() diff --git a/quickwit/quickwit-common/src/metrics.rs b/quickwit/quickwit-common/src/metrics.rs index 193def5e01a..ad1455d8f7f 100644 --- a/quickwit/quickwit-common/src/metrics.rs +++ b/quickwit/quickwit-common/src/metrics.rs @@ -12,442 +12,118 @@ // See the License for the specific language governing permissions and // limitations under the License. -use std::collections::{BTreeMap, HashMap}; -use std::sync::{LazyLock, OnceLock}; +use std::sync::LazyLock; -use prometheus::{Gauge, HistogramOpts, Opts, TextEncoder}; -pub use prometheus::{ - Histogram, HistogramTimer, HistogramVec as PrometheusHistogramVec, IntCounter, - IntCounterVec as PrometheusIntCounterVec, IntGauge, IntGaugeVec as PrometheusIntGaugeVec, - exponential_buckets, linear_buckets, -}; +pub use prometheus::{exponential_buckets, linear_buckets}; +use quickwit_metrics::{Gauge, gauge}; -#[derive(Clone)] -pub struct HistogramVec { - underlying: PrometheusHistogramVec, -} - -impl HistogramVec { - pub fn with_label_values(&self, label_values: [&str; N]) -> Histogram { - self.underlying.with_label_values(&label_values) - } -} - -#[derive(Clone)] -pub struct IntCounterVec { - underlying: PrometheusIntCounterVec, -} - -impl IntCounterVec { - pub fn new( - name: &str, - help: &str, - subsystem: &str, - const_labels: &[(&str, &str)], - label_names: [&str; N], - ) -> IntCounterVec { - let owned_const_labels: HashMap = const_labels - .iter() - .map(|(label_name, label_value)| (label_name.to_string(), label_value.to_string())) - .collect(); - let counter_opts = Opts::new(name, help) - .namespace("quickwit") - .subsystem(subsystem) - .const_labels(owned_const_labels); - let underlying = PrometheusIntCounterVec::new(counter_opts, &label_names) - .expect("failed to create counter vec"); - IntCounterVec { underlying } - } - - pub fn with_label_values(&self, label_values: [&str; N]) -> IntCounter { - self.underlying.with_label_values(&label_values) - } -} - -#[derive(Clone)] -pub struct IntGaugeVec { - underlying: PrometheusIntGaugeVec, -} - -impl IntGaugeVec { - pub fn with_label_values(&self, label_values: [&str; N]) -> IntGauge { - self.underlying.with_label_values(&label_values) - } -} +pub fn index_label(index_id: &str) -> &str { + static PER_INDEX_METRICS_ENABLED: LazyLock = + LazyLock::new(|| !crate::get_bool_from_env("QW_DISABLE_PER_INDEX_METRICS", false)); -pub fn register_info(name: &'static str, help: &'static str, kvs: BTreeMap<&'static str, String>) { - let mut counter_opts = Opts::new(name, help).namespace("quickwit"); - for (k, v) in kvs { - counter_opts = counter_opts.const_label(k, v); + if *PER_INDEX_METRICS_ENABLED { + index_id + } else { + "__any__" } - let counter = IntCounter::with_opts(counter_opts).expect("failed to create counter"); - counter.inc(); - prometheus::register(Box::new(counter)).expect("failed to register counter"); -} - -pub fn new_counter( - name: &str, - help: &str, - subsystem: &str, - const_labels: &[(&str, &str)], -) -> IntCounter { - let owned_const_labels: HashMap = const_labels - .iter() - .map(|(label_name, label_value)| (label_name.to_string(), label_value.to_string())) - .collect(); - let counter_opts = Opts::new(name, help) - .namespace("quickwit") - .subsystem(subsystem) - .const_labels(owned_const_labels); - let counter = IntCounter::with_opts(counter_opts).expect("failed to create counter"); - prometheus::register(Box::new(counter.clone())).expect("failed to register counter"); - counter } -pub fn new_counter_vec( - name: &str, - help: &str, - subsystem: &str, - const_labels: &[(&str, &str)], - label_names: [&str; N], -) -> IntCounterVec { - let int_counter_vec = IntCounterVec::new(name, help, subsystem, const_labels, label_names); - let collector = Box::new(int_counter_vec.underlying.clone()); - prometheus::register(collector).expect("failed to register counter vec"); - int_counter_vec -} - -pub fn new_float_gauge( - name: &str, - help: &str, - subsystem: &str, - const_labels: &[(&str, &str)], -) -> Gauge { - let owned_const_labels: HashMap = const_labels - .iter() - .map(|(label_name, label_value)| (label_name.to_string(), label_value.to_string())) - .collect(); - let gauge_opts = Opts::new(name, help) - .namespace("quickwit") - .subsystem(subsystem) - .const_labels(owned_const_labels); - let gauge = Gauge::with_opts(gauge_opts).expect("failed to create float gauge"); - prometheus::register(Box::new(gauge.clone())).expect("failed to register float gauge"); - gauge -} - -pub fn new_gauge( - name: &str, - help: &str, - subsystem: &str, - const_labels: &[(&str, &str)], -) -> IntGauge { - let owned_const_labels: HashMap = const_labels - .iter() - .map(|(label_name, label_value)| (label_name.to_string(), label_value.to_string())) - .collect(); - let gauge_opts = Opts::new(name, help) - .namespace("quickwit") - .subsystem(subsystem) - .const_labels(owned_const_labels); - let gauge = IntGauge::with_opts(gauge_opts).expect("failed to create gauge"); - prometheus::register(Box::new(gauge.clone())).expect("failed to register gauge"); - gauge -} - -pub fn new_gauge_vec( - name: &str, - help: &str, - subsystem: &str, - const_labels: &[(&str, &str)], - label_names: [&str; N], -) -> IntGaugeVec { - let owned_const_labels: HashMap = const_labels - .iter() - .map(|(label_name, label_value)| (label_name.to_string(), label_value.to_string())) - .collect(); - let gauge_opts = Opts::new(name, help) - .namespace("quickwit") - .subsystem(subsystem) - .const_labels(owned_const_labels); - let underlying = - PrometheusIntGaugeVec::new(gauge_opts, &label_names).expect("failed to create gauge vec"); - - let collector = Box::new(underlying.clone()); - prometheus::register(collector).expect("failed to register counter vec"); - - IntGaugeVec { underlying } -} +pub static MEMORY_ACTIVE_BYTES: LazyLock = LazyLock::new(|| { + gauge!( + name: "active_bytes", + description: "Total number of bytes in active pages allocated by the application, as reported by jemalloc `stats.active`.", + subsystem: "memory", + ) +}); -pub fn new_histogram(name: &str, help: &str, subsystem: &str, buckets: Vec) -> Histogram { - let histogram_opts = HistogramOpts::new(name, help) - .namespace("quickwit") - .subsystem(subsystem) - .buckets(buckets); - let histogram = Histogram::with_opts(histogram_opts).expect("failed to create histogram"); - prometheus::register(Box::new(histogram.clone())).expect("failed to register histogram"); - histogram -} +pub static MEMORY_ALLOCATED_BYTES: LazyLock = LazyLock::new(|| { + gauge!( + name: "allocated_bytes", + description: "Total number of bytes allocated by the application, as reported by jemalloc `stats.allocated`.", + subsystem: "memory", + ) +}); -pub fn new_histogram_vec( - name: &str, - help: &str, - subsystem: &str, - const_labels: &[(&str, &str)], - label_names: [&str; N], - buckets: Vec, -) -> HistogramVec { - let owned_const_labels: HashMap = const_labels - .iter() - .map(|(label_name, label_value)| (label_name.to_string(), label_value.to_string())) - .collect(); - let histogram_opts = HistogramOpts::new(name, help) - .namespace("quickwit") - .subsystem(subsystem) - .const_labels(owned_const_labels) - .buckets(buckets); - let underlying = PrometheusHistogramVec::new(histogram_opts, &label_names) - .expect("failed to create histogram vec"); +pub static MEMORY_RESIDENT_BYTES: LazyLock = LazyLock::new(|| { + gauge!( + name: "resident_bytes", + description: " Total number of bytes in physically resident data pages mapped by the allocator, as reported by jemalloc `stats.resident`.", + subsystem: "memory", + ) +}); - let collector = Box::new(underlying.clone()); - prometheus::register(collector).expect("failed to register histogram vec"); +static IN_FLIGHT_DATA_BYTES: LazyLock = LazyLock::new(|| { + gauge!( + name: "in_flight_data_bytes", + description: "Amount of data in-flight in various buffers in bytes.", + subsystem: "memory", + ) +}); - HistogramVec { underlying } -} +pub static IN_FLIGHT_REST_SERVER: LazyLock = + LazyLock::new(|| in_flight_data_gauge("rest_server")); -pub struct GaugeGuard<'a> { - gauge: &'a IntGauge, - delta: i64, -} +pub static IN_FLIGHT_INGEST_ROUTER: LazyLock = + LazyLock::new(|| in_flight_data_gauge("ingest_router")); -impl std::fmt::Debug for GaugeGuard<'_> { - fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { - self.delta.fmt(f) - } -} +pub static IN_FLIGHT_INGESTER_PERSIST: LazyLock = + LazyLock::new(|| in_flight_data_gauge("ingester_persist")); -impl<'a> GaugeGuard<'a> { - pub fn from_gauge(gauge: &'a IntGauge) -> Self { - Self { gauge, delta: 0i64 } - } +pub static IN_FLIGHT_INGESTER_REPLICATE: LazyLock = + LazyLock::new(|| in_flight_data_gauge("ingester_replicate")); - pub fn get(&self) -> i64 { - self.delta - } +pub static IN_FLIGHT_WAL: LazyLock = LazyLock::new(|| in_flight_data_gauge("wal")); - pub fn add(&mut self, delta: i64) { - self.gauge.add(delta); - self.delta += delta; - } +pub static IN_FLIGHT_FETCH_STREAM: LazyLock = + LazyLock::new(|| in_flight_data_gauge("fetch_stream")); - pub fn sub(&mut self, delta: i64) { - self.gauge.sub(delta); - self.delta -= delta; - } -} +pub static IN_FLIGHT_MULTI_FETCH_STREAM: LazyLock = + LazyLock::new(|| in_flight_data_gauge("multi_fetch_stream")); -impl Drop for GaugeGuard<'_> { - fn drop(&mut self) { - self.gauge.sub(self.delta) - } -} +pub static IN_FLIGHT_DOC_PROCESSOR_MAILBOX: LazyLock = + LazyLock::new(|| in_flight_data_gauge("doc_processor_mailbox")); -pub struct OwnedGaugeGuard { - gauge: IntGauge, - delta: i64, -} +pub static IN_FLIGHT_INDEXER_MAILBOX: LazyLock = + LazyLock::new(|| in_flight_data_gauge("indexer_mailbox")); -impl std::fmt::Debug for OwnedGaugeGuard { - fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { - self.delta.fmt(f) - } -} +pub static IN_FLIGHT_INDEX_WRITER: LazyLock = + LazyLock::new(|| in_flight_data_gauge("index_writer")); -impl OwnedGaugeGuard { - pub fn from_gauge(gauge: IntGauge) -> Self { - Self { gauge, delta: 0i64 } - } +pub static IN_FLIGHT_FILE_SOURCE: LazyLock = + LazyLock::new(|| in_flight_data_gauge("file_source")); - pub fn get(&self) -> i64 { - self.delta - } +pub static IN_FLIGHT_INGEST_SOURCE: LazyLock = + LazyLock::new(|| in_flight_data_gauge("ingest_source")); - pub fn add(&mut self, delta: i64) { - self.gauge.add(delta); - self.delta += delta; - } +pub static IN_FLIGHT_KAFKA_SOURCE: LazyLock = + LazyLock::new(|| in_flight_data_gauge("kafka_source")); - pub fn sub(&mut self, delta: i64) { - self.gauge.sub(delta); - self.delta -= delta; - } -} +pub static IN_FLIGHT_KINESIS_SOURCE: LazyLock = + LazyLock::new(|| in_flight_data_gauge("kinesis_source")); -impl Drop for OwnedGaugeGuard { - fn drop(&mut self) { - self.gauge.sub(self.delta) - } -} +pub static IN_FLIGHT_PUBSUB_SOURCE: LazyLock = + LazyLock::new(|| in_flight_data_gauge("pubsub_source")); -pub fn metrics_text_payload() -> Result { - let metric_families = prometheus::gather(); - // Arbitrary non-zero size in order to skip a bunch of - // buffer growth-reallocations when encoding metrics. - let mut buffer = String::with_capacity(1024); - let encoder = TextEncoder::new(); - match encoder.encode_utf8(&metric_families, &mut buffer) { - Ok(()) => Ok(buffer), - Err(e) => Err(e.to_string()), - } -} +pub static IN_FLIGHT_PULSAR_SOURCE: LazyLock = + LazyLock::new(|| in_flight_data_gauge("pulsar_source")); -#[derive(Clone)] -pub struct MemoryMetrics { - pub active_bytes: IntGauge, - pub allocated_bytes: IntGauge, - pub resident_bytes: IntGauge, - pub in_flight: InFlightDataGauges, -} +pub static IN_FLIGHT_OTHER_SOURCE: LazyLock = + LazyLock::new(|| in_flight_data_gauge("other_source")); -impl Default for MemoryMetrics { - fn default() -> Self { - Self { - active_bytes: new_gauge( - "active_bytes", - "Total number of bytes in active pages allocated by the application, as reported \ - by jemalloc `stats.active`.", - "memory", - &[], - ), - allocated_bytes: new_gauge( - "allocated_bytes", - "Total number of bytes allocated by the application, as reported by jemalloc \ - `stats.allocated`.", - "memory", - &[], - ), - resident_bytes: new_gauge( - "resident_bytes", - " Total number of bytes in physically resident data pages mapped by the \ - allocator, as reported by jemalloc `stats.resident`.", - "memory", - &[], - ), - in_flight: InFlightDataGauges::default(), - } - } +fn in_flight_data_gauge(component: &'static str) -> Gauge { + gauge!(parent: IN_FLIGHT_DATA_BYTES, "component" => component) } -#[derive(Clone)] -pub struct InFlightDataGauges { - pub rest_server: IntGauge, - pub ingest_router: IntGauge, - pub ingester_persist: IntGauge, - pub ingester_replicate: IntGauge, - pub wal: IntGauge, - pub fetch_stream: IntGauge, - pub multi_fetch_stream: IntGauge, - pub doc_processor_mailbox: IntGauge, - pub indexer_mailbox: IntGauge, - pub index_writer: IntGauge, - in_flight_gauge_vec: IntGaugeVec<1>, -} +#[cfg(test)] +mod tests { + use super::*; -impl Default for InFlightDataGauges { - fn default() -> Self { - let in_flight_gauge_vec = new_gauge_vec( - "in_flight_data_bytes", - "Amount of data in-flight in various buffers in bytes.", - "memory", - &[], - ["component"], + #[test] + fn bucket_helpers_are_reexported() { + assert_eq!(linear_buckets(0.0, 1.0, 3).unwrap(), vec![0.0, 1.0, 2.0]); + assert_eq!( + exponential_buckets(1.0, 2.0, 3).unwrap(), + vec![1.0, 2.0, 4.0] ); - Self { - rest_server: in_flight_gauge_vec.with_label_values(["rest_server"]), - ingest_router: in_flight_gauge_vec.with_label_values(["ingest_router"]), - ingester_persist: in_flight_gauge_vec.with_label_values(["ingester_persist"]), - ingester_replicate: in_flight_gauge_vec.with_label_values(["ingester_replicate"]), - wal: in_flight_gauge_vec.with_label_values(["wal"]), - fetch_stream: in_flight_gauge_vec.with_label_values(["fetch_stream"]), - multi_fetch_stream: in_flight_gauge_vec.with_label_values(["multi_fetch_stream"]), - doc_processor_mailbox: in_flight_gauge_vec.with_label_values(["doc_processor_mailbox"]), - indexer_mailbox: in_flight_gauge_vec.with_label_values(["indexer_mailbox"]), - index_writer: in_flight_gauge_vec.with_label_values(["index_writer"]), - in_flight_gauge_vec: in_flight_gauge_vec.clone(), - } - } -} - -impl InFlightDataGauges { - #[inline] - pub fn file(&self) -> &IntGauge { - static GAUGE: OnceLock = OnceLock::new(); - GAUGE.get_or_init(|| self.in_flight_gauge_vec.with_label_values(["file_source"])) - } - - #[inline] - pub fn ingest(&self) -> &IntGauge { - static GAUGE: OnceLock = OnceLock::new(); - GAUGE.get_or_init(|| { - self.in_flight_gauge_vec - .with_label_values(["ingest_source"]) - }) - } - - #[inline] - pub fn kafka(&self) -> &IntGauge { - static GAUGE: OnceLock = OnceLock::new(); - GAUGE.get_or_init(|| self.in_flight_gauge_vec.with_label_values(["kafka_source"])) - } - - #[inline] - pub fn kinesis(&self) -> &IntGauge { - static GAUGE: OnceLock = OnceLock::new(); - GAUGE.get_or_init(|| { - self.in_flight_gauge_vec - .with_label_values(["kinesis_source"]) - }) - } - - #[inline] - pub fn pubsub(&self) -> &IntGauge { - static GAUGE: OnceLock = OnceLock::new(); - GAUGE.get_or_init(|| { - self.in_flight_gauge_vec - .with_label_values(["pubsub_source"]) - }) - } - - #[inline] - pub fn pulsar(&self) -> &IntGauge { - static GAUGE: OnceLock = OnceLock::new(); - GAUGE.get_or_init(|| { - self.in_flight_gauge_vec - .with_label_values(["pulsar_source"]) - }) - } - - #[inline] - pub fn other(&self) -> &IntGauge { - static GAUGE: OnceLock = OnceLock::new(); - GAUGE.get_or_init(|| { - self.in_flight_gauge_vec - .with_label_values(["pulsar_source"]) - }) - } -} - -/// This function returns `index_id` as is if per-index metrics are enabled, or projects it to -/// `"__any__"` otherwise. -pub fn index_label(index_id: &str) -> &str { - static PER_INDEX_METRICS_ENABLED: LazyLock = - LazyLock::new(|| !crate::get_bool_from_env("QW_DISABLE_PER_INDEX_METRICS", false)); - - if *PER_INDEX_METRICS_ENABLED { - index_id - } else { - "__any__" } } - -pub static MEMORY_METRICS: LazyLock = LazyLock::new(MemoryMetrics::default); diff --git a/quickwit/quickwit-common/src/runtimes.rs b/quickwit/quickwit-common/src/runtimes.rs index 79ac2611bd9..04c3a6cef8f 100644 --- a/quickwit/quickwit-common/src/runtimes.rs +++ b/quickwit/quickwit-common/src/runtimes.rs @@ -17,14 +17,45 @@ use std::sync::OnceLock; use std::sync::atomic::{AtomicUsize, Ordering}; use std::time::Duration; -use prometheus::{Gauge, IntCounter, IntGauge}; +use quickwit_metrics::{Counter, Gauge, counter, gauge, labels}; use tokio::runtime::Runtime; use tokio_metrics::{RuntimeMetrics, RuntimeMonitor}; -use crate::metrics::{new_counter, new_float_gauge, new_gauge}; - static RUNTIMES: OnceLock> = OnceLock::new(); +static TOKIO_SCHEDULED_TASKS: std::sync::LazyLock = std::sync::LazyLock::new(|| { + gauge!( + name: "tokio_scheduled_tasks", + description: "The total number of tasks currently scheduled in workers' local queues.", + subsystem: "runtime", + ) +}); + +static TOKIO_WORKER_BUSY_DURATION_MILLISECONDS_TOTAL: std::sync::LazyLock = + std::sync::LazyLock::new(|| { + counter!( + name: "tokio_worker_busy_duration_milliseconds_total", + description: " The total amount of time worker threads were busy.", + subsystem: "runtime", + ) + }); + +static TOKIO_WORKER_BUSY_RATIO: std::sync::LazyLock = std::sync::LazyLock::new(|| { + gauge!( + name: "tokio_worker_busy_ratio", + description: "The ratio of time worker threads were busy since the last time runtime metrics were collected.", + subsystem: "runtime", + ) +}); + +static TOKIO_WORKER_THREADS: std::sync::LazyLock = std::sync::LazyLock::new(|| { + gauge!( + name: "tokio_worker_threads", + description: "The number of worker threads used by the runtime.", + subsystem: "runtime", + ) +}); + /// Describes which runtime an actor should run on. #[derive(Clone, Copy, Debug, Hash, Eq, PartialEq)] pub enum RuntimeType { @@ -165,61 +196,43 @@ pub fn scrape_tokio_runtime_metrics(handle: &tokio::runtime::Handle, label: &'st let runtime_monitor = RuntimeMonitor::new(handle); handle.spawn(async move { let mut interval = tokio::time::interval(Duration::from_secs(1)); - let mut prometheus_runtime_metrics = PrometheusRuntimeMetrics::new(label); + let mut runtime_metrics_recorder = RuntimeMetricsRecorder::new(label); for tokio_runtime_metrics in runtime_monitor.intervals() { interval.tick().await; - prometheus_runtime_metrics.update(&tokio_runtime_metrics); + runtime_metrics_recorder.update(&tokio_runtime_metrics); } }); } -struct PrometheusRuntimeMetrics { - scheduled_tasks: IntGauge, - worker_busy_duration_milliseconds_total: IntCounter, +struct RuntimeMetricsRecorder { + scheduled_tasks: Gauge, + worker_busy_duration_milliseconds_total: Counter, worker_busy_ratio: Gauge, - worker_threads: IntGauge, + worker_threads: Gauge, } -impl PrometheusRuntimeMetrics { +impl RuntimeMetricsRecorder { pub fn new(label: &'static str) -> Self { + let labels = labels!("runtime_type" => label); Self { - scheduled_tasks: new_gauge( - "tokio_scheduled_tasks", - "The total number of tasks currently scheduled in workers' local queues.", - "runtime", - &[("runtime_type", label)], - ), - worker_busy_duration_milliseconds_total: new_counter( - "tokio_worker_busy_duration_milliseconds_total", - " The total amount of time worker threads were busy.", - "runtime", - &[("runtime_type", label)], - ), - worker_busy_ratio: new_float_gauge( - "tokio_worker_busy_ratio", - "The ratio of time worker threads were busy since the last time runtime metrics \ - were collected.", - "runtime", - &[("runtime_type", label)], - ), - worker_threads: new_gauge( - "tokio_worker_threads", - "The number of worker threads used by the runtime.", - "runtime", - &[("runtime_type", label)], - ), + scheduled_tasks: gauge!(parent: TOKIO_SCHEDULED_TASKS, labels: [labels]), + worker_busy_duration_milliseconds_total: counter!( + parent: TOKIO_WORKER_BUSY_DURATION_MILLISECONDS_TOTAL, + labels: [labels]), + worker_busy_ratio: gauge!(parent: TOKIO_WORKER_BUSY_RATIO, labels: [labels]), + worker_threads: gauge!(parent: TOKIO_WORKER_THREADS, labels: [labels]), } } pub fn update(&mut self, runtime_metrics: &RuntimeMetrics) { self.scheduled_tasks - .set(runtime_metrics.total_local_queue_depth as i64); + .set(runtime_metrics.total_local_queue_depth as f64); self.worker_busy_duration_milliseconds_total - .inc_by(runtime_metrics.total_busy_duration.as_millis() as u64); + .increment(runtime_metrics.total_busy_duration.as_millis() as u64); self.worker_busy_ratio.set(runtime_metrics.busy_ratio()); self.worker_threads - .set(runtime_metrics.workers_count as i64); + .set(runtime_metrics.workers_count as f64); } } diff --git a/quickwit/quickwit-common/src/stream_utils.rs b/quickwit/quickwit-common/src/stream_utils.rs index 00b40ee4b43..e79ce60c350 100644 --- a/quickwit/quickwit-common/src/stream_utils.rs +++ b/quickwit/quickwit-common/src/stream_utils.rs @@ -18,12 +18,11 @@ use std::pin::Pin; use bytesize::ByteSize; use futures::{Stream, StreamExt, TryStreamExt, stream}; -use prometheus::IntGauge; +use quickwit_metrics::{Gauge, GaugeGuard}; use tokio::sync::{mpsc, watch}; use tokio_stream::wrappers::{ReceiverStream, UnboundedReceiverStream, WatchStream}; use tracing::warn; -use crate::metrics::GaugeGuard; use crate::tower::RpcName; pub type BoxStream = Pin + Send + Unpin + 'static>>; @@ -77,7 +76,7 @@ where T: Send + 'static pub fn new_bounded_with_gauge( capacity: usize, - gauge: &'static IntGauge, + gauge: &'static Gauge, ) -> (TrackedSender, Self) { let (sender, receiver) = mpsc::channel(capacity); let tracked_sender = TrackedSender { sender, gauge }; @@ -94,7 +93,7 @@ where T: Send + 'static (sender, receiver.into()) } - pub fn new_unbounded_with_gauge(gauge: &'static IntGauge) -> (TrackedUnboundedSender, Self) { + pub fn new_unbounded_with_gauge(gauge: &'static Gauge) -> (TrackedUnboundedSender, Self) { let (sender, receiver) = mpsc::unbounded_channel(); let tracked_sender = TrackedUnboundedSender { sender, gauge }; let receiver_stream = UnboundedReceiverStream::new(receiver) @@ -228,7 +227,7 @@ where T: RpcName } } -pub struct InFlightValue(T, #[allow(dead_code)] GaugeGuard<'static>); +pub struct InFlightValue(T, #[allow(dead_code)] GaugeGuard); impl fmt::Debug for InFlightValue where T: fmt::Debug @@ -239,10 +238,8 @@ where T: fmt::Debug } impl InFlightValue { - pub fn new(value: T, value_size: ByteSize, gauge: &'static IntGauge) -> Self { - let mut gauge_guard = GaugeGuard::from_gauge(gauge); - gauge_guard.add(value_size.as_u64() as i64); - + pub fn new(value: T, value_size: ByteSize, gauge: &'static Gauge) -> Self { + let gauge_guard = GaugeGuard::new(gauge, value_size.as_u64() as f64); Self(value, gauge_guard) } @@ -253,7 +250,7 @@ impl InFlightValue { pub struct TrackedSender { sender: mpsc::Sender>, - gauge: &'static IntGauge, + gauge: &'static Gauge, } impl TrackedSender { @@ -271,7 +268,7 @@ impl TrackedSender { pub struct TrackedUnboundedSender { sender: mpsc::UnboundedSender>, - gauge: &'static IntGauge, + gauge: &'static Gauge, } impl TrackedUnboundedSender { @@ -286,8 +283,9 @@ impl TrackedUnboundedSender { mod tests { use std::sync::LazyLock; + use quickwit_metrics::{Gauge, gauge}; + use super::*; - use crate::metrics::new_gauge; #[tokio::test] async fn test_service_stream_map() { @@ -300,32 +298,35 @@ mod tests { #[tokio::test] async fn test_tracked_service_stream_bounded() { - static TEST_GAUGE: LazyLock = LazyLock::new(|| { - new_gauge("common", "help", "test_tracked_service_stream_bounded", &[]) + static TEST_GAUGE: LazyLock = LazyLock::new(|| { + gauge!( + name: "common", + description: "help", + subsystem: "test_tracked_service_stream_bounded", + ) }); let (service_stream_tx, mut service_stream) = ServiceStream::new_bounded_with_gauge(3, &TEST_GAUGE); service_stream_tx.send(1, ByteSize(42)).await.unwrap(); - assert_eq!(TEST_GAUGE.get(), 42); + assert_eq!(TEST_GAUGE.get(), 42.0); service_stream_tx.send(2, ByteSize(1337)).await.unwrap(); - assert_eq!(TEST_GAUGE.get(), 1379); + assert_eq!(TEST_GAUGE.get(), 1379.0); let value = service_stream.next().await.unwrap(); assert_eq!(value, 1); - assert_eq!(TEST_GAUGE.get(), 1337); + assert_eq!(TEST_GAUGE.get(), 1337.0); } #[tokio::test] async fn test_tracked_service_stream_unbounded() { - static TEST_GAUGE: LazyLock = LazyLock::new(|| { - new_gauge( - "common", - "help", - "test_tracked_service_stream_unbounded", - &[], + static TEST_GAUGE: LazyLock = LazyLock::new(|| { + gauge!( + name: "common", + description: "help", + subsystem: "test_tracked_service_stream_unbounded", ) }); @@ -333,13 +334,13 @@ mod tests { ServiceStream::new_unbounded_with_gauge(&TEST_GAUGE); service_stream_tx.send(1, ByteSize(42)).unwrap(); - assert_eq!(TEST_GAUGE.get(), 42); + assert_eq!(TEST_GAUGE.get(), 42.0); service_stream_tx.send(2, ByteSize(1337)).unwrap(); - assert_eq!(TEST_GAUGE.get(), 1379); + assert_eq!(TEST_GAUGE.get(), 1379.0); let value = service_stream.next().await.unwrap(); assert_eq!(value, 1); - assert_eq!(TEST_GAUGE.get(), 1337); + assert_eq!(TEST_GAUGE.get(), 1337.0); } } diff --git a/quickwit/quickwit-common/src/thread_pool.rs b/quickwit/quickwit-common/src/thread_pool.rs index f4b738ef2c0..a57ee56d6f8 100644 --- a/quickwit/quickwit-common/src/thread_pool.rs +++ b/quickwit/quickwit-common/src/thread_pool.rs @@ -13,14 +13,28 @@ // limitations under the License. use std::fmt; -use std::sync::{Arc, LazyLock}; +use std::sync::Arc; use futures::{Future, TryFutureExt}; -use prometheus::IntGauge; +use quickwit_metrics::{Gauge, GaugeGuard, gauge, labels}; use tokio::sync::oneshot; use tracing::error; -use crate::metrics::{GaugeGuard, IntGaugeVec, OwnedGaugeGuard, new_gauge_vec}; +static THREAD_POOL_ONGOING_TASKS: std::sync::LazyLock = std::sync::LazyLock::new(|| { + gauge!( + name: "ongoing_tasks", + description: "number of tasks being currently processed by threads in the thread pool", + subsystem: "thread_pool", + ) +}); + +static THREAD_POOL_PENDING_TASKS: std::sync::LazyLock = std::sync::LazyLock::new(|| { + gauge!( + name: "pending_tasks", + description: "number of tasks waiting in the queue before being processed by the thread pool", + subsystem: "thread_pool", + ) +}); /// An executor backed by a thread pool to run CPU-intensive tasks. /// @@ -29,8 +43,8 @@ use crate::metrics::{GaugeGuard, IntGaugeVec, OwnedGaugeGuard, new_gauge_vec}; #[derive(Clone)] pub struct ThreadPool { thread_pool: Arc, - ongoing_tasks: IntGauge, - pending_tasks: IntGauge, + ongoing_tasks: Gauge, + pending_tasks: Gauge, } impl ThreadPool { @@ -46,8 +60,9 @@ impl ThreadPool { let thread_pool = rayon_pool_builder .build() .expect("failed to spawn thread pool"); - let ongoing_tasks = THREAD_POOL_METRICS.ongoing_tasks.with_label_values([name]); - let pending_tasks = THREAD_POOL_METRICS.pending_tasks.with_label_values([name]); + let labels = labels!("pool" => name); + let ongoing_tasks = gauge!(parent: THREAD_POOL_ONGOING_TASKS, labels: [labels]); + let pending_tasks = gauge!(parent: THREAD_POOL_PENDING_TASKS, labels: [labels]); ThreadPool { thread_pool: Arc::new(thread_pool), ongoing_tasks, @@ -84,9 +99,7 @@ impl ThreadPool { { let span = tracing::Span::current(); let ongoing_tasks = self.ongoing_tasks.clone(); - let mut pending_tasks_guard: OwnedGaugeGuard = - OwnedGaugeGuard::from_gauge(self.pending_tasks.clone()); - pending_tasks_guard.add(1i64); + let pending_tasks_guard = GaugeGuard::new(&self.pending_tasks, 1.0); let (tx, rx) = oneshot::channel(); self.thread_pool.spawn(move || { drop(pending_tasks_guard); @@ -94,8 +107,7 @@ impl ThreadPool { return; } let _guard = span.enter(); - let mut ongoing_task_guard = GaugeGuard::from_gauge(&ongoing_tasks); - ongoing_task_guard.add(1i64); + let _ongoing_task_guard = GaugeGuard::new(&ongoing_tasks, 1.0); let result = cpu_intensive_fn(); let _ = tx.send(result); }); @@ -134,34 +146,6 @@ impl fmt::Display for Panicked { impl std::error::Error for Panicked {} -struct ThreadPoolMetrics { - ongoing_tasks: IntGaugeVec<1>, - pending_tasks: IntGaugeVec<1>, -} - -impl Default for ThreadPoolMetrics { - fn default() -> Self { - ThreadPoolMetrics { - ongoing_tasks: new_gauge_vec( - "ongoing_tasks", - "number of tasks being currently processed by threads in the thread pool", - "thread_pool", - &[], - ["pool"], - ), - pending_tasks: new_gauge_vec( - "pending_tasks", - "number of tasks waiting in the queue before being processed by the thread pool", - "thread_pool", - &[], - ["pool"], - ), - } - } -} - -static THREAD_POOL_METRICS: LazyLock = LazyLock::new(ThreadPoolMetrics::default); - #[cfg(test)] mod tests { use std::sync::Arc; diff --git a/quickwit/quickwit-common/src/tower/circuit_breaker.rs b/quickwit/quickwit-common/src/tower/circuit_breaker.rs index 09ada07e187..b42f68e8cd3 100644 --- a/quickwit/quickwit-common/src/tower/circuit_breaker.rs +++ b/quickwit/quickwit-common/src/tower/circuit_breaker.rs @@ -19,7 +19,7 @@ use std::task::{Context, Poll}; use std::time::Duration; use pin_project::pin_project; -use prometheus::IntCounter; +use quickwit_metrics::Counter; use tokio::time::Instant; use tower::{Layer, Service}; @@ -49,7 +49,7 @@ pub struct CircuitBreakerLayer { time_window: Duration, timeout: Duration, evaluator: Evaluator, - circuit_break_total: prometheus::IntCounter, + circuit_break_total: Counter, } pub trait CircuitBreakerEvaluator: Clone { @@ -61,7 +61,7 @@ pub trait CircuitBreakerEvaluator: Clone { self, max_num_errors_per_secs: u32, timeout: Duration, - circuit_break_total: prometheus::IntCounter, + circuit_break_total: Counter, ) -> CircuitBreakerLayer { CircuitBreakerLayer { max_error_count_per_time_window: max_num_errors_per_secs, @@ -102,7 +102,7 @@ struct CircuitBreakerInner { timeout: Duration, evaluator: Evaluator, state: CircuitBreakerState, - circuit_break_total: IntCounter, + circuit_break_total: Counter, } impl CircuitBreakerInner { @@ -125,7 +125,7 @@ impl CircuitBreakerInner { fn receive_error(&mut self) { match self.state { CircuitBreakerState::HalfOpen => { - self.circuit_break_total.inc(); + self.circuit_break_total.increment(1); self.state = CircuitBreakerState::Open { until: Instant::now() + self.timeout, } @@ -144,7 +144,7 @@ impl CircuitBreakerInner { } let now = Instant::now(); if now < error_window_end { - self.circuit_break_total.inc(); + self.circuit_break_total.increment(1); self.state = CircuitBreakerState::Open { until: now + self.timeout, }; @@ -268,6 +268,7 @@ where mod tests { use std::sync::atomic::{AtomicBool, Ordering}; + use quickwit_metrics::counter; use tower::{ServiceBuilder, ServiceExt}; use super::*; @@ -301,8 +302,11 @@ mod tests { const TIMEOUT: Duration = Duration::from_millis(500); - let int_counter: prometheus::IntCounter = - IntCounter::new("circuit_break_total_test", "test circuit breaker counter").unwrap(); + let int_counter = counter!( + name: "circuit_break_total", + description: "test circuit breaker counter", + subsystem: "test", + ); let mut service = ServiceBuilder::new() .layer(TestCircuitBreakerEvaluator.make_layer(10, TIMEOUT, int_counter)) .service_fn(|_| async { diff --git a/quickwit/quickwit-common/src/tower/metrics.rs b/quickwit/quickwit-common/src/tower/metrics.rs index b2d093adbe3..a242212eaf6 100644 --- a/quickwit/quickwit-common/src/tower/metrics.rs +++ b/quickwit/quickwit-common/src/tower/metrics.rs @@ -13,28 +13,52 @@ // limitations under the License. use std::pin::Pin; +use std::sync::LazyLock; use std::task::{Context, Poll}; use std::time::Instant; use futures::{Future, ready}; use pin_project::{pin_project, pinned_drop}; -use prometheus::exponential_buckets; +use quickwit_metrics::{Counter, Gauge, Histogram, counter, gauge, histogram, labels}; use tower::{Layer, Service}; -use crate::metrics::{ - HistogramVec, IntCounterVec, IntGaugeVec, new_counter_vec, new_gauge_vec, new_histogram_vec, -}; +use crate::metrics::exponential_buckets; pub trait RpcName { fn rpc_name() -> &'static str; } +static GRPC_REQUESTS_TOTAL: LazyLock = LazyLock::new(|| { + counter!( + name: "requests_total", + description: "Total number of gRPC requests processed.", + subsystem: "grpc", + ) +}); + +static GRPC_REQUESTS_IN_FLIGHT: LazyLock = LazyLock::new(|| { + gauge!( + name: "requests_in_flight", + description: "Number of gRPC requests in-flight.", + subsystem: "grpc", + ) +}); + +static GRPC_REQUEST_DURATION_SECONDS: LazyLock = LazyLock::new(|| { + histogram!( + name: "request_duration_seconds", + description: "Duration of request in seconds.", + subsystem: "grpc", + buckets: exponential_buckets(0.001, 2.0, 12).unwrap(), + ) +}); + #[derive(Clone)] pub struct GrpcMetrics { inner: S, - requests_total: IntCounterVec<2>, - requests_in_flight: IntGaugeVec<1>, - request_duration_seconds: HistogramVec<2>, + requests_total: Counter, + requests_in_flight: Gauge, + request_duration_seconds: Histogram, } impl Service for GrpcMetrics @@ -55,7 +79,11 @@ where let rpc_name = R::rpc_name(); let inner = self.inner.call(request); - self.requests_in_flight.with_label_values([rpc_name]).inc(); + gauge!( + parent: self.requests_in_flight, + "rpc" => rpc_name, + ) + .increment(1.0); ResponseFuture { inner, @@ -71,36 +99,18 @@ where #[derive(Clone)] pub struct GrpcMetricsLayer { - requests_total: IntCounterVec<2>, - requests_in_flight: IntGaugeVec<1>, - request_duration_seconds: HistogramVec<2>, + requests_total: Counter, + requests_in_flight: Gauge, + request_duration_seconds: Histogram, } impl GrpcMetricsLayer { pub fn new(subsystem: &'static str, kind: &'static str) -> Self { + let labels = labels!("service" => subsystem, "kind" => kind); Self { - requests_total: new_counter_vec( - "grpc_requests_total", - "Total number of gRPC requests processed.", - subsystem, - &[("kind", kind)], - ["rpc", "status"], - ), - requests_in_flight: new_gauge_vec( - "grpc_requests_in_flight", - "Number of gRPC requests in-flight.", - subsystem, - &[("kind", kind)], - ["rpc"], - ), - request_duration_seconds: new_histogram_vec( - "grpc_request_duration_seconds", - "Duration of request in seconds.", - subsystem, - &[("kind", kind)], - ["rpc", "status"], - exponential_buckets(0.001, 2.0, 12).unwrap(), - ), + requests_total: counter!(parent: GRPC_REQUESTS_TOTAL, labels: [labels]), + requests_in_flight: gauge!(parent: GRPC_REQUESTS_IN_FLIGHT, labels: [labels]), + request_duration_seconds: histogram!(parent: GRPC_REQUEST_DURATION_SECONDS, labels: [labels]), } } } @@ -118,7 +128,7 @@ impl Layer for GrpcMetricsLayer { } } -/// Response future for [`PrometheusMetrics`]. +/// Response future for [`GrpcMetrics`]. #[pin_project(PinnedDrop)] pub struct ResponseFuture { #[pin] @@ -126,24 +136,21 @@ pub struct ResponseFuture { start: Instant, rpc_name: &'static str, status: &'static str, - requests_total: IntCounterVec<2>, - requests_in_flight: IntGaugeVec<1>, - request_duration_seconds: HistogramVec<2>, + requests_total: Counter, + requests_in_flight: Gauge, + request_duration_seconds: Histogram, } #[pinned_drop] impl PinnedDrop for ResponseFuture { fn drop(self: Pin<&mut Self>) { let elapsed = self.start.elapsed().as_secs_f64(); - let label_values = [self.rpc_name, self.status]; - - self.requests_total.with_label_values(label_values).inc(); - self.request_duration_seconds - .with_label_values(label_values) - .observe(elapsed); - self.requests_in_flight - .with_label_values([self.rpc_name]) - .dec(); + let rpc_label = labels!("rpc" => self.rpc_name); + let status_label = labels!("status" => self.status); + counter!(parent: self.requests_total, labels: [rpc_label, status_label]).increment(1); + histogram!(parent: self.request_duration_seconds, labels: [rpc_label, status_label]) + .record(elapsed); + gauge!(parent: self.requests_in_flight, labels: [rpc_label]).decrement(1.0); } } @@ -162,6 +169,9 @@ where F: Future> #[cfg(test)] mod tests { + use metrics::with_local_recorder; + use metrics_util::debugging::{DebugValue, DebuggingRecorder}; + use super::*; struct HelloRequest; @@ -180,59 +190,67 @@ mod tests { } } - #[tokio::test] - async fn test_grpc_metrics() { - let layer = GrpcMetricsLayer::new("quickwit_test", "server"); - - let mut hello_service = - layer - .clone() - .layer(tower::service_fn(|request: HelloRequest| async move { - Ok::<_, ()>(request) - })); - let mut goodbye_service = - layer - .clone() - .layer(tower::service_fn(|request: GoodbyeRequest| async move { - Ok::<_, ()>(request) - })); - - hello_service.call(HelloRequest).await.unwrap(); - + #[test] + fn test_grpc_metrics() { + let recorder = DebuggingRecorder::new(); + let snapshotter = recorder.snapshotter(); + + with_local_recorder(&recorder, || { + futures::executor::block_on(async { + let layer = GrpcMetricsLayer::new("quickwit_test", "server"); + + let mut hello_service = + layer + .clone() + .layer(tower::service_fn(|request: HelloRequest| async move { + Ok::<_, ()>(request) + })); + let mut goodbye_service = + layer + .clone() + .layer(tower::service_fn(|request: GoodbyeRequest| async move { + Ok::<_, ()>(request) + })); + + hello_service.call(HelloRequest).await.unwrap(); + goodbye_service.call(GoodbyeRequest).await.unwrap(); + + let hello_future = hello_service.call(HelloRequest); + drop(hello_future); + }); + }); + + let snapshot = snapshotter.snapshot().into_vec(); + let counter_value = |rpc: &str, status: &str| { + snapshot.iter().find_map(|(composite_key, _, _, value)| { + let (_, key) = composite_key.clone().into_parts(); + let labels = key + .labels() + .map(|label| (label.key(), label.value())) + .collect::>(); + if key.name() == "quickwit_grpc_requests_total" + && labels.contains(&("service", "quickwit_test")) + && labels.contains(&("kind", "server")) + && labels.contains(&("rpc", rpc)) + && labels.contains(&("status", status)) + { + Some(value) + } else { + None + } + }) + }; assert_eq!( - layer - .requests_total - .with_label_values(["hello", "success"]) - .get(), - 1 + counter_value("hello", "success"), + Some(&DebugValue::Counter(1)) ); assert_eq!( - layer - .requests_total - .with_label_values(["goodbye", "success"]) - .get(), - 0 + counter_value("goodbye", "success"), + Some(&DebugValue::Counter(1)) ); - - goodbye_service.call(GoodbyeRequest).await.unwrap(); - - assert_eq!( - layer - .requests_total - .with_label_values(["goodbye", "success"]) - .get(), - 1 - ); - - let hello_future = hello_service.call(HelloRequest); - drop(hello_future); - assert_eq!( - layer - .requests_total - .with_label_values(["hello", "cancelled"]) - .get(), - 1 + counter_value("hello", "cancelled"), + Some(&DebugValue::Counter(1)) ); } } diff --git a/quickwit/quickwit-config/src/source_config/serialize.rs b/quickwit/quickwit-config/src/source_config/serialize.rs index 224689019d2..3f580ee8fa3 100644 --- a/quickwit/quickwit-config/src/source_config/serialize.rs +++ b/quickwit/quickwit-config/src/source_config/serialize.rs @@ -129,7 +129,9 @@ impl SourceConfigForSerialization { | SourceParams::File(FileSourceParams::Notifications(_)) => {} _ => { if self.num_pipelines > 1 { - bail!("Quickwit currently supports multiple pipelines only for GCP PubSub or Kafka sources. open an issue https://github.com/quickwit-oss/quickwit/issues if you need the feature for other source types"); + bail!( + "Quickwit currently supports multiple pipelines only for GCP PubSub or Kafka sources. open an issue https://github.com/quickwit-oss/quickwit/issues if you need the feature for other source types" + ); } } } diff --git a/quickwit/quickwit-control-plane/Cargo.toml b/quickwit/quickwit-control-plane/Cargo.toml index e7d9d012dd0..e0c8e951e98 100644 --- a/quickwit/quickwit-control-plane/Cargo.toml +++ b/quickwit/quickwit-control-plane/Cargo.toml @@ -30,6 +30,7 @@ ulid = { workspace = true } quickwit-actors = { workspace = true } quickwit-cluster = { workspace = true } quickwit-common = { workspace = true } +quickwit-metrics = { workspace = true } quickwit-config = { workspace = true } quickwit-ingest = { workspace = true } quickwit-metastore = { workspace = true } diff --git a/quickwit/quickwit-control-plane/src/control_plane.rs b/quickwit/quickwit-control-plane/src/control_plane.rs index 1056aba6eb8..11cc76298af 100644 --- a/quickwit/quickwit-control-plane/src/control_plane.rs +++ b/quickwit/quickwit-control-plane/src/control_plane.rs @@ -61,6 +61,7 @@ use crate::debouncer::Debouncer; use crate::indexing_scheduler::{IndexingScheduler, IndexingSchedulerState}; use crate::ingest::IngestController; use crate::ingest::ingest_controller::{IngestControllerStats, RebalanceShardsCallback}; +use crate::metrics::{METASTORE_ERROR_ABORTED, METASTORE_ERROR_MAYBE_EXECUTED, RESTART_TOTAL}; use crate::model::ControlPlaneModel; /// Interval between two controls (or checks) of the desired plan VS running plan. @@ -219,7 +220,7 @@ impl Actor for ControlPlane { } async fn initialize(&mut self, ctx: &ActorContext) -> Result<(), ActorExitStatus> { - crate::metrics::CONTROL_PLANE_METRICS.restart_total.inc(); + RESTART_TOTAL.increment(1); self.model .load_from_metastore(&mut self.metastore, ctx.progress()) @@ -568,17 +569,13 @@ fn convert_metastore_error( // It will be up to the client to decide what to do there. error!(err=?metastore_error, transaction_outcome="aborted", "metastore error"); } - crate::metrics::CONTROL_PLANE_METRICS - .metastore_error_aborted - .inc(); + METASTORE_ERROR_ABORTED.increment(1); Ok(Err(ControlPlaneError::Metastore(metastore_error))) } else { // If the metastore transaction may have been executed, we need to restart the control plane // so that it gets resynced with the metastore state. error!(error=?metastore_error, transaction_outcome="maybe-executed", "metastore error"); - crate::metrics::CONTROL_PLANE_METRICS - .metastore_error_maybe_executed - .inc(); + METASTORE_ERROR_MAYBE_EXECUTED.increment(1); Err(ActorExitStatus::from(anyhow::anyhow!(metastore_error))) } } diff --git a/quickwit/quickwit-control-plane/src/indexing_scheduler/mod.rs b/quickwit/quickwit-control-plane/src/indexing_scheduler/mod.rs index 300f6a9d151..7883ec80447 100644 --- a/quickwit/quickwit-control-plane/src/indexing_scheduler/mod.rs +++ b/quickwit/quickwit-control-plane/src/indexing_scheduler/mod.rs @@ -38,7 +38,7 @@ use tracing::{debug, info, warn}; use crate::indexing_plan::PhysicalIndexingPlan; use crate::indexing_scheduler::change_tracker::{NotifyChangeOnDrop, RebuildNotifier}; use crate::indexing_scheduler::scheduling::build_physical_indexing_plan; -use crate::metrics::ShardLocalityMetrics; +use crate::metrics::{APPLY_PLAN_TOTAL, SCHEDULE_TOTAL, ShardLocalityMetrics}; use crate::model::{ControlPlaneModel, ShardEntry, ShardLocations}; use crate::{IndexerNodeInfo, IndexerPool}; @@ -295,7 +295,7 @@ impl IndexingScheduler { // Prefer not calling this method directly, and instead call // `ControlPlane::rebuild_indexing_plan_debounced`. pub(crate) fn rebuild_plan(&mut self, model: &ControlPlaneModel) { - crate::metrics::CONTROL_PLANE_METRICS.schedule_total.inc(); + SCHEDULE_TOTAL.increment(1); let notify_on_drop = self.next_rebuild_tracker.start_rebuild(); @@ -330,7 +330,7 @@ impl IndexingScheduler { ); let shard_locality_metrics = get_shard_locality_metrics(&new_physical_plan, &shard_locations); - crate::metrics::CONTROL_PLANE_METRICS.set_shard_locality_metrics(shard_locality_metrics); + shard_locality_metrics.publish(); if let Some(last_applied_plan) = &self.state.last_applied_physical_plan { let plans_diff = get_indexing_plans_diff( last_applied_plan.indexing_tasks_per_indexer(), @@ -397,7 +397,7 @@ impl IndexingScheduler { notify_on_drop: Option>, ) { debug!(new_physical_plan=?new_physical_plan, "apply physical indexing plan"); - crate::metrics::CONTROL_PLANE_METRICS.apply_plan_total.inc(); + APPLY_PLAN_TOTAL.increment(1); for (node_id, indexing_tasks) in new_physical_plan.indexing_tasks_per_indexer() { // We don't want to block on a slow indexer so we apply this change asynchronously // TODO not blocking is cool, but we need to make sure there is not accumulation diff --git a/quickwit/quickwit-control-plane/src/ingest/ingest_controller.rs b/quickwit/quickwit-control-plane/src/ingest/ingest_controller.rs index 6a64c183361..8dc70b45b28 100644 --- a/quickwit/quickwit-control-plane/src/ingest/ingest_controller.rs +++ b/quickwit/quickwit-control-plane/src/ingest/ingest_controller.rs @@ -58,6 +58,7 @@ use ulid::Ulid; use super::scaling_arbiter::ScalingArbiter; use crate::control_plane::ControlPlane; use crate::ingest::wait_handle::WaitHandle; +use crate::metrics::REBALANCE_SHARDS; use crate::model::{ControlPlaneModel, ScalingMode, ShardEntry, ShardStats}; const CLOSE_SHARDS_REQUEST_TIMEOUT: Duration = if cfg!(test) { @@ -1024,9 +1025,7 @@ impl IngestController { let shards_to_rebalance: Vec = self.compute_shards_to_rebalance(model); - crate::metrics::CONTROL_PLANE_METRICS - .rebalance_shards - .set(shards_to_rebalance.len() as i64); + REBALANCE_SHARDS.set(shards_to_rebalance.len() as f64); if shards_to_rebalance.is_empty() { debug!("skipping rebalance: no shards to rebalance"); @@ -1049,16 +1048,12 @@ impl IngestController { .await .inspect_err(|error| { error!(%error, "failed to open shards during rebalance"); - crate::metrics::CONTROL_PLANE_METRICS - .rebalance_shards - .set(0); + REBALANCE_SHARDS.set(0.0); })?; let num_opened_shards: usize = per_source_num_opened_shards.values().sum(); - crate::metrics::CONTROL_PLANE_METRICS - .rebalance_shards - .set(num_opened_shards as i64); + REBALANCE_SHARDS.set(num_opened_shards as f64); for source_uid in per_source_num_opened_shards.keys() { // We temporarily disable the ability the scale down the number of shards for diff --git a/quickwit/quickwit-control-plane/src/metrics.rs b/quickwit/quickwit-control-plane/src/metrics.rs index 7935f18a1e8..fb2eb4153f4 100644 --- a/quickwit/quickwit-control-plane/src/metrics.rs +++ b/quickwit/quickwit-control-plane/src/metrics.rs @@ -14,9 +14,7 @@ use std::sync::LazyLock; -use quickwit_common::metrics::{ - IntCounter, IntGauge, IntGaugeVec, new_counter, new_gauge, new_gauge_vec, -}; +use quickwit_metrics::{Counter, Gauge, LabelNames, counter, gauge, label_names}; #[derive(Debug, Clone, Copy)] pub struct ShardLocalityMetrics { @@ -24,114 +22,95 @@ pub struct ShardLocalityMetrics { pub num_local_shards: usize, } -pub struct ControlPlaneMetrics { - // Indexes and shards tracked by the control plane. - pub indexes_total: IntGauge, - pub open_shards: IntGaugeVec<1>, - pub closed_shards: IntGaugeVec<1>, - - // Operations performed by the control plane. - pub apply_plan_total: IntCounter, - pub rebalance_shards: IntGauge, - pub restart_total: IntCounter, - pub schedule_total: IntCounter, - - // Metastore errors. - pub metastore_error_aborted: IntCounter, - pub metastore_error_maybe_executed: IntCounter, - - // Indexing plan metrics. - pub local_shards: IntGauge, - pub remote_shards: IntGauge, -} - -impl ControlPlaneMetrics { - pub fn set_shard_locality_metrics(&self, shard_locality_metrics: ShardLocalityMetrics) { - self.local_shards - .set(shard_locality_metrics.num_local_shards as i64); - self.remote_shards - .set(shard_locality_metrics.num_remote_shards as i64); +impl ShardLocalityMetrics { + pub fn publish(self) { + LOCAL_SHARDS.set(self.num_local_shards as f64); + REMOTE_SHARDS.set(self.num_remote_shards as f64); } } -impl Default for ControlPlaneMetrics { - fn default() -> Self { - let open_shards = new_gauge_vec( - "shards", - "Number of open and closed shards tracked by the ingest controller", - "control_plane", - &[("state", "open")], - ["index_id"], - ); - let closed_shards = new_gauge_vec( - "shards", - "Number of open and closed shards tracked by the ingest controller", - "control_plane", - &[("state", "closed")], - ["index_id"], - ); - let indexed_shards = new_gauge_vec( - "indexed_shards", - "Number of (remote/local) shards in the indexing plan", - "control_plane", - &[], - ["locality"], - ); - let local_shards = indexed_shards.with_label_values(["local"]); - let remote_shards = indexed_shards.with_label_values(["remote"]); - - ControlPlaneMetrics { - indexes_total: new_gauge( - "indexes_total", - "Number of indexes tracked by the control plane.", - "control_plane", - &[], - ), - open_shards, - closed_shards, - apply_plan_total: new_counter( - "apply_plan_total", - "Number of control plane `apply plan` operations.", - "control_plane", - &[], - ), - rebalance_shards: new_gauge( - "rebalance_shards", - "Number of shards rebalanced by the control plane.", - "control_plane", - &[], - ), - restart_total: new_counter( - "restart_total", - "Number of control plane restarts.", - "control_plane", - &[], - ), - schedule_total: new_counter( - "schedule_total", - "Number of control plane `schedule` operations.", - "control_plane", - &[], - ), - metastore_error_aborted: new_counter( - "metastore_error_aborted", - "Number of aborted metastore transaction (= do not trigger a control plane \ - restart)", - "control_plane", - &[], - ), - metastore_error_maybe_executed: new_counter( - "metastore_error_maybe_executed", - "Number of metastore transaction with an uncertain outcome (= do trigger a \ - control plane restart)", - "control_plane", - &[], - ), - local_shards, - remote_shards, - } - } -} +pub(crate) static INDEXES_TOTAL: LazyLock = LazyLock::new(|| { + gauge!( + name: "indexes_total", + description: "Number of indexes tracked by the control plane.", + subsystem: "control_plane", + ) +}); + +static SHARDS: LazyLock = LazyLock::new(|| { + gauge!( + name: "shards", + description: "Number of open and closed shards tracked by the ingest controller", + subsystem: "control_plane", + ) +}); + +pub(crate) static OPEN_SHARDS: LazyLock = + LazyLock::new(|| gauge!(parent: SHARDS, "state" => "open")); + +pub(crate) static CLOSED_SHARDS: LazyLock = + LazyLock::new(|| gauge!(parent: SHARDS, "state" => "closed")); + +pub(crate) const INDEX_ID_LABELS: LabelNames<1> = label_names!("index_id"); + +static INDEXED_SHARDS: LazyLock = LazyLock::new(|| { + gauge!( + name: "indexed_shards", + description: "Number of (remote/local) shards in the indexing plan", + subsystem: "control_plane", + ) +}); + +pub(crate) static LOCAL_SHARDS: LazyLock = + LazyLock::new(|| gauge!(parent: INDEXED_SHARDS, "locality" => "local")); + +pub(crate) static REMOTE_SHARDS: LazyLock = + LazyLock::new(|| gauge!(parent: INDEXED_SHARDS, "locality" => "remote")); + +pub(crate) static APPLY_PLAN_TOTAL: LazyLock = LazyLock::new(|| { + counter!( + name: "apply_plan_total", + description: "Number of control plane `apply plan` operations.", + subsystem: "control_plane", + ) +}); + +pub(crate) static REBALANCE_SHARDS: LazyLock = LazyLock::new(|| { + gauge!( + name: "rebalance_shards", + description: "Number of shards rebalanced by the control plane.", + subsystem: "control_plane", + ) +}); + +pub(crate) static RESTART_TOTAL: LazyLock = LazyLock::new(|| { + counter!( + name: "restart_total", + description: "Number of control plane restarts.", + subsystem: "control_plane", + ) +}); + +pub(crate) static SCHEDULE_TOTAL: LazyLock = LazyLock::new(|| { + counter!( + name: "schedule_total", + description: "Number of control plane `schedule` operations.", + subsystem: "control_plane", + ) +}); + +pub(crate) static METASTORE_ERROR_ABORTED: LazyLock = LazyLock::new(|| { + counter!( + name: "metastore_error_aborted", + description: "Number of aborted metastore transaction (= do not trigger a control plane restart)", + subsystem: "control_plane", + ) +}); -pub static CONTROL_PLANE_METRICS: LazyLock = - LazyLock::new(ControlPlaneMetrics::default); +pub(crate) static METASTORE_ERROR_MAYBE_EXECUTED: LazyLock = LazyLock::new(|| { + counter!( + name: "metastore_error_maybe_executed", + description: "Number of metastore transaction with an uncertain outcome (= do trigger a control plane restart)", + subsystem: "control_plane", + ) +}); diff --git a/quickwit/quickwit-control-plane/src/model/mod.rs b/quickwit/quickwit-control-plane/src/model/mod.rs index 0d0431a67ce..38eb0ed6cbe 100644 --- a/quickwit/quickwit-control-plane/src/model/mod.rs +++ b/quickwit/quickwit-control-plane/src/model/mod.rs @@ -39,6 +39,8 @@ use quickwit_proto::types::{IndexId, IndexUid, NodeId, ShardId, SourceId, Source pub(super) use shard_table::{ScalingMode, ShardEntry, ShardLocations, ShardStats, ShardTable}; use tracing::{debug, error, info, instrument, warn}; +use crate::metrics::INDEXES_TOTAL; + /// The control plane maintains a model in sync with the metastore. /// /// The model stays consistent with the metastore, because all @@ -167,9 +169,7 @@ impl ControlPlaneModel { } fn update_metrics(&self) { - crate::metrics::CONTROL_PLANE_METRICS - .indexes_total - .set(self.index_table.len() as i64); + INDEXES_TOTAL.set(self.index_table.len() as f64); } pub(crate) fn source_configs(&self) -> impl Iterator + '_ { diff --git a/quickwit/quickwit-control-plane/src/model/shard_table.rs b/quickwit/quickwit-control-plane/src/model/shard_table.rs index 623ae3e6224..e2909712844 100644 --- a/quickwit/quickwit-control-plane/src/model/shard_table.rs +++ b/quickwit/quickwit-control-plane/src/model/shard_table.rs @@ -22,10 +22,13 @@ use quickwit_common::metrics::index_label; use quickwit_common::rate_limiter::{RateLimiter, RateLimiterSettings}; use quickwit_common::tower::ConstantRate; use quickwit_ingest::{RateMibPerSec, ShardInfo, ShardInfos}; +use quickwit_metrics::{gauge, label_values}; use quickwit_proto::ingest::{Shard, ShardState}; use quickwit_proto::types::{IndexUid, NodeId, ShardId, SourceId, SourceUid}; use tracing::{error, info, warn}; +use crate::metrics::{CLOSED_SHARDS, INDEX_ID_LABELS, OPEN_SHARDS}; + /// Limits the number of scale up operations that can happen to a source to 5 per minute. const SCALING_UP_RATE_LIMITER_SETTINGS: RateLimiterSettings = RateLimiterSettings { burst_limit: 5, @@ -461,14 +464,10 @@ impl ShardTable { // can update the metrics for this specific index. if index_label == index_id { let shard_stats = table_entry.shards_stats(); - crate::metrics::CONTROL_PLANE_METRICS - .open_shards - .with_label_values([index_label]) - .set(shard_stats.num_open_shards as i64); - crate::metrics::CONTROL_PLANE_METRICS - .closed_shards - .with_label_values([index_label]) - .set(shard_stats.num_closed_shards as i64); + let labels = label_values!(INDEX_ID_LABELS => index_label.to_string()); + gauge!(parent: OPEN_SHARDS, labels: [labels]).set(shard_stats.num_open_shards as f64); + gauge!(parent: CLOSED_SHARDS, labels: [labels]) + .set(shard_stats.num_closed_shards as f64); return; } // Per-index metrics are disabled, so we update the metrics for all sources. @@ -482,14 +481,9 @@ impl ShardTable { num_closed_shards += 1; } } - crate::metrics::CONTROL_PLANE_METRICS - .open_shards - .with_label_values([index_label]) - .set(num_open_shards as i64); - crate::metrics::CONTROL_PLANE_METRICS - .closed_shards - .with_label_values([index_label]) - .set(num_closed_shards as i64); + let labels = label_values!(INDEX_ID_LABELS => index_label.to_string()); + gauge!(parent: OPEN_SHARDS, labels: [labels]).set(num_open_shards as f64); + gauge!(parent: CLOSED_SHARDS, labels: [labels]).set(num_closed_shards as f64); } pub fn update_shards( diff --git a/quickwit/quickwit-directories/src/caching_directory.rs b/quickwit/quickwit-directories/src/caching_directory.rs index 6e9461f5493..c40ac3ec760 100644 --- a/quickwit/quickwit-directories/src/caching_directory.rs +++ b/quickwit/quickwit-directories/src/caching_directory.rs @@ -37,9 +37,8 @@ impl CachingDirectory { /// Warning: The resulting CacheDirectory will cache all information without ever /// removing any item from the cache. pub fn new_unbounded(underlying: Arc) -> CachingDirectory { - let byte_range_cache = ByteRangeCache::with_infinite_capacity( - &quickwit_storage::STORAGE_METRICS.shortlived_cache, - ); + let byte_range_cache = + ByteRangeCache::with_infinite_capacity(&quickwit_storage::metrics::SHORTLIVED_CACHE); CachingDirectory::new(underlying, byte_range_cache) } diff --git a/quickwit/quickwit-index-management/Cargo.toml b/quickwit/quickwit-index-management/Cargo.toml index d303125f65f..35978b9f5e7 100644 --- a/quickwit/quickwit-index-management/Cargo.toml +++ b/quickwit/quickwit-index-management/Cargo.toml @@ -21,6 +21,7 @@ tokio = { workspace = true } tracing = { workspace = true } quickwit-common = { workspace = true } +quickwit-metrics = { workspace = true } quickwit-config = { workspace = true } quickwit-indexing = { workspace = true } quickwit-metastore = { workspace = true } diff --git a/quickwit/quickwit-index-management/src/garbage_collection.rs b/quickwit/quickwit-index-management/src/garbage_collection.rs index dc2655dbfcd..2b964b0a4e4 100644 --- a/quickwit/quickwit-index-management/src/garbage_collection.rs +++ b/quickwit/quickwit-index-management/src/garbage_collection.rs @@ -20,13 +20,13 @@ use std::time::Duration; use anyhow::Context; use futures::{Future, StreamExt}; use itertools::Itertools; -use quickwit_common::metrics::IntCounter; use quickwit_common::pretty::PrettySample; use quickwit_common::{Progress, rate_limited_info}; use quickwit_metastore::{ ListSplitsQuery, ListSplitsRequestExt, MetastoreServiceStreamSplitsExt, SplitInfo, SplitMetadata, SplitState, }; +use quickwit_metrics::Counter; use quickwit_proto::metastore::{ DeleteSplitsRequest, ListSplitsRequest, MarkSplitsForDeletionRequest, MetastoreError, MetastoreService, MetastoreServiceClient, @@ -41,9 +41,9 @@ use tracing::{error, instrument}; const DELETE_SPLITS_BATCH_SIZE: usize = 10_000; pub struct GcMetrics { - pub deleted_splits: IntCounter, - pub deleted_bytes: IntCounter, - pub failed_splits: IntCounter, + pub deleted_splits: Counter, + pub deleted_bytes: Counter, + pub failed_splits: Counter, } pub(crate) trait RecordGcMetrics { @@ -53,9 +53,9 @@ pub(crate) trait RecordGcMetrics { impl RecordGcMetrics for Option { fn record(&self, num_deleted_splits: usize, num_deleted_bytes: u64, num_failed_splits: usize) { if let Some(metrics) = self { - metrics.deleted_splits.inc_by(num_deleted_splits as u64); - metrics.deleted_bytes.inc_by(num_deleted_bytes); - metrics.failed_splits.inc_by(num_failed_splits as u64); + metrics.deleted_splits.increment(num_deleted_splits as u64); + metrics.deleted_bytes.increment(num_deleted_bytes); + metrics.failed_splits.increment(num_failed_splits as u64); } } } diff --git a/quickwit/quickwit-indexing/Cargo.toml b/quickwit/quickwit-indexing/Cargo.toml index 2a0d581797d..10a0ef98b75 100644 --- a/quickwit/quickwit-indexing/Cargo.toml +++ b/quickwit/quickwit-indexing/Cargo.toml @@ -55,6 +55,7 @@ quickwit-actors = { workspace = true } quickwit-aws = { workspace = true } quickwit-cluster = { workspace = true } quickwit-common = { workspace = true } +quickwit-metrics = { workspace = true } quickwit-config = { workspace = true } quickwit-directories = { workspace = true } quickwit-doc-mapper = { workspace = true } diff --git a/quickwit/quickwit-indexing/src/actors/doc_processor.rs b/quickwit/quickwit-indexing/src/actors/doc_processor.rs index 407c55ff526..cab1b6aab43 100644 --- a/quickwit/quickwit-indexing/src/actors/doc_processor.rs +++ b/quickwit/quickwit-indexing/src/actors/doc_processor.rs @@ -20,11 +20,11 @@ use anyhow::{Context, bail}; use async_trait::async_trait; use bytes::Bytes; use quickwit_actors::{Actor, ActorContext, ActorExitStatus, Handler, Mailbox, QueueCapacity}; -use quickwit_common::metrics::IntCounter; use quickwit_common::rate_limited_tracing::rate_limited_warn; use quickwit_common::runtimes::RuntimeType; use quickwit_config::{SourceInputFormat, TransformConfig}; use quickwit_doc_mapper::{DocMapper, DocParsingError, JsonObject}; +use quickwit_metrics::{Counter, counter, labels}; use quickwit_opentelemetry::otlp::{ JsonLogIterator, JsonSpanIterator, OtlpLogsError, OtlpTracesError, parse_otlp_logs_json, parse_otlp_logs_protobuf, parse_otlp_spans_json, parse_otlp_spans_protobuf, @@ -40,12 +40,12 @@ use tokio::runtime::Handle; #[cfg(feature = "vrl")] use super::vrl_processing::*; use crate::actors::Indexer; +use crate::metrics::{PROCESSED_BYTES, PROCESSED_DOCS_TOTAL}; use crate::models::{ NewPublishLock, NewPublishToken, ProcessedDoc, ProcessedDocBatch, PublishLock, RawDocBatch, }; const PLAIN_TEXT: &str = "plain_text"; - pub(super) struct JsonDoc { json_obj: JsonObject, num_bytes: usize, @@ -270,8 +270,8 @@ impl From> for JsonDocIterator { #[derive(Debug)] pub struct DocProcessorCounter { pub num_docs: AtomicU64, - pub num_docs_metric: IntCounter, - pub num_bytes_metric: IntCounter, + pub num_docs_metric: Counter, + pub num_bytes_metric: Counter, } impl Serialize for DocProcessorCounter { @@ -282,17 +282,18 @@ impl Serialize for DocProcessorCounter { } impl DocProcessorCounter { - fn for_index_and_doc_processor_outcome(index: &str, outcome: &str) -> DocProcessorCounter { - let index_label = quickwit_common::metrics::index_label(index); - let labels = [index_label, outcome]; + fn for_index_and_doc_processor_outcome( + index: &str, + outcome: &'static str, + ) -> DocProcessorCounter { + let labels = labels!( + "index" => quickwit_common::metrics::index_label(index).to_string(), + "docs_processed_status" => outcome + ); DocProcessorCounter { num_docs: Default::default(), - num_docs_metric: crate::metrics::INDEXER_METRICS - .processed_docs_total - .with_label_values(labels), - num_bytes_metric: crate::metrics::INDEXER_METRICS - .processed_bytes - .with_label_values(labels), + num_docs_metric: counter!(parent: PROCESSED_DOCS_TOTAL, labels: [labels]), + num_bytes_metric: counter!(parent: PROCESSED_BYTES, labels: [labels]), } } @@ -303,8 +304,8 @@ impl DocProcessorCounter { fn record_doc(&self, num_bytes: u64) { self.num_docs.fetch_add(1, Ordering::Relaxed); - self.num_docs_metric.inc(); - self.num_bytes_metric.inc_by(num_bytes); + self.num_docs_metric.increment(1); + self.num_bytes_metric.increment(num_bytes); } } diff --git a/quickwit/quickwit-indexing/src/actors/indexer.rs b/quickwit/quickwit-indexing/src/actors/indexer.rs index b2257ed203f..0d49e6db3af 100644 --- a/quickwit/quickwit-indexing/src/actors/indexer.rs +++ b/quickwit/quickwit-indexing/src/actors/indexer.rs @@ -27,12 +27,13 @@ use quickwit_actors::{ Actor, ActorContext, ActorExitStatus, Command, Handler, Mailbox, QueueCapacity, }; use quickwit_common::io::IoControls; -use quickwit_common::metrics::GaugeGuard; +use quickwit_common::metrics::IN_FLIGHT_INDEX_WRITER; use quickwit_common::runtimes::RuntimeType; use quickwit_common::temp_dir::TempDirectory; use quickwit_config::IndexingSettings; use quickwit_doc_mapper::DocMapper; use quickwit_metastore::checkpoint::{IndexCheckpointDelta, SourceCheckpointDelta}; +use quickwit_metrics::GaugeGuard; use quickwit_proto::indexing::{IndexingPipelineId, PipelineMetrics}; use quickwit_proto::metastore::{ LastDeleteOpstampRequest, MetastoreService, MetastoreServiceClient, @@ -51,6 +52,7 @@ use ulid::Ulid; use super::IndexSerializer; use super::cooperative_indexing::{CooperativeIndexingCycle, CooperativeIndexingPeriod}; +use crate::metrics::SPLIT_BUILDERS; use crate::models::{ CommitTrigger, EmptySplit, IndexedSplitBatchBuilder, IndexedSplitBuilder, NewPublishLock, NewPublishToken, ProcessedDoc, ProcessedDocBatch, PublishLock, @@ -219,9 +221,7 @@ impl IndexerState { let publish_lock = self.publish_lock.clone(); let publish_token_opt = self.publish_token_opt.clone(); - let mut split_builders_guard = - GaugeGuard::from_gauge(&crate::metrics::INDEXER_METRICS.split_builders); - split_builders_guard.add(1); + let split_builders_guard = GaugeGuard::new(&SPLIT_BUILDERS, 1.0); let workbench = IndexingWorkbench { workbench_id, @@ -233,11 +233,7 @@ impl IndexerState { publish_lock, publish_token_opt, last_delete_opstamp, - memory_usage: GaugeGuard::from_gauge( - &quickwit_common::metrics::MEMORY_METRICS - .in_flight - .index_writer, - ), + memory_usage: GaugeGuard::new(&IN_FLIGHT_INDEX_WRITER, 0.0), cooperative_indexing_period, split_builders_guard, }; @@ -335,7 +331,7 @@ impl IndexerState { memory_usage_delta += mem_usage_after as i64 - mem_usage_before as i64; ctx.record_progress(); } - memory_usage.add(memory_usage_delta); + memory_usage.increment(memory_usage_delta as f64); Ok(()) } } @@ -358,8 +354,8 @@ struct IndexingWorkbench { // We use this value to set the `delete_opstamp` of the workbench splits. last_delete_opstamp: u64, // Number of bytes declared as used by tantivy. - memory_usage: GaugeGuard<'static>, - split_builders_guard: GaugeGuard<'static>, + memory_usage: GaugeGuard, + split_builders_guard: GaugeGuard, cooperative_indexing_period: Option, } @@ -583,7 +579,7 @@ impl Indexer { fn memory_usage(&self) -> ByteSize { if let Some(workbench) = &self.indexing_workbench_opt { - ByteSize(workbench.memory_usage.get() as u64) + ByteSize(workbench.memory_usage.delta() as u64) } else { ByteSize(0u64) } diff --git a/quickwit/quickwit-indexing/src/actors/indexing_pipeline.rs b/quickwit/quickwit-indexing/src/actors/indexing_pipeline.rs index 3b43e47c105..686bd942495 100644 --- a/quickwit/quickwit-indexing/src/actors/indexing_pipeline.rs +++ b/quickwit/quickwit-indexing/src/actors/indexing_pipeline.rs @@ -23,12 +23,12 @@ use quickwit_actors::{ QueueCapacity, Supervisable, }; use quickwit_common::KillSwitch; -use quickwit_common::metrics::OwnedGaugeGuard; use quickwit_common::pubsub::EventBroker; use quickwit_common::temp_dir::TempDirectory; use quickwit_config::{IndexingSettings, RetentionPolicy, SourceConfig}; use quickwit_doc_mapper::DocMapper; use quickwit_ingest::IngesterPool; +use quickwit_metrics::{GaugeGuard, counter, gauge, label_values}; use quickwit_proto::indexing::IndexingPipelineId; use quickwit_proto::metastore::{MetastoreError, MetastoreServiceClient}; use quickwit_proto::types::ShardId; @@ -45,6 +45,7 @@ use crate::actors::sequencer::Sequencer; use crate::actors::uploader::UploaderType; use crate::actors::{Publisher, Uploader}; use crate::merge_policy::MergePolicy; +use crate::metrics::{ACTOR_NAME, BACKPRESSURE_MICROS, INDEXING_PIPELINES}; use crate::models::IndexingStatistics; use crate::source::{ AssignShards, Assignment, SourceActor, SourceRuntime, quickwit_supported_sources, @@ -88,7 +89,7 @@ pub struct IndexingPipeline { // requiring a respawn of the pipeline. // We keep the list of shards here however, to reassign them after a respawn. shard_ids: BTreeSet, - _indexing_pipelines_gauge_guard: OwnedGaugeGuard, + _indexing_pipelines_gauge_guard: GaugeGuard, } #[async_trait] @@ -123,10 +124,8 @@ impl Actor for IndexingPipeline { impl IndexingPipeline { pub fn new(params: IndexingPipelineParams) -> Self { - let indexing_pipelines_gauge = crate::metrics::INDEXER_METRICS - .indexing_pipelines - .with_label_values([¶ms.pipeline_id.index_uid.index_id]); - let indexing_pipelines_gauge_guard = OwnedGaugeGuard::from_gauge(indexing_pipelines_gauge); + let indexing_pipelines_gauge = gauge!(parent: INDEXING_PIPELINES, "index" => params.pipeline_id.index_uid.index_id.clone()); + let indexing_pipelines_gauge_guard = GaugeGuard::new(&indexing_pipelines_gauge, 1.0); let params_fingerprint = params.params_fingerprint; IndexingPipeline { params, @@ -311,21 +310,13 @@ impl IndexingPipeline { let (publisher_mailbox, publisher_handle) = ctx .spawn_actor() .set_kill_switch(self.kill_switch.clone()) - .set_backpressure_micros_counter( - crate::metrics::INDEXER_METRICS - .backpressure_micros - .with_label_values(["publisher"]), - ) + .set_backpressure_micros_counter(counter!(parent: BACKPRESSURE_MICROS, labels: [label_values!(ACTOR_NAME => "publisher")])) .spawn(publisher); let sequencer = Sequencer::new(publisher_mailbox); let (sequencer_mailbox, sequencer_handle) = ctx .spawn_actor() - .set_backpressure_micros_counter( - crate::metrics::INDEXER_METRICS - .backpressure_micros - .with_label_values(["sequencer"]), - ) + .set_backpressure_micros_counter(counter!(parent: BACKPRESSURE_MICROS, labels: [label_values!(ACTOR_NAME => "sequencer")])) .set_kill_switch(self.kill_switch.clone()) .spawn(sequencer); @@ -342,11 +333,7 @@ impl IndexingPipeline { ); let (uploader_mailbox, uploader_handle) = ctx .spawn_actor() - .set_backpressure_micros_counter( - crate::metrics::INDEXER_METRICS - .backpressure_micros - .with_label_values(["uploader"]), - ) + .set_backpressure_micros_counter(counter!(parent: BACKPRESSURE_MICROS, labels: [label_values!(ACTOR_NAME => "uploader")])) .set_kill_switch(self.kill_switch.clone()) .spawn(uploader); @@ -377,11 +364,7 @@ impl IndexingPipeline { ); let (indexer_mailbox, indexer_handle) = ctx .spawn_actor() - .set_backpressure_micros_counter( - crate::metrics::INDEXER_METRICS - .backpressure_micros - .with_label_values(["indexer"]), - ) + .set_backpressure_micros_counter(counter!(parent: BACKPRESSURE_MICROS, labels: [label_values!(ACTOR_NAME => "indexer")])) .set_kill_switch(self.kill_switch.clone()) .spawn(indexer); @@ -395,11 +378,7 @@ impl IndexingPipeline { )?; let (doc_processor_mailbox, doc_processor_handle) = ctx .spawn_actor() - .set_backpressure_micros_counter( - crate::metrics::INDEXER_METRICS - .backpressure_micros - .with_label_values(["doc_processor"]), - ) + .set_backpressure_micros_counter(counter!(parent: BACKPRESSURE_MICROS, labels: [label_values!(ACTOR_NAME => "doc_processor")])) .set_kill_switch(self.kill_switch.clone()) .spawn(doc_processor); let source_runtime = SourceRuntime { diff --git a/quickwit/quickwit-indexing/src/actors/merge_pipeline.rs b/quickwit/quickwit-indexing/src/actors/merge_pipeline.rs index b901d9f804a..656200717c2 100644 --- a/quickwit/quickwit-indexing/src/actors/merge_pipeline.rs +++ b/quickwit/quickwit-indexing/src/actors/merge_pipeline.rs @@ -30,6 +30,7 @@ use quickwit_metastore::{ ListSplitsQuery, ListSplitsRequestExt, MetastoreServiceStreamSplitsExt, SplitMetadata, SplitState, }; +use quickwit_metrics::{counter, label_values}; use quickwit_proto::indexing::MergePipelineId; use quickwit_proto::metastore::{ ListSplitsRequest, MetastoreError, MetastoreResult, MetastoreService, MetastoreServiceClient, @@ -44,6 +45,7 @@ use crate::actors::pipeline_shared::wait_duration_before_retry; use crate::actors::publisher::DisconnectMergePlanner; use crate::actors::{MergeSchedulerService, Publisher, Uploader, UploaderType}; use crate::merge_policy::MergePolicy; +use crate::metrics::{ACTOR_NAME, BACKPRESSURE_MICROS, ONGOING_MERGE_OPERATIONS}; use crate::models::MergeStatistics; use crate::split_store::IndexingSplitStore; @@ -272,11 +274,7 @@ impl MergePipeline { let (merge_publisher_mailbox, merge_publisher_handle) = ctx .spawn_actor() .set_kill_switch(self.kill_switch.clone()) - .set_backpressure_micros_counter( - crate::metrics::INDEXER_METRICS - .backpressure_micros - .with_label_values(["merge_publisher"]), - ) + .set_backpressure_micros_counter(counter!(parent: BACKPRESSURE_MICROS, labels: [label_values!(ACTOR_NAME => "merge_publisher")])) .spawn(merge_publisher); // Merge uploader @@ -322,11 +320,7 @@ impl MergePipeline { let (merge_executor_mailbox, merge_executor_handle) = ctx .spawn_actor() .set_kill_switch(self.kill_switch.clone()) - .set_backpressure_micros_counter( - crate::metrics::INDEXER_METRICS - .backpressure_micros - .with_label_values(["merge_executor"]), - ) + .set_backpressure_micros_counter(counter!(parent: BACKPRESSURE_MICROS, labels: [label_values!(ACTOR_NAME => "merge_executor")])) .spawn(merge_executor); let merge_split_downloader = MergeSplitDownloader { @@ -338,11 +332,7 @@ impl MergePipeline { let (merge_split_downloader_mailbox, merge_split_downloader_handle) = ctx .spawn_actor() .set_kill_switch(self.kill_switch.clone()) - .set_backpressure_micros_counter( - crate::metrics::INDEXER_METRICS - .backpressure_micros - .with_label_values(["merge_split_downloader"]), - ) + .set_backpressure_micros_counter(counter!(parent: BACKPRESSURE_MICROS, labels: [label_values!(ACTOR_NAME => "merge_split_downloader")])) .spawn(merge_split_downloader); // Merge planner @@ -397,9 +387,7 @@ impl MergePipeline { handles.merge_planner.refresh_observe(); handles.merge_uploader.refresh_observe(); handles.merge_publisher.refresh_observe(); - let num_ongoing_merges = crate::metrics::INDEXER_METRICS - .ongoing_merge_operations - .get(); + let num_ongoing_merges = ONGOING_MERGE_OPERATIONS.get(); self.statistics = self .previous_generations_statistics .clone() @@ -409,7 +397,7 @@ impl MergePipeline { ) .set_generation(self.statistics.generation) .set_num_spawn_attempts(self.statistics.num_spawn_attempts) - .set_ongoing_merges(usize::try_from(num_ongoing_merges).unwrap_or(0)); + .set_ongoing_merges(num_ongoing_merges.max(0.0) as usize); } async fn perform_health_check( diff --git a/quickwit/quickwit-indexing/src/actors/merge_scheduler_service.rs b/quickwit/quickwit-indexing/src/actors/merge_scheduler_service.rs index 70fe17c621b..ae77c982532 100644 --- a/quickwit/quickwit-indexing/src/actors/merge_scheduler_service.rs +++ b/quickwit/quickwit-indexing/src/actors/merge_scheduler_service.rs @@ -30,6 +30,7 @@ use super::MergeSplitDownloader; #[cfg(feature = "metrics")] use super::metrics_pipeline::{ParquetMergeSplitDownloader, ParquetMergeTask}; use crate::merge_policy::{MergeOperation, MergeTask}; +use crate::metrics::{ONGOING_MERGE_OPERATIONS, PENDING_MERGE_BYTES, PENDING_MERGE_OPERATIONS}; pub struct MergePermit { _semaphore_permit: Option, @@ -226,12 +227,8 @@ impl MergeSchedulerService { _merge_permit: merge_permit, }; self.pending_merge_bytes -= merge_task.merge_operation.total_num_bytes(); - crate::metrics::INDEXER_METRICS - .pending_merge_operations - .set(self.pending_merge_queue.len() as i64); - crate::metrics::INDEXER_METRICS - .pending_merge_bytes - .set(self.pending_merge_bytes as i64); + PENDING_MERGE_OPERATIONS.set(self.pending_merge_queue.len() as f64); + PENDING_MERGE_BYTES.set(self.pending_merge_bytes as f64); match split_downloader_mailbox.try_send_message(merge_task) { Ok(_) => {} Err(quickwit_actors::TrySendError::Full(_)) => { @@ -273,15 +270,10 @@ impl MergeSchedulerService { merge_permit, }; self.pending_merge_bytes -= parquet_merge_task.merge_operation.total_size_bytes(); - crate::metrics::INDEXER_METRICS - .pending_merge_operations - .set( - self.pending_merge_queue.len() as i64 - + self.pending_parquet_merge_queue.len() as i64, - ); - crate::metrics::INDEXER_METRICS - .pending_merge_bytes - .set(self.pending_merge_bytes as i64); + PENDING_MERGE_OPERATIONS.set( + (self.pending_merge_queue.len() + self.pending_parquet_merge_queue.len()) as f64, + ); + PENDING_MERGE_BYTES.set(self.pending_merge_bytes as f64); match split_downloader_mailbox.try_send_message(parquet_merge_task) { Ok(_) => {} Err(quickwit_actors::TrySendError::Full(_)) => { @@ -295,9 +287,7 @@ impl MergeSchedulerService { let num_merges = self.merge_concurrency as i64 - self.merge_semaphore.available_permits() as i64; - crate::metrics::INDEXER_METRICS - .ongoing_merge_operations - .set(num_merges); + ONGOING_MERGE_OPERATIONS.set(num_merges as f64); } } @@ -381,12 +371,8 @@ impl Handler for MergeSchedulerService { }; self.pending_merge_bytes += scheduled_merge.merge_operation.total_num_bytes(); self.pending_merge_queue.push(scheduled_merge); - crate::metrics::INDEXER_METRICS - .pending_merge_operations - .set(self.pending_merge_queue.len() as i64); - crate::metrics::INDEXER_METRICS - .pending_merge_bytes - .set(self.pending_merge_bytes as i64); + PENDING_MERGE_OPERATIONS.set(self.pending_merge_queue.len() as f64); + PENDING_MERGE_BYTES.set(self.pending_merge_bytes as f64); self.schedule_pending_merges(ctx); Ok(()) } @@ -467,15 +453,9 @@ impl Handler for MergeSchedulerService { }; self.pending_merge_bytes += scheduled.merge_operation.total_size_bytes(); self.pending_parquet_merge_queue.push(scheduled); - crate::metrics::INDEXER_METRICS - .pending_merge_operations - .set( - self.pending_merge_queue.len() as i64 - + self.pending_parquet_merge_queue.len() as i64, - ); - crate::metrics::INDEXER_METRICS - .pending_merge_bytes - .set(self.pending_merge_bytes as i64); + PENDING_MERGE_OPERATIONS + .set((self.pending_merge_queue.len() + self.pending_parquet_merge_queue.len()) as f64); + PENDING_MERGE_BYTES.set(self.pending_merge_bytes as f64); self.schedule_pending_merges(ctx); Ok(()) } diff --git a/quickwit/quickwit-indexing/src/actors/metrics_pipeline/parquet_merge_pipeline.rs b/quickwit/quickwit-indexing/src/actors/metrics_pipeline/parquet_merge_pipeline.rs index d1347f2d4bf..5567485f6d6 100644 --- a/quickwit/quickwit-indexing/src/actors/metrics_pipeline/parquet_merge_pipeline.rs +++ b/quickwit/quickwit-indexing/src/actors/metrics_pipeline/parquet_merge_pipeline.rs @@ -54,6 +54,7 @@ use super::{METRICS_PUBLISHER_NAME, ParquetUploader}; use crate::actors::pipeline_shared::wait_duration_before_retry; use crate::actors::publisher::DisconnectMergePlanner; use crate::actors::{MergeSchedulerService, Publisher, Sequencer, UploaderType}; +use crate::metrics::ONGOING_MERGE_OPERATIONS; use crate::models::MergeStatistics; /// Limits concurrent Parquet merge pipeline spawns to avoid overwhelming the @@ -367,9 +368,7 @@ impl ParquetMergePipeline { handles.merge_planner.refresh_observe(); handles.merge_uploader.refresh_observe(); handles.merge_publisher.refresh_observe(); - let num_ongoing_merges = crate::metrics::INDEXER_METRICS - .ongoing_merge_operations - .get(); + let num_ongoing_merges = ONGOING_MERGE_OPERATIONS.get(); self.statistics = self .previous_generations_statistics .clone() @@ -379,7 +378,7 @@ impl ParquetMergePipeline { ) .set_generation(self.statistics.generation) .set_num_spawn_attempts(self.statistics.num_spawn_attempts) - .set_ongoing_merges(usize::try_from(num_ongoing_merges).unwrap_or(0)); + .set_ongoing_merges(num_ongoing_merges.max(0.0) as usize); } async fn perform_health_check( diff --git a/quickwit/quickwit-indexing/src/actors/metrics_pipeline/parquet_uploader.rs b/quickwit/quickwit-indexing/src/actors/metrics_pipeline/parquet_uploader.rs index 509f9e3de6e..a23a478e12e 100644 --- a/quickwit/quickwit-indexing/src/actors/metrics_pipeline/parquet_uploader.rs +++ b/quickwit/quickwit-indexing/src/actors/metrics_pipeline/parquet_uploader.rs @@ -27,6 +27,7 @@ use async_trait::async_trait; use quickwit_actors::{Actor, ActorContext, ActorExitStatus, Handler, Mailbox, QueueCapacity}; use quickwit_common::spawn_named_task; use quickwit_metastore::StageParquetSplitsRequestExt; +use quickwit_metrics::{gauge, label_values}; use quickwit_parquet_engine::split::{ParquetSplitKind, ParquetSplitMetadata}; use quickwit_proto::metastore::{MetastoreService, MetastoreServiceClient}; use quickwit_storage::Storage; @@ -36,7 +37,7 @@ use tracing::{Instrument, Span, debug, info, instrument, warn}; use super::{ParquetSplitBatch, ParquetSplitsUpdate}; use crate::actors::sequencer::{Sequencer, SequencerCommand}; use crate::actors::{Publisher, UploaderCounters, UploaderType}; -use crate::metrics::INDEXER_METRICS; +use crate::metrics::{AVAILABLE_CONCURRENT_UPLOAD_PERMITS, COMPONENT}; /// Concurrent upload permits for metrics ingest uploads. static CONCURRENT_UPLOAD_PERMITS_METRICS_INDEX: OnceLock = OnceLock::new(); @@ -121,24 +122,21 @@ impl ParquetUploader { ctx: &ActorContext, ) -> anyhow::Result> { let _guard = ctx.protect_zone(); - let (concurrent_upload_permits_once_cell, concurrent_upload_permits_gauge) = - match self.uploader_type { - UploaderType::IndexUploader => ( - &CONCURRENT_UPLOAD_PERMITS_METRICS_INDEX, - INDEXER_METRICS - .available_concurrent_upload_permits - .with_label_values(["metrics_indexer"]), - ), - UploaderType::MergeUploader | UploaderType::DeleteUploader => ( - &CONCURRENT_UPLOAD_PERMITS_METRICS_MERGE, - INDEXER_METRICS - .available_concurrent_upload_permits - .with_label_values(["metrics_merger"]), - ), - }; + let (concurrent_upload_permits_once_cell, component) = match self.uploader_type { + UploaderType::IndexUploader => { + (&CONCURRENT_UPLOAD_PERMITS_METRICS_INDEX, "metrics_indexer") + } + UploaderType::MergeUploader | UploaderType::DeleteUploader => { + (&CONCURRENT_UPLOAD_PERMITS_METRICS_MERGE, "metrics_merger") + } + }; let concurrent_upload_permits = concurrent_upload_permits_once_cell .get_or_init(|| Semaphore::const_new(self.max_concurrent_uploads)); - concurrent_upload_permits_gauge.set(concurrent_upload_permits.available_permits() as i64); + let gauge = gauge!( + parent: AVAILABLE_CONCURRENT_UPLOAD_PERMITS, + labels: [label_values!(COMPONENT => component)], + ); + gauge.set(concurrent_upload_permits.available_permits() as f64); concurrent_upload_permits .acquire() .await diff --git a/quickwit/quickwit-indexing/src/actors/metrics_pipeline/pipeline.rs b/quickwit/quickwit-indexing/src/actors/metrics_pipeline/pipeline.rs index bf596995b89..629c1457c6a 100644 --- a/quickwit/quickwit-indexing/src/actors/metrics_pipeline/pipeline.rs +++ b/quickwit/quickwit-indexing/src/actors/metrics_pipeline/pipeline.rs @@ -33,11 +33,11 @@ use quickwit_actors::{ QueueCapacity, Supervisable, }; use quickwit_common::KillSwitch; -use quickwit_common::metrics::OwnedGaugeGuard; use quickwit_common::pubsub::EventBroker; use quickwit_common::temp_dir::TempDirectory; use quickwit_config::{IndexingSettings, SourceConfig}; use quickwit_ingest::IngesterPool; +use quickwit_metrics::{GaugeGuard, gauge}; use quickwit_proto::indexing::IndexingPipelineId; use quickwit_proto::metastore::{MetastoreError, MetastoreServiceClient}; use quickwit_proto::types::ShardId; @@ -50,6 +50,7 @@ use crate::actors::pipeline_shared::{ }; use crate::actors::sequencer::Sequencer; use crate::actors::{Publisher, UploaderType}; +use crate::metrics::INDEXING_PIPELINES; use crate::models::IndexingStatistics; use crate::source::{ AssignShards, Assignment, SourceActor, SourceRuntime, quickwit_supported_sources, @@ -111,7 +112,7 @@ pub struct MetricsPipeline { handles_opt: Option, kill_switch: KillSwitch, shard_ids: BTreeSet, - _indexing_pipelines_gauge_guard: OwnedGaugeGuard, + _indexing_pipelines_gauge_guard: GaugeGuard, } #[async_trait] @@ -144,10 +145,11 @@ impl Actor for MetricsPipeline { impl MetricsPipeline { pub fn new(params: MetricsPipelineParams) -> Self { - let indexing_pipelines_gauge = crate::metrics::INDEXER_METRICS - .indexing_pipelines - .with_label_values([¶ms.pipeline_id.index_uid.index_id]); - let indexing_pipelines_gauge_guard = OwnedGaugeGuard::from_gauge(indexing_pipelines_gauge); + let indexing_pipelines_gauge = gauge!( + parent: INDEXING_PIPELINES, + "index" => params.pipeline_id.index_uid.index_id.clone(), + ); + let indexing_pipelines_gauge_guard = GaugeGuard::new(&indexing_pipelines_gauge, 1.0); let params_fingerprint = params.params_fingerprint; MetricsPipeline { params, diff --git a/quickwit/quickwit-indexing/src/actors/metrics_pipeline/processed_parquet_batch.rs b/quickwit/quickwit-indexing/src/actors/metrics_pipeline/processed_parquet_batch.rs index 0599e03bf2c..64f23e08b7e 100644 --- a/quickwit/quickwit-indexing/src/actors/metrics_pipeline/processed_parquet_batch.rs +++ b/quickwit/quickwit-indexing/src/actors/metrics_pipeline/processed_parquet_batch.rs @@ -20,8 +20,9 @@ use std::fmt; use arrow::record_batch::RecordBatch; -use quickwit_common::metrics::{GaugeGuard, MEMORY_METRICS}; +use quickwit_common::metrics::IN_FLIGHT_INDEXER_MAILBOX; use quickwit_metastore::checkpoint::SourceCheckpointDelta; +use quickwit_metrics::GaugeGuard; /// Batch of parquet data as Arrow RecordBatch for the parquet indexing pipeline. /// @@ -35,7 +36,7 @@ pub struct ProcessedParquetBatch { /// Force commit flag - when true, accumulator should flush immediately. pub force_commit: bool, /// Memory tracking gauge guard. - _gauge_guard: GaugeGuard<'static>, + _gauge_guard: GaugeGuard, } impl ProcessedParquetBatch { @@ -65,8 +66,7 @@ impl ProcessedParquetBatch { .map(|col| col.get_array_memory_size() as i64) .sum(); - let mut gauge_guard = GaugeGuard::from_gauge(&MEMORY_METRICS.in_flight.indexer_mailbox); - gauge_guard.add(memory_size); + let gauge_guard = GaugeGuard::new(&IN_FLIGHT_INDEXER_MAILBOX, memory_size as f64); Self { batches, diff --git a/quickwit/quickwit-indexing/src/actors/uploader.rs b/quickwit/quickwit-indexing/src/actors/uploader.rs index 8e1c0d56afb..b432622e739 100644 --- a/quickwit/quickwit-indexing/src/actors/uploader.rs +++ b/quickwit/quickwit-indexing/src/actors/uploader.rs @@ -28,6 +28,7 @@ use quickwit_common::spawn_named_task; use quickwit_config::RetentionPolicy; use quickwit_metastore::checkpoint::IndexCheckpointDelta; use quickwit_metastore::{SplitMetadata, StageSplitsRequestExt}; +use quickwit_metrics::{gauge, label_values}; use quickwit_proto::metastore::{MetastoreService, MetastoreServiceClient, StageSplitsRequest}; use quickwit_proto::search::{ReportSplit, ReportSplitsRequest}; use quickwit_proto::types::{IndexUid, PublishToken}; @@ -40,7 +41,7 @@ use tracing::{Instrument, Span, debug, info, instrument, warn}; use crate::actors::Publisher; use crate::actors::sequencer::{Sequencer, SequencerCommand}; use crate::merge_policy::{MergePolicy, MergeTask}; -use crate::metrics::INDEXER_METRICS; +use crate::metrics::{AVAILABLE_CONCURRENT_UPLOAD_PERMITS, COMPONENT}; use crate::models::{ EmptySplit, PackagedSplit, PackagedSplitBatch, PublishLock, SplitsUpdate, create_split_metadata, }; @@ -199,30 +200,25 @@ impl Uploader { ctx: &ActorContext, ) -> anyhow::Result> { let _guard = ctx.protect_zone(); - let (concurrent_upload_permits_once_cell, concurrent_upload_permits_gauge) = - match self.uploader_type { - UploaderType::IndexUploader => ( - &CONCURRENT_UPLOAD_PERMITS_INDEX, - INDEXER_METRICS - .available_concurrent_upload_permits - .with_label_values(["indexer"]), - ), - UploaderType::MergeUploader => ( - &CONCURRENT_UPLOAD_PERMITS_MERGE, - INDEXER_METRICS - .available_concurrent_upload_permits - .with_label_values(["merger"]), - ), - UploaderType::DeleteUploader => ( - &CONCURRENT_UPLOAD_PERMITS_MERGE, - INDEXER_METRICS - .available_concurrent_upload_permits - .with_label_values(["merger"]), - ), - }; + let (concurrent_upload_permits_once_cell, concurrent_upload_permits_gauge) = match self + .uploader_type + { + UploaderType::IndexUploader => ( + &CONCURRENT_UPLOAD_PERMITS_INDEX, + gauge!(parent: AVAILABLE_CONCURRENT_UPLOAD_PERMITS, labels: [label_values!(COMPONENT => "indexer")]), + ), + UploaderType::MergeUploader => ( + &CONCURRENT_UPLOAD_PERMITS_MERGE, + gauge!(parent: AVAILABLE_CONCURRENT_UPLOAD_PERMITS, labels: [label_values!(COMPONENT => "merger")]), + ), + UploaderType::DeleteUploader => ( + &CONCURRENT_UPLOAD_PERMITS_MERGE, + gauge!(parent: AVAILABLE_CONCURRENT_UPLOAD_PERMITS, labels: [label_values!(COMPONENT => "merger")]), + ), + }; let concurrent_upload_permits = concurrent_upload_permits_once_cell .get_or_init(|| Semaphore::const_new(self.max_concurrent_split_uploads)); - concurrent_upload_permits_gauge.set(concurrent_upload_permits.available_permits() as i64); + concurrent_upload_permits_gauge.set(concurrent_upload_permits.available_permits() as f64); concurrent_upload_permits .acquire() .await diff --git a/quickwit/quickwit-indexing/src/metrics.rs b/quickwit/quickwit-indexing/src/metrics.rs index 98ca19636a2..2d936a00cb9 100644 --- a/quickwit/quickwit-indexing/src/metrics.rs +++ b/quickwit/quickwit-indexing/src/metrics.rs @@ -14,103 +14,89 @@ use std::sync::LazyLock; -use quickwit_common::metrics::{ - IntCounter, IntCounterVec, IntGauge, IntGaugeVec, new_counter, new_counter_vec, new_gauge, - new_gauge_vec, -}; +use quickwit_metrics::{Counter, Gauge, LabelNames, counter, gauge, label_names}; -pub struct IndexerMetrics { - pub processed_docs_total: IntCounterVec<2>, - pub processed_bytes: IntCounterVec<2>, - pub indexing_pipelines: IntGaugeVec<1>, - pub backpressure_micros: IntCounterVec<1>, - pub available_concurrent_upload_permits: IntGaugeVec<1>, - pub split_builders: IntGauge, - pub ongoing_merge_operations: IntGauge, - pub pending_merge_operations: IntGauge, - pub pending_merge_bytes: IntGauge, - // We use a lazy counter, as most users do not use Kafka. - #[cfg_attr(not(feature = "kafka"), allow(dead_code))] - pub kafka_rebalance_total: LazyLock, -} +pub(crate) const ACTOR_NAME: LabelNames<1> = label_names!("actor_name"); +pub(crate) const COMPONENT: LabelNames<1> = label_names!("component"); -impl Default for IndexerMetrics { - fn default() -> Self { - IndexerMetrics { - processed_docs_total: new_counter_vec( - "processed_docs_total", - "Number of processed docs by index, source and processed status in [valid, \ - schema_error, parse_error, transform_error]", - "indexing", - &[], - ["index", "docs_processed_status"], - ), - processed_bytes: new_counter_vec( - "processed_bytes", - "Number of bytes of processed documents by index, source and processed status in \ - [valid, schema_error, parse_error, transform_error]", - "indexing", - &[], - ["index", "docs_processed_status"], - ), - indexing_pipelines: new_gauge_vec( - "indexing_pipelines", - "Number of running indexing pipelines", - "indexing", - &[], - ["index"], - ), - backpressure_micros: new_counter_vec( - "backpressure_micros", - "Amount of time spent in backpressure (in micros). This time only includes the \ - amount of time spent waiting for a place in the queue of another actor.", - "indexing", - &[], - ["actor_name"], - ), - available_concurrent_upload_permits: new_gauge_vec( - "concurrent_upload_available_permits_num", - "Number of available concurrent upload permits by component in [merger, indexer]", - "indexing", - &[], - ["component"], - ), - split_builders: new_gauge( - "split_builders", - "Number of existing index writer instances.", - "indexing", - &[], - ), - ongoing_merge_operations: new_gauge( - "ongoing_merge_operations", - "Number of ongoing merge operations", - "indexing", - &[], - ), - pending_merge_operations: new_gauge( - "pending_merge_operations", - "Number of pending merge operations", - "indexing", - &[], - ), - pending_merge_bytes: new_gauge( - "pending_merge_bytes", - "Number of pending merge bytes", - "indexing", - &[], - ), - kafka_rebalance_total: LazyLock::new(|| { - new_counter( - "kafka_rebalance_total", - "Number of kafka rebalances", - "indexing", - &[], - ) - }), - } - } -} +pub(crate) static PROCESSED_DOCS_TOTAL: LazyLock = LazyLock::new(|| { + counter!( + name: "processed_docs_total", + description: "Number of processed docs by index, source and processed status in [valid, schema_error, parse_error, transform_error]", + subsystem: "indexing", + ) +}); -/// `INDEXER_METRICS` exposes indexing related metrics through a prometheus -/// endpoint. -pub static INDEXER_METRICS: LazyLock = LazyLock::new(IndexerMetrics::default); +pub(crate) static PROCESSED_BYTES: LazyLock = LazyLock::new(|| { + counter!( + name: "processed_bytes", + description: "Number of bytes of processed documents by index, source and processed status in [valid, schema_error, parse_error, transform_error]", + subsystem: "indexing", + ) +}); + +pub(crate) static INDEXING_PIPELINES: LazyLock = LazyLock::new(|| { + gauge!( + name: "indexing_pipelines", + description: "Number of running indexing pipelines", + subsystem: "indexing", + ) +}); + +pub(crate) static BACKPRESSURE_MICROS: LazyLock = LazyLock::new(|| { + counter!( + name: "backpressure_micros", + description: "Amount of time spent in backpressure (in micros). This time only includes the amount of time spent waiting for a place in the queue of another actor.", + subsystem: "indexing", + ) +}); + +pub(crate) static AVAILABLE_CONCURRENT_UPLOAD_PERMITS: LazyLock = LazyLock::new(|| { + gauge!( + name: "concurrent_upload_available_permits_num", + description: "Number of available concurrent upload permits by component in [merger, indexer]", + subsystem: "indexing", + ) +}); + +pub(crate) static SPLIT_BUILDERS: LazyLock = LazyLock::new(|| { + gauge!( + name: "split_builders", + description: "Number of existing index writer instances.", + subsystem: "indexing", + ) +}); + +pub(crate) static ONGOING_MERGE_OPERATIONS: LazyLock = LazyLock::new(|| { + gauge!( + name: "ongoing_merge_operations", + description: "Number of ongoing merge operations", + subsystem: "indexing", + ) +}); + +pub(crate) static PENDING_MERGE_OPERATIONS: LazyLock = LazyLock::new(|| { + gauge!( + name: "pending_merge_operations", + description: "Number of pending merge operations", + subsystem: "indexing", + ) +}); + +pub(crate) static PENDING_MERGE_BYTES: LazyLock = LazyLock::new(|| { + gauge!( + name: "pending_merge_bytes", + description: "Number of pending merge bytes", + subsystem: "indexing", + ) +}); + +// We use a lazy counter, as most users do not use Kafka. +#[cfg_attr(not(feature = "kafka"), allow(dead_code))] +pub(crate) static KAFKA_REBALANCE_TOTAL: LazyLock = LazyLock::new(|| { + counter!( + name: "kafka_rebalance_total", + description: "Number of kafka rebalances", + subsystem: "indexing", + ) +}); diff --git a/quickwit/quickwit-indexing/src/models/indexed_split.rs b/quickwit/quickwit-indexing/src/models/indexed_split.rs index cd272bdc34c..03728fe2f6a 100644 --- a/quickwit/quickwit-indexing/src/models/indexed_split.rs +++ b/quickwit/quickwit-indexing/src/models/indexed_split.rs @@ -16,9 +16,9 @@ use std::fmt; use std::path::Path; use quickwit_common::io::IoControls; -use quickwit_common::metrics::GaugeGuard; use quickwit_common::temp_dir::TempDirectory; use quickwit_metastore::checkpoint::IndexCheckpointDelta; +use quickwit_metrics::GaugeGuard; use quickwit_proto::indexing::IndexingPipelineId; use quickwit_proto::types::{DocMappingUid, IndexUid, PublishToken}; use tantivy::IndexBuilder; @@ -182,8 +182,8 @@ pub struct IndexedSplitBatchBuilder { pub publish_token_opt: Option, pub commit_trigger: CommitTrigger, pub batch_parent_span: Span, - pub memory_usage: GaugeGuard<'static>, - pub _split_builders_guard: GaugeGuard<'static>, + pub memory_usage: GaugeGuard, + pub _split_builders_guard: GaugeGuard, } /// Sends notifications to the Publisher that the last batch of splits was empty. diff --git a/quickwit/quickwit-indexing/src/models/processed_doc.rs b/quickwit/quickwit-indexing/src/models/processed_doc.rs index bed695aa1d4..eb9c7179342 100644 --- a/quickwit/quickwit-indexing/src/models/processed_doc.rs +++ b/quickwit/quickwit-indexing/src/models/processed_doc.rs @@ -14,8 +14,9 @@ use std::fmt; -use quickwit_common::metrics::{GaugeGuard, MEMORY_METRICS}; +use quickwit_common::metrics::IN_FLIGHT_INDEXER_MAILBOX; use quickwit_metastore::checkpoint::SourceCheckpointDelta; +use quickwit_metrics::GaugeGuard; use tantivy::{DateTime, TantivyDocument}; pub struct ProcessedDoc { @@ -41,7 +42,7 @@ pub struct ProcessedDocBatch { pub docs: Vec, pub checkpoint_delta: SourceCheckpointDelta, pub force_commit: bool, - _gauge_guard: GaugeGuard<'static>, + _gauge_guard: GaugeGuard, } impl ProcessedDocBatch { @@ -51,8 +52,7 @@ impl ProcessedDocBatch { force_commit: bool, ) -> Self { let delta = docs.iter().map(|doc| doc.num_bytes as i64).sum::(); - let mut gauge_guard = GaugeGuard::from_gauge(&MEMORY_METRICS.in_flight.indexer_mailbox); - gauge_guard.add(delta); + let gauge_guard = GaugeGuard::new(&IN_FLIGHT_INDEXER_MAILBOX, delta as f64); Self { docs, checkpoint_delta, diff --git a/quickwit/quickwit-indexing/src/models/raw_doc_batch.rs b/quickwit/quickwit-indexing/src/models/raw_doc_batch.rs index f88d9fcac2b..f84226bfaa6 100644 --- a/quickwit/quickwit-indexing/src/models/raw_doc_batch.rs +++ b/quickwit/quickwit-indexing/src/models/raw_doc_batch.rs @@ -15,8 +15,9 @@ use std::fmt; use bytes::Bytes; -use quickwit_common::metrics::{GaugeGuard, MEMORY_METRICS}; +use quickwit_common::metrics::IN_FLIGHT_DOC_PROCESSOR_MAILBOX; use quickwit_metastore::checkpoint::SourceCheckpointDelta; +use quickwit_metrics::GaugeGuard; pub struct RawDocBatch { // Do not directly append documents to this vector; otherwise, in-flight metrics will be @@ -24,7 +25,7 @@ pub struct RawDocBatch { pub docs: Vec, pub checkpoint_delta: SourceCheckpointDelta, pub force_commit: bool, - _gauge_guard: GaugeGuard<'static>, + _gauge_guard: GaugeGuard, } impl RawDocBatch { @@ -34,9 +35,7 @@ impl RawDocBatch { force_commit: bool, ) -> Self { let delta = docs.iter().map(|doc| doc.len() as i64).sum::(); - let mut gauge_guard = - GaugeGuard::from_gauge(&MEMORY_METRICS.in_flight.doc_processor_mailbox); - gauge_guard.add(delta); + let gauge_guard = GaugeGuard::new(&IN_FLIGHT_DOC_PROCESSOR_MAILBOX, delta as f64); Self { docs, @@ -67,7 +66,7 @@ impl fmt::Debug for RawDocBatch { impl Default for RawDocBatch { fn default() -> Self { - let _gauge_guard = GaugeGuard::from_gauge(&MEMORY_METRICS.in_flight.doc_processor_mailbox); + let _gauge_guard = GaugeGuard::new(&IN_FLIGHT_DOC_PROCESSOR_MAILBOX, 0.0); Self { docs: Vec::new(), checkpoint_delta: SourceCheckpointDelta::default(), diff --git a/quickwit/quickwit-indexing/src/source/ingest/mod.rs b/quickwit/quickwit-indexing/src/source/ingest/mod.rs index 4403fa0f547..c815d2bc1e7 100644 --- a/quickwit/quickwit-indexing/src/source/ingest/mod.rs +++ b/quickwit/quickwit-indexing/src/source/ingest/mod.rs @@ -665,7 +665,7 @@ mod tests { use itertools::Itertools; use quickwit_actors::{ActorContext, Universe}; use quickwit_common::ServiceStream; - use quickwit_common::metrics::MEMORY_METRICS; + use quickwit_common::metrics::IN_FLIGHT_FETCH_STREAM; use quickwit_common::stream_utils::InFlightValue; use quickwit_config::{IndexingSettings, SourceConfig, SourceParams}; use quickwit_ingest::IngesterPoolEntry; @@ -1434,11 +1434,8 @@ mod tests { }; let batch_size = fetch_payload.estimate_size(); let fetch_message = FetchMessage::new_payload(fetch_payload); - let in_flight_value = InFlightValue::new( - fetch_message, - batch_size, - &MEMORY_METRICS.in_flight.fetch_stream, - ); + let in_flight_value = + InFlightValue::new(fetch_message, batch_size, &IN_FLIGHT_FETCH_STREAM); fetch_message_tx.send(Ok(in_flight_value)).await.unwrap(); let fetch_payload = FetchPayload { @@ -1451,11 +1448,8 @@ mod tests { }; let batch_size = fetch_payload.estimate_size(); let fetch_message = FetchMessage::new_payload(fetch_payload); - let in_flight_value = InFlightValue::new( - fetch_message, - batch_size, - &MEMORY_METRICS.in_flight.fetch_stream, - ); + let in_flight_value = + InFlightValue::new(fetch_message, batch_size, &IN_FLIGHT_FETCH_STREAM); fetch_message_tx.send(Ok(in_flight_value)).await.unwrap(); let fetch_eof = FetchEof { @@ -1465,11 +1459,8 @@ mod tests { eof_position: Some(Position::eof(23u64)), }; let fetch_message = FetchMessage::new_eof(fetch_eof); - let in_flight_value = InFlightValue::new( - fetch_message, - ByteSize(0), - &MEMORY_METRICS.in_flight.fetch_stream, - ); + let in_flight_value = + InFlightValue::new(fetch_message, ByteSize(0), &IN_FLIGHT_FETCH_STREAM); fetch_message_tx.send(Ok(in_flight_value)).await.unwrap(); source.emit_batches(&source_sink, &ctx).await.unwrap(); @@ -1526,11 +1517,8 @@ mod tests { }; let batch_size = fetch_payload.estimate_size(); let fetch_message = FetchMessage::new_payload(fetch_payload); - let in_flight_value = InFlightValue::new( - fetch_message, - batch_size, - &MEMORY_METRICS.in_flight.fetch_stream, - ); + let in_flight_value = + InFlightValue::new(fetch_message, batch_size, &IN_FLIGHT_FETCH_STREAM); fetch_message_tx.send(Ok(in_flight_value)).await.unwrap(); source.emit_batches(&source_sink, &ctx).await.unwrap(); diff --git a/quickwit/quickwit-indexing/src/source/kafka_source.rs b/quickwit/quickwit-indexing/src/source/kafka_source.rs index 93ce5b3dc37..b15430a4909 100644 --- a/quickwit/quickwit-indexing/src/source/kafka_source.rs +++ b/quickwit/quickwit-indexing/src/source/kafka_source.rs @@ -40,6 +40,7 @@ use tokio::task::{JoinHandle, spawn_blocking}; use tokio::time; use tracing::{debug, info, warn}; +use crate::metrics::KAFKA_REBALANCE_TOTAL; use crate::models::{NewPublishLock, PublishLock}; use crate::source::{ BATCH_NUM_BYTES_LIMIT, BatchBuilder, EMIT_BATCHES_TIMEOUT, Source, SourceContext, @@ -127,7 +128,7 @@ macro_rules! return_if_err { /// impl ConsumerContext for RdKafkaContext { fn pre_rebalance(&self, _consumer: &BaseConsumer, rebalance: &Rebalance) { - crate::metrics::INDEXER_METRICS.kafka_rebalance_total.inc(); + KAFKA_REBALANCE_TOTAL.increment(1); quickwit_common::rate_limited_info!(limit_per_min = 3, topic = self.topic, "rebalance"); if let Rebalance::Revoke(tpl) = rebalance { let partitions = collect_partitions(tpl, &self.topic); diff --git a/quickwit/quickwit-indexing/src/source/mod.rs b/quickwit/quickwit-indexing/src/source/mod.rs index 0e696eaea0f..8abd92e7114 100644 --- a/quickwit/quickwit-indexing/src/source/mod.rs +++ b/quickwit/quickwit-indexing/src/source/mod.rs @@ -92,7 +92,11 @@ pub use pulsar_source::{PulsarSource, PulsarSourceFactory}; #[cfg(feature = "sqs")] pub use queue_sources::sqs_queue; use quickwit_actors::{Actor, ActorContext, ActorExitStatus, Handler}; -use quickwit_common::metrics::{GaugeGuard, MEMORY_METRICS}; +use quickwit_common::metrics::{ + IN_FLIGHT_FILE_SOURCE, IN_FLIGHT_INGEST_SOURCE, IN_FLIGHT_KAFKA_SOURCE, + IN_FLIGHT_KINESIS_SOURCE, IN_FLIGHT_OTHER_SOURCE, IN_FLIGHT_PUBSUB_SOURCE, + IN_FLIGHT_PULSAR_SOURCE, +}; use quickwit_common::pubsub::EventBroker; use quickwit_common::runtimes::RuntimeType; use quickwit_config::{ @@ -101,6 +105,7 @@ use quickwit_config::{ use quickwit_ingest::IngesterPool; use quickwit_metastore::IndexMetadataResponseExt; use quickwit_metastore::checkpoint::{SourceCheckpoint, SourceCheckpointDelta}; +use quickwit_metrics::GaugeGuard; use quickwit_proto::indexing::IndexingPipelineId; use quickwit_proto::metastore::{ IndexMetadataRequest, MetastoreError, MetastoreResult, MetastoreService, @@ -519,7 +524,7 @@ pub(super) struct BatchBuilder { num_bytes: u64, checkpoint_delta: SourceCheckpointDelta, force_commit: bool, - gauge_guard: GaugeGuard<'static>, + gauge_guard: GaugeGuard, } impl BatchBuilder { @@ -529,15 +534,15 @@ impl BatchBuilder { pub fn with_capacity(capacity: usize, source_type: SourceType) -> Self { let gauge = match source_type { - SourceType::File => MEMORY_METRICS.in_flight.file(), - SourceType::IngestV2 => MEMORY_METRICS.in_flight.ingest(), - SourceType::Kafka => MEMORY_METRICS.in_flight.kafka(), - SourceType::Kinesis => MEMORY_METRICS.in_flight.kinesis(), - SourceType::PubSub => MEMORY_METRICS.in_flight.pubsub(), - SourceType::Pulsar => MEMORY_METRICS.in_flight.pulsar(), - _ => MEMORY_METRICS.in_flight.other(), + SourceType::File => &IN_FLIGHT_FILE_SOURCE, + SourceType::IngestV2 => &IN_FLIGHT_INGEST_SOURCE, + SourceType::Kafka => &IN_FLIGHT_KAFKA_SOURCE, + SourceType::Kinesis => &IN_FLIGHT_KINESIS_SOURCE, + SourceType::PubSub => &IN_FLIGHT_PUBSUB_SOURCE, + SourceType::Pulsar => &IN_FLIGHT_PULSAR_SOURCE, + _ => &IN_FLIGHT_OTHER_SOURCE, }; - let gauge_guard = GaugeGuard::from_gauge(gauge); + let gauge_guard = GaugeGuard::new(gauge, 0.0); Self { docs: Vec::with_capacity(capacity), @@ -551,8 +556,8 @@ impl BatchBuilder { pub fn add_doc(&mut self, doc: Bytes) { let num_bytes = doc.len(); self.docs.push(doc); - self.gauge_guard.add(num_bytes as i64); self.num_bytes += num_bytes as u64; + self.gauge_guard.increment(num_bytes as f64); } pub fn force_commit(&mut self) { @@ -567,7 +572,7 @@ impl BatchBuilder { pub fn clear(&mut self) { self.docs.clear(); self.checkpoint_delta = SourceCheckpointDelta::default(); - self.gauge_guard.sub(self.num_bytes as i64); + self.gauge_guard.decrement(self.num_bytes as f64); self.num_bytes = 0; } } diff --git a/quickwit/quickwit-indexing/src/source/queue_sources/shared_state.rs b/quickwit/quickwit-indexing/src/source/queue_sources/shared_state.rs index b839c968043..b3551186341 100644 --- a/quickwit/quickwit-indexing/src/source/queue_sources/shared_state.rs +++ b/quickwit/quickwit-indexing/src/source/queue_sources/shared_state.rs @@ -155,7 +155,9 @@ impl QueueSharedState { info!(previous_token = shard.publish_token, "shard re-acquired"); re_acquired_shards.push(shard.shard_id().clone()); } else if is_owned && !position.is_beginning() { - bail!("Partition is owned by this indexing pipeline but is not at the beginning. This should never happen! Please, report on https://github.com/quickwit-oss/quickwit/issues.") + bail!( + "Partition is owned by this indexing pipeline but is not at the beginning. This should never happen! Please, report on https://github.com/quickwit-oss/quickwit/issues." + ) } } diff --git a/quickwit/quickwit-ingest/Cargo.toml b/quickwit/quickwit-ingest/Cargo.toml index 1bf15d76fd4..03121cf4cc6 100644 --- a/quickwit/quickwit-ingest/Cargo.toml +++ b/quickwit/quickwit-ingest/Cargo.toml @@ -38,6 +38,7 @@ utoipa = { workspace = true } quickwit-actors = { workspace = true } quickwit-cluster = { workspace = true } quickwit-common = { workspace = true, features = ["testsuite"] } +quickwit-metrics = { workspace = true } quickwit-config = { workspace = true } quickwit-doc-mapper = { workspace = true, features = ["testsuite"] } quickwit-proto = { workspace = true } diff --git a/quickwit/quickwit-ingest/src/ingest_api_service.rs b/quickwit/quickwit-ingest/src/ingest_api_service.rs index 7ee8d0b232b..ba625417a6a 100644 --- a/quickwit/quickwit-ingest/src/ingest_api_service.rs +++ b/quickwit/quickwit-ingest/src/ingest_api_service.rs @@ -22,11 +22,12 @@ use quickwit_actors::{ }; use quickwit_common::runtimes::RuntimeType; use quickwit_common::tower::Cost; +use quickwit_metrics::{counter, label_values}; use quickwit_proto::ingest::RateLimitingCause; use tracing::{error, info}; use ulid::Ulid; -use crate::metrics::INGEST_METRICS; +use crate::metrics::{DOCS_BYTES_TOTAL, DOCS_TOTAL, VALIDITY}; use crate::notifications::Notifications; use crate::{ CommitType, CreateQueueIfNotExistsRequest, CreateQueueIfNotExistsResponse, CreateQueueRequest, @@ -201,12 +202,9 @@ impl IngestApiService { } num_docs += batch_num_docs; - INGEST_METRICS - .ingested_docs_bytes_valid - .inc_by(batch_num_bytes as u64); - INGEST_METRICS - .ingested_docs_valid - .inc_by(batch_num_docs as u64); + let labels = label_values!(VALIDITY => "valid"); + counter!(parent: DOCS_BYTES_TOTAL, labels: [labels]).increment(batch_num_bytes as u64); + counter!(parent: DOCS_TOTAL, labels: [labels]).increment(batch_num_docs as u64); } // TODO we could fsync here and disable autosync to have better i/o perfs. Ok(( diff --git a/quickwit/quickwit-ingest/src/ingest_v2/broadcast/local_shards.rs b/quickwit/quickwit-ingest/src/ingest_v2/broadcast/local_shards.rs index 86f59f6101e..bc2409dcedd 100644 --- a/quickwit/quickwit-ingest/src/ingest_v2/broadcast/local_shards.rs +++ b/quickwit/quickwit-ingest/src/ingest_v2/broadcast/local_shards.rs @@ -30,7 +30,9 @@ use tracing::{debug, warn}; use super::{BROADCAST_INTERVAL_PERIOD, make_key, parse_key}; use crate::RateMibPerSec; -use crate::ingest_v2::metrics::INGEST_V2_METRICS; +use crate::ingest_v2::metrics::{ + CLOSED_SHARDS, OPEN_SHARDS, SHARD_LT_THROUGHPUT_MIB, SHARD_ST_THROUGHPUT_MIB, +}; use crate::ingest_v2::state::WeakIngesterState; const ONE_MIB: ByteSize = ByteSize::mib(1); @@ -195,12 +197,8 @@ impl ShardThroughputTimeSeriesMap { .average() .as_u64() .div_ceil(ONE_MIB.as_u64()); - INGEST_V2_METRICS - .shard_st_throughput_mib - .observe(short_term_ingestion_rate_mib_per_sec_u64 as f64); - INGEST_V2_METRICS - .shard_lt_throughput_mib - .observe(long_term_ingestion_rate_mib_per_sec_u64 as f64); + SHARD_ST_THROUGHPUT_MIB.record(short_term_ingestion_rate_mib_per_sec_u64 as f64); + SHARD_LT_THROUGHPUT_MIB.record(long_term_ingestion_rate_mib_per_sec_u64 as f64); let short_term_ingestion_rate = RateMibPerSec(short_term_ingestion_rate_mib_per_sec_u64 as u16); @@ -300,10 +298,8 @@ impl BroadcastLocalShardsTask { } } } - INGEST_V2_METRICS.open_shards.set(num_open_shards as i64); - INGEST_V2_METRICS - .closed_shards - .set(num_closed_shards as i64); + OPEN_SHARDS.set(num_open_shards as f64); + CLOSED_SHARDS.set(num_closed_shards as f64); let snapshot = LocalShardsSnapshot { per_source_shard_infos, diff --git a/quickwit/quickwit-ingest/src/ingest_v2/fetch.rs b/quickwit/quickwit-ingest/src/ingest_v2/fetch.rs index f5f5ca166c1..635d0658f08 100644 --- a/quickwit/quickwit-ingest/src/ingest_v2/fetch.rs +++ b/quickwit/quickwit-ingest/src/ingest_v2/fetch.rs @@ -22,7 +22,7 @@ use bytes::{BufMut, BytesMut}; use bytesize::ByteSize; use futures::StreamExt; use mrecordlog::Record; -use quickwit_common::metrics::MEMORY_METRICS; +use quickwit_common::metrics::{IN_FLIGHT_FETCH_STREAM, IN_FLIGHT_MULTI_FETCH_STREAM}; use quickwit_common::retry::RetryParams; use quickwit_common::stream_utils::{InFlightValue, TrackedSender}; use quickwit_common::{ServiceStream, spawn_named_task}; @@ -83,7 +83,7 @@ impl FetchStreamTask { .map(|offset| offset + 1) .unwrap_or_default(); let (fetch_message_tx, fetch_stream) = - ServiceStream::new_bounded_with_gauge(3, &MEMORY_METRICS.in_flight.fetch_stream); + ServiceStream::new_bounded_with_gauge(3, &IN_FLIGHT_FETCH_STREAM); let mut fetch_task = Self { shard_id: open_fetch_stream_request.shard_id().clone(), queue_id: open_fetch_stream_request.queue_id(), @@ -562,7 +562,7 @@ async fn fault_tolerant_fetch_stream( let in_flight_value = InFlightValue::new( fetch_message, batch_size, - &MEMORY_METRICS.in_flight.multi_fetch_stream, + &IN_FLIGHT_MULTI_FETCH_STREAM, ); if fetch_message_tx.send(Ok(in_flight_value)).await.is_err() { // The consumer was dropped. @@ -575,7 +575,7 @@ async fn fault_tolerant_fetch_stream( let in_flight_value = InFlightValue::new( fetch_message, ByteSize(0), - &MEMORY_METRICS.in_flight.multi_fetch_stream, + &IN_FLIGHT_MULTI_FETCH_STREAM, ); // We ignore the send error if the consumer was dropped because we're going // to return anyway. diff --git a/quickwit/quickwit-ingest/src/ingest_v2/ingester.rs b/quickwit/quickwit-ingest/src/ingest_v2/ingester.rs index 4680472cdce..cb2e4be38f1 100644 --- a/quickwit/quickwit-ingest/src/ingest_v2/ingester.rs +++ b/quickwit/quickwit-ingest/src/ingest_v2/ingester.rs @@ -25,11 +25,12 @@ use futures::StreamExt; use futures::stream::FuturesUnordered; use mrecordlog::error::CreateQueueError; use quickwit_cluster::Cluster; -use quickwit_common::metrics::{GaugeGuard, MEMORY_METRICS}; +use quickwit_common::metrics::IN_FLIGHT_INGESTER_PERSIST; use quickwit_common::pretty::PrettyDisplay; use quickwit_common::pubsub::{EventBroker, EventSubscriber}; use quickwit_common::rate_limiter::{RateLimiter, RateLimiterSettings}; use quickwit_common::{ServiceStream, rate_limited_error, rate_limited_warn}; +use quickwit_metrics::{GaugeGuard, counter, label_values}; use quickwit_proto::control_plane::{ AdviseResetShardsRequest, ControlPlaneService, ControlPlaneServiceClient, }; @@ -51,7 +52,6 @@ use super::broadcast::{BroadcastIngesterCapacityScoreTask, BroadcastLocalShardsT use super::doc_mapper::validate_doc_batch; use super::fetch::FetchStreamTask; use super::idle::CloseIdleShardsTask; -use super::metrics::INGEST_V2_METRICS; use super::models::IngesterShard; use super::mrecordlog_utils::{ AppendDocBatchError, append_non_empty_doc_batch, check_enough_capacity, @@ -63,8 +63,9 @@ use super::replication::{ }; use super::state::{IngesterState, InnerIngesterState, WeakIngesterState}; use crate::ingest_v2::doc_mapper::get_or_try_build_doc_mapper; -use crate::ingest_v2::metrics::report_wal_usage; +use crate::ingest_v2::metrics::{RESET_SHARDS_OPERATIONS_TOTAL, STATUS, report_wal_usage}; use crate::ingest_v2::models::IngesterShardType; +use crate::metrics::{DOCS_BYTES_TOTAL, DOCS_TOTAL, VALIDITY}; use crate::mrecordlog_async::MultiRecordLogAsync; use crate::{FollowerId, estimate_size}; @@ -337,10 +338,11 @@ impl Ingester { advise_reset_shards_response.shards_to_truncate.len(), now.elapsed().pretty_display() ); - INGEST_V2_METRICS - .reset_shards_operations_total - .with_label_values(["success"]) - .inc(); + counter!( + parent: RESET_SHARDS_OPERATIONS_TOTAL, + labels: [label_values!(STATUS => "success")], + ) + .increment(1); let wal_usage = state_guard.mrecordlog.resource_usage(); report_wal_usage(wal_usage); @@ -348,18 +350,20 @@ impl Ingester { Ok(Err(error)) => { warn!("advise reset shards request failed: {error}"); - INGEST_V2_METRICS - .reset_shards_operations_total - .with_label_values(["error"]) - .inc(); + counter!( + parent: RESET_SHARDS_OPERATIONS_TOTAL, + labels: [label_values!(STATUS => "error")], + ) + .increment(1); } Err(_) => { warn!("advise reset shards request timed out"); - INGEST_V2_METRICS - .reset_shards_operations_total - .with_label_values(["timeout"]) - .inc(); + counter!( + parent: RESET_SHARDS_OPERATIONS_TOTAL, + labels: [label_values!(STATUS => "timeout")], + ) + .increment(1); } }; // We still hold the permit while sleeping so we effectively rate limit the reset shards @@ -571,12 +575,16 @@ impl Ingester { }; if valid_doc_batch.is_empty() { - crate::metrics::INGEST_METRICS - .ingested_docs_invalid - .inc_by(parse_failures.len() as u64); - crate::metrics::INGEST_METRICS - .ingested_docs_bytes_invalid - .inc_by(original_batch_num_bytes); + counter!( + parent: DOCS_TOTAL, + labels: [label_values!(VALIDITY => "invalid")], + ) + .increment(parse_failures.len() as u64); + counter!( + parent: DOCS_BYTES_TOTAL, + labels: [label_values!(VALIDITY => "invalid")], + ) + .increment(original_batch_num_bytes); let persist_success = PersistSuccess { subrequest_id: subrequest.subrequest_id, index_uid: subrequest.index_uid, @@ -590,19 +598,27 @@ impl Ingester { continue; }; - crate::metrics::INGEST_METRICS - .ingested_docs_valid - .inc_by(valid_doc_batch.num_docs() as u64); - crate::metrics::INGEST_METRICS - .ingested_docs_bytes_valid - .inc_by(valid_doc_batch.num_bytes() as u64); + counter!( + parent: DOCS_TOTAL, + labels: [label_values!(VALIDITY => "valid")], + ) + .increment(valid_doc_batch.num_docs() as u64); + counter!( + parent: DOCS_BYTES_TOTAL, + labels: [label_values!(VALIDITY => "valid")], + ) + .increment(valid_doc_batch.num_bytes() as u64); if !parse_failures.is_empty() { - crate::metrics::INGEST_METRICS - .ingested_docs_invalid - .inc_by(parse_failures.len() as u64); - crate::metrics::INGEST_METRICS - .ingested_docs_bytes_invalid - .inc_by(original_batch_num_bytes - valid_doc_batch.num_bytes() as u64); + counter!( + parent: DOCS_TOTAL, + labels: [label_values!(VALIDITY => "invalid")], + ) + .increment(parse_failures.len() as u64); + counter!( + parent: DOCS_BYTES_TOTAL, + labels: [label_values!(VALIDITY => "invalid")], + ) + .increment(original_batch_num_bytes - valid_doc_batch.num_bytes() as u64); } let valid_batch_num_bytes = valid_doc_batch.num_bytes() as u64; shard.rate_meter.update(valid_batch_num_bytes); @@ -1112,8 +1128,7 @@ impl IngesterService for Ingester { _ => None, }) .sum::(); - let mut gauge_guard = GaugeGuard::from_gauge(&MEMORY_METRICS.in_flight.ingester_persist); - gauge_guard.add(request_size_bytes as i64); + let _gauge_guard = GaugeGuard::new(&IN_FLIGHT_INGESTER_PERSIST, request_size_bytes as f64); self.persist_inner(persist_request).await } diff --git a/quickwit/quickwit-ingest/src/ingest_v2/metrics.rs b/quickwit/quickwit-ingest/src/ingest_v2/metrics.rs index 5cf86397b70..e30dc1c19c3 100644 --- a/quickwit/quickwit-ingest/src/ingest_v2/metrics.rs +++ b/quickwit/quickwit-ingest/src/ingest_v2/metrics.rs @@ -15,171 +15,159 @@ use std::sync::LazyLock; use mrecordlog::ResourceUsage; -use quickwit_common::metrics::{ - Histogram, HistogramVec, IntCounter, IntCounterVec, IntGauge, IntGaugeVec, exponential_buckets, - linear_buckets, new_counter_vec, new_gauge, new_gauge_vec, new_histogram, new_histogram_vec, +use quickwit_common::metrics::{IN_FLIGHT_WAL, exponential_buckets, linear_buckets}; +use quickwit_metrics::{ + Counter, Gauge, Histogram, LabelNames, counter, gauge, histogram, label_names, }; -// Counter vec counting the different outcomes of ingest requests as -// measure at the end of the router work. -// -// The counter are counting persist subrequests. -pub(crate) struct IngestResultMetrics { - pub success: IntCounter, - pub circuit_breaker: IntCounter, - pub unspecified: IntCounter, - pub index_not_found: IntCounter, - pub source_not_found: IntCounter, - pub internal: IntCounter, - pub no_shards_available: IntCounter, - pub shard_rate_limited: IntCounter, - pub wal_full: IntCounter, - pub timeout: IntCounter, - pub router_timeout: IntCounter, - pub router_load_shedding: IntCounter, - pub load_shedding: IntCounter, - pub shard_not_found: IntCounter, - pub unavailable: IntCounter, -} +pub(super) const STATUS: LabelNames<1> = label_names!("status"); -impl Default for IngestResultMetrics { - fn default() -> Self { - let ingest_result_total_vec = new_counter_vec::<1>( - "ingest_result_total", - "Number of ingest requests by result", - "ingest", - &[], - ["result"], - ); - Self { - success: ingest_result_total_vec.with_label_values(["success"]), - circuit_breaker: ingest_result_total_vec.with_label_values(["circuit_breaker"]), - unspecified: ingest_result_total_vec.with_label_values(["unspecified"]), - index_not_found: ingest_result_total_vec.with_label_values(["index_not_found"]), - source_not_found: ingest_result_total_vec.with_label_values(["source_not_found"]), - internal: ingest_result_total_vec.with_label_values(["internal"]), - no_shards_available: ingest_result_total_vec.with_label_values(["no_shards_available"]), - shard_rate_limited: ingest_result_total_vec.with_label_values(["shard_rate_limited"]), - wal_full: ingest_result_total_vec.with_label_values(["wal_full"]), - timeout: ingest_result_total_vec.with_label_values(["timeout"]), - router_timeout: ingest_result_total_vec.with_label_values(["router_timeout"]), - router_load_shedding: ingest_result_total_vec - .with_label_values(["router_load_shedding"]), - load_shedding: ingest_result_total_vec.with_label_values(["load_shedding"]), - unavailable: ingest_result_total_vec.with_label_values(["unavailable"]), - shard_not_found: ingest_result_total_vec.with_label_values(["shard_not_found"]), - } - } -} +static INGEST_RESULT_TOTAL: LazyLock = LazyLock::new(|| { + counter!( + name: "ingest_result_total", + description: "Number of ingest requests by result", + subsystem: "ingest", + ) +}); -pub(super) struct IngestV2Metrics { - pub reset_shards_operations_total: IntCounterVec<1>, - pub open_shards: IntGauge, - pub closed_shards: IntGauge, - pub shard_lt_throughput_mib: Histogram, - pub shard_st_throughput_mib: Histogram, - pub wal_acquire_lock_requests_in_flight: IntGaugeVec<2>, - pub wal_acquire_lock_request_duration_secs: HistogramVec<2>, - pub wal_lock_hold_duration_secs: HistogramVec<2>, - pub wal_disk_used_bytes: IntGauge, - pub wal_memory_used_bytes: IntGauge, - pub ingest_results: IngestResultMetrics, - pub ingest_attempts: IntCounterVec<1>, -} +pub(super) static INGEST_RESULT_SUCCESS: LazyLock = + LazyLock::new(|| counter!(parent: INGEST_RESULT_TOTAL, "result" => "success")); -impl Default for IngestV2Metrics { - fn default() -> Self { - Self { - ingest_results: IngestResultMetrics::default(), - ingest_attempts: new_counter_vec::<1>( - "ingest_attempts", - "Number of routing attempts by AZ locality", - "ingest", - &[], - ["az_routing"], - ), - reset_shards_operations_total: new_counter_vec( - "reset_shards_operations_total", - "Total number of reset shards operations performed.", - "ingest", - &[], - ["status"], - ), - open_shards: new_gauge( - "shards", - "Number of shards hosted by the ingester.", - "ingest", - &[("state", "open")], - ), - closed_shards: new_gauge( - "shards", - "Number of shards hosted by the ingester.", - "ingest", - &[("state", "closed")], - ), - shard_lt_throughput_mib: new_histogram( - "shard_lt_throughput_mib", - "Shard long term throughput as reported through chitchat", - "ingest", - linear_buckets(0.0f64, 1.0f64, 15).unwrap(), - ), - shard_st_throughput_mib: new_histogram( - "shard_st_throughput_mib", - "Shard short term throughput as reported through chitchat", - "ingest", - linear_buckets(0.0f64, 1.0f64, 15).unwrap(), - ), - wal_acquire_lock_requests_in_flight: new_gauge_vec( - "wal_acquire_lock_requests_in_flight", - "Number of acquire lock requests in-flight.", - "ingest", - &[], - ["operation", "type"], - ), - wal_acquire_lock_request_duration_secs: new_histogram_vec( - "wal_acquire_lock_request_duration_secs", - "Duration of acquire lock requests in seconds.", - "ingest", - &[], - ["operation", "type"], - exponential_buckets(0.001, 2.0, 12).unwrap(), - ), - wal_lock_hold_duration_secs: new_histogram_vec( - "wal_lock_hold_duration_secs", - "Duration for which the WAL lock was held in seconds.", - "ingest", - &[], - ["operation", "type"], - exponential_buckets(0.001, 2.0, 12).unwrap(), - ), - wal_disk_used_bytes: new_gauge( - "wal_disk_used_bytes", - "WAL disk space used in bytes.", - "ingest", - &[], - ), - wal_memory_used_bytes: new_gauge( - "wal_memory_used_bytes", - "WAL memory used in bytes.", - "ingest", - &[], - ), - } - } -} +pub(super) static INGEST_RESULT_CIRCUIT_BREAKER: LazyLock = + LazyLock::new(|| counter!(parent: INGEST_RESULT_TOTAL, "result" => "circuit_breaker")); + +pub(super) static INGEST_RESULT_UNSPECIFIED: LazyLock = + LazyLock::new(|| counter!(parent: INGEST_RESULT_TOTAL, "result" => "unspecified")); + +pub(super) static INGEST_RESULT_INDEX_NOT_FOUND: LazyLock = + LazyLock::new(|| counter!(parent: INGEST_RESULT_TOTAL, "result" => "index_not_found")); + +pub(super) static INGEST_RESULT_SOURCE_NOT_FOUND: LazyLock = + LazyLock::new(|| counter!(parent: INGEST_RESULT_TOTAL, "result" => "source_not_found")); + +pub(super) static INGEST_RESULT_INTERNAL: LazyLock = + LazyLock::new(|| counter!(parent: INGEST_RESULT_TOTAL, "result" => "internal")); + +pub(super) static INGEST_RESULT_NO_SHARDS_AVAILABLE: LazyLock = + LazyLock::new(|| counter!(parent: INGEST_RESULT_TOTAL, "result" => "no_shards_available")); + +pub(super) static INGEST_RESULT_SHARD_RATE_LIMITED: LazyLock = + LazyLock::new(|| counter!(parent: INGEST_RESULT_TOTAL, "result" => "shard_rate_limited")); + +pub(super) static INGEST_RESULT_WAL_FULL: LazyLock = + LazyLock::new(|| counter!(parent: INGEST_RESULT_TOTAL, "result" => "wal_full")); + +pub(super) static INGEST_RESULT_TIMEOUT: LazyLock = + LazyLock::new(|| counter!(parent: INGEST_RESULT_TOTAL, "result" => "timeout")); + +pub(super) static INGEST_RESULT_ROUTER_TIMEOUT: LazyLock = + LazyLock::new(|| counter!(parent: INGEST_RESULT_TOTAL, "result" => "router_timeout")); + +pub(super) static INGEST_RESULT_ROUTER_LOAD_SHEDDING: LazyLock = + LazyLock::new(|| counter!(parent: INGEST_RESULT_TOTAL, "result" => "router_load_shedding")); + +pub(super) static INGEST_RESULT_LOAD_SHEDDING: LazyLock = + LazyLock::new(|| counter!(parent: INGEST_RESULT_TOTAL, "result" => "load_shedding")); + +pub(super) static INGEST_RESULT_SHARD_NOT_FOUND: LazyLock = + LazyLock::new(|| counter!(parent: INGEST_RESULT_TOTAL, "result" => "shard_not_found")); + +pub(super) static INGEST_RESULT_UNAVAILABLE: LazyLock = + LazyLock::new(|| counter!(parent: INGEST_RESULT_TOTAL, "result" => "unavailable")); + +pub(super) static INGEST_ATTEMPTS: LazyLock = LazyLock::new(|| { + counter!( + name: "ingest_attempts", + description: "Number of routing attempts by AZ locality", + subsystem: "ingest", + ) +}); + +pub(super) static RESET_SHARDS_OPERATIONS_TOTAL: LazyLock = LazyLock::new(|| { + counter!( + name: "reset_shards_operations_total", + description: "Total number of reset shards operations performed.", + subsystem: "ingest", + ) +}); + +static SHARDS: LazyLock = LazyLock::new(|| { + gauge!( + name: "shards", + description: "Number of shards hosted by the ingester.", + subsystem: "ingest", + ) +}); + +pub(super) static OPEN_SHARDS: LazyLock = + LazyLock::new(|| gauge!(parent: SHARDS, "state" => "open")); + +pub(super) static CLOSED_SHARDS: LazyLock = + LazyLock::new(|| gauge!(parent: SHARDS, "state" => "closed")); + +pub(super) static SHARD_LT_THROUGHPUT_MIB: LazyLock = LazyLock::new(|| { + histogram!( + name: "shard_lt_throughput_mib", + description: "Shard long term throughput as reported through chitchat", + subsystem: "ingest", + buckets: linear_buckets(0.0f64, 1.0f64, 15).unwrap(), + ) +}); + +pub(super) static SHARD_ST_THROUGHPUT_MIB: LazyLock = LazyLock::new(|| { + histogram!( + name: "shard_st_throughput_mib", + description: "Shard short term throughput as reported through chitchat", + subsystem: "ingest", + buckets: linear_buckets(0.0f64, 1.0f64, 15).unwrap(), + ) +}); + +pub(super) static WAL_ACQUIRE_LOCK_REQUESTS_IN_FLIGHT: LazyLock = LazyLock::new(|| { + gauge!( + name: "wal_acquire_lock_requests_in_flight", + description: "Number of acquire lock requests in-flight.", + subsystem: "ingest", + ) +}); + +pub(super) static WAL_ACQUIRE_LOCK_REQUEST_DURATION_SECS: LazyLock = + LazyLock::new(|| { + histogram!( + name: "wal_acquire_lock_request_duration_secs", + description: "Duration of acquire lock requests in seconds.", + subsystem: "ingest", + buckets: exponential_buckets(0.001, 2.0, 12).unwrap(), + ) + }); + +pub(super) static WAL_LOCK_HOLD_DURATION_SECS: LazyLock = LazyLock::new(|| { + histogram!( + name: "wal_lock_hold_duration_secs", + description: "Duration for which the WAL lock was held in seconds.", + subsystem: "ingest", + buckets: exponential_buckets(0.001, 2.0, 12).unwrap(), + ) +}); + +pub(super) static WAL_DISK_USED_BYTES: LazyLock = LazyLock::new(|| { + gauge!( + name: "wal_disk_used_bytes", + description: "WAL disk space used in bytes.", + subsystem: "ingest", + ) +}); + +pub(super) static WAL_MEMORY_USED_BYTES: LazyLock = LazyLock::new(|| { + gauge!( + name: "wal_memory_used_bytes", + description: "WAL memory used in bytes.", + subsystem: "ingest", + ) +}); pub(super) fn report_wal_usage(wal_usage: ResourceUsage) { - INGEST_V2_METRICS - .wal_disk_used_bytes - .set(wal_usage.disk_used_bytes as i64); - quickwit_common::metrics::MEMORY_METRICS - .in_flight - .wal - .set(wal_usage.memory_allocated_bytes as i64); - INGEST_V2_METRICS - .wal_memory_used_bytes - .set(wal_usage.memory_used_bytes as i64); + WAL_DISK_USED_BYTES.set(wal_usage.disk_used_bytes as f64); + IN_FLIGHT_WAL.set(wal_usage.memory_allocated_bytes as f64); + WAL_MEMORY_USED_BYTES.set(wal_usage.memory_used_bytes as f64); } - -pub(super) static INGEST_V2_METRICS: LazyLock = - LazyLock::new(IngestV2Metrics::default); diff --git a/quickwit/quickwit-ingest/src/ingest_v2/replication.rs b/quickwit/quickwit-ingest/src/ingest_v2/replication.rs index 62d20fc8567..5f1687c720d 100644 --- a/quickwit/quickwit-ingest/src/ingest_v2/replication.rs +++ b/quickwit/quickwit-ingest/src/ingest_v2/replication.rs @@ -18,8 +18,9 @@ use std::time::{Duration, Instant}; use bytesize::ByteSize; use futures::{Future, StreamExt}; use mrecordlog::error::CreateQueueError; -use quickwit_common::metrics::{GaugeGuard, MEMORY_METRICS}; +use quickwit_common::metrics::IN_FLIGHT_INGESTER_REPLICATE; use quickwit_common::{ServiceStream, rate_limited_warn}; +use quickwit_metrics::GaugeGuard; use quickwit_proto::ingest::ingester::{ AckReplicationMessage, IngesterStatus, InitReplicaRequest, InitReplicaResponse, ReplicateFailure, ReplicateFailureReason, ReplicateRequest, ReplicateResponse, @@ -39,7 +40,7 @@ use super::mrecordlog_utils::check_enough_capacity; use super::state::IngesterState; use crate::estimate_size; use crate::ingest_v2::mrecordlog_utils::{AppendDocBatchError, append_non_empty_doc_batch}; -use crate::metrics::INGEST_METRICS; +use crate::metrics::{REPLICATED_NUM_BYTES_TOTAL, REPLICATED_NUM_DOCS_TOTAL}; pub(super) const SYN_REPLICATION_STREAM_CAPACITY: usize = 5; @@ -503,8 +504,8 @@ impl ReplicationTask { ))); } let request_size_bytes = replicate_request.num_bytes(); - let mut gauge_guard = GaugeGuard::from_gauge(&MEMORY_METRICS.in_flight.ingester_replicate); - gauge_guard.add(request_size_bytes as i64); + let _gauge_guard = + GaugeGuard::new(&IN_FLIGHT_INGESTER_REPLICATE, request_size_bytes as f64); self.current_replication_seqno += 1; @@ -665,12 +666,8 @@ impl ReplicationTask { .expect("replica shard should be initialized") .set_replication_position_inclusive(current_position_inclusive.clone(), now); - INGEST_METRICS - .replicated_num_bytes_total - .inc_by(batch_num_bytes); - INGEST_METRICS - .replicated_num_docs_total - .inc_by(batch_num_docs); + REPLICATED_NUM_BYTES_TOTAL.increment(batch_num_bytes); + REPLICATED_NUM_DOCS_TOTAL.increment(batch_num_docs); let replicate_success = ReplicateSuccess { subrequest_id: subrequest.subrequest_id, diff --git a/quickwit/quickwit-ingest/src/ingest_v2/router.rs b/quickwit/quickwit-ingest/src/ingest_v2/router.rs index e249dd1e0fe..f18d441af6c 100644 --- a/quickwit/quickwit-ingest/src/ingest_v2/router.rs +++ b/quickwit/quickwit-ingest/src/ingest_v2/router.rs @@ -20,9 +20,10 @@ use std::time::Duration; use async_trait::async_trait; use futures::stream::FuturesUnordered; use futures::{Future, StreamExt}; -use quickwit_common::metrics::{GaugeGuard, MEMORY_METRICS}; +use quickwit_common::metrics::IN_FLIGHT_INGEST_ROUTER; use quickwit_common::pubsub::{EventBroker, EventSubscriber}; use quickwit_common::{rate_limited_error, rate_limited_warn}; +use quickwit_metrics::{GaugeGuard, counter}; use quickwit_proto::control_plane::{ ControlPlaneService, ControlPlaneServiceClient, GetOrCreateOpenShardsRequest, GetOrCreateOpenShardsSubrequest, @@ -45,12 +46,18 @@ use super::debouncing::{ DebouncedGetOrCreateOpenShardsRequest, GetOrCreateOpenShardsRequestDebouncer, }; use super::ingester::PERSIST_REQUEST_TIMEOUT; -use super::metrics::IngestResultMetrics; use super::routing_table::RoutingTable; use super::workbench::IngestWorkbench; use super::{IngesterPool, pending_subrequests}; use crate::get_ingest_router_buffer_size; -use crate::ingest_v2::metrics::INGEST_V2_METRICS; +use crate::ingest_v2::metrics::{ + INGEST_ATTEMPTS, INGEST_RESULT_CIRCUIT_BREAKER, INGEST_RESULT_INDEX_NOT_FOUND, + INGEST_RESULT_INTERNAL, INGEST_RESULT_LOAD_SHEDDING, INGEST_RESULT_NO_SHARDS_AVAILABLE, + INGEST_RESULT_ROUTER_LOAD_SHEDDING, INGEST_RESULT_ROUTER_TIMEOUT, + INGEST_RESULT_SHARD_NOT_FOUND, INGEST_RESULT_SHARD_RATE_LIMITED, + INGEST_RESULT_SOURCE_NOT_FOUND, INGEST_RESULT_SUCCESS, INGEST_RESULT_TIMEOUT, + INGEST_RESULT_UNAVAILABLE, INGEST_RESULT_UNSPECIFIED, INGEST_RESULT_WAL_FULL, +}; /// Duration after which ingest requests time out with [`IngestV2Error::Timeout`]. fn ingest_request_timeout() -> Duration { @@ -371,10 +378,11 @@ impl IngestRouter { let az_locality = state_guard .routing_table .classify_az_locality(&ingester_node.node_id, &self.ingester_pool); - INGEST_V2_METRICS - .ingest_attempts - .with_label_values([az_locality]) - .inc(); + counter!( + parent: INGEST_ATTEMPTS, + "az_routing" => az_locality, + ) + .increment(1); let persist_subrequest = PersistSubrequest { subrequest_id: subrequest.subrequest_id, index_uid: Some(ingester_node.index_uid.clone()), @@ -492,82 +500,69 @@ impl IngestRouter { fn update_ingest_metrics(ingest_result: &IngestV2Result, num_subrequests: usize) { let num_subrequests = num_subrequests as u64; - let ingest_results_metrics: &IngestResultMetrics = &INGEST_V2_METRICS.ingest_results; match ingest_result { Ok(ingest_response) => { - ingest_results_metrics - .success - .inc_by(ingest_response.successes.len() as u64); + INGEST_RESULT_SUCCESS.increment(ingest_response.successes.len() as u64); for ingest_failure in &ingest_response.failures { match ingest_failure.reason() { IngestFailureReason::CircuitBreaker => { - ingest_results_metrics.circuit_breaker.inc(); + INGEST_RESULT_CIRCUIT_BREAKER.increment(1); } - IngestFailureReason::Unspecified => ingest_results_metrics.unspecified.inc(), + IngestFailureReason::Unspecified => INGEST_RESULT_UNSPECIFIED.increment(1), IngestFailureReason::IndexNotFound => { - ingest_results_metrics.index_not_found.inc() + INGEST_RESULT_INDEX_NOT_FOUND.increment(1) } IngestFailureReason::SourceNotFound => { - ingest_results_metrics.source_not_found.inc() + INGEST_RESULT_SOURCE_NOT_FOUND.increment(1) } - IngestFailureReason::Internal => ingest_results_metrics.internal.inc(), + IngestFailureReason::Internal => INGEST_RESULT_INTERNAL.increment(1), IngestFailureReason::NoShardsAvailable => { - ingest_results_metrics.no_shards_available.inc() + INGEST_RESULT_NO_SHARDS_AVAILABLE.increment(1) } IngestFailureReason::ShardRateLimited => { - ingest_results_metrics.shard_rate_limited.inc() + INGEST_RESULT_SHARD_RATE_LIMITED.increment(1) } - IngestFailureReason::WalFull => ingest_results_metrics.wal_full.inc(), - IngestFailureReason::Timeout => ingest_results_metrics.timeout.inc(), + IngestFailureReason::WalFull => INGEST_RESULT_WAL_FULL.increment(1), + IngestFailureReason::Timeout => INGEST_RESULT_TIMEOUT.increment(1), IngestFailureReason::RouterLoadShedding => { - ingest_results_metrics.router_load_shedding.inc() + INGEST_RESULT_ROUTER_LOAD_SHEDDING.increment(1) } - IngestFailureReason::LoadShedding => ingest_results_metrics.load_shedding.inc(), + IngestFailureReason::LoadShedding => INGEST_RESULT_LOAD_SHEDDING.increment(1), } } } Err(ingest_error) => match ingest_error { IngestV2Error::TooManyRequests(rate_limiting_cause) => match rate_limiting_cause { RateLimitingCause::RouterLoadShedding => { - ingest_results_metrics - .router_load_shedding - .inc_by(num_subrequests); + INGEST_RESULT_ROUTER_LOAD_SHEDDING.increment(num_subrequests); } RateLimitingCause::LoadShedding => { - ingest_results_metrics.load_shedding.inc_by(num_subrequests) + INGEST_RESULT_LOAD_SHEDDING.increment(num_subrequests) } RateLimitingCause::WalFull => { - ingest_results_metrics.wal_full.inc_by(num_subrequests); + INGEST_RESULT_WAL_FULL.increment(num_subrequests); } RateLimitingCause::CircuitBreaker => { - ingest_results_metrics - .circuit_breaker - .inc_by(num_subrequests); + INGEST_RESULT_CIRCUIT_BREAKER.increment(num_subrequests); } RateLimitingCause::ShardRateLimiting => { - ingest_results_metrics - .shard_rate_limited - .inc_by(num_subrequests); + INGEST_RESULT_SHARD_RATE_LIMITED.increment(num_subrequests); } RateLimitingCause::Unknown => { - ingest_results_metrics.unspecified.inc_by(num_subrequests); + INGEST_RESULT_UNSPECIFIED.increment(num_subrequests); } }, IngestV2Error::Timeout(_) => { - ingest_results_metrics - .router_timeout - .inc_by(num_subrequests); + INGEST_RESULT_ROUTER_TIMEOUT.increment(num_subrequests); } IngestV2Error::ShardNotFound { .. } => { - ingest_results_metrics - .shard_not_found - .inc_by(num_subrequests); + INGEST_RESULT_SHARD_NOT_FOUND.increment(num_subrequests); } IngestV2Error::Unavailable(_) => { - ingest_results_metrics.unavailable.inc_by(num_subrequests); + INGEST_RESULT_UNAVAILABLE.increment(num_subrequests); } IngestV2Error::Internal(_) => { - ingest_results_metrics.internal.inc_by(num_subrequests); + INGEST_RESULT_INTERNAL.increment(num_subrequests); } }, } @@ -578,8 +573,7 @@ impl IngestRouterService for IngestRouter { async fn ingest(&self, ingest_request: IngestRequestV2) -> IngestV2Result { let request_size_bytes = ingest_request.num_bytes(); - let mut gauge_guard = GaugeGuard::from_gauge(&MEMORY_METRICS.in_flight.ingest_router); - gauge_guard.add(request_size_bytes as i64); + let _gauge_guard = GaugeGuard::new(&IN_FLIGHT_INGEST_ROUTER, request_size_bytes as f64); let num_subrequests = ingest_request.subrequests.len(); let _permit = self diff --git a/quickwit/quickwit-ingest/src/ingest_v2/state.rs b/quickwit/quickwit-ingest/src/ingest_v2/state.rs index 69bcfae1619..accff060c3e 100644 --- a/quickwit/quickwit-ingest/src/ingest_v2/state.rs +++ b/quickwit/quickwit-ingest/src/ingest_v2/state.rs @@ -27,6 +27,7 @@ use quickwit_common::pretty::PrettyDisplay; use quickwit_common::rate_limiter::{RateLimiter, RateLimiterSettings}; use quickwit_common::shared_consts::INGESTER_STATUS_KEY; use quickwit_doc_mapper::DocMapper; +use quickwit_metrics::{gauge, histogram, labels}; use quickwit_proto::control_plane::AdviseResetShardsResponse; use quickwit_proto::ingest::ingester::IngesterStatus; use quickwit_proto::ingest::{IngestV2Error, IngestV2Result, ShardIds, ShardState}; @@ -460,10 +461,12 @@ pub(super) fn warn_on_long_lock_hold( ) { let elapsed = acquired_at.elapsed(); - crate::ingest_v2::metrics::INGEST_V2_METRICS - .wal_lock_hold_duration_secs - .with_label_values([operation, lock_type]) - .observe(elapsed.as_secs_f64()); + let labels = labels!("operation" => operation, "type" => lock_type); + histogram!( + parent: crate::ingest_v2::metrics::WAL_LOCK_HOLD_DURATION_SECS, + labels: [labels], + ) + .record(elapsed.as_secs_f64()); if elapsed > Duration::from_secs(1) { quickwit_common::rate_limited_warn!( @@ -488,12 +491,13 @@ pub(super) async fn track_acquire_lock( where F: std::future::Future, { - let metrics = &crate::ingest_v2::metrics::INGEST_V2_METRICS; + let labels = labels!("operation" => operation, "type" => lock_type); - metrics - .wal_acquire_lock_requests_in_flight - .with_label_values([operation, lock_type]) - .inc(); + gauge!( + parent: crate::ingest_v2::metrics::WAL_ACQUIRE_LOCK_REQUESTS_IN_FLIGHT, + labels: [labels], + ) + .increment(1.0); let now = Instant::now(); let guard = acquire_future.await; @@ -510,14 +514,16 @@ where elapsed.pretty_display() ); } - metrics - .wal_acquire_lock_requests_in_flight - .with_label_values([operation, lock_type]) - .dec(); - metrics - .wal_acquire_lock_request_duration_secs - .with_label_values([operation, lock_type]) - .observe(elapsed.as_secs_f64()); + gauge!( + parent: crate::ingest_v2::metrics::WAL_ACQUIRE_LOCK_REQUESTS_IN_FLIGHT, + labels: [labels], + ) + .decrement(1.0); + histogram!( + parent: crate::ingest_v2::metrics::WAL_ACQUIRE_LOCK_REQUEST_DURATION_SECS, + labels: [labels], + ) + .record(elapsed.as_secs_f64()); (guard, acquired_at) } diff --git a/quickwit/quickwit-ingest/src/metrics.rs b/quickwit/quickwit-ingest/src/metrics.rs index 7b6888243e5..6c6946437ab 100644 --- a/quickwit/quickwit-ingest/src/metrics.rs +++ b/quickwit/quickwit-ingest/src/metrics.rs @@ -14,69 +14,47 @@ use std::sync::LazyLock; -use quickwit_common::metrics::{IntCounter, IntGauge, new_counter, new_counter_vec, new_gauge}; - -pub struct IngestMetrics { - pub ingested_docs_bytes_valid: IntCounter, - pub ingested_docs_bytes_invalid: IntCounter, - pub ingested_docs_invalid: IntCounter, - pub ingested_docs_valid: IntCounter, - - pub replicated_num_bytes_total: IntCounter, - pub replicated_num_docs_total: IntCounter, - #[allow(dead_code)] // this really shouldn't be dead, it needs to be used somewhere - pub queue_count: IntGauge, -} - -impl Default for IngestMetrics { - fn default() -> Self { - let ingest_docs_bytes_total = new_counter_vec( - "docs_bytes_total", - "Total size of the docs ingested, measured in ingester's leader, after validation and \ - before persistence/replication", - "ingest", - &[], - ["validity"], - ); - let ingested_docs_bytes_valid = ingest_docs_bytes_total.with_label_values(["valid"]); - let ingested_docs_bytes_invalid = ingest_docs_bytes_total.with_label_values(["invalid"]); - - let ingest_docs_total = new_counter_vec( - "docs_total", - "Total number of the docs ingested, measured in ingester's leader, after validation \ - and before persistence/replication", - "ingest", - &[], - ["validity"], - ); - let ingested_docs_valid = ingest_docs_total.with_label_values(["valid"]); - let ingested_docs_invalid = ingest_docs_total.with_label_values(["invalid"]); - - IngestMetrics { - ingested_docs_bytes_valid, - ingested_docs_bytes_invalid, - ingested_docs_valid, - ingested_docs_invalid, - replicated_num_bytes_total: new_counter( - "replicated_num_bytes_total", - "Total size in bytes of the replicated docs.", - "ingest", - &[], - ), - replicated_num_docs_total: new_counter( - "replicated_num_docs_total", - "Total number of docs replicated.", - "ingest", - &[], - ), - queue_count: new_gauge( - "queue_count", - "Number of queues currently active", - "ingest", - &[], - ), - } - } -} - -pub static INGEST_METRICS: LazyLock = LazyLock::new(IngestMetrics::default); +use quickwit_metrics::{Counter, Gauge, LabelNames, counter, gauge, label_names}; + +pub(crate) const VALIDITY: LabelNames<1> = label_names!("validity"); + +pub(crate) static DOCS_BYTES_TOTAL: LazyLock = LazyLock::new(|| { + counter!( + name: "docs_bytes_total", + description: "Total size of the docs ingested, measured in ingester's leader, after validation and before persistence/replication", + subsystem: "ingest", + ) +}); + +pub(crate) static DOCS_TOTAL: LazyLock = LazyLock::new(|| { + counter!( + name: "docs_total", + description: "Total number of the docs ingested, measured in ingester's leader, after validation and before persistence/replication", + subsystem: "ingest", + ) +}); + +pub(crate) static REPLICATED_NUM_BYTES_TOTAL: LazyLock = LazyLock::new(|| { + counter!( + name: "replicated_num_bytes_total", + description: "Total size in bytes of the replicated docs.", + subsystem: "ingest", + ) +}); + +pub(crate) static REPLICATED_NUM_DOCS_TOTAL: LazyLock = LazyLock::new(|| { + counter!( + name: "replicated_num_docs_total", + description: "Total number of docs replicated.", + subsystem: "ingest", + ) +}); + +#[allow(dead_code)] // this really shouldn't be dead, it needs to be used somewhere +pub(crate) static QUEUE_COUNT: LazyLock = LazyLock::new(|| { + gauge!( + name: "queue_count", + description: "Number of queues currently active", + subsystem: "ingest", + ) +}); diff --git a/quickwit/quickwit-jaeger/Cargo.toml b/quickwit/quickwit-jaeger/Cargo.toml index 1ebebc8dbfb..a2d686c3f7a 100644 --- a/quickwit/quickwit-jaeger/Cargo.toml +++ b/quickwit/quickwit-jaeger/Cargo.toml @@ -26,6 +26,7 @@ tonic = { workspace = true } tracing = { workspace = true } quickwit-common = { workspace = true } +quickwit-metrics = { workspace = true } quickwit-config = { workspace = true } quickwit-opentelemetry = { workspace = true } quickwit-proto = { workspace = true } diff --git a/quickwit/quickwit-jaeger/src/lib.rs b/quickwit/quickwit-jaeger/src/lib.rs index 1b6dfc27d0c..5371fb1fd4f 100644 --- a/quickwit/quickwit-jaeger/src/lib.rs +++ b/quickwit/quickwit-jaeger/src/lib.rs @@ -22,6 +22,7 @@ use itertools::{Either, Itertools}; use prost::Message; use prost_types::{Duration as WellKnownDuration, Timestamp as WellKnownTimestamp}; use quickwit_config::JaegerConfig; +use quickwit_metrics::{counter, histogram, label_values}; use quickwit_opentelemetry::otlp::{ Event as QwEvent, Link as QwLink, OTEL_TRACES_INDEX_ID, Span as QwSpan, SpanFingerprint, SpanId, SpanKind as QwSpanKind, SpanStatus as QwSpanStatus, TraceId, @@ -51,7 +52,11 @@ use tonic::Status; use tracing::field::Empty; use tracing::{Span as RuntimeSpan, debug, error, instrument, warn}; -pub(crate) use crate::metrics::JAEGER_SERVICE_METRICS; +use crate::metrics::{ + FETCHED_SPANS_TOTAL, FETCHED_TRACES_TOTAL, OPERATION_INDEX_ERROR_LABELS, + OPERATION_INDEX_LABELS, REQUEST_DURATION_SECONDS, REQUEST_ERRORS_TOTAL, + TRANSFERRED_BYTES_TOTAL, +}; mod metrics; mod v1; @@ -415,43 +420,37 @@ impl JaegerService { current_span.record("num_spans", num_spans_total); current_span.record("num_bytes", num_bytes_total); - JAEGER_SERVICE_METRICS - .fetched_traces_total - .with_label_values([operation_name, OTEL_TRACES_INDEX_ID]) - .inc_by(num_traces); + let labels = label_values!( + OPERATION_INDEX_LABELS => operation_name, OTEL_TRACES_INDEX_ID + ); + counter!(parent: FETCHED_TRACES_TOTAL, labels: [labels]).increment(num_traces); let elapsed = request_start.elapsed().as_secs_f64(); - JAEGER_SERVICE_METRICS - .request_duration_seconds - .with_label_values([operation_name, OTEL_TRACES_INDEX_ID, "false"]) - .observe(elapsed); + let err_labels = label_values!( + OPERATION_INDEX_ERROR_LABELS => + operation_name, OTEL_TRACES_INDEX_ID, "false" + ); + histogram!(parent: REQUEST_DURATION_SECONDS, labels: [err_labels]).record(elapsed); }); Ok(ReceiverStream::new(rx)) } } pub(crate) fn record_error(operation_name: &'static str, request_start: Instant) { - JAEGER_SERVICE_METRICS - .request_errors_total - .with_label_values([operation_name, OTEL_TRACES_INDEX_ID]) - .inc(); + let labels = label_values!(OPERATION_INDEX_LABELS => operation_name, OTEL_TRACES_INDEX_ID); + counter!(parent: REQUEST_ERRORS_TOTAL, labels: [labels]).increment(1); let elapsed = request_start.elapsed().as_secs_f64(); - JAEGER_SERVICE_METRICS - .request_duration_seconds - .with_label_values([operation_name, OTEL_TRACES_INDEX_ID, "true"]) - .observe(elapsed); + let err_labels = label_values!( + OPERATION_INDEX_ERROR_LABELS => operation_name, OTEL_TRACES_INDEX_ID, "true" + ); + histogram!(parent: REQUEST_DURATION_SECONDS, labels: [err_labels]).record(elapsed); } pub(crate) fn record_send(operation_name: &'static str, num_spans: usize, num_bytes: usize) { - JAEGER_SERVICE_METRICS - .fetched_spans_total - .with_label_values([operation_name, OTEL_TRACES_INDEX_ID]) - .inc_by(num_spans as u64); - JAEGER_SERVICE_METRICS - .transferred_bytes_total - .with_label_values([operation_name, OTEL_TRACES_INDEX_ID]) - .inc_by(num_bytes as u64); + let labels = label_values!(OPERATION_INDEX_LABELS => operation_name, OTEL_TRACES_INDEX_ID); + counter!(parent: FETCHED_SPANS_TOTAL, labels: [labels]).increment(num_spans as u64); + counter!(parent: TRANSFERRED_BYTES_TOTAL, labels: [labels]).increment(num_bytes as u64); } #[allow(deprecated)] diff --git a/quickwit/quickwit-jaeger/src/metrics.rs b/quickwit/quickwit-jaeger/src/metrics.rs index 3095b68b59f..b2b91ccfdd3 100644 --- a/quickwit/quickwit-jaeger/src/metrics.rs +++ b/quickwit/quickwit-jaeger/src/metrics.rs @@ -14,68 +14,58 @@ use std::sync::LazyLock; -use quickwit_common::metrics::{ - HistogramVec, IntCounterVec, exponential_buckets, new_counter_vec, new_histogram_vec, -}; +use quickwit_common::metrics::exponential_buckets; +use quickwit_metrics::{Counter, Histogram, LabelNames, counter, histogram, label_names}; -pub struct JaegerServiceMetrics { - pub requests_total: IntCounterVec<2>, - pub request_errors_total: IntCounterVec<2>, - pub request_duration_seconds: HistogramVec<3>, - pub fetched_traces_total: IntCounterVec<2>, - pub fetched_spans_total: IntCounterVec<2>, - pub transferred_bytes_total: IntCounterVec<2>, -} +pub(crate) const OPERATION_INDEX_LABELS: LabelNames<2> = label_names!("operation", "index"); +pub(crate) const OPERATION_INDEX_ERROR_LABELS: LabelNames<3> = + label_names!("operation", "index", "error"); -impl Default for JaegerServiceMetrics { - fn default() -> Self { - Self { - requests_total: new_counter_vec( - "requests_total", - "Number of requests", - "jaeger", - &[], - ["operation", "index"], - ), - request_errors_total: new_counter_vec( - "request_errors_total", - "Number of failed requests", - "jaeger", - &[], - ["operation", "index"], - ), - request_duration_seconds: new_histogram_vec( - "request_duration_seconds", - "Duration of requests", - "jaeger", - &[], - ["operation", "index", "error"], - exponential_buckets(0.02, 2.0, 8).unwrap(), - ), - fetched_traces_total: new_counter_vec( - "fetched_traces_total", - "Number of traces retrieved from storage", - "jaeger", - &[], - ["operation", "index"], - ), - fetched_spans_total: new_counter_vec( - "fetched_spans_total", - "Number of spans retrieved from storage", - "jaeger", - &[], - ["operation", "index"], - ), - transferred_bytes_total: new_counter_vec( - "transferred_bytes_total", - "Number of bytes transferred", - "jaeger", - &[], - ["operation", "index"], - ), - } - } -} +pub(crate) static REQUESTS_TOTAL: LazyLock = LazyLock::new(|| { + counter!( + name: "requests_total", + description: "Number of requests", + subsystem: "jaeger", + ) +}); -pub static JAEGER_SERVICE_METRICS: LazyLock = - LazyLock::new(JaegerServiceMetrics::default); +pub(crate) static REQUEST_ERRORS_TOTAL: LazyLock = LazyLock::new(|| { + counter!( + name: "request_errors_total", + description: "Number of failed requests", + subsystem: "jaeger", + ) +}); + +pub(crate) static REQUEST_DURATION_SECONDS: LazyLock = LazyLock::new(|| { + histogram!( + name: "request_duration_seconds", + description: "Duration of requests", + subsystem: "jaeger", + buckets: exponential_buckets(0.02, 2.0, 8).unwrap(), + ) +}); + +pub(crate) static FETCHED_TRACES_TOTAL: LazyLock = LazyLock::new(|| { + counter!( + name: "fetched_traces_total", + description: "Number of traces retrieved from storage", + subsystem: "jaeger", + ) +}); + +pub(crate) static FETCHED_SPANS_TOTAL: LazyLock = LazyLock::new(|| { + counter!( + name: "fetched_spans_total", + description: "Number of spans retrieved from storage", + subsystem: "jaeger", + ) +}); + +pub(crate) static TRANSFERRED_BYTES_TOTAL: LazyLock = LazyLock::new(|| { + counter!( + name: "transferred_bytes_total", + description: "Number of bytes transferred", + subsystem: "jaeger", + ) +}); diff --git a/quickwit/quickwit-jaeger/src/v1.rs b/quickwit/quickwit-jaeger/src/v1.rs index 11d6935db4e..b0f33d1893d 100644 --- a/quickwit/quickwit-jaeger/src/v1.rs +++ b/quickwit/quickwit-jaeger/src/v1.rs @@ -17,6 +17,7 @@ use std::time::Instant; use async_trait::async_trait; +use quickwit_metrics::{counter, histogram, label_values}; use quickwit_opentelemetry::otlp::{ OTEL_TRACES_INDEX_ID, extract_otel_traces_index_id_patterns_from_metadata, }; @@ -27,26 +28,32 @@ use quickwit_proto::jaeger::storage::v1::{ }; use tonic::{Request, Response, Status}; -use crate::metrics::JAEGER_SERVICE_METRICS; +use crate::metrics::{ + OPERATION_INDEX_ERROR_LABELS, OPERATION_INDEX_LABELS, REQUEST_DURATION_SECONDS, + REQUEST_ERRORS_TOTAL, REQUESTS_TOTAL, +}; use crate::{JaegerService, SpanStream}; macro_rules! metrics { - ($expr:expr, [$operation:ident, $($label:expr),*]) => { + ($expr:expr, [$operation:ident, $index:expr]) => { let start = std::time::Instant::now(); - let labels = [stringify!($operation), $($label,)*]; - JAEGER_SERVICE_METRICS.requests_total.with_label_values(labels).inc(); + let operation = stringify!($operation); + let index = $index; + let labels = label_values!(OPERATION_INDEX_LABELS => operation, index); + counter!(parent: REQUESTS_TOTAL, labels: [labels]).increment(1); let (res, is_error) = match $expr { - ok @ Ok(_) => { - (ok, "false") - }, + ok @ Ok(_) => (ok, "false"), err @ Err(_) => { - JAEGER_SERVICE_METRICS.request_errors_total.with_label_values(labels).inc(); + counter!(parent: REQUEST_ERRORS_TOTAL, labels: [labels]).increment(1); (err, "true") }, }; let elapsed = start.elapsed().as_secs_f64(); - let labels = [stringify!($operation), $($label,)* is_error]; - JAEGER_SERVICE_METRICS.request_duration_seconds.with_label_values(labels).observe(elapsed); + let err_labels = label_values!( + OPERATION_INDEX_ERROR_LABELS => operation, index, is_error + ); + histogram!(parent: REQUEST_DURATION_SECONDS, labels: [err_labels]) + .record(elapsed); return res.map(Response::new); }; diff --git a/quickwit/quickwit-jaeger/src/v2.rs b/quickwit/quickwit-jaeger/src/v2.rs index e355c18a8c3..826f82828a1 100644 --- a/quickwit/quickwit-jaeger/src/v2.rs +++ b/quickwit/quickwit-jaeger/src/v2.rs @@ -19,6 +19,7 @@ use std::time::Instant; use async_trait::async_trait; use prost_types::Timestamp as WellKnownTimestamp; +use quickwit_metrics::{counter, histogram, label_values}; use quickwit_opentelemetry::otlp::{ OTEL_TRACES_INDEX_ID, Span as QwSpan, TraceId, extract_otel_traces_index_id_patterns_from_metadata, @@ -50,29 +51,35 @@ use tonic::{Request, Response, Status}; use tracing::field::Empty; use tracing::{Span as RuntimeSpan, debug, error, instrument}; -use crate::metrics::JAEGER_SERVICE_METRICS; +use crate::metrics::{ + FETCHED_TRACES_TOTAL, OPERATION_INDEX_ERROR_LABELS, OPERATION_INDEX_LABELS, + REQUEST_DURATION_SECONDS, REQUEST_ERRORS_TOTAL, REQUESTS_TOTAL, +}; use crate::{ JaegerService, TimeIntervalSecs, TracesDataStream, get_operations_impl, get_services_impl, json_deserialize, record_error, record_send, to_duration_millis, }; macro_rules! metrics { - ($expr:expr, [$operation:ident, $($label:expr),*]) => { + ($expr:expr, [$operation:ident, $index:expr]) => { let start = std::time::Instant::now(); - let labels = [stringify!($operation), $($label,)*]; - JAEGER_SERVICE_METRICS.requests_total.with_label_values(labels).inc(); + let operation = stringify!($operation); + let index = $index; + let labels = label_values!(OPERATION_INDEX_LABELS => operation, index); + counter!(parent: REQUESTS_TOTAL, labels: [labels]).increment(1); let (res, is_error) = match $expr { - ok @ Ok(_) => { - (ok, "false") - }, + ok @ Ok(_) => (ok, "false"), err @ Err(_) => { - JAEGER_SERVICE_METRICS.request_errors_total.with_label_values(labels).inc(); + counter!(parent: REQUEST_ERRORS_TOTAL, labels: [labels]).increment(1); (err, "true") }, }; let elapsed = start.elapsed().as_secs_f64(); - let labels = [stringify!($operation), $($label,)* is_error]; - JAEGER_SERVICE_METRICS.request_duration_seconds.with_label_values(labels).observe(elapsed); + let err_labels = label_values!( + OPERATION_INDEX_ERROR_LABELS => operation, index, is_error + ); + histogram!(parent: REQUEST_DURATION_SECONDS, labels: [err_labels]) + .record(elapsed); return res.map(Response::new); }; @@ -426,16 +433,14 @@ async fn stream_otel_spans_impl( record_send(operation_name, num_spans, num_bytes); - JAEGER_SERVICE_METRICS - .fetched_traces_total - .with_label_values([operation_name, OTEL_TRACES_INDEX_ID]) - .inc_by(trace_ids.len() as u64); + let labels = label_values!(OPERATION_INDEX_LABELS => operation_name, OTEL_TRACES_INDEX_ID); + counter!(parent: FETCHED_TRACES_TOTAL, labels: [labels]).increment(trace_ids.len() as u64); let elapsed = request_start.elapsed().as_secs_f64(); - JAEGER_SERVICE_METRICS - .request_duration_seconds - .with_label_values([operation_name, OTEL_TRACES_INDEX_ID, "false"]) - .observe(elapsed); + let err_labels = label_values!( + OPERATION_INDEX_ERROR_LABELS => operation_name, OTEL_TRACES_INDEX_ID, "false" + ); + histogram!(parent: REQUEST_DURATION_SECONDS, labels: [err_labels]).record(elapsed); Ok(qw_spans) } diff --git a/quickwit/quickwit-janitor/Cargo.toml b/quickwit/quickwit-janitor/Cargo.toml index ecb243a9990..e8063895f24 100644 --- a/quickwit/quickwit-janitor/Cargo.toml +++ b/quickwit/quickwit-janitor/Cargo.toml @@ -26,6 +26,7 @@ utoipa = { workspace = true } quickwit-actors = { workspace = true } quickwit-common = { workspace = true } +quickwit-metrics = { workspace = true } quickwit-config = { workspace = true } quickwit-doc-mapper = { workspace = true } quickwit-index-management = { workspace = true } diff --git a/quickwit/quickwit-janitor/src/actors/delete_task_planner.rs b/quickwit/quickwit-janitor/src/actors/delete_task_planner.rs index 5e08b7773e6..bf42e949c36 100644 --- a/quickwit/quickwit-janitor/src/actors/delete_task_planner.rs +++ b/quickwit/quickwit-janitor/src/actors/delete_task_planner.rs @@ -26,6 +26,7 @@ use quickwit_doc_mapper::tag_pruning::extract_tags_from_query; use quickwit_indexing::actors::{MergeSchedulerService, MergeSplitDownloader, schedule_merge}; use quickwit_indexing::merge_policy::MergeOperation; use quickwit_metastore::{ListSplitsResponseExt, Split, split_tag_filter, split_time_range_filter}; +use quickwit_metrics::gauge; use quickwit_proto::metastore::{ DeleteTask, LastDeleteOpstampRequest, ListDeleteTasksRequest, ListStaleSplitsRequest, MetastoreResult, MetastoreService, MetastoreServiceClient, UpdateSplitsDeleteOpstampRequest, @@ -37,7 +38,7 @@ use serde::Serialize; use tantivy::Inventory; use tracing::{debug, info}; -use crate::metrics::JANITOR_METRICS; +use crate::metrics::ONGOING_NUM_DELETE_OPERATIONS_TOTAL; const PLANNER_REFRESH_INTERVAL: Duration = Duration::from_secs(60); const NUM_STALE_SPLITS_TO_FETCH: usize = 1000; @@ -205,11 +206,13 @@ impl DeleteTaskPlanner { ) .await?; let index_label = - quickwit_common::metrics::index_label(self.index_uid.index_id.as_str()); - JANITOR_METRICS - .ongoing_num_delete_operations_total - .with_label_values([index_label]) - .set(self.ongoing_delete_operations_inventory.list().len() as i64); + quickwit_common::metrics::index_label(self.index_uid.index_id.as_str()) + .to_string(); + gauge!( + parent: ONGOING_NUM_DELETE_OPERATIONS_TOTAL, + "index" => index_label, + ) + .set(self.ongoing_delete_operations_inventory.list().len() as f64); } } diff --git a/quickwit/quickwit-janitor/src/actors/garbage_collector.rs b/quickwit/quickwit-janitor/src/actors/garbage_collector.rs index 21411bb0192..fd40d7e5423 100644 --- a/quickwit/quickwit-janitor/src/actors/garbage_collector.rs +++ b/quickwit/quickwit-janitor/src/actors/garbage_collector.rs @@ -23,6 +23,7 @@ use quickwit_common::is_parquet_pipeline_index; use quickwit_common::shared_consts::split_deletion_grace_period; use quickwit_index_management::{GcMetrics, run_garbage_collect, run_parquet_garbage_collect}; use quickwit_metastore::ListIndexesMetadataResponseExt; +use quickwit_metrics::{counter, label_names, label_values}; use quickwit_proto::metastore::{ ListIndexesMetadataRequest, MetastoreService, MetastoreServiceClient, }; @@ -31,7 +32,7 @@ use quickwit_storage::{Storage, StorageResolver}; use serde::Serialize; use tracing::{debug, error, info}; -use crate::metrics::JANITOR_METRICS; +use crate::metrics::{GC_DELETED_BYTES, GC_DELETED_SPLITS, GC_RUNS, GC_SECONDS_TOTAL}; const RUN_INTERVAL: Duration = Duration::from_secs(10 * 60); // 10 minutes @@ -54,20 +55,22 @@ impl GcRunResult { } } -fn gc_metrics(split_type: &str) -> GcMetrics { +fn gc_metrics(split_type: &'static str) -> GcMetrics { GcMetrics { - deleted_splits: JANITOR_METRICS - .gc_deleted_splits - .with_label_values(["success", split_type]) - .clone(), - deleted_bytes: JANITOR_METRICS - .gc_deleted_bytes - .with_label_values([split_type]) - .clone(), - failed_splits: JANITOR_METRICS - .gc_deleted_splits - .with_label_values(["error", split_type]) - .clone(), + deleted_splits: counter!( + parent: GC_DELETED_SPLITS, + "result" => "success", + "split_type" => split_type, + ), + failed_splits: counter!( + parent: GC_DELETED_SPLITS, + "result" => "error", + "split_type" => split_type, + ), + deleted_bytes: counter!( + parent: GC_DELETED_BYTES, + "split_type" => split_type, + ), } } @@ -188,7 +191,12 @@ impl GarbageCollector { } // Run Tantivy GC + let labels_result = label_names!("result"); + let labels_split = label_names!("split_type"); + if !tantivy_storages.is_empty() { + let labels_split = label_values!(labels_split => "tantivy"); + let tantivy_start = Instant::now(); let gc_res = run_garbage_collect( tantivy_storages, @@ -202,18 +210,17 @@ impl GarbageCollector { .await; let tantivy_run_duration = tantivy_start.elapsed().as_secs(); - JANITOR_METRICS - .gc_seconds_total - .with_label_values(["tantivy"]) - .inc_by(tantivy_run_duration); + counter!(parent: GC_SECONDS_TOTAL, labels: [labels_split]) + .increment(tantivy_run_duration); let result = match gc_res { Ok(removal_info) => { self.counters.num_successful_gc_run += 1; - JANITOR_METRICS - .gc_runs - .with_label_values(["success", "tantivy"]) - .inc(); + counter!( + parent: GC_RUNS, + labels: [labels_split, label_values!(labels_result => "success")], + ) + .increment(1); GcRunResult { num_deleted_splits: removal_info.removed_split_entries.len(), num_deleted_bytes: removal_info @@ -232,10 +239,11 @@ impl GarbageCollector { } Err(error) => { self.counters.num_failed_gc_run += 1; - JANITOR_METRICS - .gc_runs - .with_label_values(["error", "tantivy"]) - .inc(); + counter!( + parent: GC_RUNS, + labels: [labels_split, label_values!(labels_result => "error")], + ) + .increment(1); error!(error=?error, "failed to run garbage collection"); GcRunResult::failed() } @@ -245,6 +253,8 @@ impl GarbageCollector { // Run Parquet GC if !parquet_storages.is_empty() { + let labels_split = label_values!(labels_split => "parquet"); + let parquet_start = Instant::now(); let gc_res = run_parquet_garbage_collect( parquet_storages, @@ -258,18 +268,17 @@ impl GarbageCollector { .await; let parquet_run_duration = parquet_start.elapsed().as_secs(); - JANITOR_METRICS - .gc_seconds_total - .with_label_values(["parquet"]) - .inc_by(parquet_run_duration); + counter!(parent: GC_SECONDS_TOTAL, labels: [labels_split]) + .increment(parquet_run_duration); let result = match gc_res { Ok(removal_info) => { self.counters.num_successful_gc_run += 1; - JANITOR_METRICS - .gc_runs - .with_label_values(["success", "parquet"]) - .inc(); + counter!( + parent: GC_RUNS, + labels: [labels_split, label_values!(labels_result => "success")], + ) + .increment(1); GcRunResult { num_deleted_splits: removal_info.removed_split_count(), num_deleted_bytes: removal_info.removed_bytes() as usize, @@ -284,10 +293,11 @@ impl GarbageCollector { } Err(error) => { self.counters.num_failed_gc_run += 1; - JANITOR_METRICS - .gc_runs - .with_label_values(["error", "parquet"]) - .inc(); + counter!( + parent: GC_RUNS, + labels: [labels_split, label_values!(labels_result => "error")], + ) + .increment(1); error!(error=?error, "failed to run parquet garbage collection"); GcRunResult::failed() } diff --git a/quickwit/quickwit-janitor/src/metrics.rs b/quickwit/quickwit-janitor/src/metrics.rs index aeea26c2674..46bc94bc012 100644 --- a/quickwit/quickwit-janitor/src/metrics.rs +++ b/quickwit/quickwit-janitor/src/metrics.rs @@ -14,58 +14,44 @@ use std::sync::LazyLock; -use quickwit_common::metrics::{IntCounterVec, IntGaugeVec, new_counter_vec, new_gauge_vec}; +use quickwit_metrics::{Counter, Gauge, counter, gauge}; -pub struct JanitorMetrics { - pub ongoing_num_delete_operations_total: IntGaugeVec<1>, - pub gc_deleted_splits: IntCounterVec<2>, - pub gc_deleted_bytes: IntCounterVec<1>, - pub gc_runs: IntCounterVec<2>, - pub gc_seconds_total: IntCounterVec<1>, -} +pub(crate) static ONGOING_NUM_DELETE_OPERATIONS_TOTAL: LazyLock = LazyLock::new(|| { + gauge!( + name: "ongoing_num_delete_operations_total", + description: "Num of ongoing delete operations (per index).", + subsystem: "janitor", + ) +}); -impl Default for JanitorMetrics { - fn default() -> Self { - JanitorMetrics { - ongoing_num_delete_operations_total: new_gauge_vec( - "ongoing_num_delete_operations_total", - "Num of ongoing delete operations (per index).", - "quickwit_janitor", - &[], - ["index"], - ), - gc_deleted_splits: new_counter_vec( - "gc_deleted_splits_total", - "Total number of splits deleted by the garbage collector.", - "quickwit_janitor", - &[], - ["result", "split_type"], - ), - gc_deleted_bytes: new_counter_vec( - "gc_deleted_bytes_total", - "Total number of bytes deleted by the garbage collector.", - "quickwit_janitor", - &[], - ["split_type"], - ), - gc_runs: new_counter_vec( - "gc_runs_total", - "Total number of garbage collector execition.", - "quickwit_janitor", - &[], - ["result", "split_type"], - ), - gc_seconds_total: new_counter_vec( - "gc_seconds_total", - "Total time spent running the garbage collector", - "quickwit_janitor", - &[], - ["split_type"], - ), - } - } -} +pub(crate) static GC_DELETED_SPLITS: LazyLock = LazyLock::new(|| { + counter!( + name: "gc_deleted_splits_total", + description: "Total number of splits deleted by the garbage collector.", + subsystem: "janitor", + ) +}); -/// `JANITOR_METRICS` exposes a bunch of related metrics through a prometheus -/// endpoint. -pub static JANITOR_METRICS: LazyLock = LazyLock::new(JanitorMetrics::default); +pub(crate) static GC_DELETED_BYTES: LazyLock = LazyLock::new(|| { + counter!( + name: "gc_deleted_bytes_total", + description: "Total number of bytes deleted by the garbage collector.", + subsystem: "janitor", + ) +}); + +pub(crate) static GC_RUNS: LazyLock = LazyLock::new(|| { + counter!( + name: "gc_runs_total", + description: "Total number of garbage collector execition.", + subsystem: "janitor", + ) +}); + +pub(crate) static GC_SECONDS_TOTAL: LazyLock = LazyLock::new(|| { + counter!( + name: "gc_seconds_total", + description: "Total time spent running the garbage collector", + subsystem: "janitor", + ) +}); diff --git a/quickwit/quickwit-lambda-client/Cargo.toml b/quickwit/quickwit-lambda-client/Cargo.toml index 9f8318e7c15..1d33060be2e 100644 --- a/quickwit/quickwit-lambda-client/Cargo.toml +++ b/quickwit/quickwit-lambda-client/Cargo.toml @@ -23,6 +23,7 @@ tokio = { workspace = true } tracing = { workspace = true } quickwit-common = { workspace = true } +quickwit-metrics = { workspace = true } quickwit-config = { workspace = true } quickwit-lambda-server = { workspace = true } quickwit-proto = { workspace = true } diff --git a/quickwit/quickwit-lambda-client/src/invoker.rs b/quickwit/quickwit-lambda-client/src/invoker.rs index 69f7cb6519d..16e39ce6e1f 100644 --- a/quickwit/quickwit-lambda-client/src/invoker.rs +++ b/quickwit/quickwit-lambda-client/src/invoker.rs @@ -25,11 +25,15 @@ use base64::prelude::*; use prost::Message; use quickwit_common::retry::RetryParams; use quickwit_lambda_server::{LambdaSearchRequestPayload, LambdaSearchResponsePayload}; +use quickwit_metrics::{counter, histogram, labels}; use quickwit_proto::search::{LambdaSearchResponses, LambdaSingleSplitResult, LeafSearchRequest}; use quickwit_search::{LambdaLeafSearchInvoker, SearchError}; use tracing::{debug, info, instrument}; -use crate::metrics::LAMBDA_METRICS; +use crate::metrics::{ + LEAF_SEARCH_DURATION_SECONDS, LEAF_SEARCH_REQUEST_PAYLOAD_SIZE_BYTES, + LEAF_SEARCH_REQUESTS_TOTAL, LEAF_SEARCH_RESPONSE_PAYLOAD_SIZE_BYTES, +}; /// Upper bound on the retry-after hint we will honor from Lambda rate-limit responses. const MAX_RETRY_AFTER: Duration = Duration::from_secs(10); @@ -171,14 +175,9 @@ impl LambdaLeafSearchInvoker for AwsLambdaInvoker { let result = self.invoke_leaf_search_with_retry(request).await; let elapsed = start.elapsed().as_secs_f64(); let status = if result.is_ok() { "success" } else { "error" }; - LAMBDA_METRICS - .leaf_search_requests_total - .with_label_values([status]) - .inc(); - LAMBDA_METRICS - .leaf_search_duration_seconds - .with_label_values([status]) - .observe(elapsed); + let labels = labels!("status" => status); + counter!(parent: LEAF_SEARCH_REQUESTS_TOTAL, labels: [labels]).increment(1); + histogram!(parent: LEAF_SEARCH_DURATION_SECONDS, labels: [labels]).record(elapsed); result } } @@ -233,9 +232,7 @@ impl AwsLambdaInvoker { let payload_json = serde_json::to_vec(&payload) .map_err(|e| SearchError::Internal(format!("JSON serialization error: {}", e)))?; - LAMBDA_METRICS - .leaf_search_request_payload_size_bytes - .observe(payload_json.len() as f64); + LEAF_SEARCH_REQUEST_PAYLOAD_SIZE_BYTES.record(payload_json.len() as f64); debug!( payload_size = payload_json.len(), @@ -275,9 +272,7 @@ impl AwsLambdaInvoker { .payload() .ok_or_else(|| SearchError::Internal("no response payload from Lambda".into()))?; - LAMBDA_METRICS - .leaf_search_response_payload_size_bytes - .observe(response_payload.as_ref().len() as f64); + LEAF_SEARCH_RESPONSE_PAYLOAD_SIZE_BYTES.record(response_payload.as_ref().len() as f64); let lambda_response: LambdaSearchResponsePayload = serde_json::from_slice(response_payload.as_ref()) diff --git a/quickwit/quickwit-lambda-client/src/lib.rs b/quickwit/quickwit-lambda-client/src/lib.rs index aebf264df8c..70163f06e84 100644 --- a/quickwit/quickwit-lambda-client/src/lib.rs +++ b/quickwit/quickwit-lambda-client/src/lib.rs @@ -32,6 +32,5 @@ mod invoker; mod metrics; pub use deploy::try_get_or_deploy_invoker; -pub use metrics::LAMBDA_METRICS; // Re-export payload types from server crate for convenience pub use quickwit_lambda_server::{LambdaSearchRequestPayload, LambdaSearchResponsePayload}; diff --git a/quickwit/quickwit-lambda-client/src/metrics.rs b/quickwit/quickwit-lambda-client/src/metrics.rs index f136e4249c1..75e56577816 100644 --- a/quickwit/quickwit-lambda-client/src/metrics.rs +++ b/quickwit/quickwit-lambda-client/src/metrics.rs @@ -16,10 +16,8 @@ use std::sync::LazyLock; -use quickwit_common::metrics::{ - Histogram, HistogramVec, IntCounterVec, exponential_buckets, new_counter_vec, new_histogram, - new_histogram_vec, -}; +use quickwit_common::metrics::exponential_buckets; +use quickwit_metrics::{Counter, Histogram, counter, histogram}; /// From 100ms to 73s seconds fn duration_buckets() -> Vec { @@ -31,45 +29,39 @@ fn payload_size_buckets() -> Vec { exponential_buckets(1024.0, 4.0, 8).unwrap() } -pub struct LambdaMetrics { - pub leaf_search_requests_total: IntCounterVec<1>, - pub leaf_search_duration_seconds: HistogramVec<1>, - pub leaf_search_request_payload_size_bytes: Histogram, - pub leaf_search_response_payload_size_bytes: Histogram, -} +pub(crate) static LEAF_SEARCH_REQUESTS_TOTAL: LazyLock = LazyLock::new(|| { + counter!( + name: "leaf_search_requests_total", + description: "Total number of Lambda leaf search invocations.", + subsystem: "lambda", + ) +}); -impl Default for LambdaMetrics { - fn default() -> Self { - LambdaMetrics { - leaf_search_requests_total: new_counter_vec( - "leaf_search_requests_total", - "Total number of Lambda leaf search invocations.", - "lambda", - &[], - ["status"], - ), - leaf_search_duration_seconds: new_histogram_vec( - "leaf_search_duration_seconds", - "Duration of Lambda leaf search invocations in seconds.", - "lambda", - &[], - ["status"], - duration_buckets(), - ), - leaf_search_request_payload_size_bytes: new_histogram( - "leaf_search_request_payload_size_bytes", - "Size of the request payload sent to Lambda in bytes.", - "lambda", - payload_size_buckets(), - ), - leaf_search_response_payload_size_bytes: new_histogram( - "leaf_search_response_payload_size_bytes", - "Size of the response payload received from Lambda in bytes.", - "lambda", - payload_size_buckets(), - ), - } - } -} +pub(crate) static LEAF_SEARCH_DURATION_SECONDS: LazyLock = LazyLock::new(|| { + histogram!( + name: "leaf_search_duration_seconds", + description: "Duration of Lambda leaf search invocations in seconds.", + subsystem: "lambda", + buckets: duration_buckets(), + ) +}); + +pub(crate) static LEAF_SEARCH_REQUEST_PAYLOAD_SIZE_BYTES: LazyLock = + LazyLock::new(|| { + histogram!( + name: "leaf_search_request_payload_size_bytes", + description: "Size of the request payload sent to Lambda in bytes.", + subsystem: "lambda", + buckets: payload_size_buckets(), + ) + }); -pub static LAMBDA_METRICS: LazyLock = LazyLock::new(LambdaMetrics::default); +pub(crate) static LEAF_SEARCH_RESPONSE_PAYLOAD_SIZE_BYTES: LazyLock = + LazyLock::new(|| { + histogram!( + name: "leaf_search_response_payload_size_bytes", + description: "Size of the response payload received from Lambda in bytes.", + subsystem: "lambda", + buckets: payload_size_buckets(), + ) + }); diff --git a/quickwit/quickwit-metastore/Cargo.toml b/quickwit/quickwit-metastore/Cargo.toml index 8a8a4755feb..ac3e8d5406f 100644 --- a/quickwit/quickwit-metastore/Cargo.toml +++ b/quickwit/quickwit-metastore/Cargo.toml @@ -40,6 +40,7 @@ uuid = { workspace = true } utoipa = { workspace = true } quickwit-common = { workspace = true } +quickwit-metrics = { workspace = true } quickwit-config = { workspace = true } quickwit-doc-mapper = { workspace = true } quickwit-parquet-engine = { workspace = true } diff --git a/quickwit/quickwit-metastore/src/metastore/postgres/metrics.rs b/quickwit/quickwit-metastore/src/metastore/postgres/metrics.rs index 59cea1db805..b45d005366a 100644 --- a/quickwit/quickwit-metastore/src/metastore/postgres/metrics.rs +++ b/quickwit/quickwit-metastore/src/metastore/postgres/metrics.rs @@ -14,39 +14,28 @@ use std::sync::LazyLock; -use quickwit_common::metrics::{IntGauge, new_gauge}; +use quickwit_metrics::{Gauge, gauge}; -#[derive(Clone)] -pub(super) struct PostgresMetrics { - pub acquire_connections: IntGauge, - pub active_connections: IntGauge, - pub idle_connections: IntGauge, -} +pub(super) static ACQUIRE_CONNECTIONS: LazyLock = LazyLock::new(|| { + gauge!( + name: "acquire_connections", + description: "Number of connections being acquired.", + subsystem: "metastore", + ) +}); -impl Default for PostgresMetrics { - fn default() -> Self { - Self { - acquire_connections: new_gauge( - "acquire_connections", - "Number of connections being acquired.", - "metastore", - &[], - ), - active_connections: new_gauge( - "active_connections", - "Number of active (used + idle) connections.", - "metastore", - &[], - ), - idle_connections: new_gauge( - "idle_connections", - "Number of idle connections.", - "metastore", - &[], - ), - } - } -} +pub(super) static ACTIVE_CONNECTIONS: LazyLock = LazyLock::new(|| { + gauge!( + name: "active_connections", + description: "Number of active (used + idle) connections.", + subsystem: "metastore", + ) +}); -pub(super) static POSTGRES_METRICS: LazyLock = - LazyLock::new(PostgresMetrics::default); +pub(super) static IDLE_CONNECTIONS: LazyLock = LazyLock::new(|| { + gauge!( + name: "idle_connections", + description: "Number of idle connections.", + subsystem: "metastore", + ) +}); diff --git a/quickwit/quickwit-metastore/src/metastore/postgres/pool.rs b/quickwit/quickwit-metastore/src/metastore/postgres/pool.rs index a4c1e790e5b..984a2724472 100644 --- a/quickwit/quickwit-metastore/src/metastore/postgres/pool.rs +++ b/quickwit/quickwit-metastore/src/metastore/postgres/pool.rs @@ -14,15 +14,13 @@ use futures::future::BoxFuture; use futures::stream::BoxStream; -use quickwit_common::metrics::GaugeGuard; +use quickwit_metrics::GaugeGuard; use sqlx::pool::PoolConnection; use sqlx::pool::maybe::MaybePoolConnection; use sqlx::{ Acquire, Database, Describe, Either, Error, Execute, Executor, Pool, Postgres, Transaction, }; -use super::metrics::POSTGRES_METRICS; - #[derive(Debug)] pub(super) struct TrackedPool { inner_pool: Pool, @@ -50,16 +48,11 @@ impl<'a, DB: Database> Acquire<'a> for &TrackedPool { fn acquire(self) -> BoxFuture<'static, Result> { let acquire_conn_fut = self.inner_pool.acquire(); - POSTGRES_METRICS - .active_connections - .set(self.inner_pool.size() as i64); - POSTGRES_METRICS - .idle_connections - .set(self.inner_pool.num_idle() as i64); + super::metrics::ACTIVE_CONNECTIONS.set(self.inner_pool.size() as f64); + super::metrics::IDLE_CONNECTIONS.set(self.inner_pool.num_idle() as f64); Box::pin(async move { - let mut gauge_guard = GaugeGuard::from_gauge(&POSTGRES_METRICS.acquire_connections); - gauge_guard.add(1); + let _gauge_guard = GaugeGuard::new(&super::metrics::ACQUIRE_CONNECTIONS, 1.0); let conn = acquire_conn_fut.await?; Ok(conn) diff --git a/quickwit/quickwit-metrics-inventory/Cargo.toml b/quickwit/quickwit-metrics-inventory/Cargo.toml new file mode 100644 index 00000000000..504eaa8cec7 --- /dev/null +++ b/quickwit/quickwit-metrics-inventory/Cargo.toml @@ -0,0 +1,13 @@ +[package] +name = "quickwit-metrics-inventory" +version = { workspace = true } +edition = { workspace = true } +homepage = { workspace = true } +documentation = { workspace = true } +repository = { workspace = true } +authors = { workspace = true } +license = { workspace = true } +description = "Enumerates all quickwit-metrics declarations across workspace crates" + +[dependencies] +quickwit-metrics = { workspace = true } diff --git a/quickwit/quickwit-metrics-inventory/build.rs b/quickwit/quickwit-metrics-inventory/build.rs new file mode 100644 index 00000000000..9659de6a894 --- /dev/null +++ b/quickwit/quickwit-metrics-inventory/build.rs @@ -0,0 +1,36 @@ +// Copyright 2021-Present Datadog, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +fn main() { + let target_os = std::env::var("CARGO_CFG_TARGET_OS").unwrap(); + // Prevent the linker from stripping inventory-submitted statics that + // live in dependency rlibs. Without this, the linker sees that the + // inventory binary never references symbols from dependency crates + // and drops them — along with the MetricInfo entries registered via + // inventory::submit!(). + match target_os.as_str() { + "macos" => { + println!("cargo::rustc-link-arg-bins=-Wl,-all_load"); + } + "linux" => { + println!("cargo::rustc-link-arg-bins=-Wl,--whole-archive"); + } + other => { + eprintln!( + "cargo:warning=quickwit-metrics-inventory: no whole-archive linker flag for \ + target OS '{other}'; inventory discovery from dependency crates may not work" + ); + } + } +} diff --git a/quickwit/quickwit-metrics-inventory/scripts/run.sh b/quickwit/quickwit-metrics-inventory/scripts/run.sh new file mode 100755 index 00000000000..08f23e49cee --- /dev/null +++ b/quickwit/quickwit-metrics-inventory/scripts/run.sh @@ -0,0 +1,35 @@ +#!/usr/bin/env bash +# +# Discovers quickwit-metrics reverse dependencies, patches Cargo.toml and +# src/main.rs, builds and runs the inventory binary, then restores +# Cargo.toml, Cargo.lock, and src/main.rs via git. +# Files are always restored — even on Ctrl-C or failure. +# +# Usage: +# ./scripts/run.sh + +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" +CRATE_DIR="$(dirname "$SCRIPT_DIR")" +WORKSPACE_DIR="$(dirname "$CRATE_DIR")" +CARGO_TOML="$CRATE_DIR/Cargo.toml" +CARGO_LOCK="$WORKSPACE_DIR/Cargo.lock" +MAIN_RS="$CRATE_DIR/src/main.rs" + +trap 'git restore "$CARGO_TOML" "$CARGO_LOCK" "$MAIN_RS"' EXIT + +# --format '{lib}' outputs the Rust crate name (underscores, no version/path). +# --prefix none removes tree decorators. tail skips the root (quickwit-metrics itself). +REVERSE_DEPS=$(cargo tree --manifest-path "$WORKSPACE_DIR/Cargo.toml" \ + --workspace --all-features --depth 1 --invert quickwit-metrics \ + --prefix none --format '{lib}' 2>/dev/null \ + | tail -n +2) + +for rust_name in $REVERSE_DEPS; do + pkg_name=$(echo "$rust_name" | tr '_' '-') + echo "$pkg_name = { workspace = true }" >> "$CARGO_TOML" + echo "extern crate $rust_name;" >> "$MAIN_RS" +done + +cargo run --manifest-path "$CARGO_TOML" diff --git a/quickwit/quickwit-metrics-inventory/src/main.rs b/quickwit/quickwit-metrics-inventory/src/main.rs new file mode 100644 index 00000000000..7322044afa7 --- /dev/null +++ b/quickwit/quickwit-metrics-inventory/src/main.rs @@ -0,0 +1,71 @@ +// Copyright 2021-Present Datadog, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! Enumerates all registered `MetricInfo` entries via `inventory`. +//! +//! **Do not run this binary directly** — it will only see metrics from +//! crates listed in this crate's `Cargo.toml` dependencies. To discover +//! metrics from all workspace crates, use the wrapper script which patches +//! in reverse dependencies: +//! +//! ```sh +//! ./scripts/run_inventory.sh +//! ``` +//! +//! The script temporarily adds `extern crate` lines and `Cargo.toml` +//! dependencies for every crate that depends on `quickwit-metrics`, then +//! restores the files on exit. The `build.rs` ensures the linker pulls in +//! all inventory submissions even without explicit symbol references. + +use std::collections::BTreeMap; + +fn format_key(info: &quickwit_metrics::MetricInfo) -> String { + if info.static_labels.is_empty() { + info.key_name.to_string() + } else { + let pairs: Vec = info + .static_labels + .iter() + .map(|(k, v)| format!("{k}={v}")) + .collect(); + format!("{}{{{}}}", info.key_name, pairs.join(", ")) + } +} + +fn main() { + let mut by_module: BTreeMap<&str, BTreeMap> = + BTreeMap::new(); + + for info in quickwit_metrics::metrics_info() { + let module = info.metadata.module_path().unwrap_or(""); + by_module + .entry(module) + .or_default() + .insert(format_key(info), info); + } + + for (module, metrics) in &by_module { + let max_key_len = metrics.keys().map(|k| k.len()).max().unwrap_or(0); + println!("{module}"); + for (key, info) in metrics { + println!( + " {key:, + _desc: metrics::SharedString, + ) { + } + fn describe_gauge( + &self, + _key: metrics::KeyName, + _unit: Option, + _desc: metrics::SharedString, + ) { + } + fn describe_histogram( + &self, + _key: metrics::KeyName, + _unit: Option, + _desc: metrics::SharedString, + ) { + } + fn register_counter(&self, _key: &metrics::Key, _metadata: &metrics::Metadata<'_>) -> Counter { + Counter::noop() + } + fn register_gauge(&self, _key: &metrics::Key, _metadata: &metrics::Metadata<'_>) -> Gauge { + Gauge::noop() + } + fn register_histogram( + &self, + _key: &metrics::Key, + _metadata: &metrics::Metadata<'_>, + ) -> Histogram { + Histogram::noop() + } +} + +// --------------------------------------------------------------------------- +// Recorder setup — RECORDER env-var is mandatory. +// +// RECORDER=noop cargo bench --bench baseline # noop recorder +// RECORDER=prometheus cargo bench --bench baseline # prometheus +// --------------------------------------------------------------------------- + +static INSTALL_RECORDER: OnceLock<()> = OnceLock::new(); + +fn install_recorder() { + INSTALL_RECORDER.get_or_init(|| { + let recorder = std::env::var("RECORDER") + .expect("RECORDER env var is required (set to \"noop\" or \"prometheus\")"); + + match recorder.to_ascii_lowercase().as_str() { + "noop" => { + eprintln!("[bench] Using noop recorder"); + metrics::set_global_recorder(NoopRecorder) + .expect("failed to install noop recorder"); + } + "prometheus" => { + eprintln!("[bench] Using prometheus recorder"); + let _handle = metrics_exporter_prometheus::PrometheusBuilder::new() + .install_recorder() + .expect("failed to install prometheus recorder"); + } + other => { + panic!("unknown RECORDER value \"{other}\", expected \"noop\" or \"prometheus\"") + } + } + }); +} + +// --------------------------------------------------------------------------- +// Helpers +// --------------------------------------------------------------------------- + +fn make_labels(n: usize) -> Vec