Coverage for node / src / stigmem_node / observability / metrics.py: 62%
94 statements
« prev ^ index » next coverage.py v7.13.5, created at 2026-05-25 01:49 +0000
« prev ^ index » next coverage.py v7.13.5, created at 2026-05-25 01:49 +0000
1"""Prometheus metrics for the Stigmem reference node — spec §22.4, Phase 13.
3If ``prometheus_client`` is installed, this module registers the full metric
4set and exposes it via ``/metrics``. If the package is absent, all operations
5are no-ops so the node runs without the dependency.
7Counters:
8 stigmem_fact_write_total{principal, tenant}
9 stigmem_fact_read_total{principal, tenant}
10 stigmem_quota_breach_total{principal, tenant, dimension}
11 stigmem_audit_event_total{event_type, tenant}
12 stigmem_contradiction_total{tenant}
13 stigmem_federation_ingress_total{peer_id, status}
14 stigmem_federation_egress_total{peer_id, status}
15 stigmem_peer_hlc_anomaly_total{peer_id, direction}
16 stigmem_subscription_event_total{delivery_type, status}
17 stigmem_plugin_hook_fire_total{hook}
18 stigmem_plugin_handler_invocation_total{hook, plugin}
19 stigmem_plugin_handler_error_total{hook, plugin, error_type}
20 stigmem_plugin_voting_decision_total{hook, decision}
21 stigmem_plugin_registration_total{outcome, reason}
23Histograms:
24 stigmem_request_latency_seconds{route, method, status_code}
25 stigmem_recall_ranker_duration_seconds{tenant}
26 stigmem_capability_verify_duration_seconds{result}
27 stigmem_plugin_hook_duration_seconds{hook}
28 stigmem_plugin_handler_duration_seconds{hook, plugin}
30Gauges:
31 stigmem_subscription_connections_active{tenant}
32 stigmem_replication_lag_seconds{peer_id}
33 stigmem_plugin_registered_count
34 stigmem_plugin_handlers_per_hook{hook}
35"""
37from __future__ import annotations
39import contextlib
40import logging
41import time
42from collections.abc import Generator
43from typing import TYPE_CHECKING, Any
45contextmanager = contextlib.contextmanager
47logger = logging.getLogger("stigmem.metrics")
49if TYPE_CHECKING:
50 from starlette.responses import Response
52try:
53 import prometheus_client as _prom
55 REGISTRY = _prom.REGISTRY
56 Counter = _prom.Counter
57 Gauge = _prom.Gauge
58 Histogram = _prom.Histogram
60 _ENABLED = True
62 # ----- Counters -----
63 FACT_WRITE = Counter(
64 "stigmem_fact_write_total",
65 "Total fact assertions",
66 ["principal", "tenant"],
67 )
68 FACT_READ = Counter(
69 "stigmem_fact_read_total",
70 "Total fact read / recall queries",
71 ["principal", "tenant"],
72 )
73 QUOTA_BREACH = Counter(
74 "stigmem_quota_breach_total",
75 "Total quota-exceeded (429) responses",
76 ["principal", "tenant", "dimension"],
77 )
78 AUDIT_EVENT = Counter(
79 "stigmem_audit_event_total",
80 "Total audit events emitted (§22.3)",
81 ["event_type", "tenant"],
82 )
83 CONTRADICTION = Counter(
84 "stigmem_contradiction_total",
85 "Total facts that triggered a contradiction on write",
86 ["tenant"],
87 )
88 FEDERATION_INGRESS = Counter(
89 "stigmem_federation_ingress_total",
90 "Facts received via federation pull ingress",
91 ["peer_id", "status"],
92 )
93 FEDERATION_EGRESS = Counter(
94 "stigmem_federation_egress_total",
95 "Facts served via federation pull egress",
96 ["peer_id", "status"],
97 )
98 PEER_HLC_ANOMALY = Counter(
99 "stigmem_peer_hlc_anomaly_total",
100 "Rejected inbound federation facts with remote HLC skew outside configured bounds",
101 ["peer_id", "direction"],
102 )
103 SUBSCRIPTION_EVENT = Counter(
104 "stigmem_subscription_event_total",
105 "Subscription delivery events",
106 ["delivery_type", "status"],
107 )
108 PLUGIN_HOOK_FIRE = Counter(
109 "stigmem_plugin_hook_fire_total",
110 "Plugin hook dispatches",
111 ["hook"],
112 )
113 PLUGIN_HANDLER_INVOCATION = Counter(
114 "stigmem_plugin_handler_invocation_total",
115 "Plugin handler invocations",
116 ["hook", "plugin"],
117 )
118 PLUGIN_HANDLER_ERROR = Counter(
119 "stigmem_plugin_handler_error_total",
120 "Plugin handler errors",
121 ["hook", "plugin", "error_type"],
122 )
123 PLUGIN_VOTING_DECISION = Counter(
124 "stigmem_plugin_voting_decision_total",
125 "Plugin voting hook decisions",
126 ["hook", "decision"],
127 )
128 PLUGIN_REGISTRATION = Counter(
129 "stigmem_plugin_registration_total",
130 "Plugin registration attempts",
131 ["outcome", "reason"],
132 )
134 # ----- Histograms -----
135 REQUEST_LATENCY = Histogram(
136 "stigmem_request_latency_seconds",
137 "End-to-end HTTP request latency",
138 ["route", "method", "status_code"],
139 buckets=(0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5),
140 )
141 RECALL_RANKER_DURATION = Histogram(
142 "stigmem_recall_ranker_duration_seconds",
143 "Time spent in the hybrid recall ranker",
144 ["tenant"],
145 buckets=(0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5),
146 )
147 CAPABILITY_VERIFY_DURATION = Histogram(
148 "stigmem_capability_verify_duration_seconds",
149 "Time spent verifying capability tokens",
150 ["result"],
151 buckets=(0.001, 0.005, 0.01, 0.025, 0.05, 0.1),
152 )
153 PLUGIN_HOOK_DURATION = Histogram(
154 "stigmem_plugin_hook_duration_seconds",
155 "Plugin hook dispatch duration",
156 ["hook"],
157 buckets=(0.000005, 0.00001, 0.000025, 0.00005, 0.0001, 0.00025, 0.0005, 0.001),
158 )
159 PLUGIN_HANDLER_DURATION = Histogram(
160 "stigmem_plugin_handler_duration_seconds",
161 "Plugin handler invocation duration",
162 ["hook", "plugin"],
163 buckets=(0.000005, 0.00001, 0.000025, 0.00005, 0.0001, 0.00025, 0.0005, 0.001),
164 )
166 # ----- Gauges -----
167 SUBSCRIPTION_CONNECTIONS = Gauge(
168 "stigmem_subscription_connections_active",
169 "Number of active (non-circuit-open) subscriptions",
170 ["tenant"],
171 )
172 REPLICATION_LAG = Gauge(
173 "stigmem_replication_lag_seconds",
174 "Estimated replication lag to each federation peer (seconds)",
175 ["peer_id"],
176 )
177 PLUGIN_REGISTERED_COUNT = Gauge(
178 "stigmem_plugin_registered_count",
179 "Number of registered plugins",
180 )
181 PLUGIN_HANDLERS_PER_HOOK = Gauge(
182 "stigmem_plugin_handlers_per_hook",
183 "Number of registered handlers per plugin hook",
184 ["hook"],
185 )
187except ImportError:
188 _ENABLED = False
190 class _Noop:
191 def labels(self, **_: Any) -> _Noop:
192 return self
194 def inc(self, amount: float = 1) -> None:
195 pass
197 def observe(self, amount: float) -> None:
198 pass
200 def set(self, value: float) -> None:
201 pass
203 def inc_gauge(self, amount: float = 1) -> None:
204 pass
206 def dec(self, amount: float = 1) -> None:
207 pass
209 _noop = _Noop()
211 # Counters
212 FACT_WRITE = _noop
213 FACT_READ = _noop
214 QUOTA_BREACH = _noop
215 AUDIT_EVENT = _noop
216 CONTRADICTION = _noop
217 FEDERATION_INGRESS = _noop
218 FEDERATION_EGRESS = _noop
219 PEER_HLC_ANOMALY = _noop
220 SUBSCRIPTION_EVENT = _noop
221 PLUGIN_HOOK_FIRE = _noop
222 PLUGIN_HANDLER_INVOCATION = _noop
223 PLUGIN_HANDLER_ERROR = _noop
224 PLUGIN_VOTING_DECISION = _noop
225 PLUGIN_REGISTRATION = _noop
227 # Histograms
228 REQUEST_LATENCY = _noop
229 RECALL_RANKER_DURATION = _noop
230 CAPABILITY_VERIFY_DURATION = _noop
231 PLUGIN_HOOK_DURATION = _noop
232 PLUGIN_HANDLER_DURATION = _noop
234 # Gauges
235 SUBSCRIPTION_CONNECTIONS = _noop
236 REPLICATION_LAG = _noop
237 PLUGIN_REGISTERED_COUNT = _noop
238 PLUGIN_HANDLERS_PER_HOOK = _noop
241def metrics_enabled() -> bool:
242 return _ENABLED
245def make_metrics_response() -> Response | None:
246 """Return a Starlette ``Response`` with the Prometheus text exposition."""
247 if not _ENABLED: 247 ↛ 249line 247 didn't jump to line 249 because the condition on line 247 was always true
248 return None
249 from starlette.responses import Response
251 return Response(
252 content=_prom.generate_latest(),
253 media_type=_prom.CONTENT_TYPE_LATEST,
254 )
257@contextmanager
258def observe_duration(histogram: Any, labels: dict[str, str]) -> Generator[None, None, None]:
259 """Context manager: observe elapsed time in ``histogram`` after the block exits."""
260 start = time.perf_counter()
261 try:
262 yield
263 finally:
264 elapsed = time.perf_counter() - start
265 try:
266 histogram.labels(**labels).observe(elapsed)
267 except Exception as exc:
268 logger.debug("failed to observe duration metric: %s", exc)