Coverage for node / src / stigmem_node / observability / metrics.py: 62%

94 statements  

« prev     ^ index     » next       coverage.py v7.13.5, created at 2026-05-25 01:49 +0000

1"""Prometheus metrics for the Stigmem reference node — spec §22.4, Phase 13. 

2 

3If ``prometheus_client`` is installed, this module registers the full metric 

4set and exposes it via ``/metrics``. If the package is absent, all operations 

5are no-ops so the node runs without the dependency. 

6 

7Counters: 

8 stigmem_fact_write_total{principal, tenant} 

9 stigmem_fact_read_total{principal, tenant} 

10 stigmem_quota_breach_total{principal, tenant, dimension} 

11 stigmem_audit_event_total{event_type, tenant} 

12 stigmem_contradiction_total{tenant} 

13 stigmem_federation_ingress_total{peer_id, status} 

14 stigmem_federation_egress_total{peer_id, status} 

15 stigmem_peer_hlc_anomaly_total{peer_id, direction} 

16 stigmem_subscription_event_total{delivery_type, status} 

17 stigmem_plugin_hook_fire_total{hook} 

18 stigmem_plugin_handler_invocation_total{hook, plugin} 

19 stigmem_plugin_handler_error_total{hook, plugin, error_type} 

20 stigmem_plugin_voting_decision_total{hook, decision} 

21 stigmem_plugin_registration_total{outcome, reason} 

22 

23Histograms: 

24 stigmem_request_latency_seconds{route, method, status_code} 

25 stigmem_recall_ranker_duration_seconds{tenant} 

26 stigmem_capability_verify_duration_seconds{result} 

27 stigmem_plugin_hook_duration_seconds{hook} 

28 stigmem_plugin_handler_duration_seconds{hook, plugin} 

29 

30Gauges: 

31 stigmem_subscription_connections_active{tenant} 

32 stigmem_replication_lag_seconds{peer_id} 

33 stigmem_plugin_registered_count 

34 stigmem_plugin_handlers_per_hook{hook} 

35""" 

36 

37from __future__ import annotations 

38 

39import contextlib 

40import logging 

41import time 

42from collections.abc import Generator 

43from typing import TYPE_CHECKING, Any 

44 

45contextmanager = contextlib.contextmanager 

46 

47logger = logging.getLogger("stigmem.metrics") 

48 

49if TYPE_CHECKING: 

50 from starlette.responses import Response 

51 

52try: 

53 import prometheus_client as _prom 

54 

55 REGISTRY = _prom.REGISTRY 

56 Counter = _prom.Counter 

57 Gauge = _prom.Gauge 

58 Histogram = _prom.Histogram 

59 

60 _ENABLED = True 

61 

62 # ----- Counters ----- 

63 FACT_WRITE = Counter( 

64 "stigmem_fact_write_total", 

65 "Total fact assertions", 

66 ["principal", "tenant"], 

67 ) 

68 FACT_READ = Counter( 

69 "stigmem_fact_read_total", 

70 "Total fact read / recall queries", 

71 ["principal", "tenant"], 

72 ) 

73 QUOTA_BREACH = Counter( 

74 "stigmem_quota_breach_total", 

75 "Total quota-exceeded (429) responses", 

76 ["principal", "tenant", "dimension"], 

77 ) 

78 AUDIT_EVENT = Counter( 

79 "stigmem_audit_event_total", 

80 "Total audit events emitted (§22.3)", 

81 ["event_type", "tenant"], 

82 ) 

83 CONTRADICTION = Counter( 

84 "stigmem_contradiction_total", 

85 "Total facts that triggered a contradiction on write", 

86 ["tenant"], 

87 ) 

88 FEDERATION_INGRESS = Counter( 

89 "stigmem_federation_ingress_total", 

90 "Facts received via federation pull ingress", 

91 ["peer_id", "status"], 

92 ) 

93 FEDERATION_EGRESS = Counter( 

94 "stigmem_federation_egress_total", 

95 "Facts served via federation pull egress", 

96 ["peer_id", "status"], 

97 ) 

98 PEER_HLC_ANOMALY = Counter( 

99 "stigmem_peer_hlc_anomaly_total", 

100 "Rejected inbound federation facts with remote HLC skew outside configured bounds", 

101 ["peer_id", "direction"], 

102 ) 

103 SUBSCRIPTION_EVENT = Counter( 

104 "stigmem_subscription_event_total", 

105 "Subscription delivery events", 

106 ["delivery_type", "status"], 

107 ) 

108 PLUGIN_HOOK_FIRE = Counter( 

109 "stigmem_plugin_hook_fire_total", 

110 "Plugin hook dispatches", 

111 ["hook"], 

112 ) 

113 PLUGIN_HANDLER_INVOCATION = Counter( 

114 "stigmem_plugin_handler_invocation_total", 

115 "Plugin handler invocations", 

116 ["hook", "plugin"], 

117 ) 

118 PLUGIN_HANDLER_ERROR = Counter( 

119 "stigmem_plugin_handler_error_total", 

120 "Plugin handler errors", 

121 ["hook", "plugin", "error_type"], 

122 ) 

123 PLUGIN_VOTING_DECISION = Counter( 

124 "stigmem_plugin_voting_decision_total", 

125 "Plugin voting hook decisions", 

126 ["hook", "decision"], 

127 ) 

128 PLUGIN_REGISTRATION = Counter( 

129 "stigmem_plugin_registration_total", 

130 "Plugin registration attempts", 

131 ["outcome", "reason"], 

132 ) 

133 

134 # ----- Histograms ----- 

135 REQUEST_LATENCY = Histogram( 

136 "stigmem_request_latency_seconds", 

137 "End-to-end HTTP request latency", 

138 ["route", "method", "status_code"], 

139 buckets=(0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5), 

140 ) 

141 RECALL_RANKER_DURATION = Histogram( 

142 "stigmem_recall_ranker_duration_seconds", 

143 "Time spent in the hybrid recall ranker", 

144 ["tenant"], 

145 buckets=(0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5), 

146 ) 

147 CAPABILITY_VERIFY_DURATION = Histogram( 

148 "stigmem_capability_verify_duration_seconds", 

149 "Time spent verifying capability tokens", 

150 ["result"], 

151 buckets=(0.001, 0.005, 0.01, 0.025, 0.05, 0.1), 

152 ) 

153 PLUGIN_HOOK_DURATION = Histogram( 

154 "stigmem_plugin_hook_duration_seconds", 

155 "Plugin hook dispatch duration", 

156 ["hook"], 

157 buckets=(0.000005, 0.00001, 0.000025, 0.00005, 0.0001, 0.00025, 0.0005, 0.001), 

158 ) 

159 PLUGIN_HANDLER_DURATION = Histogram( 

160 "stigmem_plugin_handler_duration_seconds", 

161 "Plugin handler invocation duration", 

162 ["hook", "plugin"], 

163 buckets=(0.000005, 0.00001, 0.000025, 0.00005, 0.0001, 0.00025, 0.0005, 0.001), 

164 ) 

165 

166 # ----- Gauges ----- 

167 SUBSCRIPTION_CONNECTIONS = Gauge( 

168 "stigmem_subscription_connections_active", 

169 "Number of active (non-circuit-open) subscriptions", 

170 ["tenant"], 

171 ) 

172 REPLICATION_LAG = Gauge( 

173 "stigmem_replication_lag_seconds", 

174 "Estimated replication lag to each federation peer (seconds)", 

175 ["peer_id"], 

176 ) 

177 PLUGIN_REGISTERED_COUNT = Gauge( 

178 "stigmem_plugin_registered_count", 

179 "Number of registered plugins", 

180 ) 

181 PLUGIN_HANDLERS_PER_HOOK = Gauge( 

182 "stigmem_plugin_handlers_per_hook", 

183 "Number of registered handlers per plugin hook", 

184 ["hook"], 

185 ) 

186 

187except ImportError: 

188 _ENABLED = False 

189 

190 class _Noop: 

191 def labels(self, **_: Any) -> _Noop: 

192 return self 

193 

194 def inc(self, amount: float = 1) -> None: 

195 pass 

196 

197 def observe(self, amount: float) -> None: 

198 pass 

199 

200 def set(self, value: float) -> None: 

201 pass 

202 

203 def inc_gauge(self, amount: float = 1) -> None: 

204 pass 

205 

206 def dec(self, amount: float = 1) -> None: 

207 pass 

208 

209 _noop = _Noop() 

210 

211 # Counters 

212 FACT_WRITE = _noop 

213 FACT_READ = _noop 

214 QUOTA_BREACH = _noop 

215 AUDIT_EVENT = _noop 

216 CONTRADICTION = _noop 

217 FEDERATION_INGRESS = _noop 

218 FEDERATION_EGRESS = _noop 

219 PEER_HLC_ANOMALY = _noop 

220 SUBSCRIPTION_EVENT = _noop 

221 PLUGIN_HOOK_FIRE = _noop 

222 PLUGIN_HANDLER_INVOCATION = _noop 

223 PLUGIN_HANDLER_ERROR = _noop 

224 PLUGIN_VOTING_DECISION = _noop 

225 PLUGIN_REGISTRATION = _noop 

226 

227 # Histograms 

228 REQUEST_LATENCY = _noop 

229 RECALL_RANKER_DURATION = _noop 

230 CAPABILITY_VERIFY_DURATION = _noop 

231 PLUGIN_HOOK_DURATION = _noop 

232 PLUGIN_HANDLER_DURATION = _noop 

233 

234 # Gauges 

235 SUBSCRIPTION_CONNECTIONS = _noop 

236 REPLICATION_LAG = _noop 

237 PLUGIN_REGISTERED_COUNT = _noop 

238 PLUGIN_HANDLERS_PER_HOOK = _noop 

239 

240 

241def metrics_enabled() -> bool: 

242 return _ENABLED 

243 

244 

245def make_metrics_response() -> Response | None: 

246 """Return a Starlette ``Response`` with the Prometheus text exposition.""" 

247 if not _ENABLED: 247 ↛ 249line 247 didn't jump to line 249 because the condition on line 247 was always true

248 return None 

249 from starlette.responses import Response 

250 

251 return Response( 

252 content=_prom.generate_latest(), 

253 media_type=_prom.CONTENT_TYPE_LATEST, 

254 ) 

255 

256 

257@contextmanager 

258def observe_duration(histogram: Any, labels: dict[str, str]) -> Generator[None, None, None]: 

259 """Context manager: observe elapsed time in ``histogram`` after the block exits.""" 

260 start = time.perf_counter() 

261 try: 

262 yield 

263 finally: 

264 elapsed = time.perf_counter() - start 

265 try: 

266 histogram.labels(**labels).observe(elapsed) 

267 except Exception as exc: 

268 logger.debug("failed to observe duration metric: %s", exc)