1# Copyright 2014-2016 OpenMarket Ltd 2# Copyright 2018 New Vector Ltd 3# 4# Licensed under the Apache License, Version 2.0 (the "License"); 5# you may not use this file except in compliance with the License. 6# You may obtain a copy of the License at 7# 8# http://www.apache.org/licenses/LICENSE-2.0 9# 10# Unless required by applicable law or agreed to in writing, software 11# distributed under the License is distributed on an "AS IS" BASIS, 12# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13# See the License for the specific language governing permissions and 14# limitations under the License. 15 16import logging 17import threading 18import traceback 19from typing import Dict, Mapping, Set, Tuple 20 21from prometheus_client.core import Counter, Histogram 22 23from synapse.logging.context import current_context 24from synapse.metrics import LaterGauge 25 26logger = logging.getLogger(__name__) 27 28 29# total number of responses served, split by method/servlet/tag 30response_count = Counter( 31 "synapse_http_server_response_count", "", ["method", "servlet", "tag"] 32) 33 34requests_counter = Counter( 35 "synapse_http_server_requests_received", "", ["method", "servlet"] 36) 37 38outgoing_responses_counter = Counter( 39 "synapse_http_server_responses", "", ["method", "code"] 40) 41 42response_timer = Histogram( 43 "synapse_http_server_response_time_seconds", 44 "sec", 45 ["method", "servlet", "tag", "code"], 46) 47 48response_ru_utime = Counter( 49 "synapse_http_server_response_ru_utime_seconds", "sec", ["method", "servlet", "tag"] 50) 51 52response_ru_stime = Counter( 53 "synapse_http_server_response_ru_stime_seconds", "sec", ["method", "servlet", "tag"] 54) 55 56response_db_txn_count = Counter( 57 "synapse_http_server_response_db_txn_count", "", ["method", "servlet", "tag"] 58) 59 60# seconds spent waiting for db txns, excluding scheduling time, when processing 61# this request 62response_db_txn_duration = Counter( 63 "synapse_http_server_response_db_txn_duration_seconds", 64 "", 65 ["method", "servlet", "tag"], 66) 67 68# seconds spent waiting for a db connection, when processing this request 69response_db_sched_duration = Counter( 70 "synapse_http_server_response_db_sched_duration_seconds", 71 "", 72 ["method", "servlet", "tag"], 73) 74 75# size in bytes of the response written 76response_size = Counter( 77 "synapse_http_server_response_size", "", ["method", "servlet", "tag"] 78) 79 80# In flight metrics are incremented while the requests are in flight, rather 81# than when the response was written. 82 83in_flight_requests_ru_utime = Counter( 84 "synapse_http_server_in_flight_requests_ru_utime_seconds", "", ["method", "servlet"] 85) 86 87in_flight_requests_ru_stime = Counter( 88 "synapse_http_server_in_flight_requests_ru_stime_seconds", "", ["method", "servlet"] 89) 90 91in_flight_requests_db_txn_count = Counter( 92 "synapse_http_server_in_flight_requests_db_txn_count", "", ["method", "servlet"] 93) 94 95# seconds spent waiting for db txns, excluding scheduling time, when processing 96# this request 97in_flight_requests_db_txn_duration = Counter( 98 "synapse_http_server_in_flight_requests_db_txn_duration_seconds", 99 "", 100 ["method", "servlet"], 101) 102 103# seconds spent waiting for a db connection, when processing this request 104in_flight_requests_db_sched_duration = Counter( 105 "synapse_http_server_in_flight_requests_db_sched_duration_seconds", 106 "", 107 ["method", "servlet"], 108) 109 110_in_flight_requests: Set["RequestMetrics"] = set() 111 112# Protects the _in_flight_requests set from concurrent access 113_in_flight_requests_lock = threading.Lock() 114 115 116def _get_in_flight_counts() -> Mapping[Tuple[str, ...], int]: 117 """Returns a count of all in flight requests by (method, server_name)""" 118 # Cast to a list to prevent it changing while the Prometheus 119 # thread is collecting metrics 120 with _in_flight_requests_lock: 121 reqs = list(_in_flight_requests) 122 123 for rm in reqs: 124 rm.update_metrics() 125 126 # Map from (method, name) -> int, the number of in flight requests of that 127 # type. The key type is Tuple[str, str], but we leave the length unspecified 128 # for compatability with LaterGauge's annotations. 129 counts: Dict[Tuple[str, ...], int] = {} 130 for rm in reqs: 131 key = (rm.method, rm.name) 132 counts[key] = counts.get(key, 0) + 1 133 134 return counts 135 136 137LaterGauge( 138 "synapse_http_server_in_flight_requests_count", 139 "", 140 ["method", "servlet"], 141 _get_in_flight_counts, 142) 143 144 145class RequestMetrics: 146 def start(self, time_sec: float, name: str, method: str) -> None: 147 self.start_ts = time_sec 148 self.start_context = current_context() 149 self.name = name 150 self.method = method 151 152 if self.start_context: 153 # _request_stats records resource usage that we have already added 154 # to the "in flight" metrics. 155 self._request_stats = self.start_context.get_resource_usage() 156 else: 157 logger.error( 158 "Tried to start a RequestMetric from the sentinel context.\n%s", 159 "".join(traceback.format_stack()), 160 ) 161 162 with _in_flight_requests_lock: 163 _in_flight_requests.add(self) 164 165 def stop(self, time_sec, response_code, sent_bytes): 166 with _in_flight_requests_lock: 167 _in_flight_requests.discard(self) 168 169 context = current_context() 170 171 tag = "" 172 if context: 173 tag = context.tag 174 175 if context != self.start_context: 176 logger.error( 177 "Context have unexpectedly changed %r, %r", 178 context, 179 self.start_context, 180 ) 181 return 182 else: 183 logger.error( 184 "Trying to stop RequestMetrics in the sentinel context.\n%s", 185 "".join(traceback.format_stack()), 186 ) 187 return 188 189 response_code = str(response_code) 190 191 outgoing_responses_counter.labels(self.method, response_code).inc() 192 193 response_count.labels(self.method, self.name, tag).inc() 194 195 response_timer.labels(self.method, self.name, tag, response_code).observe( 196 time_sec - self.start_ts 197 ) 198 199 resource_usage = context.get_resource_usage() 200 201 response_ru_utime.labels(self.method, self.name, tag).inc( 202 resource_usage.ru_utime 203 ) 204 response_ru_stime.labels(self.method, self.name, tag).inc( 205 resource_usage.ru_stime 206 ) 207 response_db_txn_count.labels(self.method, self.name, tag).inc( 208 resource_usage.db_txn_count 209 ) 210 response_db_txn_duration.labels(self.method, self.name, tag).inc( 211 resource_usage.db_txn_duration_sec 212 ) 213 response_db_sched_duration.labels(self.method, self.name, tag).inc( 214 resource_usage.db_sched_duration_sec 215 ) 216 217 response_size.labels(self.method, self.name, tag).inc(sent_bytes) 218 219 # We always call this at the end to ensure that we update the metrics 220 # regardless of whether a call to /metrics while the request was in 221 # flight. 222 self.update_metrics() 223 224 def update_metrics(self): 225 """Updates the in flight metrics with values from this request.""" 226 if not self.start_context: 227 logger.error( 228 "Tried to update a RequestMetric from the sentinel context.\n%s", 229 "".join(traceback.format_stack()), 230 ) 231 return 232 new_stats = self.start_context.get_resource_usage() 233 234 diff = new_stats - self._request_stats 235 self._request_stats = new_stats 236 237 # max() is used since rapid use of ru_stime/ru_utime can end up with the 238 # count going backwards due to NTP, time smearing, fine-grained 239 # correction, or floating points. Who knows, really? 240 in_flight_requests_ru_utime.labels(self.method, self.name).inc( 241 max(diff.ru_utime, 0) 242 ) 243 in_flight_requests_ru_stime.labels(self.method, self.name).inc( 244 max(diff.ru_stime, 0) 245 ) 246 247 in_flight_requests_db_txn_count.labels(self.method, self.name).inc( 248 diff.db_txn_count 249 ) 250 251 in_flight_requests_db_txn_duration.labels(self.method, self.name).inc( 252 diff.db_txn_duration_sec 253 ) 254 255 in_flight_requests_db_sched_duration.labels(self.method, self.name).inc( 256 diff.db_sched_duration_sec 257 ) 258