1# Copyright 2014-2016 OpenMarket Ltd
2# Copyright 2018 New Vector Ltd
3#
4# Licensed under the Apache License, Version 2.0 (the "License");
5# you may not use this file except in compliance with the License.
6# You may obtain a copy of the License at
7#
8#     http://www.apache.org/licenses/LICENSE-2.0
9#
10# Unless required by applicable law or agreed to in writing, software
11# distributed under the License is distributed on an "AS IS" BASIS,
12# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13# See the License for the specific language governing permissions and
14# limitations under the License.
15
16import logging
17import threading
18import traceback
19from typing import Dict, Mapping, Set, Tuple
20
21from prometheus_client.core import Counter, Histogram
22
23from synapse.logging.context import current_context
24from synapse.metrics import LaterGauge
25
26logger = logging.getLogger(__name__)
27
28
29# total number of responses served, split by method/servlet/tag
30response_count = Counter(
31    "synapse_http_server_response_count", "", ["method", "servlet", "tag"]
32)
33
34requests_counter = Counter(
35    "synapse_http_server_requests_received", "", ["method", "servlet"]
36)
37
38outgoing_responses_counter = Counter(
39    "synapse_http_server_responses", "", ["method", "code"]
40)
41
42response_timer = Histogram(
43    "synapse_http_server_response_time_seconds",
44    "sec",
45    ["method", "servlet", "tag", "code"],
46)
47
48response_ru_utime = Counter(
49    "synapse_http_server_response_ru_utime_seconds", "sec", ["method", "servlet", "tag"]
50)
51
52response_ru_stime = Counter(
53    "synapse_http_server_response_ru_stime_seconds", "sec", ["method", "servlet", "tag"]
54)
55
56response_db_txn_count = Counter(
57    "synapse_http_server_response_db_txn_count", "", ["method", "servlet", "tag"]
58)
59
60# seconds spent waiting for db txns, excluding scheduling time, when processing
61# this request
62response_db_txn_duration = Counter(
63    "synapse_http_server_response_db_txn_duration_seconds",
64    "",
65    ["method", "servlet", "tag"],
66)
67
68# seconds spent waiting for a db connection, when processing this request
69response_db_sched_duration = Counter(
70    "synapse_http_server_response_db_sched_duration_seconds",
71    "",
72    ["method", "servlet", "tag"],
73)
74
75# size in bytes of the response written
76response_size = Counter(
77    "synapse_http_server_response_size", "", ["method", "servlet", "tag"]
78)
79
80# In flight metrics are incremented while the requests are in flight, rather
81# than when the response was written.
82
83in_flight_requests_ru_utime = Counter(
84    "synapse_http_server_in_flight_requests_ru_utime_seconds", "", ["method", "servlet"]
85)
86
87in_flight_requests_ru_stime = Counter(
88    "synapse_http_server_in_flight_requests_ru_stime_seconds", "", ["method", "servlet"]
89)
90
91in_flight_requests_db_txn_count = Counter(
92    "synapse_http_server_in_flight_requests_db_txn_count", "", ["method", "servlet"]
93)
94
95# seconds spent waiting for db txns, excluding scheduling time, when processing
96# this request
97in_flight_requests_db_txn_duration = Counter(
98    "synapse_http_server_in_flight_requests_db_txn_duration_seconds",
99    "",
100    ["method", "servlet"],
101)
102
103# seconds spent waiting for a db connection, when processing this request
104in_flight_requests_db_sched_duration = Counter(
105    "synapse_http_server_in_flight_requests_db_sched_duration_seconds",
106    "",
107    ["method", "servlet"],
108)
109
110_in_flight_requests: Set["RequestMetrics"] = set()
111
112# Protects the _in_flight_requests set from concurrent access
113_in_flight_requests_lock = threading.Lock()
114
115
116def _get_in_flight_counts() -> Mapping[Tuple[str, ...], int]:
117    """Returns a count of all in flight requests by (method, server_name)"""
118    # Cast to a list to prevent it changing while the Prometheus
119    # thread is collecting metrics
120    with _in_flight_requests_lock:
121        reqs = list(_in_flight_requests)
122
123    for rm in reqs:
124        rm.update_metrics()
125
126    # Map from (method, name) -> int, the number of in flight requests of that
127    # type. The key type is Tuple[str, str], but we leave the length unspecified
128    # for compatability with LaterGauge's annotations.
129    counts: Dict[Tuple[str, ...], int] = {}
130    for rm in reqs:
131        key = (rm.method, rm.name)
132        counts[key] = counts.get(key, 0) + 1
133
134    return counts
135
136
137LaterGauge(
138    "synapse_http_server_in_flight_requests_count",
139    "",
140    ["method", "servlet"],
141    _get_in_flight_counts,
142)
143
144
145class RequestMetrics:
146    def start(self, time_sec: float, name: str, method: str) -> None:
147        self.start_ts = time_sec
148        self.start_context = current_context()
149        self.name = name
150        self.method = method
151
152        if self.start_context:
153            # _request_stats records resource usage that we have already added
154            # to the "in flight" metrics.
155            self._request_stats = self.start_context.get_resource_usage()
156        else:
157            logger.error(
158                "Tried to start a RequestMetric from the sentinel context.\n%s",
159                "".join(traceback.format_stack()),
160            )
161
162        with _in_flight_requests_lock:
163            _in_flight_requests.add(self)
164
165    def stop(self, time_sec, response_code, sent_bytes):
166        with _in_flight_requests_lock:
167            _in_flight_requests.discard(self)
168
169        context = current_context()
170
171        tag = ""
172        if context:
173            tag = context.tag
174
175            if context != self.start_context:
176                logger.error(
177                    "Context have unexpectedly changed %r, %r",
178                    context,
179                    self.start_context,
180                )
181                return
182        else:
183            logger.error(
184                "Trying to stop RequestMetrics in the sentinel context.\n%s",
185                "".join(traceback.format_stack()),
186            )
187            return
188
189        response_code = str(response_code)
190
191        outgoing_responses_counter.labels(self.method, response_code).inc()
192
193        response_count.labels(self.method, self.name, tag).inc()
194
195        response_timer.labels(self.method, self.name, tag, response_code).observe(
196            time_sec - self.start_ts
197        )
198
199        resource_usage = context.get_resource_usage()
200
201        response_ru_utime.labels(self.method, self.name, tag).inc(
202            resource_usage.ru_utime
203        )
204        response_ru_stime.labels(self.method, self.name, tag).inc(
205            resource_usage.ru_stime
206        )
207        response_db_txn_count.labels(self.method, self.name, tag).inc(
208            resource_usage.db_txn_count
209        )
210        response_db_txn_duration.labels(self.method, self.name, tag).inc(
211            resource_usage.db_txn_duration_sec
212        )
213        response_db_sched_duration.labels(self.method, self.name, tag).inc(
214            resource_usage.db_sched_duration_sec
215        )
216
217        response_size.labels(self.method, self.name, tag).inc(sent_bytes)
218
219        # We always call this at the end to ensure that we update the metrics
220        # regardless of whether a call to /metrics while the request was in
221        # flight.
222        self.update_metrics()
223
224    def update_metrics(self):
225        """Updates the in flight metrics with values from this request."""
226        if not self.start_context:
227            logger.error(
228                "Tried to update a RequestMetric from the sentinel context.\n%s",
229                "".join(traceback.format_stack()),
230            )
231            return
232        new_stats = self.start_context.get_resource_usage()
233
234        diff = new_stats - self._request_stats
235        self._request_stats = new_stats
236
237        # max() is used since rapid use of ru_stime/ru_utime can end up with the
238        # count going backwards due to NTP, time smearing, fine-grained
239        # correction, or floating points. Who knows, really?
240        in_flight_requests_ru_utime.labels(self.method, self.name).inc(
241            max(diff.ru_utime, 0)
242        )
243        in_flight_requests_ru_stime.labels(self.method, self.name).inc(
244            max(diff.ru_stime, 0)
245        )
246
247        in_flight_requests_db_txn_count.labels(self.method, self.name).inc(
248            diff.db_txn_count
249        )
250
251        in_flight_requests_db_txn_duration.labels(self.method, self.name).inc(
252            diff.db_txn_duration_sec
253        )
254
255        in_flight_requests_db_sched_duration.labels(self.method, self.name).inc(
256            diff.db_sched_duration_sec
257        )
258