1 /*
2 * Copyright (c) 2015 DeNA Co., Ltd., Kazuho Oku, Justin Zhu
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a copy
5 * of this software and associated documentation files (the "Software"), to
6 * deal in the Software without restriction, including without limitation the
7 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
8 * sell copies of the Software, and to permit persons to whom the Software is
9 * furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice shall be included in
12 * all copies or substantial portions of the Software.
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
19 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
20 * IN THE SOFTWARE.
21 */
22 #include <errno.h>
23 #include <fcntl.h>
24 #include <inttypes.h>
25 #include <limits.h>
26 #include <netdb.h>
27 #include <netinet/in.h>
28 #include <netinet/tcp.h>
29 #include <string.h>
30 #include <sys/syscall.h>
31 #include <sys/un.h>
32 #include <unistd.h>
33 #include <openssl/err.h>
34 #if defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__)
35 #include <sys/ioctl.h>
36 #endif
37 #include "picotls.h"
38 #include "quicly.h"
39 #include "h2o/socket.h"
40 #include "h2o/multithread.h"
41 #include "../probes_.h"
42
43 #if defined(__APPLE__) && defined(__clang__)
44 #pragma clang diagnostic ignored "-Wdeprecated-declarations"
45 #endif
46
47 #ifndef IOV_MAX
48 #define IOV_MAX UIO_MAXIOV
49 #endif
50
51 /* kernel-headers bundled with Ubuntu 14.04 does not have the constant defined in netinet/tcp.h */
52 #if defined(__linux__) && !defined(TCP_NOTSENT_LOWAT)
53 #define TCP_NOTSENT_LOWAT 25
54 #endif
55
56 #if H2O_USE_DTRACE && defined(__linux__)
57 #define H2O_USE_EBPF_MAP 1
58 #endif
59
60 #define OPENSSL_HOSTNAME_VALIDATION_LINKAGE static
61 #pragma GCC diagnostic push
62 #pragma GCC diagnostic ignored "-Wpragmas"
63 #pragma GCC diagnostic ignored "-Wshorten-64-to-32"
64 #include "../../deps/ssl-conservatory/openssl/openssl_hostname_validation.c"
65 #pragma GCC diagnostic pop
66
67 #define SOCKET_PROBE(label, sock, ...) \
68 do { \
69 h2o_socket_t *_sock = (sock); \
70 if (!_sock->_skip_tracing) \
71 H2O_PROBE(SOCKET_##label, sock, __VA_ARGS__); \
72 } while (0)
73
74 struct st_h2o_socket_ssl_t {
75 SSL_CTX *ssl_ctx;
76 SSL *ossl;
77 ptls_t *ptls;
78 int *did_write_in_read; /* used for detecting and closing the connection upon renegotiation (FIXME implement renegotiation) */
79 size_t record_overhead;
80 struct {
81 h2o_socket_cb cb;
82 union {
83 struct {
84 struct {
85 enum {
86 ASYNC_RESUMPTION_STATE_COMPLETE = 0, /* just pass thru */
87 ASYNC_RESUMPTION_STATE_RECORD, /* record first input, restore SSL state if it changes to REQUEST_SENT
88 */
89 ASYNC_RESUMPTION_STATE_REQUEST_SENT /* async request has been sent, and is waiting for response */
90 } state;
91 SSL_SESSION *session_data;
92 } async_resumption;
93 } server;
94 struct {
95 char *server_name;
96 h2o_cache_t *session_cache;
97 h2o_iovec_t session_cache_key;
98 h2o_cache_hashcode_t session_cache_key_hash;
99 } client;
100 };
101 } handshake;
102 struct {
103 h2o_buffer_t *encrypted;
104 } input;
105 /**
106 * Pending TLS data to be sent.
107 */
108 struct {
109 /**
110 * This buffer is initialized when and only when pending data is stored. Otherwise, all the members are zero-cleared; see
111 * `has_pending_ssl_data`.
112 * To reduce the cost of repeated memory allocation, expansion, and release, this buffer points to a chunk of memory being
113 * allocated from `h2o_socket_ssl_buffer_allocator` when initialized. Upon disposal, the memory chunk being used by this
114 * buffer is returned to that memory pool, unless the chunk has been expanded. It is designed as such because sometimes it
115 * is hard to limit the amount of TLS records being generated at once (who knows how large the server's handshake messages
116 * will be, or when it has to send a KeyUpdate message?). But for most of the case, handshake messages will be smaller than
117 * the default size (H2O_SOCKET_DEFAULT_SSL_BUFFER_SIZE), and application traffic will not cause expansion (see
118 * * `generate_tls_records`). Therefore, the memory chunk will be recycled.
119 */
120 ptls_buffer_t buf;
121 size_t pending_off;
122 } output;
123 };
124
125 struct st_h2o_ssl_context_t {
126 SSL_CTX *ctx;
127 const h2o_iovec_t *protocols;
128 h2o_iovec_t _npn_list_of_protocols;
129 };
130
131 /* backend functions */
132 static void init_write_buf(h2o_socket_t *sock, h2o_iovec_t *bufs, size_t bufcnt, size_t first_buf_written);
133 static void dispose_write_buf(h2o_socket_t *sock);
134 static void dispose_ssl_output_buffer(struct st_h2o_socket_ssl_t *ssl);
135 static int has_pending_ssl_bytes(struct st_h2o_socket_ssl_t *ssl);
136 static size_t generate_tls_records(h2o_socket_t *sock, h2o_iovec_t **bufs, size_t *bufcnt, size_t first_buf_written);
137 static void do_dispose_socket(h2o_socket_t *sock);
138 static void do_write(h2o_socket_t *sock, h2o_iovec_t *bufs, size_t bufcnt, h2o_socket_cb cb);
139 static void do_read_start(h2o_socket_t *sock);
140 static void do_read_stop(h2o_socket_t *sock);
141 static int do_export(h2o_socket_t *_sock, h2o_socket_export_t *info);
142 static h2o_socket_t *do_import(h2o_loop_t *loop, h2o_socket_export_t *info);
143 static socklen_t get_peername_uncached(h2o_socket_t *sock, struct sockaddr *sa);
144 static socklen_t get_sockname_uncached(h2o_socket_t *sock, struct sockaddr *sa);
145
146 /* internal functions called from the backend */
147 static const char *decode_ssl_input(h2o_socket_t *sock);
148 static void on_write_complete(h2o_socket_t *sock, const char *err);
149
150 #if H2O_USE_LIBUV
151 #include "socket/uv-binding.c.h"
152 #else
153 #include "socket/evloop.c.h"
154 #endif
155
156 h2o_buffer_mmap_settings_t h2o_socket_buffer_mmap_settings = {
157 32 * 1024 * 1024, /* 32MB, should better be greater than max frame size of HTTP2 for performance reasons */
158 "/tmp/h2o.b.XXXXXX"};
159
160 h2o_buffer_prototype_t h2o_socket_buffer_prototype = {
161 {H2O_SOCKET_INITIAL_INPUT_BUFFER_SIZE}, /* minimum initial capacity; actual initial size is ~8KB, see h2o_buffer_reserve */
162 &h2o_socket_buffer_mmap_settings};
163
164 size_t h2o_socket_ssl_buffer_size = H2O_SOCKET_DEFAULT_SSL_BUFFER_SIZE;
165 __thread h2o_mem_recycle_t h2o_socket_ssl_buffer_allocator = {1024};
166
167 const char h2o_socket_error_out_of_memory[] = "out of memory";
168 const char h2o_socket_error_io[] = "I/O error";
169 const char h2o_socket_error_closed[] = "socket closed by peer";
170 const char h2o_socket_error_conn_fail[] = "connection failure";
171 const char h2o_socket_error_conn_refused[] = "connection refused";
172 const char h2o_socket_error_conn_timed_out[] = "connection timed out";
173 const char h2o_socket_error_network_unreachable[] = "network unreachable";
174 const char h2o_socket_error_host_unreachable[] = "host unreachable";
175 const char h2o_socket_error_socket_fail[] = "socket creation failed";
176 const char h2o_socket_error_ssl_no_cert[] = "no certificate";
177 const char h2o_socket_error_ssl_cert_invalid[] = "invalid certificate";
178 const char h2o_socket_error_ssl_cert_name_mismatch[] = "certificate name mismatch";
179 const char h2o_socket_error_ssl_decode[] = "SSL decode error";
180 const char h2o_socket_error_ssl_handshake[] = "ssl handshake failure";
181
182 static void (*resumption_get_async)(h2o_socket_t *sock, h2o_iovec_t session_id);
183 static void (*resumption_new)(h2o_socket_t *sock, h2o_iovec_t session_id, h2o_iovec_t session_data);
184
read_bio(BIO * b,char * out,int len)185 static int read_bio(BIO *b, char *out, int len)
186 {
187 h2o_socket_t *sock = BIO_get_data(b);
188
189 if (len == 0)
190 return 0;
191
192 if (sock->ssl->input.encrypted->size == 0) {
193 BIO_set_retry_read(b);
194 return -1;
195 }
196
197 if (sock->ssl->input.encrypted->size < len) {
198 len = (int)sock->ssl->input.encrypted->size;
199 }
200 memcpy(out, sock->ssl->input.encrypted->bytes, len);
201 h2o_buffer_consume(&sock->ssl->input.encrypted, len);
202
203 return len;
204 }
205
init_write_buf(h2o_socket_t * sock,h2o_iovec_t * bufs,size_t bufcnt,size_t first_buf_written)206 static void init_write_buf(h2o_socket_t *sock, h2o_iovec_t *bufs, size_t bufcnt, size_t first_buf_written)
207 {
208 if (bufcnt < PTLS_ELEMENTSOF(sock->_write_buf.smallbufs)) {
209 sock->_write_buf.bufs = sock->_write_buf.smallbufs;
210 } else {
211 sock->_write_buf.bufs = h2o_mem_alloc(sizeof(sock->_write_buf.bufs[0]) * bufcnt);
212 sock->_write_buf.alloced_ptr = sock->_write_buf.bufs;
213 }
214 if (bufcnt != 0) {
215 sock->_write_buf.bufs[0].base = bufs[0].base + first_buf_written;
216 sock->_write_buf.bufs[0].len = bufs[0].len - first_buf_written;
217 for (size_t i = 1; i < bufcnt; ++i)
218 sock->_write_buf.bufs[i] = bufs[i];
219 }
220 sock->_write_buf.cnt = bufcnt;
221 }
222
dispose_write_buf(h2o_socket_t * sock)223 static void dispose_write_buf(h2o_socket_t *sock)
224 {
225 if (sock->_write_buf.smallbufs <= sock->_write_buf.bufs &&
226 sock->_write_buf.bufs <=
227 sock->_write_buf.smallbufs + sizeof(sock->_write_buf.smallbufs) / sizeof(sock->_write_buf.smallbufs[0])) {
228 /* no need to free */
229 } else {
230 free(sock->_write_buf.alloced_ptr);
231 sock->_write_buf.bufs = sock->_write_buf.smallbufs;
232 }
233 }
234
init_ssl_output_buffer(struct st_h2o_socket_ssl_t * ssl)235 static void init_ssl_output_buffer(struct st_h2o_socket_ssl_t *ssl)
236 {
237 ptls_buffer_init(&ssl->output.buf, h2o_mem_alloc_recycle(&h2o_socket_ssl_buffer_allocator, h2o_socket_ssl_buffer_size),
238 h2o_socket_ssl_buffer_size);
239 ssl->output.buf.is_allocated = 1; /* set to true, so that the allocated memory is freed when the buffer is expanded */
240 ssl->output.pending_off = 0;
241 }
242
dispose_ssl_output_buffer(struct st_h2o_socket_ssl_t * ssl)243 static void dispose_ssl_output_buffer(struct st_h2o_socket_ssl_t *ssl)
244 {
245 /* The destruction logic that we have here are different from `ptls_buffer_dispose` in following two aspects:
246 * - returns the allocated memory to the pool if possible
247 * - does not zero-clear the memory (there's no need to, because the content is something to be sent in clear) */
248
249 assert(ssl->output.buf.is_allocated);
250
251 if (ssl->output.buf.capacity == h2o_socket_ssl_buffer_size) {
252 h2o_mem_free_recycle(&h2o_socket_ssl_buffer_allocator, ssl->output.buf.base);
253 } else {
254 free(ssl->output.buf.base);
255 }
256 ssl->output.buf = (ptls_buffer_t){};
257 ssl->output.pending_off = 0;
258 }
259
has_pending_ssl_bytes(struct st_h2o_socket_ssl_t * ssl)260 static int has_pending_ssl_bytes(struct st_h2o_socket_ssl_t *ssl)
261 {
262 /* for convenience, this function can be invoked for non-TLS connections too, in which case ssl will be NULL */
263 if (ssl == NULL)
264 return 0;
265
266 /* the contract is that `dispose_ssl_output_buffer` is called immediately when all the data are written out */
267 return ssl->output.buf.base != NULL;
268 }
269
write_ssl_bytes(h2o_socket_t * sock,const void * in,size_t len)270 static void write_ssl_bytes(h2o_socket_t *sock, const void *in, size_t len)
271 {
272 if (len != 0) {
273 if (!has_pending_ssl_bytes(sock->ssl))
274 init_ssl_output_buffer(sock->ssl);
275 if (ptls_buffer_reserve(&sock->ssl->output.buf, len) != 0)
276 h2o_fatal("no memory; tried to allocate %zu bytes", len);
277 memcpy(sock->ssl->output.buf.base + sock->ssl->output.buf.off, in, len);
278 sock->ssl->output.buf.off += len;
279 }
280 }
281
write_bio(BIO * b,const char * in,int len)282 static int write_bio(BIO *b, const char *in, int len)
283 {
284 h2o_socket_t *sock = BIO_get_data(b);
285
286 /* FIXME no support for SSL renegotiation (yet) */
287 if (sock->ssl->did_write_in_read != NULL) {
288 *sock->ssl->did_write_in_read = 1;
289 return -1;
290 }
291
292 write_ssl_bytes(sock, in, len);
293 return len;
294 }
295
puts_bio(BIO * b,const char * str)296 static int puts_bio(BIO *b, const char *str)
297 {
298 return write_bio(b, str, (int)strlen(str));
299 }
300
ctrl_bio(BIO * b,int cmd,long num,void * ptr)301 static long ctrl_bio(BIO *b, int cmd, long num, void *ptr)
302 {
303 switch (cmd) {
304 case BIO_CTRL_GET_CLOSE:
305 return BIO_get_shutdown(b);
306 case BIO_CTRL_SET_CLOSE:
307 BIO_set_shutdown(b, (int)num);
308 return 1;
309 case BIO_CTRL_FLUSH:
310 return 1;
311 default:
312 return 0;
313 }
314 }
315
setup_bio(h2o_socket_t * sock)316 static void setup_bio(h2o_socket_t *sock)
317 {
318 static BIO_METHOD *volatile bio_methods = NULL;
319 H2O_MULTITHREAD_ONCE({
320 bio_methods = BIO_meth_new(BIO_TYPE_FD, "h2o_socket");
321 BIO_meth_set_write(bio_methods, write_bio);
322 BIO_meth_set_read(bio_methods, read_bio);
323 BIO_meth_set_puts(bio_methods, puts_bio);
324 BIO_meth_set_ctrl(bio_methods, ctrl_bio);
325 });
326
327 BIO *bio = BIO_new(bio_methods);
328 if (bio == NULL)
329 h2o_fatal("no memory");
330 BIO_set_data(bio, sock);
331 BIO_set_init(bio, 1);
332 SSL_set_bio(sock->ssl->ossl, bio, bio);
333 }
334
decode_ssl_input(h2o_socket_t * sock)335 const char *decode_ssl_input(h2o_socket_t *sock)
336 {
337 assert(sock->ssl != NULL);
338 assert(sock->ssl->handshake.cb == NULL);
339
340 if (sock->ssl->ptls != NULL) {
341 if (sock->ssl->input.encrypted->size != 0) {
342 const char *src = sock->ssl->input.encrypted->bytes, *src_end = src + sock->ssl->input.encrypted->size;
343 h2o_iovec_t reserved;
344 ptls_buffer_t rbuf;
345 int ret;
346 if ((reserved = h2o_buffer_try_reserve(&sock->input, sock->ssl->input.encrypted->size)).base == NULL)
347 return h2o_socket_error_out_of_memory;
348 ptls_buffer_init(&rbuf, reserved.base, reserved.len);
349 do {
350 size_t consumed = src_end - src;
351 if ((ret = ptls_receive(sock->ssl->ptls, &rbuf, src, &consumed)) != 0)
352 break;
353 src += consumed;
354 } while (src != src_end);
355 h2o_buffer_consume(&sock->ssl->input.encrypted, sock->ssl->input.encrypted->size - (src_end - src));
356 if (rbuf.is_allocated) {
357 if ((reserved = h2o_buffer_try_reserve(&sock->input, rbuf.off)).base == NULL)
358 return h2o_socket_error_out_of_memory;
359 memcpy(reserved.base, rbuf.base, rbuf.off);
360 sock->input->size += rbuf.off;
361 ptls_buffer_dispose(&rbuf);
362 } else {
363 sock->input->size += rbuf.off;
364 }
365 if (!(ret == 0 || ret == PTLS_ERROR_IN_PROGRESS))
366 return h2o_socket_error_ssl_decode;
367 }
368 return NULL;
369 }
370
371 while (sock->ssl->input.encrypted->size != 0 || SSL_pending(sock->ssl->ossl)) {
372 int rlen;
373 h2o_iovec_t buf = h2o_buffer_try_reserve(&sock->input, 4096);
374 if (buf.base == NULL)
375 return h2o_socket_error_out_of_memory;
376 { /* call SSL_read (while detecting SSL renegotiation and reporting it as error) */
377 int did_write_in_read = 0;
378 sock->ssl->did_write_in_read = &did_write_in_read;
379 ERR_clear_error();
380 rlen = SSL_read(sock->ssl->ossl, buf.base, (int)buf.len);
381 sock->ssl->did_write_in_read = NULL;
382 if (did_write_in_read)
383 return "ssl renegotiation not supported";
384 }
385 if (rlen == -1) {
386 if (SSL_get_error(sock->ssl->ossl, rlen) != SSL_ERROR_WANT_READ) {
387 return h2o_socket_error_ssl_decode;
388 }
389 break;
390 } else if (rlen == 0) {
391 break;
392 } else {
393 sock->input->size += rlen;
394 }
395 }
396
397 return 0;
398 }
399
flush_pending_ssl(h2o_socket_t * sock,h2o_socket_cb cb)400 static void flush_pending_ssl(h2o_socket_t *sock, h2o_socket_cb cb)
401 {
402 do_write(sock, NULL, 0, cb);
403 }
404
destroy_ssl(struct st_h2o_socket_ssl_t * ssl)405 static void destroy_ssl(struct st_h2o_socket_ssl_t *ssl)
406 {
407 if (ssl->ptls != NULL) {
408 ptls_free(ssl->ptls);
409 ssl->ptls = NULL;
410 }
411 if (ssl->ossl != NULL) {
412 if (!SSL_is_server(ssl->ossl)) {
413 free(ssl->handshake.client.server_name);
414 free(ssl->handshake.client.session_cache_key.base);
415 }
416 SSL_free(ssl->ossl);
417 ssl->ossl = NULL;
418 }
419 h2o_buffer_dispose(&ssl->input.encrypted);
420 if (has_pending_ssl_bytes(ssl))
421 dispose_ssl_output_buffer(ssl);
422 free(ssl);
423 }
424
dispose_socket(h2o_socket_t * sock,const char * err)425 static void dispose_socket(h2o_socket_t *sock, const char *err)
426 {
427 void (*close_cb)(void *data);
428 void *close_cb_data;
429
430 if (sock->ssl != NULL) {
431 destroy_ssl(sock->ssl);
432 sock->ssl = NULL;
433 }
434 h2o_buffer_dispose(&sock->input);
435 if (sock->_peername != NULL) {
436 free(sock->_peername);
437 sock->_peername = NULL;
438 }
439 if (sock->_sockname != NULL) {
440 free(sock->_sockname);
441 sock->_sockname = NULL;
442 }
443
444 close_cb = sock->on_close.cb;
445 close_cb_data = sock->on_close.data;
446
447 do_dispose_socket(sock);
448
449 if (close_cb != NULL)
450 close_cb(close_cb_data);
451 }
452
shutdown_ssl(h2o_socket_t * sock,const char * err)453 static void shutdown_ssl(h2o_socket_t *sock, const char *err)
454 {
455 if (err != NULL)
456 goto Close;
457
458 if (sock->_cb.write != NULL) {
459 /* note: libuv calls the write callback after the socket is closed by uv_close (with status set to 0 if the write succeeded)
460 */
461 sock->_cb.write = NULL;
462 goto Close;
463 }
464
465 if (sock->ssl->ptls != NULL) {
466 ptls_buffer_t wbuf;
467 uint8_t wbuf_small[32];
468 ptls_buffer_init(&wbuf, wbuf_small, sizeof(wbuf_small));
469 if (ptls_send_alert(sock->ssl->ptls, &wbuf, PTLS_ALERT_LEVEL_WARNING, PTLS_ALERT_CLOSE_NOTIFY) != 0)
470 goto Close;
471 write_ssl_bytes(sock, wbuf.base, wbuf.off);
472 ptls_buffer_dispose(&wbuf);
473 } else if (sock->ssl->ossl != NULL) {
474 ERR_clear_error();
475 if (SSL_shutdown(sock->ssl->ossl) == -1)
476 goto Close;
477 } else {
478 goto Close;
479 }
480
481 if (has_pending_ssl_bytes(sock->ssl)) {
482 h2o_socket_read_stop(sock);
483 flush_pending_ssl(sock, dispose_socket);
484 return;
485 }
486
487 Close:
488 dispose_socket(sock, err);
489 }
490
h2o_socket_dispose_export(h2o_socket_export_t * info)491 void h2o_socket_dispose_export(h2o_socket_export_t *info)
492 {
493 assert(info->fd != -1);
494 if (info->ssl != NULL) {
495 destroy_ssl(info->ssl);
496 info->ssl = NULL;
497 }
498 h2o_buffer_dispose(&info->input);
499 close(info->fd);
500 info->fd = -1;
501 }
502
h2o_socket_export(h2o_socket_t * sock,h2o_socket_export_t * info)503 int h2o_socket_export(h2o_socket_t *sock, h2o_socket_export_t *info)
504 {
505 static h2o_buffer_prototype_t nonpooling_prototype;
506
507 assert(!h2o_socket_is_writing(sock));
508
509 if (do_export(sock, info) == -1)
510 return -1;
511
512 if ((info->ssl = sock->ssl) != NULL) {
513 sock->ssl = NULL;
514 h2o_buffer_set_prototype(&info->ssl->input.encrypted, &nonpooling_prototype);
515 }
516 info->input = sock->input;
517 h2o_buffer_set_prototype(&info->input, &nonpooling_prototype);
518 h2o_buffer_init(&sock->input, &h2o_socket_buffer_prototype);
519
520 h2o_socket_close(sock);
521
522 return 0;
523 }
524
h2o_socket_import(h2o_loop_t * loop,h2o_socket_export_t * info)525 h2o_socket_t *h2o_socket_import(h2o_loop_t *loop, h2o_socket_export_t *info)
526 {
527 h2o_socket_t *sock;
528
529 assert(info->fd != -1);
530
531 sock = do_import(loop, info);
532 info->fd = -1; /* just in case */
533 if ((sock->ssl = info->ssl) != NULL) {
534 setup_bio(sock);
535 h2o_buffer_set_prototype(&sock->ssl->input.encrypted, &h2o_socket_buffer_prototype);
536 }
537 sock->input = info->input;
538 h2o_buffer_set_prototype(&sock->input, &h2o_socket_buffer_prototype);
539 return sock;
540 }
541
h2o_socket_close(h2o_socket_t * sock)542 void h2o_socket_close(h2o_socket_t *sock)
543 {
544 if (sock->ssl == NULL) {
545 dispose_socket(sock, 0);
546 } else {
547 shutdown_ssl(sock, 0);
548 }
549 }
550
calc_suggested_tls_payload_size(h2o_socket_t * sock,uint16_t suggested_tls_record_size)551 static uint16_t calc_suggested_tls_payload_size(h2o_socket_t *sock, uint16_t suggested_tls_record_size)
552 {
553 uint16_t ps = suggested_tls_record_size;
554 if (sock->ssl != NULL && sock->ssl->record_overhead < ps)
555 ps -= sock->ssl->record_overhead;
556 return ps;
557 }
558
disable_latency_optimized_write(h2o_socket_t * sock,int (* adjust_notsent_lowat)(h2o_socket_t *,unsigned))559 static void disable_latency_optimized_write(h2o_socket_t *sock, int (*adjust_notsent_lowat)(h2o_socket_t *, unsigned))
560 {
561 if (sock->_latency_optimization.notsent_is_minimized) {
562 adjust_notsent_lowat(sock, 0);
563 sock->_latency_optimization.notsent_is_minimized = 0;
564 }
565 sock->_latency_optimization.state = H2O_SOCKET_LATENCY_OPTIMIZATION_STATE_DISABLED;
566 sock->_latency_optimization.suggested_tls_payload_size = SIZE_MAX;
567 sock->_latency_optimization.suggested_write_size = SIZE_MAX;
568 }
569
prepare_for_latency_optimized_write(h2o_socket_t * sock,const h2o_socket_latency_optimization_conditions_t * conditions,uint32_t rtt,uint32_t mss,uint32_t cwnd_size,uint32_t cwnd_avail,uint64_t loop_time,int (* adjust_notsent_lowat)(h2o_socket_t *,unsigned))570 static inline void prepare_for_latency_optimized_write(h2o_socket_t *sock,
571 const h2o_socket_latency_optimization_conditions_t *conditions, uint32_t rtt,
572 uint32_t mss, uint32_t cwnd_size, uint32_t cwnd_avail, uint64_t loop_time,
573 int (*adjust_notsent_lowat)(h2o_socket_t *, unsigned))
574 {
575 /* check RTT */
576 if (rtt < conditions->min_rtt * (uint64_t)1000)
577 goto Disable;
578 if (rtt * conditions->max_additional_delay < loop_time * 1000 * 100)
579 goto Disable;
580
581 /* latency-optimization is enabled */
582 sock->_latency_optimization.state = H2O_SOCKET_LATENCY_OPTIMIZATION_STATE_DETERMINED;
583
584 /* no need to:
585 * 1) adjust the write size if single_write_size << cwnd_size
586 * 2) align TLS record boundary to TCP packet boundary if packet loss-rate is low and BW isn't small (implied by cwnd size)
587 */
588 if (mss * cwnd_size < conditions->max_cwnd) {
589 if (!sock->_latency_optimization.notsent_is_minimized) {
590 if (adjust_notsent_lowat(sock, 1 /* cannot be set to zero on Linux */) != 0)
591 goto Disable;
592 sock->_latency_optimization.notsent_is_minimized = 1;
593 }
594 sock->_latency_optimization.suggested_tls_payload_size = calc_suggested_tls_payload_size(sock, mss);
595 sock->_latency_optimization.suggested_write_size =
596 cwnd_avail * (size_t)sock->_latency_optimization.suggested_tls_payload_size;
597 } else {
598 if (sock->_latency_optimization.notsent_is_minimized) {
599 if (adjust_notsent_lowat(sock, 0) != 0)
600 goto Disable;
601 sock->_latency_optimization.notsent_is_minimized = 0;
602 }
603 sock->_latency_optimization.suggested_tls_payload_size = SIZE_MAX;
604 sock->_latency_optimization.suggested_write_size = SIZE_MAX;
605 }
606 return;
607
608 Disable:
609 disable_latency_optimized_write(sock, adjust_notsent_lowat);
610 }
611
612 /**
613 * Obtains RTT, MSS, size of CWND (in the number of packets).
614 * Also writes to cwnd_avail minimum number of packets (of MSS size) sufficient to shut up poll-for-write under the precondition
615 * that TCP_NOTSENT_LOWAT is set to 1.
616 */
obtain_tcp_info(int fd,uint32_t * rtt,uint32_t * mss,uint32_t * cwnd_size,uint32_t * cwnd_avail)617 static int obtain_tcp_info(int fd, uint32_t *rtt, uint32_t *mss, uint32_t *cwnd_size, uint32_t *cwnd_avail)
618 {
619 #define CALC_CWND_PAIR_FROM_BYTE_UNITS(cwnd_bytes, inflight_bytes) \
620 do { \
621 *cwnd_size = (cwnd_bytes + *mss / 2) / *mss; \
622 *cwnd_avail = cwnd_bytes > inflight_bytes ? (cwnd_bytes - inflight_bytes) / *mss + 2 : 2; \
623 } while (0)
624
625 #if defined(__linux__) && defined(TCP_INFO)
626
627 struct tcp_info tcpi;
628 socklen_t tcpisz = sizeof(tcpi);
629 if (getsockopt(fd, IPPROTO_TCP, TCP_INFO, &tcpi, &tcpisz) != 0)
630 return -1;
631 *rtt = tcpi.tcpi_rtt;
632 *mss = tcpi.tcpi_snd_mss;
633 *cwnd_size = tcpi.tcpi_snd_cwnd;
634 *cwnd_avail = tcpi.tcpi_snd_cwnd > tcpi.tcpi_unacked ? tcpi.tcpi_snd_cwnd - tcpi.tcpi_unacked + 2 : 2;
635 return 0;
636
637 #elif defined(__FreeBSD__) && defined(TCP_INFO) && 0 /* disabled since we wouldn't use it anyways; OS lacks TCP_NOTSENT_LOWAT */
638
639 struct tcp_info tcpi;
640 socklen_t tcpisz = sizeof(tcpi);
641 int bytes_inflight;
642 if (getsockopt(fd, IPPROTO_TCP, TCP_INFO, &tcpi, &tcpisz) != 0 || ioctl(fd, FIONWRITE, &bytes_inflight) == -1)
643 return -1;
644 *rtt = tcpi.tcpi_rtt;
645 *mss = tcpi.tcpi_snd_mss;
646 CALC_CWND_PAIR_FROM_BYTE_UNITS(tcpi.tcpi_snd_cwnd, bytes_inflight);
647 return 0;
648
649 #elif defined(__APPLE__) && defined(TCP_CONNECTION_INFO)
650
651 struct tcp_connection_info tcpi;
652 socklen_t tcpisz = sizeof(tcpi);
653 if (getsockopt(fd, IPPROTO_TCP, TCP_CONNECTION_INFO, &tcpi, &tcpisz) != 0 || tcpi.tcpi_maxseg == 0)
654 return -1;
655 *rtt = tcpi.tcpi_srtt * 1000;
656 *mss = tcpi.tcpi_maxseg;
657 CALC_CWND_PAIR_FROM_BYTE_UNITS(tcpi.tcpi_snd_cwnd, tcpi.tcpi_snd_sbbytes);
658 return 0;
659
660 #else
661 /* TODO add support for NetBSD; note that the OS returns the number of packets for tcpi_snd_cwnd; see
662 * http://twitter.com/n_soda/status/740719125878575105
663 */
664 return -1;
665 #endif
666
667 #undef CALC_CWND_PAIR_FROM_BYTE_UNITS
668 }
669
670 #ifdef TCP_NOTSENT_LOWAT
adjust_notsent_lowat(h2o_socket_t * sock,unsigned notsent_lowat)671 static int adjust_notsent_lowat(h2o_socket_t *sock, unsigned notsent_lowat)
672 {
673 return setsockopt(h2o_socket_get_fd(sock), IPPROTO_TCP, TCP_NOTSENT_LOWAT, ¬sent_lowat, sizeof(notsent_lowat));
674 }
675 #else
676 #define adjust_notsent_lowat NULL
677 #endif
678
h2o_socket_do_prepare_for_latency_optimized_write(h2o_socket_t * sock,const h2o_socket_latency_optimization_conditions_t * conditions)679 size_t h2o_socket_do_prepare_for_latency_optimized_write(h2o_socket_t *sock,
680 const h2o_socket_latency_optimization_conditions_t *conditions)
681 {
682 uint32_t rtt = 0, mss = 0, cwnd_size = 0, cwnd_avail = 0;
683 uint64_t loop_time = UINT64_MAX;
684 int can_prepare = 1;
685
686 #if !defined(TCP_NOTSENT_LOWAT)
687 /* the feature cannot be setup unless TCP_NOTSENT_LOWAT is available */
688 can_prepare = 0;
689 #endif
690
691 #if H2O_USE_LIBUV
692 /* poll-then-write is impossible with libuv */
693 can_prepare = 0;
694 #else
695 if (can_prepare)
696 loop_time = h2o_evloop_get_execution_time_millisec(h2o_socket_get_loop(sock));
697 #endif
698
699 /* obtain TCP states */
700 if (can_prepare && obtain_tcp_info(h2o_socket_get_fd(sock), &rtt, &mss, &cwnd_size, &cwnd_avail) != 0)
701 can_prepare = 0;
702
703 /* determine suggested_write_size, suggested_tls_record_size and adjust TCP_NOTSENT_LOWAT based on the obtained information */
704 if (can_prepare) {
705 prepare_for_latency_optimized_write(sock, conditions, rtt, mss, cwnd_size, cwnd_avail, loop_time, adjust_notsent_lowat);
706 } else {
707 disable_latency_optimized_write(sock, adjust_notsent_lowat);
708 }
709
710 return sock->_latency_optimization.suggested_write_size;
711
712 #undef CALC_CWND_PAIR_FROM_BYTE_UNITS
713 }
714
calc_tls_write_size(h2o_socket_t * sock,size_t bufsize)715 static size_t calc_tls_write_size(h2o_socket_t *sock, size_t bufsize)
716 {
717 size_t recsize;
718
719 /* set recsize to the maximum TLS record size by using the latency optimizer, or if the optimizer is not in action, based on the
720 * number of bytes that have already been sent */
721 switch (sock->_latency_optimization.state) {
722 case H2O_SOCKET_LATENCY_OPTIMIZATION_STATE_TBD:
723 case H2O_SOCKET_LATENCY_OPTIMIZATION_STATE_DISABLED:
724 recsize = sock->bytes_written < 64 * 1024 ? calc_suggested_tls_payload_size(sock, 1400) : SIZE_MAX;
725 break;
726 case H2O_SOCKET_LATENCY_OPTIMIZATION_STATE_DETERMINED:
727 sock->_latency_optimization.state = H2O_SOCKET_LATENCY_OPTIMIZATION_STATE_NEEDS_UPDATE;
728 /* fallthru */
729 default:
730 recsize = sock->_latency_optimization.suggested_tls_payload_size;
731 break;
732 }
733
734 return recsize < bufsize ? recsize : bufsize;
735 }
736
737 /**
738 * Given a vector, generate at least one TLS record if there's enough space in the buffer, and return the size of application data
739 * being encrypted. Otherwise, returns zero.
740 */
generate_tls_records_from_one_vec(h2o_socket_t * sock,const void * input,size_t inlen)741 static size_t generate_tls_records_from_one_vec(h2o_socket_t *sock, const void *input, size_t inlen)
742 {
743 static const size_t MAX_RECORD_PAYLOAD_SIZE = 16 * 1024, LARGE_RECORD_OVERHEAD = 5 + 32;
744
745 size_t tls_write_size = calc_tls_write_size(sock, inlen);
746 size_t space_left = sock->ssl->output.buf.capacity - sock->ssl->output.buf.off;
747
748 if (tls_write_size < inlen) {
749 /* Writing small TLS records, one by one. Bail out if we might fail to do so. */
750 if (space_left < tls_write_size + LARGE_RECORD_OVERHEAD)
751 return 0;
752 } else {
753 /* Writing full-sized records. Adjust tls_write_size to a multiple of full-sized TLS records, or bail out if we cannot
754 * write one. */
755 size_t rec_capacity = space_left / (MAX_RECORD_PAYLOAD_SIZE + LARGE_RECORD_OVERHEAD);
756 if (rec_capacity == 0)
757 return 0;
758 tls_write_size = MAX_RECORD_PAYLOAD_SIZE * rec_capacity;
759 if (tls_write_size > inlen)
760 tls_write_size = inlen;
761 }
762
763 /* Generate TLS record(s). */
764 if (sock->ssl->ptls != NULL) {
765 int ret = ptls_send(sock->ssl->ptls, &sock->ssl->output.buf, input, tls_write_size);
766 assert(ret == 0);
767 } else {
768 int ret = SSL_write(sock->ssl->ossl, input, (int)tls_write_size);
769 /* The error happens if SSL_write is called after SSL_read returns a fatal error (e.g. due to corrupt TCP packet being
770 * received). We might be converting more and more TLS records on this side as read errors occur. */
771 if (ret <= 0)
772 return SIZE_MAX;
773 assert(ret == tls_write_size);
774 }
775
776 SOCKET_PROBE(WRITE_TLS_RECORD, sock, tls_write_size, sock->ssl->output.buf.off);
777 return tls_write_size;
778 }
779
780 /**
781 * Generate as many TLS records as possible, given a list of vectors. Upon return, `*bufs` and `*bufcnt` will be updated to point
782 * the buffers that still have pending data, and the number of bytes being already written within `(*buf)[0]` will be returned.
783 */
generate_tls_records(h2o_socket_t * sock,h2o_iovec_t ** bufs,size_t * bufcnt,size_t first_buf_written)784 static size_t generate_tls_records(h2o_socket_t *sock, h2o_iovec_t **bufs, size_t *bufcnt, size_t first_buf_written)
785 {
786 assert(!has_pending_ssl_bytes(sock->ssl) && "we are filling encrypted bytes from the front, with no existing buffer, always");
787
788 while (*bufcnt != 0) {
789 if ((*bufs)->len == 0) {
790 ++*bufs;
791 --*bufcnt;
792 continue;
793 }
794 if (!has_pending_ssl_bytes(sock->ssl))
795 init_ssl_output_buffer(sock->ssl);
796 size_t bytes_newly_written =
797 generate_tls_records_from_one_vec(sock, (*bufs)->base + first_buf_written, (*bufs)->len - first_buf_written);
798 if (bytes_newly_written == SIZE_MAX) {
799 return SIZE_MAX;
800 } else if (bytes_newly_written == 0) {
801 break;
802 }
803 first_buf_written += bytes_newly_written;
804 if ((*bufs)->len == first_buf_written) {
805 first_buf_written = 0;
806 ++*bufs;
807 --*bufcnt;
808 }
809 }
810
811 return first_buf_written;
812 }
813
h2o_socket_write(h2o_socket_t * sock,h2o_iovec_t * bufs,size_t bufcnt,h2o_socket_cb cb)814 void h2o_socket_write(h2o_socket_t *sock, h2o_iovec_t *bufs, size_t bufcnt, h2o_socket_cb cb)
815 {
816 SOCKET_PROBE(WRITE, sock, bufs, bufcnt, cb);
817
818 assert(sock->_cb.write == NULL);
819
820 for (size_t i = 0; i != bufcnt; ++i) {
821 sock->bytes_written += bufs[i].len;
822 #if H2O_SOCKET_DUMP_WRITE
823 h2o_error_printf("writing %zu bytes to fd:%d\n", bufs[i].len, h2o_socket_get_fd(sock));
824 h2o_dump_memory(stderr, bufs[i].base, bufs[i].len);
825 #endif
826 }
827
828 do_write(sock, bufs, bufcnt, cb);
829 }
830
on_write_complete(h2o_socket_t * sock,const char * err)831 void on_write_complete(h2o_socket_t *sock, const char *err)
832 {
833 h2o_socket_cb cb;
834
835 if (has_pending_ssl_bytes(sock->ssl))
836 dispose_ssl_output_buffer(sock->ssl);
837
838 cb = sock->_cb.write;
839 sock->_cb.write = NULL;
840 cb(sock, err);
841 }
842
h2o_socket_read_start(h2o_socket_t * sock,h2o_socket_cb cb)843 void h2o_socket_read_start(h2o_socket_t *sock, h2o_socket_cb cb)
844 {
845 sock->_cb.read = cb;
846 do_read_start(sock);
847 }
848
h2o_socket_read_stop(h2o_socket_t * sock)849 void h2o_socket_read_stop(h2o_socket_t *sock)
850 {
851 sock->_cb.read = NULL;
852 do_read_stop(sock);
853 }
854
h2o_socket_setpeername(h2o_socket_t * sock,struct sockaddr * sa,socklen_t len)855 void h2o_socket_setpeername(h2o_socket_t *sock, struct sockaddr *sa, socklen_t len)
856 {
857 free(sock->_peername);
858 sock->_peername = h2o_mem_alloc(offsetof(struct st_h2o_socket_addr_t, addr) + len);
859 sock->_peername->len = len;
860 memcpy(&sock->_peername->addr, sa, len);
861 }
862
h2o_socket_getpeername(h2o_socket_t * sock,struct sockaddr * sa)863 socklen_t h2o_socket_getpeername(h2o_socket_t *sock, struct sockaddr *sa)
864 {
865 /* return cached, if exists */
866 if (sock->_peername != NULL) {
867 memcpy(sa, &sock->_peername->addr, sock->_peername->len);
868 return sock->_peername->len;
869 }
870 /* call, copy to cache, and return */
871 socklen_t len = get_peername_uncached(sock, sa);
872 h2o_socket_setpeername(sock, sa, len);
873 return len;
874 }
875
h2o_socket_getsockname(h2o_socket_t * sock,struct sockaddr * sa)876 socklen_t h2o_socket_getsockname(h2o_socket_t *sock, struct sockaddr *sa)
877 {
878 /* return cached, if exists */
879 if (sock->_sockname != NULL) {
880 memcpy(sa, &sock->_sockname->addr, sock->_sockname->len);
881 return sock->_sockname->len;
882 }
883 /* call, copy to cache, and return */
884 socklen_t len = get_sockname_uncached(sock, sa);
885 sock->_sockname = h2o_mem_alloc(offsetof(struct st_h2o_socket_addr_t, addr) + len);
886 sock->_sockname->len = len;
887 memcpy(&sock->_sockname->addr, sa, len);
888 return len;
889 }
890
h2o_socket_get_ptls(h2o_socket_t * sock)891 ptls_t *h2o_socket_get_ptls(h2o_socket_t *sock)
892 {
893 return sock->ssl != NULL ? sock->ssl->ptls : NULL;
894 }
895
h2o_socket_get_ssl_protocol_version(h2o_socket_t * sock)896 const char *h2o_socket_get_ssl_protocol_version(h2o_socket_t *sock)
897 {
898 if (sock->ssl != NULL) {
899 if (sock->ssl->ptls != NULL)
900 return "TLSv1.3";
901 if (sock->ssl->ossl != NULL)
902 return SSL_get_version(sock->ssl->ossl);
903 }
904 return NULL;
905 }
906
h2o_socket_get_ssl_session_reused(h2o_socket_t * sock)907 int h2o_socket_get_ssl_session_reused(h2o_socket_t *sock)
908 {
909 if (sock->ssl != NULL) {
910 if (sock->ssl->ptls != NULL)
911 return ptls_is_psk_handshake(sock->ssl->ptls);
912 if (sock->ssl->ossl != NULL)
913 return (int)SSL_session_reused(sock->ssl->ossl);
914 }
915 return -1;
916 }
917
h2o_socket_get_ssl_cipher(h2o_socket_t * sock)918 const char *h2o_socket_get_ssl_cipher(h2o_socket_t *sock)
919 {
920 if (sock->ssl != NULL) {
921 if (sock->ssl->ptls != NULL) {
922 ptls_cipher_suite_t *cipher = ptls_get_cipher(sock->ssl->ptls);
923 if (cipher != NULL)
924 return cipher->aead->name;
925 } else if (sock->ssl->ossl != NULL) {
926 return SSL_get_cipher_name(sock->ssl->ossl);
927 }
928 }
929 return NULL;
930 }
931
h2o_socket_get_ssl_cipher_bits(h2o_socket_t * sock)932 int h2o_socket_get_ssl_cipher_bits(h2o_socket_t *sock)
933 {
934 if (sock->ssl != NULL) {
935 if (sock->ssl->ptls != NULL) {
936 ptls_cipher_suite_t *cipher = ptls_get_cipher(sock->ssl->ptls);
937 if (cipher == NULL)
938 return 0;
939 return (int)cipher->aead->key_size;
940 } else if (sock->ssl->ossl != NULL) {
941 return SSL_get_cipher_bits(sock->ssl->ossl, NULL);
942 }
943 }
944 return 0;
945 }
946
h2o_socket_get_ssl_session_id(h2o_socket_t * sock)947 h2o_iovec_t h2o_socket_get_ssl_session_id(h2o_socket_t *sock)
948 {
949 if (sock->ssl != NULL) {
950 if (sock->ssl->ptls != NULL) {
951 /* FIXME */
952 } else if (sock->ssl->ossl != NULL) {
953 SSL_SESSION *session;
954 if (sock->ssl->handshake.server.async_resumption.state == ASYNC_RESUMPTION_STATE_COMPLETE &&
955 (session = SSL_get_session(sock->ssl->ossl)) != NULL) {
956 unsigned id_len;
957 const unsigned char *id = SSL_SESSION_get_id(session, &id_len);
958 return h2o_iovec_init(id, id_len);
959 }
960 }
961 }
962
963 return h2o_iovec_init(NULL, 0);
964 }
965
h2o_socket_get_ssl_server_name(const h2o_socket_t * sock)966 const char *h2o_socket_get_ssl_server_name(const h2o_socket_t *sock)
967 {
968 if (sock->ssl != NULL) {
969 if (sock->ssl->ptls != NULL) {
970 return ptls_get_server_name(sock->ssl->ptls);
971 } else if (sock->ssl->ossl != NULL) {
972 return SSL_get_servername(sock->ssl->ossl, TLSEXT_NAMETYPE_host_name);
973 }
974 }
975 return NULL;
976 }
977
h2o_socket_log_tcp_congestion_controller(h2o_socket_t * sock,h2o_mem_pool_t * pool)978 h2o_iovec_t h2o_socket_log_tcp_congestion_controller(h2o_socket_t *sock, h2o_mem_pool_t *pool)
979 {
980 #if defined(TCP_CONGESTION)
981 int fd;
982 if ((fd = h2o_socket_get_fd(sock)) >= 0) {
983 #define CC_BUFSIZE 32
984 socklen_t buflen = CC_BUFSIZE;
985 char *buf = pool != NULL ? h2o_mem_alloc_pool(pool, *buf, buflen) : h2o_mem_alloc(buflen);
986 if (getsockopt(fd, IPPROTO_TCP, TCP_CONGESTION, buf, &buflen) == 0) {
987 /* Upon return, linux sets `buflen` to some value greater than the size of the string. Therefore, we apply strlen after
988 * making sure that the result does not overrun the buffer. */
989 buf[CC_BUFSIZE - 1] = '\0';
990 return h2o_iovec_init(buf, strlen(buf));
991 }
992 #undef CC_BUFSIZE
993 }
994 #endif
995 return h2o_iovec_init(NULL, 0);
996 }
997
h2o_socket_log_tcp_delivery_rate(h2o_socket_t * sock,h2o_mem_pool_t * pool)998 h2o_iovec_t h2o_socket_log_tcp_delivery_rate(h2o_socket_t *sock, h2o_mem_pool_t *pool)
999 {
1000 #if defined(__linux__) && defined(TCP_INFO)
1001 int fd;
1002 if ((fd = h2o_socket_get_fd(sock)) >= 0) {
1003 /* A copy of `struct tcp_info` found in linux/tcp.h, up to `tcpi_delivery_rate`. Rest of the codebase uses netinet/tcp.h,
1004 * which does not provide access to `tcpi_delivery_rate`. */
1005 struct {
1006 uint8_t tcpi_state;
1007 uint8_t tcpi_ca_state;
1008 uint8_t tcpi_retransmits;
1009 uint8_t tcpi_probes;
1010 uint8_t tcpi_backoff;
1011 uint8_t tcpi_options;
1012 uint8_t tcpi_snd_wscale : 4, tcpi_rcv_wscale : 4;
1013 uint8_t tcpi_delivery_rate_app_limited : 1;
1014
1015 uint32_t tcpi_rto;
1016 uint32_t tcpi_ato;
1017 uint32_t tcpi_snd_mss;
1018 uint32_t tcpi_rcv_mss;
1019
1020 uint32_t tcpi_unacked;
1021 uint32_t tcpi_sacked;
1022 uint32_t tcpi_lost;
1023 uint32_t tcpi_retrans;
1024 uint32_t tcpi_fackets;
1025
1026 /* Times. */
1027 uint32_t tcpi_last_data_sent;
1028 uint32_t tcpi_last_ack_sent; /* Not remembered, sorry. */
1029 uint32_t tcpi_last_data_recv;
1030 uint32_t tcpi_last_ack_recv;
1031
1032 /* Metrics. */
1033 uint32_t tcpi_pmtu;
1034 uint32_t tcpi_rcv_ssthresh;
1035 uint32_t tcpi_rtt;
1036 uint32_t tcpi_rttvar;
1037 uint32_t tcpi_snd_ssthresh;
1038 uint32_t tcpi_snd_cwnd;
1039 uint32_t tcpi_advmss;
1040 uint32_t tcpi_reordering;
1041
1042 uint32_t tcpi_rcv_rtt;
1043 uint32_t tcpi_rcv_space;
1044
1045 uint32_t tcpi_total_retrans;
1046
1047 uint64_t tcpi_pacing_rate;
1048 uint64_t tcpi_max_pacing_rate;
1049 uint64_t tcpi_bytes_acked; /* RFC4898 tcpEStatsAppHCThruOctetsAcked */
1050 uint64_t tcpi_bytes_received; /* RFC4898 tcpEStatsAppHCThruOctetsReceived */
1051 uint32_t tcpi_segs_out; /* RFC4898 tcpEStatsPerfSegsOut */
1052 uint32_t tcpi_segs_in; /* RFC4898 tcpEStatsPerfSegsIn */
1053
1054 uint32_t tcpi_notsent_bytes;
1055 uint32_t tcpi_min_rtt;
1056 uint32_t tcpi_data_segs_in; /* RFC4898 tcpEStatsDataSegsIn */
1057 uint32_t tcpi_data_segs_out; /* RFC4898 tcpEStatsDataSegsOut */
1058
1059 uint64_t tcpi_delivery_rate;
1060 } tcpi;
1061 socklen_t tcpisz = sizeof(tcpi);
1062 if (getsockopt(fd, IPPROTO_TCP, TCP_INFO, &tcpi, &tcpisz) == 0) {
1063 char *buf = (char *)(pool != NULL ? h2o_mem_alloc_pool(pool, char, sizeof(H2O_UINT64_LONGEST_STR))
1064 : h2o_mem_alloc(sizeof(H2O_UINT64_LONGEST_STR)));
1065 size_t len = sprintf(buf, "%" PRIu64, (uint64_t)tcpi.tcpi_delivery_rate);
1066 return h2o_iovec_init(buf, len);
1067 }
1068 }
1069 #endif
1070 return h2o_iovec_init(NULL, 0);
1071 }
1072
h2o_socket_log_ssl_session_id(h2o_socket_t * sock,h2o_mem_pool_t * pool)1073 h2o_iovec_t h2o_socket_log_ssl_session_id(h2o_socket_t *sock, h2o_mem_pool_t *pool)
1074 {
1075 h2o_iovec_t base64id, rawid = h2o_socket_get_ssl_session_id(sock);
1076
1077 if (rawid.base == NULL)
1078 return h2o_iovec_init(NULL, 0);
1079
1080 base64id.base = pool != NULL ? h2o_mem_alloc_pool(pool, char, h2o_base64_encode_capacity(rawid.len))
1081 : h2o_mem_alloc(h2o_base64_encode_capacity(rawid.len));
1082 base64id.len = h2o_base64_encode(base64id.base, rawid.base, rawid.len, 1);
1083 return base64id;
1084 }
1085
h2o_socket_log_ssl_cipher_bits(h2o_socket_t * sock,h2o_mem_pool_t * pool)1086 h2o_iovec_t h2o_socket_log_ssl_cipher_bits(h2o_socket_t *sock, h2o_mem_pool_t *pool)
1087 {
1088 int bits = h2o_socket_get_ssl_cipher_bits(sock);
1089 if (bits != 0) {
1090 char *s = (char *)(pool != NULL ? h2o_mem_alloc_pool(pool, char, sizeof(H2O_INT16_LONGEST_STR))
1091 : h2o_mem_alloc(sizeof(H2O_INT16_LONGEST_STR)));
1092 size_t len = sprintf(s, "%" PRId16, (int16_t)bits);
1093 return h2o_iovec_init(s, len);
1094 } else {
1095 return h2o_iovec_init(NULL, 0);
1096 }
1097 }
1098
h2o_socket_compare_address(struct sockaddr * x,struct sockaddr * y,int check_port)1099 int h2o_socket_compare_address(struct sockaddr *x, struct sockaddr *y, int check_port)
1100 {
1101 #define CMP(a, b) \
1102 do { \
1103 if (a != b) \
1104 return a < b ? -1 : 1; \
1105 } while (0)
1106
1107 CMP(x->sa_family, y->sa_family);
1108
1109 if (x->sa_family == AF_UNIX) {
1110 struct sockaddr_un *xun = (void *)x, *yun = (void *)y;
1111 int r = strcmp(xun->sun_path, yun->sun_path);
1112 if (r != 0)
1113 return r;
1114 } else if (x->sa_family == AF_INET) {
1115 struct sockaddr_in *xin = (void *)x, *yin = (void *)y;
1116 CMP(ntohl(xin->sin_addr.s_addr), ntohl(yin->sin_addr.s_addr));
1117 if (check_port)
1118 CMP(ntohs(xin->sin_port), ntohs(yin->sin_port));
1119 } else if (x->sa_family == AF_INET6) {
1120 struct sockaddr_in6 *xin6 = (void *)x, *yin6 = (void *)y;
1121 int r = memcmp(xin6->sin6_addr.s6_addr, yin6->sin6_addr.s6_addr, sizeof(xin6->sin6_addr.s6_addr));
1122 if (r != 0)
1123 return r;
1124 if (check_port)
1125 CMP(ntohs(xin6->sin6_port), ntohs(yin6->sin6_port));
1126 CMP(xin6->sin6_flowinfo, yin6->sin6_flowinfo);
1127 CMP(xin6->sin6_scope_id, yin6->sin6_scope_id);
1128 } else {
1129 assert(!"unknown sa_family");
1130 }
1131
1132 #undef CMP
1133 return 0;
1134 }
1135
h2o_socket_getnumerichost(const struct sockaddr * sa,socklen_t salen,char * buf)1136 size_t h2o_socket_getnumerichost(const struct sockaddr *sa, socklen_t salen, char *buf)
1137 {
1138 if (sa->sa_family == AF_INET) {
1139 /* fast path for IPv4 addresses */
1140 struct sockaddr_in *sin = (void *)sa;
1141 uint32_t addr;
1142 addr = htonl(sin->sin_addr.s_addr);
1143 return sprintf(buf, "%d.%d.%d.%d", addr >> 24, (addr >> 16) & 255, (addr >> 8) & 255, addr & 255);
1144 }
1145
1146 if (getnameinfo(sa, salen, buf, NI_MAXHOST, NULL, 0, NI_NUMERICHOST) != 0)
1147 return SIZE_MAX;
1148 return strlen(buf);
1149 }
1150
h2o_socket_getport(const struct sockaddr * sa)1151 int32_t h2o_socket_getport(const struct sockaddr *sa)
1152 {
1153 switch (sa->sa_family) {
1154 case AF_INET:
1155 return htons(((struct sockaddr_in *)sa)->sin_port);
1156 case AF_INET6:
1157 return htons(((struct sockaddr_in6 *)sa)->sin6_port);
1158 default:
1159 return -1;
1160 }
1161 }
1162
h2o_socket_get_error_string(int errnum,const char * default_err)1163 const char *h2o_socket_get_error_string(int errnum, const char *default_err)
1164 {
1165 switch (errnum) {
1166 case ECONNREFUSED:
1167 return h2o_socket_error_conn_refused;
1168 case ETIMEDOUT:
1169 return h2o_socket_error_conn_timed_out;
1170 case ENETUNREACH:
1171 return h2o_socket_error_network_unreachable;
1172 case EHOSTUNREACH:
1173 return h2o_socket_error_host_unreachable;
1174 default:
1175 return default_err;
1176 }
1177 }
1178
create_ossl(h2o_socket_t * sock)1179 static void create_ossl(h2o_socket_t *sock)
1180 {
1181 sock->ssl->ossl = SSL_new(sock->ssl->ssl_ctx);
1182 /* set app data to be used in h2o_socket_ssl_new_session_cb */
1183 SSL_set_app_data(sock->ssl->ossl, sock);
1184 setup_bio(sock);
1185 }
1186
on_async_resumption_get(SSL * ssl,const unsigned char * data,int len,int * copy)1187 static SSL_SESSION *on_async_resumption_get(SSL *ssl,
1188 #if !defined(LIBRESSL_VERSION_NUMBER) ? OPENSSL_VERSION_NUMBER >= 0x1010000fL : LIBRESSL_VERSION_NUMBER > 0x2070000f
1189 const
1190 #endif
1191 unsigned char *data,
1192 int len, int *copy)
1193 {
1194 h2o_socket_t *sock = BIO_get_data(SSL_get_rbio(ssl));
1195
1196 switch (sock->ssl->handshake.server.async_resumption.state) {
1197 case ASYNC_RESUMPTION_STATE_RECORD:
1198 sock->ssl->handshake.server.async_resumption.state = ASYNC_RESUMPTION_STATE_REQUEST_SENT;
1199 resumption_get_async(sock, h2o_iovec_init(data, len));
1200 return NULL;
1201 case ASYNC_RESUMPTION_STATE_COMPLETE:
1202 *copy = 1;
1203 return sock->ssl->handshake.server.async_resumption.session_data;
1204 default:
1205 assert(!"FIXME");
1206 return NULL;
1207 }
1208 }
1209
h2o_socket_ssl_new_session_cb(SSL * s,SSL_SESSION * sess)1210 int h2o_socket_ssl_new_session_cb(SSL *s, SSL_SESSION *sess)
1211 {
1212 h2o_socket_t *sock = (h2o_socket_t *)SSL_get_app_data(s);
1213 assert(sock != NULL);
1214 assert(sock->ssl != NULL);
1215
1216 if (!SSL_is_server(s) && sock->ssl->handshake.client.session_cache != NULL
1217 #if !defined(LIBRESSL_VERSION_NUMBER) && OPENSSL_VERSION_NUMBER >= 0x1010100fL
1218 && SSL_SESSION_is_resumable(sess)
1219 #endif
1220 ) {
1221 h2o_cache_set(sock->ssl->handshake.client.session_cache, h2o_now(h2o_socket_get_loop(sock)),
1222 sock->ssl->handshake.client.session_cache_key, sock->ssl->handshake.client.session_cache_key_hash,
1223 h2o_iovec_init(sess, 1));
1224 return 1; /* retain ref count */
1225 }
1226
1227 return 0; /* drop ref count */
1228 }
1229
on_async_resumption_new(SSL * ssl,SSL_SESSION * session)1230 static int on_async_resumption_new(SSL *ssl, SSL_SESSION *session)
1231 {
1232 h2o_socket_t *sock = BIO_get_data(SSL_get_rbio(ssl));
1233
1234 h2o_iovec_t data;
1235 const unsigned char *id;
1236 unsigned id_len;
1237 unsigned char *p;
1238
1239 /* build data */
1240 data.len = i2d_SSL_SESSION(session, NULL);
1241 data.base = alloca(data.len);
1242 p = (void *)data.base;
1243 i2d_SSL_SESSION(session, &p);
1244
1245 id = SSL_SESSION_get_id(session, &id_len);
1246 resumption_new(sock, h2o_iovec_init(id, id_len), data);
1247 return 0;
1248 }
1249
on_handshake_complete(h2o_socket_t * sock,const char * err)1250 static void on_handshake_complete(h2o_socket_t *sock, const char *err)
1251 {
1252 if (err == NULL) {
1253 if (sock->ssl->ptls != NULL) {
1254 sock->ssl->record_overhead = ptls_get_record_overhead(sock->ssl->ptls);
1255 } else {
1256 const SSL_CIPHER *cipher = SSL_get_current_cipher(sock->ssl->ossl);
1257 switch (SSL_CIPHER_get_id(cipher)) {
1258 case TLS1_CK_RSA_WITH_AES_128_GCM_SHA256:
1259 case TLS1_CK_DHE_RSA_WITH_AES_128_GCM_SHA256:
1260 case TLS1_CK_ECDHE_RSA_WITH_AES_128_GCM_SHA256:
1261 case TLS1_CK_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256:
1262 case TLS1_CK_RSA_WITH_AES_256_GCM_SHA384:
1263 case TLS1_CK_DHE_RSA_WITH_AES_256_GCM_SHA384:
1264 case TLS1_CK_ECDHE_RSA_WITH_AES_256_GCM_SHA384:
1265 case TLS1_CK_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384:
1266 sock->ssl->record_overhead = 5 /* header */ + 8 /* record_iv_length (RFC 5288 3) */ + 16 /* tag (RFC 5116 5.1) */;
1267 break;
1268 #if defined(TLS1_CK_DHE_RSA_CHACHA20_POLY1305)
1269 case TLS1_CK_DHE_RSA_CHACHA20_POLY1305:
1270 case TLS1_CK_ECDHE_RSA_CHACHA20_POLY1305:
1271 case TLS1_CK_ECDHE_ECDSA_CHACHA20_POLY1305:
1272 sock->ssl->record_overhead = 5 /* header */ + 16 /* tag */;
1273 break;
1274 #endif
1275 default:
1276 sock->ssl->record_overhead = 32; /* sufficiently large number that can hold most payloads */
1277 break;
1278 }
1279 }
1280 }
1281
1282 h2o_socket_cb handshake_cb = sock->ssl->handshake.cb;
1283 sock->_cb.write = NULL;
1284 sock->ssl->handshake.cb = NULL;
1285 if (err == NULL)
1286 err = decode_ssl_input(sock);
1287 handshake_cb(sock, err);
1288 }
1289
get_handshake_error(struct st_h2o_socket_ssl_t * ssl)1290 const char *get_handshake_error(struct st_h2o_socket_ssl_t *ssl)
1291 {
1292 const char *err = h2o_socket_error_ssl_handshake;
1293 if (ssl->ossl != NULL) {
1294 long verify_result = SSL_get_verify_result(ssl->ossl);
1295 if (verify_result != X509_V_OK) {
1296 err = X509_verify_cert_error_string(verify_result);
1297 assert(err != NULL);
1298 }
1299 }
1300 return err;
1301 }
1302
on_handshake_fail_complete(h2o_socket_t * sock,const char * err)1303 static void on_handshake_fail_complete(h2o_socket_t *sock, const char *err)
1304 {
1305 on_handshake_complete(sock, get_handshake_error(sock->ssl));
1306 }
1307
1308 static void proceed_handshake(h2o_socket_t *sock, const char *err);
1309
proceed_handshake_picotls(h2o_socket_t * sock)1310 static void proceed_handshake_picotls(h2o_socket_t *sock)
1311 {
1312 size_t consumed = sock->ssl->input.encrypted->size;
1313 ptls_buffer_t wbuf;
1314 ptls_buffer_init(&wbuf, "", 0);
1315
1316 int ret = ptls_handshake(sock->ssl->ptls, &wbuf, sock->ssl->input.encrypted->bytes, &consumed, NULL);
1317 h2o_buffer_consume(&sock->ssl->input.encrypted, consumed);
1318
1319 /* determine the next action */
1320 h2o_socket_cb next_cb;
1321 switch (ret) {
1322 case 0:
1323 next_cb = on_handshake_complete;
1324 break;
1325 case PTLS_ERROR_IN_PROGRESS:
1326 next_cb = proceed_handshake;
1327 break;
1328 default:
1329 next_cb = on_handshake_fail_complete;
1330 break;
1331 }
1332
1333 /* When something is to be sent, send it and then take the next action. If there's nothing to be sent and the handshake is still
1334 * in progress, wait for more bytes to arrive; otherwise, take the action immediately. */
1335 if (wbuf.off != 0) {
1336 h2o_socket_read_stop(sock);
1337 write_ssl_bytes(sock, wbuf.base, wbuf.off);
1338 flush_pending_ssl(sock, next_cb);
1339 } else if (ret == PTLS_ERROR_IN_PROGRESS) {
1340 h2o_socket_read_start(sock, next_cb);
1341 } else {
1342 next_cb(sock, NULL);
1343 }
1344
1345 ptls_buffer_dispose(&wbuf);
1346 }
1347
proceed_handshake_openssl(h2o_socket_t * sock)1348 static void proceed_handshake_openssl(h2o_socket_t *sock)
1349 {
1350 h2o_iovec_t first_input = {NULL};
1351 int ret = 0;
1352 const char *err = NULL;
1353
1354 assert(sock->ssl->ossl != NULL);
1355
1356 if (SSL_is_server(sock->ssl->ossl) && sock->ssl->handshake.server.async_resumption.state == ASYNC_RESUMPTION_STATE_RECORD) {
1357 if (sock->ssl->input.encrypted->size <= 1024) {
1358 /* retain a copy of input if performing async resumption */
1359 first_input = h2o_iovec_init(alloca(sock->ssl->input.encrypted->size), sock->ssl->input.encrypted->size);
1360 memcpy(first_input.base, sock->ssl->input.encrypted->bytes, first_input.len);
1361 } else {
1362 sock->ssl->handshake.server.async_resumption.state = ASYNC_RESUMPTION_STATE_COMPLETE;
1363 }
1364 }
1365
1366 Redo:
1367 ERR_clear_error();
1368 if (SSL_is_server(sock->ssl->ossl)) {
1369 ret = SSL_accept(sock->ssl->ossl);
1370 switch (sock->ssl->handshake.server.async_resumption.state) {
1371 case ASYNC_RESUMPTION_STATE_COMPLETE:
1372 break;
1373 case ASYNC_RESUMPTION_STATE_RECORD:
1374 /* async resumption has not been triggered; proceed the state to complete */
1375 sock->ssl->handshake.server.async_resumption.state = ASYNC_RESUMPTION_STATE_COMPLETE;
1376 break;
1377 case ASYNC_RESUMPTION_STATE_REQUEST_SENT: {
1378 /* sent async request, reset the ssl state, and wait for async response */
1379 assert(ret < 0);
1380 SSL_free(sock->ssl->ossl);
1381 create_ossl(sock);
1382 if (has_pending_ssl_bytes(sock->ssl))
1383 dispose_ssl_output_buffer(sock->ssl);
1384 h2o_buffer_consume(&sock->ssl->input.encrypted, sock->ssl->input.encrypted->size);
1385 h2o_buffer_reserve(&sock->ssl->input.encrypted, first_input.len);
1386 memcpy(sock->ssl->input.encrypted->bytes, first_input.base, first_input.len);
1387 sock->ssl->input.encrypted->size = first_input.len;
1388 h2o_socket_read_stop(sock);
1389 return;
1390 }
1391 default:
1392 h2o_fatal("unexpected async resumption state");
1393 break;
1394 }
1395 } else {
1396 ret = SSL_connect(sock->ssl->ossl);
1397 }
1398
1399 if (ret == 0 || (ret < 0 && SSL_get_error(sock->ssl->ossl, ret) != SSL_ERROR_WANT_READ)) {
1400 /* OpenSSL 1.1.0 emits an alert immediately, we send it now. 1.0.2 emits the error when SSL_shutdown is called in
1401 * shutdown_ssl. */
1402 if (has_pending_ssl_bytes(sock->ssl)) {
1403 h2o_socket_read_stop(sock);
1404 flush_pending_ssl(sock, on_handshake_fail_complete);
1405 return;
1406 }
1407 err = get_handshake_error(sock->ssl);
1408 goto Complete;
1409 }
1410
1411 if (has_pending_ssl_bytes(sock->ssl)) {
1412 h2o_socket_read_stop(sock);
1413 flush_pending_ssl(sock, ret == 1 ? on_handshake_complete : proceed_handshake);
1414 } else {
1415 if (ret == 1) {
1416 if (!SSL_is_server(sock->ssl->ossl)) {
1417 X509 *cert = SSL_get_peer_certificate(sock->ssl->ossl);
1418 if (cert != NULL) {
1419 switch (validate_hostname(sock->ssl->handshake.client.server_name, cert)) {
1420 case MatchFound:
1421 /* ok */
1422 break;
1423 case MatchNotFound:
1424 err = h2o_socket_error_ssl_cert_name_mismatch;
1425 break;
1426 default:
1427 err = h2o_socket_error_ssl_cert_invalid;
1428 break;
1429 }
1430 X509_free(cert);
1431 } else {
1432 err = h2o_socket_error_ssl_no_cert;
1433 }
1434 }
1435 goto Complete;
1436 }
1437 if (sock->ssl->input.encrypted->size != 0) {
1438 goto Redo;
1439 }
1440 h2o_socket_read_start(sock, proceed_handshake);
1441 }
1442 return;
1443
1444 Complete:
1445 h2o_socket_read_stop(sock);
1446 on_handshake_complete(sock, err);
1447 }
1448
1449 /**
1450 * Called when it is still uncertain which of the two TLS stacks (picotls or OpenSSL) should handle the handshake.
1451 * The function first tries picotls without consuming the socket input buffer. Then, if picotls returns PTLS_ALERT_PROTOCOL_VERSION
1452 * indicating that the client is using TLS 1.2 or below, switches to using OpenSSL.
1453 */
proceed_handshake_undetermined(h2o_socket_t * sock)1454 static void proceed_handshake_undetermined(h2o_socket_t *sock)
1455 {
1456 assert(sock->ssl->ossl == NULL && sock->ssl->ptls == NULL);
1457
1458 ptls_context_t *ptls_ctx = h2o_socket_ssl_get_picotls_context(sock->ssl->ssl_ctx);
1459 assert(ptls_ctx != NULL);
1460
1461 size_t consumed = sock->ssl->input.encrypted->size;
1462 ptls_buffer_t wbuf;
1463 ptls_buffer_init(&wbuf, "", 0);
1464
1465 #if PICOTLS_USE_DTRACE
1466 unsigned ptls_skip_tracing_backup = ptls_default_skip_tracing;
1467 ptls_default_skip_tracing = sock->_skip_tracing;
1468 #endif
1469 ptls_t *ptls = ptls_new(ptls_ctx, 1);
1470 #if PICOTLS_USE_DTRACE
1471 ptls_default_skip_tracing = ptls_skip_tracing_backup;
1472 #endif
1473 if (ptls == NULL)
1474 h2o_fatal("no memory");
1475 *ptls_get_data_ptr(ptls) = sock;
1476 int ret = ptls_handshake(ptls, &wbuf, sock->ssl->input.encrypted->bytes, &consumed, NULL);
1477
1478 if (ret == PTLS_ERROR_IN_PROGRESS && wbuf.off == 0) {
1479 /* we aren't sure if the picotls can process the handshake, retain handshake transcript and replay on next occasion */
1480 ptls_free(ptls);
1481 } else if (ret == PTLS_ALERT_PROTOCOL_VERSION) {
1482 /* the client cannot use tls1.3, fallback to openssl */
1483 ptls_free(ptls);
1484 create_ossl(sock);
1485 proceed_handshake_openssl(sock);
1486 } else {
1487 /* picotls is responsible for handling the handshake */
1488 sock->ssl->ptls = ptls;
1489 sock->ssl->handshake.server.async_resumption.state = ASYNC_RESUMPTION_STATE_COMPLETE;
1490 h2o_buffer_consume(&sock->ssl->input.encrypted, consumed);
1491 /* stop reading, send response */
1492 h2o_socket_read_stop(sock);
1493 write_ssl_bytes(sock, wbuf.base, wbuf.off);
1494 h2o_socket_cb cb;
1495 switch (ret) {
1496 case 0:
1497 cb = on_handshake_complete;
1498 break;
1499 case PTLS_ERROR_IN_PROGRESS:
1500 cb = proceed_handshake;
1501 break;
1502 default:
1503 assert(ret != PTLS_ERROR_STATELESS_RETRY && "stateless retry is never turned on by us for TCP");
1504 cb = on_handshake_fail_complete;
1505 break;
1506 }
1507 flush_pending_ssl(sock, cb);
1508 }
1509 ptls_buffer_dispose(&wbuf);
1510 }
1511
proceed_handshake(h2o_socket_t * sock,const char * err)1512 static void proceed_handshake(h2o_socket_t *sock, const char *err)
1513 {
1514 sock->_cb.write = NULL;
1515
1516 if (err != NULL) {
1517 h2o_socket_read_stop(sock);
1518 on_handshake_complete(sock, err);
1519 return;
1520 }
1521
1522 if (sock->ssl->ptls != NULL) {
1523 proceed_handshake_picotls(sock);
1524 } else if (sock->ssl->ossl != NULL) {
1525 proceed_handshake_openssl(sock);
1526 } else if (h2o_socket_ssl_get_picotls_context(sock->ssl->ssl_ctx) == NULL) {
1527 create_ossl(sock);
1528 proceed_handshake_openssl(sock);
1529 } else {
1530 proceed_handshake_undetermined(sock);
1531 }
1532 }
1533
h2o_socket_ssl_handshake(h2o_socket_t * sock,SSL_CTX * ssl_ctx,const char * server_name,h2o_iovec_t alpn_protos,h2o_socket_cb handshake_cb)1534 void h2o_socket_ssl_handshake(h2o_socket_t *sock, SSL_CTX *ssl_ctx, const char *server_name, h2o_iovec_t alpn_protos,
1535 h2o_socket_cb handshake_cb)
1536 {
1537 sock->ssl = h2o_mem_alloc(sizeof(*sock->ssl));
1538 *sock->ssl = (struct st_h2o_socket_ssl_t){};
1539
1540 sock->ssl->ssl_ctx = ssl_ctx;
1541
1542 /* setup the buffers; sock->input should be empty, sock->ssl->input.encrypted should contain the initial input, if any */
1543 h2o_buffer_init(&sock->ssl->input.encrypted, &h2o_socket_buffer_prototype);
1544 if (sock->input->size != 0) {
1545 h2o_buffer_t *tmp = sock->input;
1546 sock->input = sock->ssl->input.encrypted;
1547 sock->ssl->input.encrypted = tmp;
1548 }
1549
1550 sock->ssl->handshake.cb = handshake_cb;
1551 if (server_name == NULL) {
1552 /* is server */
1553 if (SSL_CTX_sess_get_get_cb(sock->ssl->ssl_ctx) != NULL)
1554 sock->ssl->handshake.server.async_resumption.state = ASYNC_RESUMPTION_STATE_RECORD;
1555 if (sock->ssl->input.encrypted->size != 0)
1556 proceed_handshake(sock, 0);
1557 else
1558 h2o_socket_read_start(sock, proceed_handshake);
1559 } else {
1560 create_ossl(sock);
1561 if (alpn_protos.base != NULL)
1562 SSL_set_alpn_protos(sock->ssl->ossl, (const unsigned char *)alpn_protos.base, (unsigned)alpn_protos.len);
1563 h2o_cache_t *session_cache = h2o_socket_ssl_get_session_cache(sock->ssl->ssl_ctx);
1564 if (session_cache != NULL) {
1565 struct sockaddr_storage sa;
1566 int32_t port;
1567 if (h2o_socket_getpeername(sock, (struct sockaddr *)&sa) != 0 &&
1568 (port = h2o_socket_getport((struct sockaddr *)&sa)) != -1) {
1569 /* session cache is available */
1570 h2o_iovec_t session_cache_key;
1571 session_cache_key.base = h2o_mem_alloc(strlen(server_name) + sizeof(":" H2O_UINT16_LONGEST_STR));
1572 session_cache_key.len = sprintf(session_cache_key.base, "%s:%" PRIu16, server_name, (uint16_t)port);
1573 sock->ssl->handshake.client.session_cache = session_cache;
1574 sock->ssl->handshake.client.session_cache_key = session_cache_key;
1575 sock->ssl->handshake.client.session_cache_key_hash =
1576 h2o_cache_calchash(session_cache_key.base, session_cache_key.len);
1577
1578 /* fetch from session cache */
1579 h2o_cache_ref_t *cacheref = h2o_cache_fetch(session_cache, h2o_now(h2o_socket_get_loop(sock)),
1580 sock->ssl->handshake.client.session_cache_key,
1581 sock->ssl->handshake.client.session_cache_key_hash);
1582 if (cacheref != NULL) {
1583 SSL_set_session(sock->ssl->ossl, (SSL_SESSION *)cacheref->value.base);
1584 h2o_cache_release(session_cache, cacheref);
1585 }
1586 }
1587 }
1588 sock->ssl->handshake.client.server_name = h2o_strdup(NULL, server_name, SIZE_MAX).base;
1589 SSL_set_tlsext_host_name(sock->ssl->ossl, sock->ssl->handshake.client.server_name);
1590 proceed_handshake(sock, 0);
1591 }
1592 }
1593
h2o_socket_ssl_resume_server_handshake(h2o_socket_t * sock,h2o_iovec_t session_data)1594 void h2o_socket_ssl_resume_server_handshake(h2o_socket_t *sock, h2o_iovec_t session_data)
1595 {
1596 if (session_data.len != 0) {
1597 const unsigned char *p = (void *)session_data.base;
1598 sock->ssl->handshake.server.async_resumption.session_data = d2i_SSL_SESSION(NULL, &p, (long)session_data.len);
1599 /* FIXME warn on failure */
1600 }
1601
1602 sock->ssl->handshake.server.async_resumption.state = ASYNC_RESUMPTION_STATE_COMPLETE;
1603 proceed_handshake(sock, 0);
1604
1605 if (sock->ssl->handshake.server.async_resumption.session_data != NULL) {
1606 SSL_SESSION_free(sock->ssl->handshake.server.async_resumption.session_data);
1607 sock->ssl->handshake.server.async_resumption.session_data = NULL;
1608 }
1609 }
1610
h2o_socket_ssl_async_resumption_init(h2o_socket_ssl_resumption_get_async_cb get_async_cb,h2o_socket_ssl_resumption_new_cb new_cb)1611 void h2o_socket_ssl_async_resumption_init(h2o_socket_ssl_resumption_get_async_cb get_async_cb,
1612 h2o_socket_ssl_resumption_new_cb new_cb)
1613 {
1614 resumption_get_async = get_async_cb;
1615 resumption_new = new_cb;
1616 }
1617
h2o_socket_ssl_async_resumption_setup_ctx(SSL_CTX * ctx)1618 void h2o_socket_ssl_async_resumption_setup_ctx(SSL_CTX *ctx)
1619 {
1620 SSL_CTX_sess_set_get_cb(ctx, on_async_resumption_get);
1621 SSL_CTX_sess_set_new_cb(ctx, on_async_resumption_new);
1622 /* if necessary, it is the responsibility of the caller to disable the internal cache */
1623 }
1624
get_ptls_index(void)1625 static int get_ptls_index(void)
1626 {
1627 static volatile int index;
1628 H2O_MULTITHREAD_ONCE({ index = SSL_CTX_get_ex_new_index(0, NULL, NULL, NULL, NULL); });
1629 return index;
1630 }
1631
h2o_socket_ssl_get_picotls_context(SSL_CTX * ossl)1632 ptls_context_t *h2o_socket_ssl_get_picotls_context(SSL_CTX *ossl)
1633 {
1634 return SSL_CTX_get_ex_data(ossl, get_ptls_index());
1635 }
1636
h2o_socket_ssl_set_picotls_context(SSL_CTX * ossl,ptls_context_t * ptls)1637 void h2o_socket_ssl_set_picotls_context(SSL_CTX *ossl, ptls_context_t *ptls)
1638 {
1639 SSL_CTX_set_ex_data(ossl, get_ptls_index(), ptls);
1640 }
1641
on_dispose_ssl_ctx_session_cache(void * parent,void * ptr,CRYPTO_EX_DATA * ad,int idx,long argl,void * argp)1642 static void on_dispose_ssl_ctx_session_cache(void *parent, void *ptr, CRYPTO_EX_DATA *ad, int idx, long argl, void *argp)
1643 {
1644 h2o_cache_t *ssl_session_cache = (h2o_cache_t *)ptr;
1645 if (ssl_session_cache != NULL)
1646 h2o_cache_destroy(ssl_session_cache);
1647 }
1648
get_ssl_session_cache_index(void)1649 static int get_ssl_session_cache_index(void)
1650 {
1651 static volatile int index;
1652 H2O_MULTITHREAD_ONCE({ index = SSL_CTX_get_ex_new_index(0, NULL, NULL, NULL, on_dispose_ssl_ctx_session_cache); });
1653 return index;
1654 }
1655
h2o_socket_ssl_get_session_cache(SSL_CTX * ctx)1656 h2o_cache_t *h2o_socket_ssl_get_session_cache(SSL_CTX *ctx)
1657 {
1658 return (h2o_cache_t *)SSL_CTX_get_ex_data(ctx, get_ssl_session_cache_index());
1659 }
1660
h2o_socket_ssl_set_session_cache(SSL_CTX * ctx,h2o_cache_t * cache)1661 void h2o_socket_ssl_set_session_cache(SSL_CTX *ctx, h2o_cache_t *cache)
1662 {
1663 SSL_CTX_set_ex_data(ctx, get_ssl_session_cache_index(), cache);
1664 }
1665
h2o_socket_ssl_destroy_session_cache_entry(h2o_iovec_t value)1666 void h2o_socket_ssl_destroy_session_cache_entry(h2o_iovec_t value)
1667 {
1668 SSL_SESSION *session = (SSL_SESSION *)value.base;
1669 SSL_SESSION_free(session);
1670 }
1671
h2o_socket_ssl_get_selected_protocol(h2o_socket_t * sock)1672 h2o_iovec_t h2o_socket_ssl_get_selected_protocol(h2o_socket_t *sock)
1673 {
1674 const unsigned char *data = NULL;
1675 unsigned len = 0;
1676
1677 if (sock->ssl == NULL)
1678 return h2o_iovec_init(NULL, 0);
1679
1680 if (sock->ssl->ptls != NULL) {
1681 const char *proto = ptls_get_negotiated_protocol(sock->ssl->ptls);
1682 return proto != NULL ? h2o_iovec_init(proto, strlen(proto)) : h2o_iovec_init(NULL, 0);
1683 }
1684
1685 #if H2O_USE_ALPN
1686 if (len == 0)
1687 SSL_get0_alpn_selected(sock->ssl->ossl, &data, &len);
1688 #endif
1689 #if H2O_USE_NPN
1690 if (len == 0)
1691 SSL_get0_next_proto_negotiated(sock->ssl->ossl, &data, &len);
1692 #endif
1693
1694 return h2o_iovec_init(data, len);
1695 }
1696
h2o_socket_ssl_is_early_data(h2o_socket_t * sock)1697 int h2o_socket_ssl_is_early_data(h2o_socket_t *sock)
1698 {
1699 assert(sock->ssl != NULL);
1700
1701 if (sock->ssl->ptls != NULL && !ptls_handshake_is_complete(sock->ssl->ptls))
1702 return 1;
1703 return 0;
1704 }
1705
on_alpn_select(SSL * ssl,const unsigned char ** out,unsigned char * outlen,const unsigned char * _in,unsigned int inlen,void * _protocols)1706 static int on_alpn_select(SSL *ssl, const unsigned char **out, unsigned char *outlen, const unsigned char *_in, unsigned int inlen,
1707 void *_protocols)
1708 {
1709 const h2o_iovec_t *protocols = _protocols;
1710 size_t i;
1711
1712 for (i = 0; protocols[i].len != 0; ++i) {
1713 const unsigned char *in = _in, *in_end = in + inlen;
1714 while (in != in_end) {
1715 size_t cand_len = *in++;
1716 if (in_end - in < cand_len) {
1717 /* broken request */
1718 return SSL_TLSEXT_ERR_NOACK;
1719 }
1720 if (cand_len == protocols[i].len && memcmp(in, protocols[i].base, cand_len) == 0) {
1721 goto Found;
1722 }
1723 in += cand_len;
1724 }
1725 }
1726 /* not found */
1727 return SSL_TLSEXT_ERR_NOACK;
1728
1729 Found:
1730 *out = (const unsigned char *)protocols[i].base;
1731 *outlen = (unsigned char)protocols[i].len;
1732 return SSL_TLSEXT_ERR_OK;
1733 }
1734
1735 #if H2O_USE_ALPN
1736
h2o_ssl_register_alpn_protocols(SSL_CTX * ctx,const h2o_iovec_t * protocols)1737 void h2o_ssl_register_alpn_protocols(SSL_CTX *ctx, const h2o_iovec_t *protocols)
1738 {
1739 SSL_CTX_set_alpn_select_cb(ctx, on_alpn_select, (void *)protocols);
1740 }
1741
1742 #endif
1743
1744 #if H2O_USE_NPN
1745
on_npn_advertise(SSL * ssl,const unsigned char ** out,unsigned * outlen,void * protocols)1746 static int on_npn_advertise(SSL *ssl, const unsigned char **out, unsigned *outlen, void *protocols)
1747 {
1748 *out = protocols;
1749 *outlen = (unsigned)strlen(protocols);
1750 return SSL_TLSEXT_ERR_OK;
1751 }
1752
h2o_ssl_register_npn_protocols(SSL_CTX * ctx,const char * protocols)1753 void h2o_ssl_register_npn_protocols(SSL_CTX *ctx, const char *protocols)
1754 {
1755 SSL_CTX_set_next_protos_advertised_cb(ctx, on_npn_advertise, (void *)protocols);
1756 }
1757
1758 #endif
1759
h2o_socket_set_df_bit(int fd,int domain)1760 int h2o_socket_set_df_bit(int fd, int domain)
1761 {
1762 #define SETSOCKOPT(ip, optname, _optvar) \
1763 do { \
1764 int optvar = _optvar; \
1765 if (setsockopt(fd, ip, optname, &optvar, sizeof(optvar)) != 0) { \
1766 perror("failed to set the DF bit through setsockopt(" H2O_TO_STR(ip) ", " H2O_TO_STR(optname) ")"); \
1767 return 0; \
1768 } \
1769 return 1; \
1770 } while (0)
1771
1772 switch (domain) {
1773 case AF_INET:
1774 #if defined(IP_PMTUDISC_DO)
1775 SETSOCKOPT(IPPROTO_IP, IP_MTU_DISCOVER, IP_PMTUDISC_DO);
1776 #elif defined(IP_DONTFRAG)
1777 SETSOCKOPT(IPPROTO_IP, IP_DONTFRAG, 1);
1778 #endif
1779 break;
1780 case AF_INET6:
1781 #if defined(IPV6_PMTUDISC_DO)
1782 SETSOCKOPT(IPPROTO_IPV6, IPV6_MTU_DISCOVER, IPV6_PMTUDISC_DO);
1783 #elif defined(IPV6_DONTFRAG)
1784 SETSOCKOPT(IPPROTO_IPV6, IPV6_DONTFRAG, 1);
1785 #endif
1786 break;
1787 default:
1788 break;
1789 }
1790
1791 return 1;
1792
1793 #undef SETSOCKOPT
1794 }
1795
h2o_socket_set_skip_tracing(h2o_socket_t * sock,int skip_tracing)1796 void h2o_socket_set_skip_tracing(h2o_socket_t *sock, int skip_tracing)
1797 {
1798 sock->_skip_tracing = skip_tracing;
1799 if (sock->ssl != NULL && sock->ssl->ptls != NULL)
1800 ptls_set_skip_tracing(sock->ssl->ptls, skip_tracing);
1801 }
1802
h2o_sliding_counter_stop(h2o_sliding_counter_t * counter,uint64_t now)1803 void h2o_sliding_counter_stop(h2o_sliding_counter_t *counter, uint64_t now)
1804 {
1805 uint64_t elapsed;
1806
1807 assert(counter->cur.start_at != 0);
1808
1809 /* calculate the time used, and reset cur */
1810 if (now <= counter->cur.start_at)
1811 elapsed = 0;
1812 else
1813 elapsed = now - counter->cur.start_at;
1814 counter->cur.start_at = 0;
1815
1816 /* adjust prev */
1817 counter->prev.sum += elapsed;
1818 counter->prev.sum -= counter->prev.slots[counter->prev.index];
1819 counter->prev.slots[counter->prev.index] = elapsed;
1820 if (++counter->prev.index >= sizeof(counter->prev.slots) / sizeof(counter->prev.slots[0]))
1821 counter->prev.index = 0;
1822
1823 /* recalc average */
1824 counter->average = counter->prev.sum / (sizeof(counter->prev.slots) / sizeof(counter->prev.slots[0]));
1825 }
1826
1827 #if H2O_USE_EBPF_MAP
1828 #include <linux/bpf.h>
1829 #include <linux/unistd.h>
1830 #include <sys/stat.h>
1831 #include "h2o/multithread.h"
1832 #include "h2o-probes.h"
1833
ebpf_map_create(uint32_t map_type,uint32_t key_size,uint32_t value_size,uint32_t max_entries,const char * map_name)1834 static int ebpf_map_create(uint32_t map_type, uint32_t key_size, uint32_t value_size, uint32_t max_entries, const char *map_name)
1835 {
1836 union bpf_attr attr = {
1837 .map_type = map_type,
1838 .key_size = key_size,
1839 .value_size = value_size,
1840 .max_entries = max_entries,
1841 };
1842 strncpy(attr.map_name, map_name, sizeof(attr.map_name));
1843 return syscall(SYS_bpf, BPF_MAP_CREATE, &attr, sizeof(attr));
1844 }
1845
ebpf_obj_pin(int bpf_fd,const char * pathname)1846 static int ebpf_obj_pin(int bpf_fd, const char *pathname)
1847 {
1848 union bpf_attr attr = {
1849 .bpf_fd = (uint32_t)bpf_fd,
1850 .pathname = (uint64_t)pathname,
1851 };
1852 return syscall(SYS_bpf, BPF_OBJ_PIN, &attr, sizeof(attr));
1853 }
1854
ebpf_obj_get(const char * pathname)1855 static int ebpf_obj_get(const char *pathname)
1856 {
1857 union bpf_attr attr = {
1858 .pathname = (uint64_t)pathname,
1859 };
1860 return syscall(SYS_bpf, BPF_OBJ_GET, &attr, sizeof(attr));
1861 }
1862
ebpf_obj_get_info_by_fd(int fd,struct bpf_map_info * info)1863 static int ebpf_obj_get_info_by_fd(int fd, struct bpf_map_info *info)
1864 {
1865 union bpf_attr attr = {
1866 .info =
1867 {
1868 .bpf_fd = fd,
1869 .info = (uint64_t)info,
1870 .info_len = sizeof(*info),
1871 },
1872 };
1873 return syscall(SYS_bpf, BPF_OBJ_GET_INFO_BY_FD, &attr, sizeof(attr));
1874 }
1875
ebpf_map_lookup(int fd,const void * key,void * value)1876 static int ebpf_map_lookup(int fd, const void *key, void *value)
1877 {
1878 union bpf_attr attr = {
1879 .map_fd = fd,
1880 .key = (uint64_t)key,
1881 .value = (uint64_t)value,
1882 };
1883 return syscall(SYS_bpf, BPF_MAP_LOOKUP_ELEM, &attr, sizeof(attr));
1884 }
1885
ebpf_map_delete(int fd,const void * key)1886 static int ebpf_map_delete(int fd, const void *key)
1887 {
1888 union bpf_attr attr = {
1889 .map_fd = fd,
1890 .key = (uint64_t)key,
1891 };
1892 return syscall(SYS_bpf, BPF_MAP_DELETE_ELEM, &attr, sizeof(attr));
1893 }
1894
1895 static int return_map_fd = -1; // for h2o_return
1896
h2o_socket_ebpf_setup(void)1897 int h2o_socket_ebpf_setup(void)
1898 {
1899 const struct {
1900 int type;
1901 uint32_t key_size;
1902 uint32_t value_size;
1903 } map_attr = {
1904 .type = BPF_MAP_TYPE_LRU_HASH,
1905 .key_size = sizeof(pid_t),
1906 .value_size = sizeof(uint64_t),
1907 };
1908
1909 int fd = -1;
1910 if (getuid() != 0) {
1911 h2o_error_printf("failed to set up eBPF maps because bpf(2) requires root privileges\n");
1912 goto Error;
1913 }
1914
1915 fd = ebpf_obj_get(H2O_EBPF_RETURN_MAP_PATH);
1916 if (fd < 0) {
1917 if (errno != ENOENT) {
1918 h2o_perror("BPF_OBJ_GET failed");
1919 goto Error;
1920 }
1921 /* Pinned eBPF map does not exist. Create one and pin it to the BPF filesystem. */
1922 fd = ebpf_map_create(map_attr.type, map_attr.key_size, map_attr.value_size, H2O_EBPF_RETURN_MAP_SIZE,
1923 H2O_EBPF_RETURN_MAP_NAME);
1924 if (fd < 0) {
1925 if (errno == EPERM) {
1926 h2o_error_printf("BPF_MAP_CREATE failed with EPERM, maybe because RLIMIT_MEMLOCK is too small.\n");
1927 } else {
1928 h2o_perror("BPF_MAP_CREATE failed");
1929 }
1930 goto Error;
1931 }
1932 if (ebpf_obj_pin(fd, H2O_EBPF_RETURN_MAP_PATH) != 0) {
1933 if (errno == ENOENT) {
1934 h2o_error_printf("BPF_OBJ_PIN failed with ENOENT, because /sys/fs/bpf is not mounted as a BPF filesystem.\n");
1935 } else {
1936 h2o_perror("BPF_OBJ_PIN failed");
1937 }
1938 goto Error;
1939 }
1940 } else {
1941 /* BPF_OBJ_GET successfully opened a pinned eBPF map. Make sure the critical attributes (type, key size, value size) are
1942 * correct, otherwise usdt-selective-tracing does not work. */
1943 struct bpf_map_info m;
1944 if (ebpf_obj_get_info_by_fd(fd, &m) != 0) {
1945 h2o_perror("BPF_OBJ_GET_INFO_BY_FD failed");
1946 goto Error;
1947 }
1948 if (m.type != map_attr.type) {
1949 h2o_error_printf(H2O_EBPF_RETURN_MAP_PATH " has an unexpected map type: expected %d but got %d\n", map_attr.type,
1950 m.type);
1951 goto Error;
1952 }
1953 if (m.key_size != map_attr.key_size) {
1954 h2o_error_printf(H2O_EBPF_RETURN_MAP_PATH " has an unexpected map key size: expected %" PRIu32 " but got %" PRIu32 "\n",
1955 map_attr.key_size, m.key_size);
1956 goto Error;
1957 }
1958 if (m.value_size != map_attr.value_size) {
1959 h2o_error_printf(H2O_EBPF_RETURN_MAP_PATH " has an unexpected map value size: expected %" PRIu32 " but got %" PRIu32
1960 "\n",
1961 map_attr.value_size, m.value_size);
1962 goto Error;
1963 }
1964 }
1965
1966 /* success */
1967 return_map_fd = fd;
1968 return 1;
1969
1970 Error:
1971 if (fd >= 0)
1972 close(fd);
1973 return 0;
1974 }
1975
get_map_fd(h2o_loop_t * loop,const char * map_path,int * fd,uint64_t * last_attempt)1976 static void get_map_fd(h2o_loop_t *loop, const char *map_path, int *fd, uint64_t *last_attempt)
1977 {
1978 // only check every second
1979 uint64_t now = h2o_now(loop);
1980 if (*last_attempt - now < 1000)
1981 return;
1982
1983 *last_attempt = now;
1984
1985 struct stat s;
1986 if (stat(map_path, &s) != 0) {
1987 // map path unavailable, cleanup fd if needed and leave
1988 if (*fd >= 0) {
1989 close(*fd);
1990 *fd = -1;
1991 }
1992 return;
1993 }
1994
1995 if (*fd >= 0)
1996 return; // map still exists and we have a fd
1997
1998 // map exists, try connect
1999 *fd = ebpf_obj_get(map_path);
2000 if (*fd < 0)
2001 h2o_perror("BPF_OBJ_GET failed");
2002 }
2003
get_tracing_map_fd(h2o_loop_t * loop)2004 static int get_tracing_map_fd(h2o_loop_t *loop)
2005 {
2006 static __thread int fd = -1;
2007 static __thread uint64_t last_attempt = 0;
2008 get_map_fd(loop, H2O_EBPF_MAP_PATH, &fd, &last_attempt);
2009 return fd;
2010 }
2011
set_ebpf_map_key_tuples(const struct sockaddr * sa,h2o_ebpf_address_t * ea)2012 static inline int set_ebpf_map_key_tuples(const struct sockaddr *sa, h2o_ebpf_address_t *ea)
2013 {
2014 if (sa->sa_family == AF_INET) {
2015 struct sockaddr_in *sin = (void *)sa;
2016 memcpy(ea->ip, &sin->sin_addr, sizeof(sin->sin_addr));
2017 ea->port = sin->sin_port;
2018 return 1;
2019 } else if (sa->sa_family == AF_INET6) {
2020 struct sockaddr_in6 *sin = (void *)sa;
2021 memcpy(ea->ip, &sin->sin6_addr, sizeof(sin->sin6_addr));
2022 ea->port = sin->sin6_port;
2023 return 1;
2024 } else {
2025 return 0;
2026 }
2027 }
2028
h2o_socket_ebpf_init_key_raw(h2o_ebpf_map_key_t * key,int sock_type,struct sockaddr * local,struct sockaddr * remote)2029 int h2o_socket_ebpf_init_key_raw(h2o_ebpf_map_key_t *key, int sock_type, struct sockaddr *local, struct sockaddr *remote)
2030 {
2031 memset(key, 0, sizeof(*key));
2032 if (!set_ebpf_map_key_tuples(local, &key->local))
2033 return 0;
2034 if (!set_ebpf_map_key_tuples(remote, &key->remote))
2035 return 0;
2036 key->family = local->sa_family == AF_INET6 ? 6 : 4;
2037 key->protocol = sock_type;
2038 return 1;
2039 }
2040
h2o_socket_ebpf_init_key(h2o_ebpf_map_key_t * key,void * _sock)2041 int h2o_socket_ebpf_init_key(h2o_ebpf_map_key_t *key, void *_sock)
2042 {
2043 h2o_socket_t *sock = _sock;
2044 struct sockaddr_storage local, remote;
2045 unsigned int sock_type, sock_type_len = sizeof(sock_type_len);
2046
2047 /* fetch info */
2048 if (h2o_socket_getsockname(sock, (void *)&local) == 0)
2049 return 0;
2050 if (h2o_socket_getpeername(sock, (void *)&remote) == 0)
2051 return 0;
2052 if (getsockopt(h2o_socket_get_fd(sock), SOL_SOCKET, SO_TYPE, &sock_type, &sock_type_len) != 0) /* can't the info be cached? */
2053 return 0;
2054
2055 return h2o_socket_ebpf_init_key_raw(key, sock_type, (void *)&local, (void *)&remote);
2056 }
2057
report_ebpf_lookup_errors(h2o_error_reporter_t * reporter,uint64_t total_successes,uint64_t cur_successes)2058 static void report_ebpf_lookup_errors(h2o_error_reporter_t *reporter, uint64_t total_successes, uint64_t cur_successes)
2059 {
2060 fprintf(stderr,
2061 "BPF_MAP_LOOKUP_ELEM failed with ENOENT %" PRIu64 " time%s, succeeded: %" PRIu64 " time%s, over the last minute.\n",
2062 reporter->cur_errors, reporter->cur_errors > 1 ? "s" : "", cur_successes, cur_successes > 1 ? "s" : "");
2063 }
2064
2065 static h2o_error_reporter_t track_ebpf_lookup = H2O_ERROR_REPORTER_INITIALIZER(report_ebpf_lookup_errors);
2066
2067 #define DO_EBPF_RETURN_LOOKUP(func) \
2068 do { \
2069 if (return_map_fd >= 0) { \
2070 pid_t tid = (pid_t)syscall(SYS_gettid); /* gettid() was not available until glibc 2.30 (2019) */ \
2071 /* Make sure old flags do not exist, otherwise the subsequent logic will be unreliable. */ \
2072 if (ebpf_map_delete(return_map_fd, &tid) == 0 || errno == ENOENT) { \
2073 do { \
2074 func \
2075 } while (0); \
2076 if (ebpf_map_lookup(return_map_fd, &tid, &flags) == 0) { \
2077 h2o_error_reporter_record_success(&track_ebpf_lookup); \
2078 } else { \
2079 if (errno == ENOENT) { \
2080 /* ENOENT could be issued in some reasons even if BPF tries to insert the entry, for example: \
2081 * * the entry in LRU hash was evicted \
2082 * * the insert operation in BPF program failed with ENOMEM \
2083 * We don't know the frequency for this ENOENT, so cap the number of logs. \
2084 * \
2085 * Other than the above reasons, ENOENT is issued when the tracer does not set the flags via h2o_return \
2086 * map, See h2o:_private_socket_lookup_flags handler in h2olog for details. */ \
2087 h2o_error_reporter_record_error(loop, &track_ebpf_lookup, 60000, 0); \
2088 } else { \
2089 h2o_perror("BPF_MAP_LOOKUP failed"); \
2090 } \
2091 } \
2092 } else { \
2093 h2o_perror("BPF_MAP_DELETE failed"); \
2094 } \
2095 } \
2096 } while (0)
2097
h2o_socket_ebpf_lookup_flags(h2o_loop_t * loop,int (* init_key)(h2o_ebpf_map_key_t * key,void * cbdata),void * cbdata)2098 uint64_t h2o_socket_ebpf_lookup_flags(h2o_loop_t *loop, int (*init_key)(h2o_ebpf_map_key_t *key, void *cbdata), void *cbdata)
2099 {
2100 uint64_t flags = 0;
2101
2102 int tracing_map_fd = get_tracing_map_fd(loop);
2103 h2o_ebpf_map_key_t key;
2104 if ((tracing_map_fd >= 0 || H2O__PRIVATE_SOCKET_LOOKUP_FLAGS_ENABLED()) && init_key(&key, cbdata)) {
2105 if (tracing_map_fd >= 0)
2106 ebpf_map_lookup(tracing_map_fd, &key, &flags);
2107
2108 if (H2O__PRIVATE_SOCKET_LOOKUP_FLAGS_ENABLED())
2109 DO_EBPF_RETURN_LOOKUP({ H2O__PRIVATE_SOCKET_LOOKUP_FLAGS(tid, flags, &key); });
2110 }
2111
2112 return flags;
2113 }
2114
h2o_socket_ebpf_lookup_flags_sni(h2o_loop_t * loop,uint64_t flags,const char * server_name,size_t server_name_len)2115 uint64_t h2o_socket_ebpf_lookup_flags_sni(h2o_loop_t *loop, uint64_t flags, const char *server_name, size_t server_name_len)
2116 {
2117 if (H2O__PRIVATE_SOCKET_LOOKUP_FLAGS_SNI_ENABLED())
2118 DO_EBPF_RETURN_LOOKUP({ H2O__PRIVATE_SOCKET_LOOKUP_FLAGS_SNI(tid, flags, server_name, server_name_len); });
2119 return flags;
2120 }
2121
2122 #undef DO_EBPF_RETURN_LOOKUP
2123
2124 #else
2125
h2o_socket_ebpf_setup(void)2126 int h2o_socket_ebpf_setup(void)
2127 {
2128 return 0;
2129 }
2130
h2o_socket_ebpf_init_key_raw(h2o_ebpf_map_key_t * key,int sock_type,struct sockaddr * local,struct sockaddr * remote)2131 int h2o_socket_ebpf_init_key_raw(h2o_ebpf_map_key_t *key, int sock_type, struct sockaddr *local, struct sockaddr *remote)
2132 {
2133 h2o_fatal("unimplemented");
2134 }
2135
h2o_socket_ebpf_init_key(h2o_ebpf_map_key_t * key,void * sock)2136 int h2o_socket_ebpf_init_key(h2o_ebpf_map_key_t *key, void *sock)
2137 {
2138 h2o_fatal("unimplemented");
2139 }
2140
h2o_socket_ebpf_lookup_flags(h2o_loop_t * loop,int (* init_key)(h2o_ebpf_map_key_t * key,void * cbdata),void * cbdata)2141 uint64_t h2o_socket_ebpf_lookup_flags(h2o_loop_t *loop, int (*init_key)(h2o_ebpf_map_key_t *key, void *cbdata), void *cbdata)
2142 {
2143 return 0;
2144 }
2145
h2o_socket_ebpf_lookup_flags_sni(h2o_loop_t * loop,uint64_t flags,const char * server_name,size_t server_name_len)2146 uint64_t h2o_socket_ebpf_lookup_flags_sni(h2o_loop_t *loop, uint64_t flags, const char *server_name, size_t server_name_len)
2147 {
2148 return flags;
2149 }
2150
2151 #endif
2152