1 /*
2  * Copyright (c) 2015 DeNA Co., Ltd., Kazuho Oku, Justin Zhu
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a copy
5  * of this software and associated documentation files (the "Software"), to
6  * deal in the Software without restriction, including without limitation the
7  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
8  * sell copies of the Software, and to permit persons to whom the Software is
9  * furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice shall be included in
12  * all copies or substantial portions of the Software.
13  *
14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
19  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
20  * IN THE SOFTWARE.
21  */
22 #include <errno.h>
23 #include <fcntl.h>
24 #include <inttypes.h>
25 #include <limits.h>
26 #include <netdb.h>
27 #include <netinet/in.h>
28 #include <netinet/tcp.h>
29 #include <string.h>
30 #include <sys/syscall.h>
31 #include <sys/un.h>
32 #include <unistd.h>
33 #include <openssl/err.h>
34 #if defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__)
35 #include <sys/ioctl.h>
36 #endif
37 #include "picotls.h"
38 #include "quicly.h"
39 #include "h2o/socket.h"
40 #include "h2o/multithread.h"
41 #include "../probes_.h"
42 
43 #if defined(__APPLE__) && defined(__clang__)
44 #pragma clang diagnostic ignored "-Wdeprecated-declarations"
45 #endif
46 
47 #ifndef IOV_MAX
48 #define IOV_MAX UIO_MAXIOV
49 #endif
50 
51 /* kernel-headers bundled with Ubuntu 14.04 does not have the constant defined in netinet/tcp.h */
52 #if defined(__linux__) && !defined(TCP_NOTSENT_LOWAT)
53 #define TCP_NOTSENT_LOWAT 25
54 #endif
55 
56 #if H2O_USE_DTRACE && defined(__linux__)
57 #define H2O_USE_EBPF_MAP 1
58 #endif
59 
60 #define OPENSSL_HOSTNAME_VALIDATION_LINKAGE static
61 #pragma GCC diagnostic push
62 #pragma GCC diagnostic ignored "-Wpragmas"
63 #pragma GCC diagnostic ignored "-Wshorten-64-to-32"
64 #include "../../deps/ssl-conservatory/openssl/openssl_hostname_validation.c"
65 #pragma GCC diagnostic pop
66 
67 #define SOCKET_PROBE(label, sock, ...)                                                                                             \
68     do {                                                                                                                           \
69         h2o_socket_t *_sock = (sock);                                                                                              \
70         if (!_sock->_skip_tracing)                                                                                                 \
71         H2O_PROBE(SOCKET_##label, sock, __VA_ARGS__);                                                                              \
72     } while (0)
73 
74 struct st_h2o_socket_ssl_t {
75     SSL_CTX *ssl_ctx;
76     SSL *ossl;
77     ptls_t *ptls;
78     int *did_write_in_read; /* used for detecting and closing the connection upon renegotiation (FIXME implement renegotiation) */
79     size_t record_overhead;
80     struct {
81         h2o_socket_cb cb;
82         union {
83             struct {
84                 struct {
85                     enum {
86                         ASYNC_RESUMPTION_STATE_COMPLETE = 0, /* just pass thru */
87                         ASYNC_RESUMPTION_STATE_RECORD,       /* record first input, restore SSL state if it changes to REQUEST_SENT
88                                                               */
89                         ASYNC_RESUMPTION_STATE_REQUEST_SENT  /* async request has been sent, and is waiting for response */
90                     } state;
91                     SSL_SESSION *session_data;
92                 } async_resumption;
93             } server;
94             struct {
95                 char *server_name;
96                 h2o_cache_t *session_cache;
97                 h2o_iovec_t session_cache_key;
98                 h2o_cache_hashcode_t session_cache_key_hash;
99             } client;
100         };
101     } handshake;
102     struct {
103         h2o_buffer_t *encrypted;
104     } input;
105     /**
106      * Pending TLS data to be sent.
107      */
108     struct {
109         /**
110          * This buffer is initialized when and only when pending data is stored. Otherwise, all the members are zero-cleared; see
111          * `has_pending_ssl_data`.
112          * To reduce the cost of repeated memory allocation, expansion, and release, this buffer points to a chunk of memory being
113          * allocated from `h2o_socket_ssl_buffer_allocator` when initialized. Upon disposal, the memory chunk being used by this
114          * buffer is returned to that memory pool, unless the chunk has been expanded. It is designed as such because sometimes it
115          * is hard to limit the amount of TLS records being generated at once (who knows how large the server's handshake messages
116          * will be, or when it has to send a KeyUpdate message?). But for most of the case, handshake messages will be smaller than
117          * the default size (H2O_SOCKET_DEFAULT_SSL_BUFFER_SIZE), and application traffic will not cause expansion (see
118          * * `generate_tls_records`). Therefore, the memory chunk will be recycled.
119          */
120         ptls_buffer_t buf;
121         size_t pending_off;
122     } output;
123 };
124 
125 struct st_h2o_ssl_context_t {
126     SSL_CTX *ctx;
127     const h2o_iovec_t *protocols;
128     h2o_iovec_t _npn_list_of_protocols;
129 };
130 
131 /* backend functions */
132 static void init_write_buf(h2o_socket_t *sock, h2o_iovec_t *bufs, size_t bufcnt, size_t first_buf_written);
133 static void dispose_write_buf(h2o_socket_t *sock);
134 static void dispose_ssl_output_buffer(struct st_h2o_socket_ssl_t *ssl);
135 static int has_pending_ssl_bytes(struct st_h2o_socket_ssl_t *ssl);
136 static size_t generate_tls_records(h2o_socket_t *sock, h2o_iovec_t **bufs, size_t *bufcnt, size_t first_buf_written);
137 static void do_dispose_socket(h2o_socket_t *sock);
138 static void do_write(h2o_socket_t *sock, h2o_iovec_t *bufs, size_t bufcnt, h2o_socket_cb cb);
139 static void do_read_start(h2o_socket_t *sock);
140 static void do_read_stop(h2o_socket_t *sock);
141 static int do_export(h2o_socket_t *_sock, h2o_socket_export_t *info);
142 static h2o_socket_t *do_import(h2o_loop_t *loop, h2o_socket_export_t *info);
143 static socklen_t get_peername_uncached(h2o_socket_t *sock, struct sockaddr *sa);
144 static socklen_t get_sockname_uncached(h2o_socket_t *sock, struct sockaddr *sa);
145 
146 /* internal functions called from the backend */
147 static const char *decode_ssl_input(h2o_socket_t *sock);
148 static void on_write_complete(h2o_socket_t *sock, const char *err);
149 
150 #if H2O_USE_LIBUV
151 #include "socket/uv-binding.c.h"
152 #else
153 #include "socket/evloop.c.h"
154 #endif
155 
156 h2o_buffer_mmap_settings_t h2o_socket_buffer_mmap_settings = {
157     32 * 1024 * 1024, /* 32MB, should better be greater than max frame size of HTTP2 for performance reasons */
158     "/tmp/h2o.b.XXXXXX"};
159 
160 h2o_buffer_prototype_t h2o_socket_buffer_prototype = {
161     {H2O_SOCKET_INITIAL_INPUT_BUFFER_SIZE}, /* minimum initial capacity; actual initial size is ~8KB, see h2o_buffer_reserve */
162     &h2o_socket_buffer_mmap_settings};
163 
164 size_t h2o_socket_ssl_buffer_size = H2O_SOCKET_DEFAULT_SSL_BUFFER_SIZE;
165 __thread h2o_mem_recycle_t h2o_socket_ssl_buffer_allocator = {1024};
166 
167 const char h2o_socket_error_out_of_memory[] = "out of memory";
168 const char h2o_socket_error_io[] = "I/O error";
169 const char h2o_socket_error_closed[] = "socket closed by peer";
170 const char h2o_socket_error_conn_fail[] = "connection failure";
171 const char h2o_socket_error_conn_refused[] = "connection refused";
172 const char h2o_socket_error_conn_timed_out[] = "connection timed out";
173 const char h2o_socket_error_network_unreachable[] = "network unreachable";
174 const char h2o_socket_error_host_unreachable[] = "host unreachable";
175 const char h2o_socket_error_socket_fail[] = "socket creation failed";
176 const char h2o_socket_error_ssl_no_cert[] = "no certificate";
177 const char h2o_socket_error_ssl_cert_invalid[] = "invalid certificate";
178 const char h2o_socket_error_ssl_cert_name_mismatch[] = "certificate name mismatch";
179 const char h2o_socket_error_ssl_decode[] = "SSL decode error";
180 const char h2o_socket_error_ssl_handshake[] = "ssl handshake failure";
181 
182 static void (*resumption_get_async)(h2o_socket_t *sock, h2o_iovec_t session_id);
183 static void (*resumption_new)(h2o_socket_t *sock, h2o_iovec_t session_id, h2o_iovec_t session_data);
184 
read_bio(BIO * b,char * out,int len)185 static int read_bio(BIO *b, char *out, int len)
186 {
187     h2o_socket_t *sock = BIO_get_data(b);
188 
189     if (len == 0)
190         return 0;
191 
192     if (sock->ssl->input.encrypted->size == 0) {
193         BIO_set_retry_read(b);
194         return -1;
195     }
196 
197     if (sock->ssl->input.encrypted->size < len) {
198         len = (int)sock->ssl->input.encrypted->size;
199     }
200     memcpy(out, sock->ssl->input.encrypted->bytes, len);
201     h2o_buffer_consume(&sock->ssl->input.encrypted, len);
202 
203     return len;
204 }
205 
init_write_buf(h2o_socket_t * sock,h2o_iovec_t * bufs,size_t bufcnt,size_t first_buf_written)206 static void init_write_buf(h2o_socket_t *sock, h2o_iovec_t *bufs, size_t bufcnt, size_t first_buf_written)
207 {
208     if (bufcnt < PTLS_ELEMENTSOF(sock->_write_buf.smallbufs)) {
209         sock->_write_buf.bufs = sock->_write_buf.smallbufs;
210     } else {
211         sock->_write_buf.bufs = h2o_mem_alloc(sizeof(sock->_write_buf.bufs[0]) * bufcnt);
212         sock->_write_buf.alloced_ptr = sock->_write_buf.bufs;
213     }
214     if (bufcnt != 0) {
215         sock->_write_buf.bufs[0].base = bufs[0].base + first_buf_written;
216         sock->_write_buf.bufs[0].len = bufs[0].len - first_buf_written;
217         for (size_t i = 1; i < bufcnt; ++i)
218             sock->_write_buf.bufs[i] = bufs[i];
219     }
220     sock->_write_buf.cnt = bufcnt;
221 }
222 
dispose_write_buf(h2o_socket_t * sock)223 static void dispose_write_buf(h2o_socket_t *sock)
224 {
225     if (sock->_write_buf.smallbufs <= sock->_write_buf.bufs &&
226         sock->_write_buf.bufs <=
227             sock->_write_buf.smallbufs + sizeof(sock->_write_buf.smallbufs) / sizeof(sock->_write_buf.smallbufs[0])) {
228         /* no need to free */
229     } else {
230         free(sock->_write_buf.alloced_ptr);
231         sock->_write_buf.bufs = sock->_write_buf.smallbufs;
232     }
233 }
234 
init_ssl_output_buffer(struct st_h2o_socket_ssl_t * ssl)235 static void init_ssl_output_buffer(struct st_h2o_socket_ssl_t *ssl)
236 {
237     ptls_buffer_init(&ssl->output.buf, h2o_mem_alloc_recycle(&h2o_socket_ssl_buffer_allocator, h2o_socket_ssl_buffer_size),
238                      h2o_socket_ssl_buffer_size);
239     ssl->output.buf.is_allocated = 1; /* set to true, so that the allocated memory is freed when the buffer is expanded */
240     ssl->output.pending_off = 0;
241 }
242 
dispose_ssl_output_buffer(struct st_h2o_socket_ssl_t * ssl)243 static void dispose_ssl_output_buffer(struct st_h2o_socket_ssl_t *ssl)
244 {
245     /* The destruction logic that we have here are different from `ptls_buffer_dispose` in following two aspects:
246      * - returns the allocated memory to the pool if possible
247      * - does not zero-clear the memory (there's no need to, because the content is something to be sent in clear) */
248 
249     assert(ssl->output.buf.is_allocated);
250 
251     if (ssl->output.buf.capacity == h2o_socket_ssl_buffer_size) {
252         h2o_mem_free_recycle(&h2o_socket_ssl_buffer_allocator, ssl->output.buf.base);
253     } else {
254         free(ssl->output.buf.base);
255     }
256     ssl->output.buf = (ptls_buffer_t){};
257     ssl->output.pending_off = 0;
258 }
259 
has_pending_ssl_bytes(struct st_h2o_socket_ssl_t * ssl)260 static int has_pending_ssl_bytes(struct st_h2o_socket_ssl_t *ssl)
261 {
262     /* for convenience, this function can be invoked for non-TLS connections too, in which case ssl will be NULL */
263     if (ssl == NULL)
264         return 0;
265 
266     /* the contract is that `dispose_ssl_output_buffer` is called immediately when all the data are written out */
267     return ssl->output.buf.base != NULL;
268 }
269 
write_ssl_bytes(h2o_socket_t * sock,const void * in,size_t len)270 static void write_ssl_bytes(h2o_socket_t *sock, const void *in, size_t len)
271 {
272     if (len != 0) {
273         if (!has_pending_ssl_bytes(sock->ssl))
274             init_ssl_output_buffer(sock->ssl);
275         if (ptls_buffer_reserve(&sock->ssl->output.buf, len) != 0)
276             h2o_fatal("no memory; tried to allocate %zu bytes", len);
277         memcpy(sock->ssl->output.buf.base + sock->ssl->output.buf.off, in, len);
278         sock->ssl->output.buf.off += len;
279     }
280 }
281 
write_bio(BIO * b,const char * in,int len)282 static int write_bio(BIO *b, const char *in, int len)
283 {
284     h2o_socket_t *sock = BIO_get_data(b);
285 
286     /* FIXME no support for SSL renegotiation (yet) */
287     if (sock->ssl->did_write_in_read != NULL) {
288         *sock->ssl->did_write_in_read = 1;
289         return -1;
290     }
291 
292     write_ssl_bytes(sock, in, len);
293     return len;
294 }
295 
puts_bio(BIO * b,const char * str)296 static int puts_bio(BIO *b, const char *str)
297 {
298     return write_bio(b, str, (int)strlen(str));
299 }
300 
ctrl_bio(BIO * b,int cmd,long num,void * ptr)301 static long ctrl_bio(BIO *b, int cmd, long num, void *ptr)
302 {
303     switch (cmd) {
304     case BIO_CTRL_GET_CLOSE:
305         return BIO_get_shutdown(b);
306     case BIO_CTRL_SET_CLOSE:
307         BIO_set_shutdown(b, (int)num);
308         return 1;
309     case BIO_CTRL_FLUSH:
310         return 1;
311     default:
312         return 0;
313     }
314 }
315 
setup_bio(h2o_socket_t * sock)316 static void setup_bio(h2o_socket_t *sock)
317 {
318     static BIO_METHOD *volatile bio_methods = NULL;
319     H2O_MULTITHREAD_ONCE({
320         bio_methods = BIO_meth_new(BIO_TYPE_FD, "h2o_socket");
321         BIO_meth_set_write(bio_methods, write_bio);
322         BIO_meth_set_read(bio_methods, read_bio);
323         BIO_meth_set_puts(bio_methods, puts_bio);
324         BIO_meth_set_ctrl(bio_methods, ctrl_bio);
325     });
326 
327     BIO *bio = BIO_new(bio_methods);
328     if (bio == NULL)
329         h2o_fatal("no memory");
330     BIO_set_data(bio, sock);
331     BIO_set_init(bio, 1);
332     SSL_set_bio(sock->ssl->ossl, bio, bio);
333 }
334 
decode_ssl_input(h2o_socket_t * sock)335 const char *decode_ssl_input(h2o_socket_t *sock)
336 {
337     assert(sock->ssl != NULL);
338     assert(sock->ssl->handshake.cb == NULL);
339 
340     if (sock->ssl->ptls != NULL) {
341         if (sock->ssl->input.encrypted->size != 0) {
342             const char *src = sock->ssl->input.encrypted->bytes, *src_end = src + sock->ssl->input.encrypted->size;
343             h2o_iovec_t reserved;
344             ptls_buffer_t rbuf;
345             int ret;
346             if ((reserved = h2o_buffer_try_reserve(&sock->input, sock->ssl->input.encrypted->size)).base == NULL)
347                 return h2o_socket_error_out_of_memory;
348             ptls_buffer_init(&rbuf, reserved.base, reserved.len);
349             do {
350                 size_t consumed = src_end - src;
351                 if ((ret = ptls_receive(sock->ssl->ptls, &rbuf, src, &consumed)) != 0)
352                     break;
353                 src += consumed;
354             } while (src != src_end);
355             h2o_buffer_consume(&sock->ssl->input.encrypted, sock->ssl->input.encrypted->size - (src_end - src));
356             if (rbuf.is_allocated) {
357                 if ((reserved = h2o_buffer_try_reserve(&sock->input, rbuf.off)).base == NULL)
358                     return h2o_socket_error_out_of_memory;
359                 memcpy(reserved.base, rbuf.base, rbuf.off);
360                 sock->input->size += rbuf.off;
361                 ptls_buffer_dispose(&rbuf);
362             } else {
363                 sock->input->size += rbuf.off;
364             }
365             if (!(ret == 0 || ret == PTLS_ERROR_IN_PROGRESS))
366                 return h2o_socket_error_ssl_decode;
367         }
368         return NULL;
369     }
370 
371     while (sock->ssl->input.encrypted->size != 0 || SSL_pending(sock->ssl->ossl)) {
372         int rlen;
373         h2o_iovec_t buf = h2o_buffer_try_reserve(&sock->input, 4096);
374         if (buf.base == NULL)
375             return h2o_socket_error_out_of_memory;
376         { /* call SSL_read (while detecting SSL renegotiation and reporting it as error) */
377             int did_write_in_read = 0;
378             sock->ssl->did_write_in_read = &did_write_in_read;
379             ERR_clear_error();
380             rlen = SSL_read(sock->ssl->ossl, buf.base, (int)buf.len);
381             sock->ssl->did_write_in_read = NULL;
382             if (did_write_in_read)
383                 return "ssl renegotiation not supported";
384         }
385         if (rlen == -1) {
386             if (SSL_get_error(sock->ssl->ossl, rlen) != SSL_ERROR_WANT_READ) {
387                 return h2o_socket_error_ssl_decode;
388             }
389             break;
390         } else if (rlen == 0) {
391             break;
392         } else {
393             sock->input->size += rlen;
394         }
395     }
396 
397     return 0;
398 }
399 
flush_pending_ssl(h2o_socket_t * sock,h2o_socket_cb cb)400 static void flush_pending_ssl(h2o_socket_t *sock, h2o_socket_cb cb)
401 {
402     do_write(sock, NULL, 0, cb);
403 }
404 
destroy_ssl(struct st_h2o_socket_ssl_t * ssl)405 static void destroy_ssl(struct st_h2o_socket_ssl_t *ssl)
406 {
407     if (ssl->ptls != NULL) {
408         ptls_free(ssl->ptls);
409         ssl->ptls = NULL;
410     }
411     if (ssl->ossl != NULL) {
412         if (!SSL_is_server(ssl->ossl)) {
413             free(ssl->handshake.client.server_name);
414             free(ssl->handshake.client.session_cache_key.base);
415         }
416         SSL_free(ssl->ossl);
417         ssl->ossl = NULL;
418     }
419     h2o_buffer_dispose(&ssl->input.encrypted);
420     if (has_pending_ssl_bytes(ssl))
421         dispose_ssl_output_buffer(ssl);
422     free(ssl);
423 }
424 
dispose_socket(h2o_socket_t * sock,const char * err)425 static void dispose_socket(h2o_socket_t *sock, const char *err)
426 {
427     void (*close_cb)(void *data);
428     void *close_cb_data;
429 
430     if (sock->ssl != NULL) {
431         destroy_ssl(sock->ssl);
432         sock->ssl = NULL;
433     }
434     h2o_buffer_dispose(&sock->input);
435     if (sock->_peername != NULL) {
436         free(sock->_peername);
437         sock->_peername = NULL;
438     }
439     if (sock->_sockname != NULL) {
440         free(sock->_sockname);
441         sock->_sockname = NULL;
442     }
443 
444     close_cb = sock->on_close.cb;
445     close_cb_data = sock->on_close.data;
446 
447     do_dispose_socket(sock);
448 
449     if (close_cb != NULL)
450         close_cb(close_cb_data);
451 }
452 
shutdown_ssl(h2o_socket_t * sock,const char * err)453 static void shutdown_ssl(h2o_socket_t *sock, const char *err)
454 {
455     if (err != NULL)
456         goto Close;
457 
458     if (sock->_cb.write != NULL) {
459         /* note: libuv calls the write callback after the socket is closed by uv_close (with status set to 0 if the write succeeded)
460          */
461         sock->_cb.write = NULL;
462         goto Close;
463     }
464 
465     if (sock->ssl->ptls != NULL) {
466         ptls_buffer_t wbuf;
467         uint8_t wbuf_small[32];
468         ptls_buffer_init(&wbuf, wbuf_small, sizeof(wbuf_small));
469         if (ptls_send_alert(sock->ssl->ptls, &wbuf, PTLS_ALERT_LEVEL_WARNING, PTLS_ALERT_CLOSE_NOTIFY) != 0)
470             goto Close;
471         write_ssl_bytes(sock, wbuf.base, wbuf.off);
472         ptls_buffer_dispose(&wbuf);
473     } else if (sock->ssl->ossl != NULL) {
474         ERR_clear_error();
475         if (SSL_shutdown(sock->ssl->ossl) == -1)
476             goto Close;
477     } else {
478         goto Close;
479     }
480 
481     if (has_pending_ssl_bytes(sock->ssl)) {
482         h2o_socket_read_stop(sock);
483         flush_pending_ssl(sock, dispose_socket);
484         return;
485     }
486 
487 Close:
488     dispose_socket(sock, err);
489 }
490 
h2o_socket_dispose_export(h2o_socket_export_t * info)491 void h2o_socket_dispose_export(h2o_socket_export_t *info)
492 {
493     assert(info->fd != -1);
494     if (info->ssl != NULL) {
495         destroy_ssl(info->ssl);
496         info->ssl = NULL;
497     }
498     h2o_buffer_dispose(&info->input);
499     close(info->fd);
500     info->fd = -1;
501 }
502 
h2o_socket_export(h2o_socket_t * sock,h2o_socket_export_t * info)503 int h2o_socket_export(h2o_socket_t *sock, h2o_socket_export_t *info)
504 {
505     static h2o_buffer_prototype_t nonpooling_prototype;
506 
507     assert(!h2o_socket_is_writing(sock));
508 
509     if (do_export(sock, info) == -1)
510         return -1;
511 
512     if ((info->ssl = sock->ssl) != NULL) {
513         sock->ssl = NULL;
514         h2o_buffer_set_prototype(&info->ssl->input.encrypted, &nonpooling_prototype);
515     }
516     info->input = sock->input;
517     h2o_buffer_set_prototype(&info->input, &nonpooling_prototype);
518     h2o_buffer_init(&sock->input, &h2o_socket_buffer_prototype);
519 
520     h2o_socket_close(sock);
521 
522     return 0;
523 }
524 
h2o_socket_import(h2o_loop_t * loop,h2o_socket_export_t * info)525 h2o_socket_t *h2o_socket_import(h2o_loop_t *loop, h2o_socket_export_t *info)
526 {
527     h2o_socket_t *sock;
528 
529     assert(info->fd != -1);
530 
531     sock = do_import(loop, info);
532     info->fd = -1; /* just in case */
533     if ((sock->ssl = info->ssl) != NULL) {
534         setup_bio(sock);
535         h2o_buffer_set_prototype(&sock->ssl->input.encrypted, &h2o_socket_buffer_prototype);
536     }
537     sock->input = info->input;
538     h2o_buffer_set_prototype(&sock->input, &h2o_socket_buffer_prototype);
539     return sock;
540 }
541 
h2o_socket_close(h2o_socket_t * sock)542 void h2o_socket_close(h2o_socket_t *sock)
543 {
544     if (sock->ssl == NULL) {
545         dispose_socket(sock, 0);
546     } else {
547         shutdown_ssl(sock, 0);
548     }
549 }
550 
calc_suggested_tls_payload_size(h2o_socket_t * sock,uint16_t suggested_tls_record_size)551 static uint16_t calc_suggested_tls_payload_size(h2o_socket_t *sock, uint16_t suggested_tls_record_size)
552 {
553     uint16_t ps = suggested_tls_record_size;
554     if (sock->ssl != NULL && sock->ssl->record_overhead < ps)
555         ps -= sock->ssl->record_overhead;
556     return ps;
557 }
558 
disable_latency_optimized_write(h2o_socket_t * sock,int (* adjust_notsent_lowat)(h2o_socket_t *,unsigned))559 static void disable_latency_optimized_write(h2o_socket_t *sock, int (*adjust_notsent_lowat)(h2o_socket_t *, unsigned))
560 {
561     if (sock->_latency_optimization.notsent_is_minimized) {
562         adjust_notsent_lowat(sock, 0);
563         sock->_latency_optimization.notsent_is_minimized = 0;
564     }
565     sock->_latency_optimization.state = H2O_SOCKET_LATENCY_OPTIMIZATION_STATE_DISABLED;
566     sock->_latency_optimization.suggested_tls_payload_size = SIZE_MAX;
567     sock->_latency_optimization.suggested_write_size = SIZE_MAX;
568 }
569 
prepare_for_latency_optimized_write(h2o_socket_t * sock,const h2o_socket_latency_optimization_conditions_t * conditions,uint32_t rtt,uint32_t mss,uint32_t cwnd_size,uint32_t cwnd_avail,uint64_t loop_time,int (* adjust_notsent_lowat)(h2o_socket_t *,unsigned))570 static inline void prepare_for_latency_optimized_write(h2o_socket_t *sock,
571                                                        const h2o_socket_latency_optimization_conditions_t *conditions, uint32_t rtt,
572                                                        uint32_t mss, uint32_t cwnd_size, uint32_t cwnd_avail, uint64_t loop_time,
573                                                        int (*adjust_notsent_lowat)(h2o_socket_t *, unsigned))
574 {
575     /* check RTT */
576     if (rtt < conditions->min_rtt * (uint64_t)1000)
577         goto Disable;
578     if (rtt * conditions->max_additional_delay < loop_time * 1000 * 100)
579         goto Disable;
580 
581     /* latency-optimization is enabled */
582     sock->_latency_optimization.state = H2O_SOCKET_LATENCY_OPTIMIZATION_STATE_DETERMINED;
583 
584     /* no need to:
585      *   1) adjust the write size if single_write_size << cwnd_size
586      *   2) align TLS record boundary to TCP packet boundary if packet loss-rate is low and BW isn't small (implied by cwnd size)
587      */
588     if (mss * cwnd_size < conditions->max_cwnd) {
589         if (!sock->_latency_optimization.notsent_is_minimized) {
590             if (adjust_notsent_lowat(sock, 1 /* cannot be set to zero on Linux */) != 0)
591                 goto Disable;
592             sock->_latency_optimization.notsent_is_minimized = 1;
593         }
594         sock->_latency_optimization.suggested_tls_payload_size = calc_suggested_tls_payload_size(sock, mss);
595         sock->_latency_optimization.suggested_write_size =
596             cwnd_avail * (size_t)sock->_latency_optimization.suggested_tls_payload_size;
597     } else {
598         if (sock->_latency_optimization.notsent_is_minimized) {
599             if (adjust_notsent_lowat(sock, 0) != 0)
600                 goto Disable;
601             sock->_latency_optimization.notsent_is_minimized = 0;
602         }
603         sock->_latency_optimization.suggested_tls_payload_size = SIZE_MAX;
604         sock->_latency_optimization.suggested_write_size = SIZE_MAX;
605     }
606     return;
607 
608 Disable:
609     disable_latency_optimized_write(sock, adjust_notsent_lowat);
610 }
611 
612 /**
613  * Obtains RTT, MSS, size of CWND (in the number of packets).
614  * Also writes to cwnd_avail minimum number of packets (of MSS size) sufficient to shut up poll-for-write under the precondition
615  * that TCP_NOTSENT_LOWAT is set to 1.
616  */
obtain_tcp_info(int fd,uint32_t * rtt,uint32_t * mss,uint32_t * cwnd_size,uint32_t * cwnd_avail)617 static int obtain_tcp_info(int fd, uint32_t *rtt, uint32_t *mss, uint32_t *cwnd_size, uint32_t *cwnd_avail)
618 {
619 #define CALC_CWND_PAIR_FROM_BYTE_UNITS(cwnd_bytes, inflight_bytes)                                                                 \
620     do {                                                                                                                           \
621         *cwnd_size = (cwnd_bytes + *mss / 2) / *mss;                                                                               \
622         *cwnd_avail = cwnd_bytes > inflight_bytes ? (cwnd_bytes - inflight_bytes) / *mss + 2 : 2;                                  \
623     } while (0)
624 
625 #if defined(__linux__) && defined(TCP_INFO)
626 
627     struct tcp_info tcpi;
628     socklen_t tcpisz = sizeof(tcpi);
629     if (getsockopt(fd, IPPROTO_TCP, TCP_INFO, &tcpi, &tcpisz) != 0)
630         return -1;
631     *rtt = tcpi.tcpi_rtt;
632     *mss = tcpi.tcpi_snd_mss;
633     *cwnd_size = tcpi.tcpi_snd_cwnd;
634     *cwnd_avail = tcpi.tcpi_snd_cwnd > tcpi.tcpi_unacked ? tcpi.tcpi_snd_cwnd - tcpi.tcpi_unacked + 2 : 2;
635     return 0;
636 
637 #elif defined(__FreeBSD__) && defined(TCP_INFO) && 0 /* disabled since we wouldn't use it anyways; OS lacks TCP_NOTSENT_LOWAT */
638 
639     struct tcp_info tcpi;
640     socklen_t tcpisz = sizeof(tcpi);
641     int bytes_inflight;
642     if (getsockopt(fd, IPPROTO_TCP, TCP_INFO, &tcpi, &tcpisz) != 0 || ioctl(fd, FIONWRITE, &bytes_inflight) == -1)
643         return -1;
644     *rtt = tcpi.tcpi_rtt;
645     *mss = tcpi.tcpi_snd_mss;
646     CALC_CWND_PAIR_FROM_BYTE_UNITS(tcpi.tcpi_snd_cwnd, bytes_inflight);
647     return 0;
648 
649 #elif defined(__APPLE__) && defined(TCP_CONNECTION_INFO)
650 
651     struct tcp_connection_info tcpi;
652     socklen_t tcpisz = sizeof(tcpi);
653     if (getsockopt(fd, IPPROTO_TCP, TCP_CONNECTION_INFO, &tcpi, &tcpisz) != 0 || tcpi.tcpi_maxseg == 0)
654         return -1;
655     *rtt = tcpi.tcpi_srtt * 1000;
656     *mss = tcpi.tcpi_maxseg;
657     CALC_CWND_PAIR_FROM_BYTE_UNITS(tcpi.tcpi_snd_cwnd, tcpi.tcpi_snd_sbbytes);
658     return 0;
659 
660 #else
661     /* TODO add support for NetBSD; note that the OS returns the number of packets for tcpi_snd_cwnd; see
662      * http://twitter.com/n_soda/status/740719125878575105
663      */
664     return -1;
665 #endif
666 
667 #undef CALC_CWND_PAIR_FROM_BYTE_UNITS
668 }
669 
670 #ifdef TCP_NOTSENT_LOWAT
adjust_notsent_lowat(h2o_socket_t * sock,unsigned notsent_lowat)671 static int adjust_notsent_lowat(h2o_socket_t *sock, unsigned notsent_lowat)
672 {
673     return setsockopt(h2o_socket_get_fd(sock), IPPROTO_TCP, TCP_NOTSENT_LOWAT, &notsent_lowat, sizeof(notsent_lowat));
674 }
675 #else
676 #define adjust_notsent_lowat NULL
677 #endif
678 
h2o_socket_do_prepare_for_latency_optimized_write(h2o_socket_t * sock,const h2o_socket_latency_optimization_conditions_t * conditions)679 size_t h2o_socket_do_prepare_for_latency_optimized_write(h2o_socket_t *sock,
680                                                          const h2o_socket_latency_optimization_conditions_t *conditions)
681 {
682     uint32_t rtt = 0, mss = 0, cwnd_size = 0, cwnd_avail = 0;
683     uint64_t loop_time = UINT64_MAX;
684     int can_prepare = 1;
685 
686 #if !defined(TCP_NOTSENT_LOWAT)
687     /* the feature cannot be setup unless TCP_NOTSENT_LOWAT is available */
688     can_prepare = 0;
689 #endif
690 
691 #if H2O_USE_LIBUV
692     /* poll-then-write is impossible with libuv */
693     can_prepare = 0;
694 #else
695     if (can_prepare)
696         loop_time = h2o_evloop_get_execution_time_millisec(h2o_socket_get_loop(sock));
697 #endif
698 
699     /* obtain TCP states */
700     if (can_prepare && obtain_tcp_info(h2o_socket_get_fd(sock), &rtt, &mss, &cwnd_size, &cwnd_avail) != 0)
701         can_prepare = 0;
702 
703     /* determine suggested_write_size, suggested_tls_record_size and adjust TCP_NOTSENT_LOWAT based on the obtained information */
704     if (can_prepare) {
705         prepare_for_latency_optimized_write(sock, conditions, rtt, mss, cwnd_size, cwnd_avail, loop_time, adjust_notsent_lowat);
706     } else {
707         disable_latency_optimized_write(sock, adjust_notsent_lowat);
708     }
709 
710     return sock->_latency_optimization.suggested_write_size;
711 
712 #undef CALC_CWND_PAIR_FROM_BYTE_UNITS
713 }
714 
calc_tls_write_size(h2o_socket_t * sock,size_t bufsize)715 static size_t calc_tls_write_size(h2o_socket_t *sock, size_t bufsize)
716 {
717     size_t recsize;
718 
719     /* set recsize to the maximum TLS record size by using the latency optimizer, or if the optimizer is not in action, based on the
720      * number of bytes that have already been sent */
721     switch (sock->_latency_optimization.state) {
722     case H2O_SOCKET_LATENCY_OPTIMIZATION_STATE_TBD:
723     case H2O_SOCKET_LATENCY_OPTIMIZATION_STATE_DISABLED:
724         recsize = sock->bytes_written < 64 * 1024 ? calc_suggested_tls_payload_size(sock, 1400) : SIZE_MAX;
725         break;
726     case H2O_SOCKET_LATENCY_OPTIMIZATION_STATE_DETERMINED:
727         sock->_latency_optimization.state = H2O_SOCKET_LATENCY_OPTIMIZATION_STATE_NEEDS_UPDATE;
728     /* fallthru */
729     default:
730         recsize = sock->_latency_optimization.suggested_tls_payload_size;
731         break;
732     }
733 
734     return recsize < bufsize ? recsize : bufsize;
735 }
736 
737 /**
738  * Given a vector, generate at least one TLS record if there's enough space in the buffer, and return the size of application data
739  * being encrypted. Otherwise, returns zero.
740  */
generate_tls_records_from_one_vec(h2o_socket_t * sock,const void * input,size_t inlen)741 static size_t generate_tls_records_from_one_vec(h2o_socket_t *sock, const void *input, size_t inlen)
742 {
743     static const size_t MAX_RECORD_PAYLOAD_SIZE = 16 * 1024, LARGE_RECORD_OVERHEAD = 5 + 32;
744 
745     size_t tls_write_size = calc_tls_write_size(sock, inlen);
746     size_t space_left = sock->ssl->output.buf.capacity - sock->ssl->output.buf.off;
747 
748     if (tls_write_size < inlen) {
749         /* Writing small TLS records, one by one. Bail out if we might fail to do so. */
750         if (space_left < tls_write_size + LARGE_RECORD_OVERHEAD)
751             return 0;
752     } else {
753         /* Writing full-sized records. Adjust tls_write_size to a multiple of full-sized TLS records, or bail out if we cannot
754          * write one. */
755         size_t rec_capacity = space_left / (MAX_RECORD_PAYLOAD_SIZE + LARGE_RECORD_OVERHEAD);
756         if (rec_capacity == 0)
757             return 0;
758         tls_write_size = MAX_RECORD_PAYLOAD_SIZE * rec_capacity;
759         if (tls_write_size > inlen)
760             tls_write_size = inlen;
761     }
762 
763     /* Generate TLS record(s). */
764     if (sock->ssl->ptls != NULL) {
765         int ret = ptls_send(sock->ssl->ptls, &sock->ssl->output.buf, input, tls_write_size);
766         assert(ret == 0);
767     } else {
768         int ret = SSL_write(sock->ssl->ossl, input, (int)tls_write_size);
769         /* The error happens if SSL_write is called after SSL_read returns a fatal error (e.g. due to corrupt TCP packet being
770          * received). We might be converting more and more TLS records on this side as read errors occur. */
771         if (ret <= 0)
772             return SIZE_MAX;
773         assert(ret == tls_write_size);
774     }
775 
776     SOCKET_PROBE(WRITE_TLS_RECORD, sock, tls_write_size, sock->ssl->output.buf.off);
777     return tls_write_size;
778 }
779 
780 /**
781  * Generate as many TLS records as possible, given a list of vectors. Upon return, `*bufs` and `*bufcnt` will be updated to point
782  * the buffers that still have pending data, and the number of bytes being already written within `(*buf)[0]` will be returned.
783  */
generate_tls_records(h2o_socket_t * sock,h2o_iovec_t ** bufs,size_t * bufcnt,size_t first_buf_written)784 static size_t generate_tls_records(h2o_socket_t *sock, h2o_iovec_t **bufs, size_t *bufcnt, size_t first_buf_written)
785 {
786     assert(!has_pending_ssl_bytes(sock->ssl) && "we are filling encrypted bytes from the front, with no existing buffer, always");
787 
788     while (*bufcnt != 0) {
789         if ((*bufs)->len == 0) {
790             ++*bufs;
791             --*bufcnt;
792             continue;
793         }
794         if (!has_pending_ssl_bytes(sock->ssl))
795             init_ssl_output_buffer(sock->ssl);
796         size_t bytes_newly_written =
797             generate_tls_records_from_one_vec(sock, (*bufs)->base + first_buf_written, (*bufs)->len - first_buf_written);
798         if (bytes_newly_written == SIZE_MAX) {
799             return SIZE_MAX;
800         } else if (bytes_newly_written == 0) {
801             break;
802         }
803         first_buf_written += bytes_newly_written;
804         if ((*bufs)->len == first_buf_written) {
805             first_buf_written = 0;
806             ++*bufs;
807             --*bufcnt;
808         }
809     }
810 
811     return first_buf_written;
812 }
813 
h2o_socket_write(h2o_socket_t * sock,h2o_iovec_t * bufs,size_t bufcnt,h2o_socket_cb cb)814 void h2o_socket_write(h2o_socket_t *sock, h2o_iovec_t *bufs, size_t bufcnt, h2o_socket_cb cb)
815 {
816     SOCKET_PROBE(WRITE, sock, bufs, bufcnt, cb);
817 
818     assert(sock->_cb.write == NULL);
819 
820     for (size_t i = 0; i != bufcnt; ++i) {
821         sock->bytes_written += bufs[i].len;
822 #if H2O_SOCKET_DUMP_WRITE
823         h2o_error_printf("writing %zu bytes to fd:%d\n", bufs[i].len, h2o_socket_get_fd(sock));
824         h2o_dump_memory(stderr, bufs[i].base, bufs[i].len);
825 #endif
826     }
827 
828     do_write(sock, bufs, bufcnt, cb);
829 }
830 
on_write_complete(h2o_socket_t * sock,const char * err)831 void on_write_complete(h2o_socket_t *sock, const char *err)
832 {
833     h2o_socket_cb cb;
834 
835     if (has_pending_ssl_bytes(sock->ssl))
836         dispose_ssl_output_buffer(sock->ssl);
837 
838     cb = sock->_cb.write;
839     sock->_cb.write = NULL;
840     cb(sock, err);
841 }
842 
h2o_socket_read_start(h2o_socket_t * sock,h2o_socket_cb cb)843 void h2o_socket_read_start(h2o_socket_t *sock, h2o_socket_cb cb)
844 {
845     sock->_cb.read = cb;
846     do_read_start(sock);
847 }
848 
h2o_socket_read_stop(h2o_socket_t * sock)849 void h2o_socket_read_stop(h2o_socket_t *sock)
850 {
851     sock->_cb.read = NULL;
852     do_read_stop(sock);
853 }
854 
h2o_socket_setpeername(h2o_socket_t * sock,struct sockaddr * sa,socklen_t len)855 void h2o_socket_setpeername(h2o_socket_t *sock, struct sockaddr *sa, socklen_t len)
856 {
857     free(sock->_peername);
858     sock->_peername = h2o_mem_alloc(offsetof(struct st_h2o_socket_addr_t, addr) + len);
859     sock->_peername->len = len;
860     memcpy(&sock->_peername->addr, sa, len);
861 }
862 
h2o_socket_getpeername(h2o_socket_t * sock,struct sockaddr * sa)863 socklen_t h2o_socket_getpeername(h2o_socket_t *sock, struct sockaddr *sa)
864 {
865     /* return cached, if exists */
866     if (sock->_peername != NULL) {
867         memcpy(sa, &sock->_peername->addr, sock->_peername->len);
868         return sock->_peername->len;
869     }
870     /* call, copy to cache, and return */
871     socklen_t len = get_peername_uncached(sock, sa);
872     h2o_socket_setpeername(sock, sa, len);
873     return len;
874 }
875 
h2o_socket_getsockname(h2o_socket_t * sock,struct sockaddr * sa)876 socklen_t h2o_socket_getsockname(h2o_socket_t *sock, struct sockaddr *sa)
877 {
878     /* return cached, if exists */
879     if (sock->_sockname != NULL) {
880         memcpy(sa, &sock->_sockname->addr, sock->_sockname->len);
881         return sock->_sockname->len;
882     }
883     /* call, copy to cache, and return */
884     socklen_t len = get_sockname_uncached(sock, sa);
885     sock->_sockname = h2o_mem_alloc(offsetof(struct st_h2o_socket_addr_t, addr) + len);
886     sock->_sockname->len = len;
887     memcpy(&sock->_sockname->addr, sa, len);
888     return len;
889 }
890 
h2o_socket_get_ptls(h2o_socket_t * sock)891 ptls_t *h2o_socket_get_ptls(h2o_socket_t *sock)
892 {
893     return sock->ssl != NULL ? sock->ssl->ptls : NULL;
894 }
895 
h2o_socket_get_ssl_protocol_version(h2o_socket_t * sock)896 const char *h2o_socket_get_ssl_protocol_version(h2o_socket_t *sock)
897 {
898     if (sock->ssl != NULL) {
899         if (sock->ssl->ptls != NULL)
900             return "TLSv1.3";
901         if (sock->ssl->ossl != NULL)
902             return SSL_get_version(sock->ssl->ossl);
903     }
904     return NULL;
905 }
906 
h2o_socket_get_ssl_session_reused(h2o_socket_t * sock)907 int h2o_socket_get_ssl_session_reused(h2o_socket_t *sock)
908 {
909     if (sock->ssl != NULL) {
910         if (sock->ssl->ptls != NULL)
911             return ptls_is_psk_handshake(sock->ssl->ptls);
912         if (sock->ssl->ossl != NULL)
913             return (int)SSL_session_reused(sock->ssl->ossl);
914     }
915     return -1;
916 }
917 
h2o_socket_get_ssl_cipher(h2o_socket_t * sock)918 const char *h2o_socket_get_ssl_cipher(h2o_socket_t *sock)
919 {
920     if (sock->ssl != NULL) {
921         if (sock->ssl->ptls != NULL) {
922             ptls_cipher_suite_t *cipher = ptls_get_cipher(sock->ssl->ptls);
923             if (cipher != NULL)
924                 return cipher->aead->name;
925         } else if (sock->ssl->ossl != NULL) {
926             return SSL_get_cipher_name(sock->ssl->ossl);
927         }
928     }
929     return NULL;
930 }
931 
h2o_socket_get_ssl_cipher_bits(h2o_socket_t * sock)932 int h2o_socket_get_ssl_cipher_bits(h2o_socket_t *sock)
933 {
934     if (sock->ssl != NULL) {
935         if (sock->ssl->ptls != NULL) {
936             ptls_cipher_suite_t *cipher = ptls_get_cipher(sock->ssl->ptls);
937             if (cipher == NULL)
938                 return 0;
939             return (int)cipher->aead->key_size;
940         } else if (sock->ssl->ossl != NULL) {
941             return SSL_get_cipher_bits(sock->ssl->ossl, NULL);
942         }
943     }
944     return 0;
945 }
946 
h2o_socket_get_ssl_session_id(h2o_socket_t * sock)947 h2o_iovec_t h2o_socket_get_ssl_session_id(h2o_socket_t *sock)
948 {
949     if (sock->ssl != NULL) {
950         if (sock->ssl->ptls != NULL) {
951             /* FIXME */
952         } else if (sock->ssl->ossl != NULL) {
953             SSL_SESSION *session;
954             if (sock->ssl->handshake.server.async_resumption.state == ASYNC_RESUMPTION_STATE_COMPLETE &&
955                 (session = SSL_get_session(sock->ssl->ossl)) != NULL) {
956                 unsigned id_len;
957                 const unsigned char *id = SSL_SESSION_get_id(session, &id_len);
958                 return h2o_iovec_init(id, id_len);
959             }
960         }
961     }
962 
963     return h2o_iovec_init(NULL, 0);
964 }
965 
h2o_socket_get_ssl_server_name(const h2o_socket_t * sock)966 const char *h2o_socket_get_ssl_server_name(const h2o_socket_t *sock)
967 {
968     if (sock->ssl != NULL) {
969         if (sock->ssl->ptls != NULL) {
970             return ptls_get_server_name(sock->ssl->ptls);
971         } else if (sock->ssl->ossl != NULL) {
972             return SSL_get_servername(sock->ssl->ossl, TLSEXT_NAMETYPE_host_name);
973         }
974     }
975     return NULL;
976 }
977 
h2o_socket_log_tcp_congestion_controller(h2o_socket_t * sock,h2o_mem_pool_t * pool)978 h2o_iovec_t h2o_socket_log_tcp_congestion_controller(h2o_socket_t *sock, h2o_mem_pool_t *pool)
979 {
980 #if defined(TCP_CONGESTION)
981     int fd;
982     if ((fd = h2o_socket_get_fd(sock)) >= 0) {
983 #define CC_BUFSIZE 32
984         socklen_t buflen = CC_BUFSIZE;
985         char *buf = pool != NULL ? h2o_mem_alloc_pool(pool, *buf, buflen) : h2o_mem_alloc(buflen);
986         if (getsockopt(fd, IPPROTO_TCP, TCP_CONGESTION, buf, &buflen) == 0) {
987             /* Upon return, linux sets `buflen` to some value greater than the size of the string. Therefore, we apply strlen after
988              * making sure that the result does not overrun the buffer. */
989             buf[CC_BUFSIZE - 1] = '\0';
990             return h2o_iovec_init(buf, strlen(buf));
991         }
992 #undef CC_BUFSIZE
993     }
994 #endif
995     return h2o_iovec_init(NULL, 0);
996 }
997 
h2o_socket_log_tcp_delivery_rate(h2o_socket_t * sock,h2o_mem_pool_t * pool)998 h2o_iovec_t h2o_socket_log_tcp_delivery_rate(h2o_socket_t *sock, h2o_mem_pool_t *pool)
999 {
1000 #if defined(__linux__) && defined(TCP_INFO)
1001     int fd;
1002     if ((fd = h2o_socket_get_fd(sock)) >= 0) {
1003         /* A copy of `struct tcp_info` found in linux/tcp.h, up to `tcpi_delivery_rate`. Rest of the codebase uses netinet/tcp.h,
1004          * which does not provide access to `tcpi_delivery_rate`. */
1005         struct {
1006             uint8_t tcpi_state;
1007             uint8_t tcpi_ca_state;
1008             uint8_t tcpi_retransmits;
1009             uint8_t tcpi_probes;
1010             uint8_t tcpi_backoff;
1011             uint8_t tcpi_options;
1012             uint8_t tcpi_snd_wscale : 4, tcpi_rcv_wscale : 4;
1013             uint8_t tcpi_delivery_rate_app_limited : 1;
1014 
1015             uint32_t tcpi_rto;
1016             uint32_t tcpi_ato;
1017             uint32_t tcpi_snd_mss;
1018             uint32_t tcpi_rcv_mss;
1019 
1020             uint32_t tcpi_unacked;
1021             uint32_t tcpi_sacked;
1022             uint32_t tcpi_lost;
1023             uint32_t tcpi_retrans;
1024             uint32_t tcpi_fackets;
1025 
1026             /* Times. */
1027             uint32_t tcpi_last_data_sent;
1028             uint32_t tcpi_last_ack_sent; /* Not remembered, sorry. */
1029             uint32_t tcpi_last_data_recv;
1030             uint32_t tcpi_last_ack_recv;
1031 
1032             /* Metrics. */
1033             uint32_t tcpi_pmtu;
1034             uint32_t tcpi_rcv_ssthresh;
1035             uint32_t tcpi_rtt;
1036             uint32_t tcpi_rttvar;
1037             uint32_t tcpi_snd_ssthresh;
1038             uint32_t tcpi_snd_cwnd;
1039             uint32_t tcpi_advmss;
1040             uint32_t tcpi_reordering;
1041 
1042             uint32_t tcpi_rcv_rtt;
1043             uint32_t tcpi_rcv_space;
1044 
1045             uint32_t tcpi_total_retrans;
1046 
1047             uint64_t tcpi_pacing_rate;
1048             uint64_t tcpi_max_pacing_rate;
1049             uint64_t tcpi_bytes_acked;    /* RFC4898 tcpEStatsAppHCThruOctetsAcked */
1050             uint64_t tcpi_bytes_received; /* RFC4898 tcpEStatsAppHCThruOctetsReceived */
1051             uint32_t tcpi_segs_out;       /* RFC4898 tcpEStatsPerfSegsOut */
1052             uint32_t tcpi_segs_in;        /* RFC4898 tcpEStatsPerfSegsIn */
1053 
1054             uint32_t tcpi_notsent_bytes;
1055             uint32_t tcpi_min_rtt;
1056             uint32_t tcpi_data_segs_in;  /* RFC4898 tcpEStatsDataSegsIn */
1057             uint32_t tcpi_data_segs_out; /* RFC4898 tcpEStatsDataSegsOut */
1058 
1059             uint64_t tcpi_delivery_rate;
1060         } tcpi;
1061         socklen_t tcpisz = sizeof(tcpi);
1062         if (getsockopt(fd, IPPROTO_TCP, TCP_INFO, &tcpi, &tcpisz) == 0) {
1063             char *buf = (char *)(pool != NULL ? h2o_mem_alloc_pool(pool, char, sizeof(H2O_UINT64_LONGEST_STR))
1064                                               : h2o_mem_alloc(sizeof(H2O_UINT64_LONGEST_STR)));
1065             size_t len = sprintf(buf, "%" PRIu64, (uint64_t)tcpi.tcpi_delivery_rate);
1066             return h2o_iovec_init(buf, len);
1067         }
1068     }
1069 #endif
1070     return h2o_iovec_init(NULL, 0);
1071 }
1072 
h2o_socket_log_ssl_session_id(h2o_socket_t * sock,h2o_mem_pool_t * pool)1073 h2o_iovec_t h2o_socket_log_ssl_session_id(h2o_socket_t *sock, h2o_mem_pool_t *pool)
1074 {
1075     h2o_iovec_t base64id, rawid = h2o_socket_get_ssl_session_id(sock);
1076 
1077     if (rawid.base == NULL)
1078         return h2o_iovec_init(NULL, 0);
1079 
1080     base64id.base = pool != NULL ? h2o_mem_alloc_pool(pool, char, h2o_base64_encode_capacity(rawid.len))
1081                                  : h2o_mem_alloc(h2o_base64_encode_capacity(rawid.len));
1082     base64id.len = h2o_base64_encode(base64id.base, rawid.base, rawid.len, 1);
1083     return base64id;
1084 }
1085 
h2o_socket_log_ssl_cipher_bits(h2o_socket_t * sock,h2o_mem_pool_t * pool)1086 h2o_iovec_t h2o_socket_log_ssl_cipher_bits(h2o_socket_t *sock, h2o_mem_pool_t *pool)
1087 {
1088     int bits = h2o_socket_get_ssl_cipher_bits(sock);
1089     if (bits != 0) {
1090         char *s = (char *)(pool != NULL ? h2o_mem_alloc_pool(pool, char, sizeof(H2O_INT16_LONGEST_STR))
1091                                         : h2o_mem_alloc(sizeof(H2O_INT16_LONGEST_STR)));
1092         size_t len = sprintf(s, "%" PRId16, (int16_t)bits);
1093         return h2o_iovec_init(s, len);
1094     } else {
1095         return h2o_iovec_init(NULL, 0);
1096     }
1097 }
1098 
h2o_socket_compare_address(struct sockaddr * x,struct sockaddr * y,int check_port)1099 int h2o_socket_compare_address(struct sockaddr *x, struct sockaddr *y, int check_port)
1100 {
1101 #define CMP(a, b)                                                                                                                  \
1102     do {                                                                                                                           \
1103         if (a != b)                                                                                                                \
1104             return a < b ? -1 : 1;                                                                                                 \
1105     } while (0)
1106 
1107     CMP(x->sa_family, y->sa_family);
1108 
1109     if (x->sa_family == AF_UNIX) {
1110         struct sockaddr_un *xun = (void *)x, *yun = (void *)y;
1111         int r = strcmp(xun->sun_path, yun->sun_path);
1112         if (r != 0)
1113             return r;
1114     } else if (x->sa_family == AF_INET) {
1115         struct sockaddr_in *xin = (void *)x, *yin = (void *)y;
1116         CMP(ntohl(xin->sin_addr.s_addr), ntohl(yin->sin_addr.s_addr));
1117         if (check_port)
1118             CMP(ntohs(xin->sin_port), ntohs(yin->sin_port));
1119     } else if (x->sa_family == AF_INET6) {
1120         struct sockaddr_in6 *xin6 = (void *)x, *yin6 = (void *)y;
1121         int r = memcmp(xin6->sin6_addr.s6_addr, yin6->sin6_addr.s6_addr, sizeof(xin6->sin6_addr.s6_addr));
1122         if (r != 0)
1123             return r;
1124         if (check_port)
1125             CMP(ntohs(xin6->sin6_port), ntohs(yin6->sin6_port));
1126         CMP(xin6->sin6_flowinfo, yin6->sin6_flowinfo);
1127         CMP(xin6->sin6_scope_id, yin6->sin6_scope_id);
1128     } else {
1129         assert(!"unknown sa_family");
1130     }
1131 
1132 #undef CMP
1133     return 0;
1134 }
1135 
h2o_socket_getnumerichost(const struct sockaddr * sa,socklen_t salen,char * buf)1136 size_t h2o_socket_getnumerichost(const struct sockaddr *sa, socklen_t salen, char *buf)
1137 {
1138     if (sa->sa_family == AF_INET) {
1139         /* fast path for IPv4 addresses */
1140         struct sockaddr_in *sin = (void *)sa;
1141         uint32_t addr;
1142         addr = htonl(sin->sin_addr.s_addr);
1143         return sprintf(buf, "%d.%d.%d.%d", addr >> 24, (addr >> 16) & 255, (addr >> 8) & 255, addr & 255);
1144     }
1145 
1146     if (getnameinfo(sa, salen, buf, NI_MAXHOST, NULL, 0, NI_NUMERICHOST) != 0)
1147         return SIZE_MAX;
1148     return strlen(buf);
1149 }
1150 
h2o_socket_getport(const struct sockaddr * sa)1151 int32_t h2o_socket_getport(const struct sockaddr *sa)
1152 {
1153     switch (sa->sa_family) {
1154     case AF_INET:
1155         return htons(((struct sockaddr_in *)sa)->sin_port);
1156     case AF_INET6:
1157         return htons(((struct sockaddr_in6 *)sa)->sin6_port);
1158     default:
1159         return -1;
1160     }
1161 }
1162 
h2o_socket_get_error_string(int errnum,const char * default_err)1163 const char *h2o_socket_get_error_string(int errnum, const char *default_err)
1164 {
1165     switch (errnum) {
1166     case ECONNREFUSED:
1167         return h2o_socket_error_conn_refused;
1168     case ETIMEDOUT:
1169         return h2o_socket_error_conn_timed_out;
1170     case ENETUNREACH:
1171         return h2o_socket_error_network_unreachable;
1172     case EHOSTUNREACH:
1173         return h2o_socket_error_host_unreachable;
1174     default:
1175         return default_err;
1176     }
1177 }
1178 
create_ossl(h2o_socket_t * sock)1179 static void create_ossl(h2o_socket_t *sock)
1180 {
1181     sock->ssl->ossl = SSL_new(sock->ssl->ssl_ctx);
1182     /* set app data to be used in h2o_socket_ssl_new_session_cb */
1183     SSL_set_app_data(sock->ssl->ossl, sock);
1184     setup_bio(sock);
1185 }
1186 
on_async_resumption_get(SSL * ssl,const unsigned char * data,int len,int * copy)1187 static SSL_SESSION *on_async_resumption_get(SSL *ssl,
1188 #if !defined(LIBRESSL_VERSION_NUMBER) ? OPENSSL_VERSION_NUMBER >= 0x1010000fL : LIBRESSL_VERSION_NUMBER > 0x2070000f
1189                                             const
1190 #endif
1191                                             unsigned char *data,
1192                                             int len, int *copy)
1193 {
1194     h2o_socket_t *sock = BIO_get_data(SSL_get_rbio(ssl));
1195 
1196     switch (sock->ssl->handshake.server.async_resumption.state) {
1197     case ASYNC_RESUMPTION_STATE_RECORD:
1198         sock->ssl->handshake.server.async_resumption.state = ASYNC_RESUMPTION_STATE_REQUEST_SENT;
1199         resumption_get_async(sock, h2o_iovec_init(data, len));
1200         return NULL;
1201     case ASYNC_RESUMPTION_STATE_COMPLETE:
1202         *copy = 1;
1203         return sock->ssl->handshake.server.async_resumption.session_data;
1204     default:
1205         assert(!"FIXME");
1206         return NULL;
1207     }
1208 }
1209 
h2o_socket_ssl_new_session_cb(SSL * s,SSL_SESSION * sess)1210 int h2o_socket_ssl_new_session_cb(SSL *s, SSL_SESSION *sess)
1211 {
1212     h2o_socket_t *sock = (h2o_socket_t *)SSL_get_app_data(s);
1213     assert(sock != NULL);
1214     assert(sock->ssl != NULL);
1215 
1216     if (!SSL_is_server(s) && sock->ssl->handshake.client.session_cache != NULL
1217 #if !defined(LIBRESSL_VERSION_NUMBER) && OPENSSL_VERSION_NUMBER >= 0x1010100fL
1218         && SSL_SESSION_is_resumable(sess)
1219 #endif
1220     ) {
1221         h2o_cache_set(sock->ssl->handshake.client.session_cache, h2o_now(h2o_socket_get_loop(sock)),
1222                       sock->ssl->handshake.client.session_cache_key, sock->ssl->handshake.client.session_cache_key_hash,
1223                       h2o_iovec_init(sess, 1));
1224         return 1; /* retain ref count */
1225     }
1226 
1227     return 0; /* drop ref count */
1228 }
1229 
on_async_resumption_new(SSL * ssl,SSL_SESSION * session)1230 static int on_async_resumption_new(SSL *ssl, SSL_SESSION *session)
1231 {
1232     h2o_socket_t *sock = BIO_get_data(SSL_get_rbio(ssl));
1233 
1234     h2o_iovec_t data;
1235     const unsigned char *id;
1236     unsigned id_len;
1237     unsigned char *p;
1238 
1239     /* build data */
1240     data.len = i2d_SSL_SESSION(session, NULL);
1241     data.base = alloca(data.len);
1242     p = (void *)data.base;
1243     i2d_SSL_SESSION(session, &p);
1244 
1245     id = SSL_SESSION_get_id(session, &id_len);
1246     resumption_new(sock, h2o_iovec_init(id, id_len), data);
1247     return 0;
1248 }
1249 
on_handshake_complete(h2o_socket_t * sock,const char * err)1250 static void on_handshake_complete(h2o_socket_t *sock, const char *err)
1251 {
1252     if (err == NULL) {
1253         if (sock->ssl->ptls != NULL) {
1254             sock->ssl->record_overhead = ptls_get_record_overhead(sock->ssl->ptls);
1255         } else {
1256             const SSL_CIPHER *cipher = SSL_get_current_cipher(sock->ssl->ossl);
1257             switch (SSL_CIPHER_get_id(cipher)) {
1258             case TLS1_CK_RSA_WITH_AES_128_GCM_SHA256:
1259             case TLS1_CK_DHE_RSA_WITH_AES_128_GCM_SHA256:
1260             case TLS1_CK_ECDHE_RSA_WITH_AES_128_GCM_SHA256:
1261             case TLS1_CK_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256:
1262             case TLS1_CK_RSA_WITH_AES_256_GCM_SHA384:
1263             case TLS1_CK_DHE_RSA_WITH_AES_256_GCM_SHA384:
1264             case TLS1_CK_ECDHE_RSA_WITH_AES_256_GCM_SHA384:
1265             case TLS1_CK_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384:
1266                 sock->ssl->record_overhead = 5 /* header */ + 8 /* record_iv_length (RFC 5288 3) */ + 16 /* tag (RFC 5116 5.1) */;
1267                 break;
1268 #if defined(TLS1_CK_DHE_RSA_CHACHA20_POLY1305)
1269             case TLS1_CK_DHE_RSA_CHACHA20_POLY1305:
1270             case TLS1_CK_ECDHE_RSA_CHACHA20_POLY1305:
1271             case TLS1_CK_ECDHE_ECDSA_CHACHA20_POLY1305:
1272                 sock->ssl->record_overhead = 5 /* header */ + 16 /* tag */;
1273                 break;
1274 #endif
1275             default:
1276                 sock->ssl->record_overhead = 32; /* sufficiently large number that can hold most payloads */
1277                 break;
1278             }
1279         }
1280     }
1281 
1282     h2o_socket_cb handshake_cb = sock->ssl->handshake.cb;
1283     sock->_cb.write = NULL;
1284     sock->ssl->handshake.cb = NULL;
1285     if (err == NULL)
1286         err = decode_ssl_input(sock);
1287     handshake_cb(sock, err);
1288 }
1289 
get_handshake_error(struct st_h2o_socket_ssl_t * ssl)1290 const char *get_handshake_error(struct st_h2o_socket_ssl_t *ssl)
1291 {
1292     const char *err = h2o_socket_error_ssl_handshake;
1293     if (ssl->ossl != NULL) {
1294         long verify_result = SSL_get_verify_result(ssl->ossl);
1295         if (verify_result != X509_V_OK) {
1296             err = X509_verify_cert_error_string(verify_result);
1297             assert(err != NULL);
1298         }
1299     }
1300     return err;
1301 }
1302 
on_handshake_fail_complete(h2o_socket_t * sock,const char * err)1303 static void on_handshake_fail_complete(h2o_socket_t *sock, const char *err)
1304 {
1305     on_handshake_complete(sock, get_handshake_error(sock->ssl));
1306 }
1307 
1308 static void proceed_handshake(h2o_socket_t *sock, const char *err);
1309 
proceed_handshake_picotls(h2o_socket_t * sock)1310 static void proceed_handshake_picotls(h2o_socket_t *sock)
1311 {
1312     size_t consumed = sock->ssl->input.encrypted->size;
1313     ptls_buffer_t wbuf;
1314     ptls_buffer_init(&wbuf, "", 0);
1315 
1316     int ret = ptls_handshake(sock->ssl->ptls, &wbuf, sock->ssl->input.encrypted->bytes, &consumed, NULL);
1317     h2o_buffer_consume(&sock->ssl->input.encrypted, consumed);
1318 
1319     /* determine the next action */
1320     h2o_socket_cb next_cb;
1321     switch (ret) {
1322     case 0:
1323         next_cb = on_handshake_complete;
1324         break;
1325     case PTLS_ERROR_IN_PROGRESS:
1326         next_cb = proceed_handshake;
1327         break;
1328     default:
1329         next_cb = on_handshake_fail_complete;
1330         break;
1331     }
1332 
1333     /* When something is to be sent, send it and then take the next action. If there's nothing to be sent and the handshake is still
1334      * in progress, wait for more bytes to arrive; otherwise, take the action immediately. */
1335     if (wbuf.off != 0) {
1336         h2o_socket_read_stop(sock);
1337         write_ssl_bytes(sock, wbuf.base, wbuf.off);
1338         flush_pending_ssl(sock, next_cb);
1339     } else if (ret == PTLS_ERROR_IN_PROGRESS) {
1340         h2o_socket_read_start(sock, next_cb);
1341     } else {
1342         next_cb(sock, NULL);
1343     }
1344 
1345     ptls_buffer_dispose(&wbuf);
1346 }
1347 
proceed_handshake_openssl(h2o_socket_t * sock)1348 static void proceed_handshake_openssl(h2o_socket_t *sock)
1349 {
1350     h2o_iovec_t first_input = {NULL};
1351     int ret = 0;
1352     const char *err = NULL;
1353 
1354     assert(sock->ssl->ossl != NULL);
1355 
1356     if (SSL_is_server(sock->ssl->ossl) && sock->ssl->handshake.server.async_resumption.state == ASYNC_RESUMPTION_STATE_RECORD) {
1357         if (sock->ssl->input.encrypted->size <= 1024) {
1358             /* retain a copy of input if performing async resumption */
1359             first_input = h2o_iovec_init(alloca(sock->ssl->input.encrypted->size), sock->ssl->input.encrypted->size);
1360             memcpy(first_input.base, sock->ssl->input.encrypted->bytes, first_input.len);
1361         } else {
1362             sock->ssl->handshake.server.async_resumption.state = ASYNC_RESUMPTION_STATE_COMPLETE;
1363         }
1364     }
1365 
1366 Redo:
1367     ERR_clear_error();
1368     if (SSL_is_server(sock->ssl->ossl)) {
1369         ret = SSL_accept(sock->ssl->ossl);
1370         switch (sock->ssl->handshake.server.async_resumption.state) {
1371         case ASYNC_RESUMPTION_STATE_COMPLETE:
1372             break;
1373         case ASYNC_RESUMPTION_STATE_RECORD:
1374             /* async resumption has not been triggered; proceed the state to complete */
1375             sock->ssl->handshake.server.async_resumption.state = ASYNC_RESUMPTION_STATE_COMPLETE;
1376             break;
1377         case ASYNC_RESUMPTION_STATE_REQUEST_SENT: {
1378             /* sent async request, reset the ssl state, and wait for async response */
1379             assert(ret < 0);
1380             SSL_free(sock->ssl->ossl);
1381             create_ossl(sock);
1382             if (has_pending_ssl_bytes(sock->ssl))
1383                 dispose_ssl_output_buffer(sock->ssl);
1384             h2o_buffer_consume(&sock->ssl->input.encrypted, sock->ssl->input.encrypted->size);
1385             h2o_buffer_reserve(&sock->ssl->input.encrypted, first_input.len);
1386             memcpy(sock->ssl->input.encrypted->bytes, first_input.base, first_input.len);
1387             sock->ssl->input.encrypted->size = first_input.len;
1388             h2o_socket_read_stop(sock);
1389             return;
1390         }
1391         default:
1392             h2o_fatal("unexpected async resumption state");
1393             break;
1394         }
1395     } else {
1396         ret = SSL_connect(sock->ssl->ossl);
1397     }
1398 
1399     if (ret == 0 || (ret < 0 && SSL_get_error(sock->ssl->ossl, ret) != SSL_ERROR_WANT_READ)) {
1400         /* OpenSSL 1.1.0 emits an alert immediately, we  send it now. 1.0.2 emits the error when SSL_shutdown is called in
1401          * shutdown_ssl. */
1402         if (has_pending_ssl_bytes(sock->ssl)) {
1403             h2o_socket_read_stop(sock);
1404             flush_pending_ssl(sock, on_handshake_fail_complete);
1405             return;
1406         }
1407         err = get_handshake_error(sock->ssl);
1408         goto Complete;
1409     }
1410 
1411     if (has_pending_ssl_bytes(sock->ssl)) {
1412         h2o_socket_read_stop(sock);
1413         flush_pending_ssl(sock, ret == 1 ? on_handshake_complete : proceed_handshake);
1414     } else {
1415         if (ret == 1) {
1416             if (!SSL_is_server(sock->ssl->ossl)) {
1417                 X509 *cert = SSL_get_peer_certificate(sock->ssl->ossl);
1418                 if (cert != NULL) {
1419                     switch (validate_hostname(sock->ssl->handshake.client.server_name, cert)) {
1420                     case MatchFound:
1421                         /* ok */
1422                         break;
1423                     case MatchNotFound:
1424                         err = h2o_socket_error_ssl_cert_name_mismatch;
1425                         break;
1426                     default:
1427                         err = h2o_socket_error_ssl_cert_invalid;
1428                         break;
1429                     }
1430                     X509_free(cert);
1431                 } else {
1432                     err = h2o_socket_error_ssl_no_cert;
1433                 }
1434             }
1435             goto Complete;
1436         }
1437         if (sock->ssl->input.encrypted->size != 0) {
1438             goto Redo;
1439         }
1440         h2o_socket_read_start(sock, proceed_handshake);
1441     }
1442     return;
1443 
1444 Complete:
1445     h2o_socket_read_stop(sock);
1446     on_handshake_complete(sock, err);
1447 }
1448 
1449 /**
1450  * Called when it is still uncertain which of the two TLS stacks (picotls or OpenSSL) should handle the handshake.
1451  * The function first tries picotls without consuming the socket input buffer. Then, if picotls returns PTLS_ALERT_PROTOCOL_VERSION
1452  * indicating that the client is using TLS 1.2 or below, switches to using OpenSSL.
1453  */
proceed_handshake_undetermined(h2o_socket_t * sock)1454 static void proceed_handshake_undetermined(h2o_socket_t *sock)
1455 {
1456     assert(sock->ssl->ossl == NULL && sock->ssl->ptls == NULL);
1457 
1458     ptls_context_t *ptls_ctx = h2o_socket_ssl_get_picotls_context(sock->ssl->ssl_ctx);
1459     assert(ptls_ctx != NULL);
1460 
1461     size_t consumed = sock->ssl->input.encrypted->size;
1462     ptls_buffer_t wbuf;
1463     ptls_buffer_init(&wbuf, "", 0);
1464 
1465 #if PICOTLS_USE_DTRACE
1466     unsigned ptls_skip_tracing_backup = ptls_default_skip_tracing;
1467     ptls_default_skip_tracing = sock->_skip_tracing;
1468 #endif
1469     ptls_t *ptls = ptls_new(ptls_ctx, 1);
1470 #if PICOTLS_USE_DTRACE
1471     ptls_default_skip_tracing = ptls_skip_tracing_backup;
1472 #endif
1473     if (ptls == NULL)
1474         h2o_fatal("no memory");
1475     *ptls_get_data_ptr(ptls) = sock;
1476     int ret = ptls_handshake(ptls, &wbuf, sock->ssl->input.encrypted->bytes, &consumed, NULL);
1477 
1478     if (ret == PTLS_ERROR_IN_PROGRESS && wbuf.off == 0) {
1479         /* we aren't sure if the picotls can process the handshake, retain handshake transcript and replay on next occasion */
1480         ptls_free(ptls);
1481     } else if (ret == PTLS_ALERT_PROTOCOL_VERSION) {
1482         /* the client cannot use tls1.3, fallback to openssl */
1483         ptls_free(ptls);
1484         create_ossl(sock);
1485         proceed_handshake_openssl(sock);
1486     } else {
1487         /* picotls is responsible for handling the handshake */
1488         sock->ssl->ptls = ptls;
1489         sock->ssl->handshake.server.async_resumption.state = ASYNC_RESUMPTION_STATE_COMPLETE;
1490         h2o_buffer_consume(&sock->ssl->input.encrypted, consumed);
1491         /* stop reading, send response */
1492         h2o_socket_read_stop(sock);
1493         write_ssl_bytes(sock, wbuf.base, wbuf.off);
1494         h2o_socket_cb cb;
1495         switch (ret) {
1496         case 0:
1497             cb = on_handshake_complete;
1498             break;
1499         case PTLS_ERROR_IN_PROGRESS:
1500             cb = proceed_handshake;
1501             break;
1502         default:
1503             assert(ret != PTLS_ERROR_STATELESS_RETRY && "stateless retry is never turned on by us for TCP");
1504             cb = on_handshake_fail_complete;
1505             break;
1506         }
1507         flush_pending_ssl(sock, cb);
1508     }
1509     ptls_buffer_dispose(&wbuf);
1510 }
1511 
proceed_handshake(h2o_socket_t * sock,const char * err)1512 static void proceed_handshake(h2o_socket_t *sock, const char *err)
1513 {
1514     sock->_cb.write = NULL;
1515 
1516     if (err != NULL) {
1517         h2o_socket_read_stop(sock);
1518         on_handshake_complete(sock, err);
1519         return;
1520     }
1521 
1522     if (sock->ssl->ptls != NULL) {
1523         proceed_handshake_picotls(sock);
1524     } else if (sock->ssl->ossl != NULL) {
1525         proceed_handshake_openssl(sock);
1526     } else if (h2o_socket_ssl_get_picotls_context(sock->ssl->ssl_ctx) == NULL) {
1527         create_ossl(sock);
1528         proceed_handshake_openssl(sock);
1529     } else {
1530         proceed_handshake_undetermined(sock);
1531     }
1532 }
1533 
h2o_socket_ssl_handshake(h2o_socket_t * sock,SSL_CTX * ssl_ctx,const char * server_name,h2o_iovec_t alpn_protos,h2o_socket_cb handshake_cb)1534 void h2o_socket_ssl_handshake(h2o_socket_t *sock, SSL_CTX *ssl_ctx, const char *server_name, h2o_iovec_t alpn_protos,
1535                               h2o_socket_cb handshake_cb)
1536 {
1537     sock->ssl = h2o_mem_alloc(sizeof(*sock->ssl));
1538     *sock->ssl = (struct st_h2o_socket_ssl_t){};
1539 
1540     sock->ssl->ssl_ctx = ssl_ctx;
1541 
1542     /* setup the buffers; sock->input should be empty, sock->ssl->input.encrypted should contain the initial input, if any */
1543     h2o_buffer_init(&sock->ssl->input.encrypted, &h2o_socket_buffer_prototype);
1544     if (sock->input->size != 0) {
1545         h2o_buffer_t *tmp = sock->input;
1546         sock->input = sock->ssl->input.encrypted;
1547         sock->ssl->input.encrypted = tmp;
1548     }
1549 
1550     sock->ssl->handshake.cb = handshake_cb;
1551     if (server_name == NULL) {
1552         /* is server */
1553         if (SSL_CTX_sess_get_get_cb(sock->ssl->ssl_ctx) != NULL)
1554             sock->ssl->handshake.server.async_resumption.state = ASYNC_RESUMPTION_STATE_RECORD;
1555         if (sock->ssl->input.encrypted->size != 0)
1556             proceed_handshake(sock, 0);
1557         else
1558             h2o_socket_read_start(sock, proceed_handshake);
1559     } else {
1560         create_ossl(sock);
1561         if (alpn_protos.base != NULL)
1562             SSL_set_alpn_protos(sock->ssl->ossl, (const unsigned char *)alpn_protos.base, (unsigned)alpn_protos.len);
1563         h2o_cache_t *session_cache = h2o_socket_ssl_get_session_cache(sock->ssl->ssl_ctx);
1564         if (session_cache != NULL) {
1565             struct sockaddr_storage sa;
1566             int32_t port;
1567             if (h2o_socket_getpeername(sock, (struct sockaddr *)&sa) != 0 &&
1568                 (port = h2o_socket_getport((struct sockaddr *)&sa)) != -1) {
1569                 /* session cache is available */
1570                 h2o_iovec_t session_cache_key;
1571                 session_cache_key.base = h2o_mem_alloc(strlen(server_name) + sizeof(":" H2O_UINT16_LONGEST_STR));
1572                 session_cache_key.len = sprintf(session_cache_key.base, "%s:%" PRIu16, server_name, (uint16_t)port);
1573                 sock->ssl->handshake.client.session_cache = session_cache;
1574                 sock->ssl->handshake.client.session_cache_key = session_cache_key;
1575                 sock->ssl->handshake.client.session_cache_key_hash =
1576                     h2o_cache_calchash(session_cache_key.base, session_cache_key.len);
1577 
1578                 /* fetch from session cache */
1579                 h2o_cache_ref_t *cacheref = h2o_cache_fetch(session_cache, h2o_now(h2o_socket_get_loop(sock)),
1580                                                             sock->ssl->handshake.client.session_cache_key,
1581                                                             sock->ssl->handshake.client.session_cache_key_hash);
1582                 if (cacheref != NULL) {
1583                     SSL_set_session(sock->ssl->ossl, (SSL_SESSION *)cacheref->value.base);
1584                     h2o_cache_release(session_cache, cacheref);
1585                 }
1586             }
1587         }
1588         sock->ssl->handshake.client.server_name = h2o_strdup(NULL, server_name, SIZE_MAX).base;
1589         SSL_set_tlsext_host_name(sock->ssl->ossl, sock->ssl->handshake.client.server_name);
1590         proceed_handshake(sock, 0);
1591     }
1592 }
1593 
h2o_socket_ssl_resume_server_handshake(h2o_socket_t * sock,h2o_iovec_t session_data)1594 void h2o_socket_ssl_resume_server_handshake(h2o_socket_t *sock, h2o_iovec_t session_data)
1595 {
1596     if (session_data.len != 0) {
1597         const unsigned char *p = (void *)session_data.base;
1598         sock->ssl->handshake.server.async_resumption.session_data = d2i_SSL_SESSION(NULL, &p, (long)session_data.len);
1599         /* FIXME warn on failure */
1600     }
1601 
1602     sock->ssl->handshake.server.async_resumption.state = ASYNC_RESUMPTION_STATE_COMPLETE;
1603     proceed_handshake(sock, 0);
1604 
1605     if (sock->ssl->handshake.server.async_resumption.session_data != NULL) {
1606         SSL_SESSION_free(sock->ssl->handshake.server.async_resumption.session_data);
1607         sock->ssl->handshake.server.async_resumption.session_data = NULL;
1608     }
1609 }
1610 
h2o_socket_ssl_async_resumption_init(h2o_socket_ssl_resumption_get_async_cb get_async_cb,h2o_socket_ssl_resumption_new_cb new_cb)1611 void h2o_socket_ssl_async_resumption_init(h2o_socket_ssl_resumption_get_async_cb get_async_cb,
1612                                           h2o_socket_ssl_resumption_new_cb new_cb)
1613 {
1614     resumption_get_async = get_async_cb;
1615     resumption_new = new_cb;
1616 }
1617 
h2o_socket_ssl_async_resumption_setup_ctx(SSL_CTX * ctx)1618 void h2o_socket_ssl_async_resumption_setup_ctx(SSL_CTX *ctx)
1619 {
1620     SSL_CTX_sess_set_get_cb(ctx, on_async_resumption_get);
1621     SSL_CTX_sess_set_new_cb(ctx, on_async_resumption_new);
1622     /* if necessary, it is the responsibility of the caller to disable the internal cache */
1623 }
1624 
get_ptls_index(void)1625 static int get_ptls_index(void)
1626 {
1627     static volatile int index;
1628     H2O_MULTITHREAD_ONCE({ index = SSL_CTX_get_ex_new_index(0, NULL, NULL, NULL, NULL); });
1629     return index;
1630 }
1631 
h2o_socket_ssl_get_picotls_context(SSL_CTX * ossl)1632 ptls_context_t *h2o_socket_ssl_get_picotls_context(SSL_CTX *ossl)
1633 {
1634     return SSL_CTX_get_ex_data(ossl, get_ptls_index());
1635 }
1636 
h2o_socket_ssl_set_picotls_context(SSL_CTX * ossl,ptls_context_t * ptls)1637 void h2o_socket_ssl_set_picotls_context(SSL_CTX *ossl, ptls_context_t *ptls)
1638 {
1639     SSL_CTX_set_ex_data(ossl, get_ptls_index(), ptls);
1640 }
1641 
on_dispose_ssl_ctx_session_cache(void * parent,void * ptr,CRYPTO_EX_DATA * ad,int idx,long argl,void * argp)1642 static void on_dispose_ssl_ctx_session_cache(void *parent, void *ptr, CRYPTO_EX_DATA *ad, int idx, long argl, void *argp)
1643 {
1644     h2o_cache_t *ssl_session_cache = (h2o_cache_t *)ptr;
1645     if (ssl_session_cache != NULL)
1646         h2o_cache_destroy(ssl_session_cache);
1647 }
1648 
get_ssl_session_cache_index(void)1649 static int get_ssl_session_cache_index(void)
1650 {
1651     static volatile int index;
1652     H2O_MULTITHREAD_ONCE({ index = SSL_CTX_get_ex_new_index(0, NULL, NULL, NULL, on_dispose_ssl_ctx_session_cache); });
1653     return index;
1654 }
1655 
h2o_socket_ssl_get_session_cache(SSL_CTX * ctx)1656 h2o_cache_t *h2o_socket_ssl_get_session_cache(SSL_CTX *ctx)
1657 {
1658     return (h2o_cache_t *)SSL_CTX_get_ex_data(ctx, get_ssl_session_cache_index());
1659 }
1660 
h2o_socket_ssl_set_session_cache(SSL_CTX * ctx,h2o_cache_t * cache)1661 void h2o_socket_ssl_set_session_cache(SSL_CTX *ctx, h2o_cache_t *cache)
1662 {
1663     SSL_CTX_set_ex_data(ctx, get_ssl_session_cache_index(), cache);
1664 }
1665 
h2o_socket_ssl_destroy_session_cache_entry(h2o_iovec_t value)1666 void h2o_socket_ssl_destroy_session_cache_entry(h2o_iovec_t value)
1667 {
1668     SSL_SESSION *session = (SSL_SESSION *)value.base;
1669     SSL_SESSION_free(session);
1670 }
1671 
h2o_socket_ssl_get_selected_protocol(h2o_socket_t * sock)1672 h2o_iovec_t h2o_socket_ssl_get_selected_protocol(h2o_socket_t *sock)
1673 {
1674     const unsigned char *data = NULL;
1675     unsigned len = 0;
1676 
1677     if (sock->ssl == NULL)
1678         return h2o_iovec_init(NULL, 0);
1679 
1680     if (sock->ssl->ptls != NULL) {
1681         const char *proto = ptls_get_negotiated_protocol(sock->ssl->ptls);
1682         return proto != NULL ? h2o_iovec_init(proto, strlen(proto)) : h2o_iovec_init(NULL, 0);
1683     }
1684 
1685 #if H2O_USE_ALPN
1686     if (len == 0)
1687         SSL_get0_alpn_selected(sock->ssl->ossl, &data, &len);
1688 #endif
1689 #if H2O_USE_NPN
1690     if (len == 0)
1691         SSL_get0_next_proto_negotiated(sock->ssl->ossl, &data, &len);
1692 #endif
1693 
1694     return h2o_iovec_init(data, len);
1695 }
1696 
h2o_socket_ssl_is_early_data(h2o_socket_t * sock)1697 int h2o_socket_ssl_is_early_data(h2o_socket_t *sock)
1698 {
1699     assert(sock->ssl != NULL);
1700 
1701     if (sock->ssl->ptls != NULL && !ptls_handshake_is_complete(sock->ssl->ptls))
1702         return 1;
1703     return 0;
1704 }
1705 
on_alpn_select(SSL * ssl,const unsigned char ** out,unsigned char * outlen,const unsigned char * _in,unsigned int inlen,void * _protocols)1706 static int on_alpn_select(SSL *ssl, const unsigned char **out, unsigned char *outlen, const unsigned char *_in, unsigned int inlen,
1707                           void *_protocols)
1708 {
1709     const h2o_iovec_t *protocols = _protocols;
1710     size_t i;
1711 
1712     for (i = 0; protocols[i].len != 0; ++i) {
1713         const unsigned char *in = _in, *in_end = in + inlen;
1714         while (in != in_end) {
1715             size_t cand_len = *in++;
1716             if (in_end - in < cand_len) {
1717                 /* broken request */
1718                 return SSL_TLSEXT_ERR_NOACK;
1719             }
1720             if (cand_len == protocols[i].len && memcmp(in, protocols[i].base, cand_len) == 0) {
1721                 goto Found;
1722             }
1723             in += cand_len;
1724         }
1725     }
1726     /* not found */
1727     return SSL_TLSEXT_ERR_NOACK;
1728 
1729 Found:
1730     *out = (const unsigned char *)protocols[i].base;
1731     *outlen = (unsigned char)protocols[i].len;
1732     return SSL_TLSEXT_ERR_OK;
1733 }
1734 
1735 #if H2O_USE_ALPN
1736 
h2o_ssl_register_alpn_protocols(SSL_CTX * ctx,const h2o_iovec_t * protocols)1737 void h2o_ssl_register_alpn_protocols(SSL_CTX *ctx, const h2o_iovec_t *protocols)
1738 {
1739     SSL_CTX_set_alpn_select_cb(ctx, on_alpn_select, (void *)protocols);
1740 }
1741 
1742 #endif
1743 
1744 #if H2O_USE_NPN
1745 
on_npn_advertise(SSL * ssl,const unsigned char ** out,unsigned * outlen,void * protocols)1746 static int on_npn_advertise(SSL *ssl, const unsigned char **out, unsigned *outlen, void *protocols)
1747 {
1748     *out = protocols;
1749     *outlen = (unsigned)strlen(protocols);
1750     return SSL_TLSEXT_ERR_OK;
1751 }
1752 
h2o_ssl_register_npn_protocols(SSL_CTX * ctx,const char * protocols)1753 void h2o_ssl_register_npn_protocols(SSL_CTX *ctx, const char *protocols)
1754 {
1755     SSL_CTX_set_next_protos_advertised_cb(ctx, on_npn_advertise, (void *)protocols);
1756 }
1757 
1758 #endif
1759 
h2o_socket_set_df_bit(int fd,int domain)1760 int h2o_socket_set_df_bit(int fd, int domain)
1761 {
1762 #define SETSOCKOPT(ip, optname, _optvar)                                                                                           \
1763     do {                                                                                                                           \
1764         int optvar = _optvar;                                                                                                      \
1765         if (setsockopt(fd, ip, optname, &optvar, sizeof(optvar)) != 0) {                                                           \
1766             perror("failed to set the DF bit through setsockopt(" H2O_TO_STR(ip) ", " H2O_TO_STR(optname) ")");                    \
1767             return 0;                                                                                                              \
1768         }                                                                                                                          \
1769         return 1;                                                                                                                  \
1770     } while (0)
1771 
1772     switch (domain) {
1773     case AF_INET:
1774 #if defined(IP_PMTUDISC_DO)
1775         SETSOCKOPT(IPPROTO_IP, IP_MTU_DISCOVER, IP_PMTUDISC_DO);
1776 #elif defined(IP_DONTFRAG)
1777         SETSOCKOPT(IPPROTO_IP, IP_DONTFRAG, 1);
1778 #endif
1779         break;
1780     case AF_INET6:
1781 #if defined(IPV6_PMTUDISC_DO)
1782         SETSOCKOPT(IPPROTO_IPV6, IPV6_MTU_DISCOVER, IPV6_PMTUDISC_DO);
1783 #elif defined(IPV6_DONTFRAG)
1784         SETSOCKOPT(IPPROTO_IPV6, IPV6_DONTFRAG, 1);
1785 #endif
1786         break;
1787     default:
1788         break;
1789     }
1790 
1791     return 1;
1792 
1793 #undef SETSOCKOPT
1794 }
1795 
h2o_socket_set_skip_tracing(h2o_socket_t * sock,int skip_tracing)1796 void h2o_socket_set_skip_tracing(h2o_socket_t *sock, int skip_tracing)
1797 {
1798     sock->_skip_tracing = skip_tracing;
1799     if (sock->ssl != NULL && sock->ssl->ptls != NULL)
1800         ptls_set_skip_tracing(sock->ssl->ptls, skip_tracing);
1801 }
1802 
h2o_sliding_counter_stop(h2o_sliding_counter_t * counter,uint64_t now)1803 void h2o_sliding_counter_stop(h2o_sliding_counter_t *counter, uint64_t now)
1804 {
1805     uint64_t elapsed;
1806 
1807     assert(counter->cur.start_at != 0);
1808 
1809     /* calculate the time used, and reset cur */
1810     if (now <= counter->cur.start_at)
1811         elapsed = 0;
1812     else
1813         elapsed = now - counter->cur.start_at;
1814     counter->cur.start_at = 0;
1815 
1816     /* adjust prev */
1817     counter->prev.sum += elapsed;
1818     counter->prev.sum -= counter->prev.slots[counter->prev.index];
1819     counter->prev.slots[counter->prev.index] = elapsed;
1820     if (++counter->prev.index >= sizeof(counter->prev.slots) / sizeof(counter->prev.slots[0]))
1821         counter->prev.index = 0;
1822 
1823     /* recalc average */
1824     counter->average = counter->prev.sum / (sizeof(counter->prev.slots) / sizeof(counter->prev.slots[0]));
1825 }
1826 
1827 #if H2O_USE_EBPF_MAP
1828 #include <linux/bpf.h>
1829 #include <linux/unistd.h>
1830 #include <sys/stat.h>
1831 #include "h2o/multithread.h"
1832 #include "h2o-probes.h"
1833 
ebpf_map_create(uint32_t map_type,uint32_t key_size,uint32_t value_size,uint32_t max_entries,const char * map_name)1834 static int ebpf_map_create(uint32_t map_type, uint32_t key_size, uint32_t value_size, uint32_t max_entries, const char *map_name)
1835 {
1836     union bpf_attr attr = {
1837         .map_type = map_type,
1838         .key_size = key_size,
1839         .value_size = value_size,
1840         .max_entries = max_entries,
1841     };
1842     strncpy(attr.map_name, map_name, sizeof(attr.map_name));
1843     return syscall(SYS_bpf, BPF_MAP_CREATE, &attr, sizeof(attr));
1844 }
1845 
ebpf_obj_pin(int bpf_fd,const char * pathname)1846 static int ebpf_obj_pin(int bpf_fd, const char *pathname)
1847 {
1848     union bpf_attr attr = {
1849         .bpf_fd = (uint32_t)bpf_fd,
1850         .pathname = (uint64_t)pathname,
1851     };
1852     return syscall(SYS_bpf, BPF_OBJ_PIN, &attr, sizeof(attr));
1853 }
1854 
ebpf_obj_get(const char * pathname)1855 static int ebpf_obj_get(const char *pathname)
1856 {
1857     union bpf_attr attr = {
1858         .pathname = (uint64_t)pathname,
1859     };
1860     return syscall(SYS_bpf, BPF_OBJ_GET, &attr, sizeof(attr));
1861 }
1862 
ebpf_obj_get_info_by_fd(int fd,struct bpf_map_info * info)1863 static int ebpf_obj_get_info_by_fd(int fd, struct bpf_map_info *info)
1864 {
1865     union bpf_attr attr = {
1866         .info =
1867             {
1868                 .bpf_fd = fd,
1869                 .info = (uint64_t)info,
1870                 .info_len = sizeof(*info),
1871             },
1872     };
1873     return syscall(SYS_bpf, BPF_OBJ_GET_INFO_BY_FD, &attr, sizeof(attr));
1874 }
1875 
ebpf_map_lookup(int fd,const void * key,void * value)1876 static int ebpf_map_lookup(int fd, const void *key, void *value)
1877 {
1878     union bpf_attr attr = {
1879         .map_fd = fd,
1880         .key = (uint64_t)key,
1881         .value = (uint64_t)value,
1882     };
1883     return syscall(SYS_bpf, BPF_MAP_LOOKUP_ELEM, &attr, sizeof(attr));
1884 }
1885 
ebpf_map_delete(int fd,const void * key)1886 static int ebpf_map_delete(int fd, const void *key)
1887 {
1888     union bpf_attr attr = {
1889         .map_fd = fd,
1890         .key = (uint64_t)key,
1891     };
1892     return syscall(SYS_bpf, BPF_MAP_DELETE_ELEM, &attr, sizeof(attr));
1893 }
1894 
1895 static int return_map_fd = -1; // for h2o_return
1896 
h2o_socket_ebpf_setup(void)1897 int h2o_socket_ebpf_setup(void)
1898 {
1899     const struct {
1900         int type;
1901         uint32_t key_size;
1902         uint32_t value_size;
1903     } map_attr = {
1904         .type = BPF_MAP_TYPE_LRU_HASH,
1905         .key_size = sizeof(pid_t),
1906         .value_size = sizeof(uint64_t),
1907     };
1908 
1909     int fd = -1;
1910     if (getuid() != 0) {
1911         h2o_error_printf("failed to set up eBPF maps because bpf(2) requires root privileges\n");
1912         goto Error;
1913     }
1914 
1915     fd = ebpf_obj_get(H2O_EBPF_RETURN_MAP_PATH);
1916     if (fd < 0) {
1917         if (errno != ENOENT) {
1918             h2o_perror("BPF_OBJ_GET failed");
1919             goto Error;
1920         }
1921         /* Pinned eBPF map does not exist. Create one and pin it to the BPF filesystem. */
1922         fd = ebpf_map_create(map_attr.type, map_attr.key_size, map_attr.value_size, H2O_EBPF_RETURN_MAP_SIZE,
1923                              H2O_EBPF_RETURN_MAP_NAME);
1924         if (fd < 0) {
1925             if (errno == EPERM) {
1926                 h2o_error_printf("BPF_MAP_CREATE failed with EPERM, maybe because RLIMIT_MEMLOCK is too small.\n");
1927             } else {
1928                 h2o_perror("BPF_MAP_CREATE failed");
1929             }
1930             goto Error;
1931         }
1932         if (ebpf_obj_pin(fd, H2O_EBPF_RETURN_MAP_PATH) != 0) {
1933             if (errno == ENOENT) {
1934                 h2o_error_printf("BPF_OBJ_PIN failed with ENOENT, because /sys/fs/bpf is not mounted as a BPF filesystem.\n");
1935             } else {
1936                 h2o_perror("BPF_OBJ_PIN failed");
1937             }
1938             goto Error;
1939         }
1940     } else {
1941         /* BPF_OBJ_GET successfully opened a pinned eBPF map. Make sure the critical attributes (type, key size, value size) are
1942          * correct, otherwise usdt-selective-tracing does not work. */
1943         struct bpf_map_info m;
1944         if (ebpf_obj_get_info_by_fd(fd, &m) != 0) {
1945             h2o_perror("BPF_OBJ_GET_INFO_BY_FD failed");
1946             goto Error;
1947         }
1948         if (m.type != map_attr.type) {
1949             h2o_error_printf(H2O_EBPF_RETURN_MAP_PATH " has an unexpected map type: expected %d but got %d\n", map_attr.type,
1950                              m.type);
1951             goto Error;
1952         }
1953         if (m.key_size != map_attr.key_size) {
1954             h2o_error_printf(H2O_EBPF_RETURN_MAP_PATH " has an unexpected map key size: expected %" PRIu32 " but got %" PRIu32 "\n",
1955                              map_attr.key_size, m.key_size);
1956             goto Error;
1957         }
1958         if (m.value_size != map_attr.value_size) {
1959             h2o_error_printf(H2O_EBPF_RETURN_MAP_PATH " has an unexpected map value size: expected %" PRIu32 " but got %" PRIu32
1960                                                       "\n",
1961                              map_attr.value_size, m.value_size);
1962             goto Error;
1963         }
1964     }
1965 
1966     /* success */
1967     return_map_fd = fd;
1968     return 1;
1969 
1970 Error:
1971     if (fd >= 0)
1972         close(fd);
1973     return 0;
1974 }
1975 
get_map_fd(h2o_loop_t * loop,const char * map_path,int * fd,uint64_t * last_attempt)1976 static void get_map_fd(h2o_loop_t *loop, const char *map_path, int *fd, uint64_t *last_attempt)
1977 {
1978     // only check every second
1979     uint64_t now = h2o_now(loop);
1980     if (*last_attempt - now < 1000)
1981         return;
1982 
1983     *last_attempt = now;
1984 
1985     struct stat s;
1986     if (stat(map_path, &s) != 0) {
1987         // map path unavailable, cleanup fd if needed and leave
1988         if (*fd >= 0) {
1989             close(*fd);
1990             *fd = -1;
1991         }
1992         return;
1993     }
1994 
1995     if (*fd >= 0)
1996         return; // map still exists and we have a fd
1997 
1998     // map exists, try connect
1999     *fd = ebpf_obj_get(map_path);
2000     if (*fd < 0)
2001         h2o_perror("BPF_OBJ_GET failed");
2002 }
2003 
get_tracing_map_fd(h2o_loop_t * loop)2004 static int get_tracing_map_fd(h2o_loop_t *loop)
2005 {
2006     static __thread int fd = -1;
2007     static __thread uint64_t last_attempt = 0;
2008     get_map_fd(loop, H2O_EBPF_MAP_PATH, &fd, &last_attempt);
2009     return fd;
2010 }
2011 
set_ebpf_map_key_tuples(const struct sockaddr * sa,h2o_ebpf_address_t * ea)2012 static inline int set_ebpf_map_key_tuples(const struct sockaddr *sa, h2o_ebpf_address_t *ea)
2013 {
2014     if (sa->sa_family == AF_INET) {
2015         struct sockaddr_in *sin = (void *)sa;
2016         memcpy(ea->ip, &sin->sin_addr, sizeof(sin->sin_addr));
2017         ea->port = sin->sin_port;
2018         return 1;
2019     } else if (sa->sa_family == AF_INET6) {
2020         struct sockaddr_in6 *sin = (void *)sa;
2021         memcpy(ea->ip, &sin->sin6_addr, sizeof(sin->sin6_addr));
2022         ea->port = sin->sin6_port;
2023         return 1;
2024     } else {
2025         return 0;
2026     }
2027 }
2028 
h2o_socket_ebpf_init_key_raw(h2o_ebpf_map_key_t * key,int sock_type,struct sockaddr * local,struct sockaddr * remote)2029 int h2o_socket_ebpf_init_key_raw(h2o_ebpf_map_key_t *key, int sock_type, struct sockaddr *local, struct sockaddr *remote)
2030 {
2031     memset(key, 0, sizeof(*key));
2032     if (!set_ebpf_map_key_tuples(local, &key->local))
2033         return 0;
2034     if (!set_ebpf_map_key_tuples(remote, &key->remote))
2035         return 0;
2036     key->family = local->sa_family == AF_INET6 ? 6 : 4;
2037     key->protocol = sock_type;
2038     return 1;
2039 }
2040 
h2o_socket_ebpf_init_key(h2o_ebpf_map_key_t * key,void * _sock)2041 int h2o_socket_ebpf_init_key(h2o_ebpf_map_key_t *key, void *_sock)
2042 {
2043     h2o_socket_t *sock = _sock;
2044     struct sockaddr_storage local, remote;
2045     unsigned int sock_type, sock_type_len = sizeof(sock_type_len);
2046 
2047     /* fetch info */
2048     if (h2o_socket_getsockname(sock, (void *)&local) == 0)
2049         return 0;
2050     if (h2o_socket_getpeername(sock, (void *)&remote) == 0)
2051         return 0;
2052     if (getsockopt(h2o_socket_get_fd(sock), SOL_SOCKET, SO_TYPE, &sock_type, &sock_type_len) != 0) /* can't the info be cached? */
2053         return 0;
2054 
2055     return h2o_socket_ebpf_init_key_raw(key, sock_type, (void *)&local, (void *)&remote);
2056 }
2057 
report_ebpf_lookup_errors(h2o_error_reporter_t * reporter,uint64_t total_successes,uint64_t cur_successes)2058 static void report_ebpf_lookup_errors(h2o_error_reporter_t *reporter, uint64_t total_successes, uint64_t cur_successes)
2059 {
2060     fprintf(stderr,
2061             "BPF_MAP_LOOKUP_ELEM failed with ENOENT %" PRIu64 " time%s, succeeded: %" PRIu64 " time%s, over the last minute.\n",
2062             reporter->cur_errors, reporter->cur_errors > 1 ? "s" : "", cur_successes, cur_successes > 1 ? "s" : "");
2063 }
2064 
2065 static h2o_error_reporter_t track_ebpf_lookup = H2O_ERROR_REPORTER_INITIALIZER(report_ebpf_lookup_errors);
2066 
2067 #define DO_EBPF_RETURN_LOOKUP(func)                                                                                                \
2068     do {                                                                                                                           \
2069         if (return_map_fd >= 0) {                                                                                                  \
2070             pid_t tid = (pid_t)syscall(SYS_gettid); /* gettid() was not available until glibc 2.30 (2019) */                       \
2071             /* Make sure old flags do not exist, otherwise the subsequent logic will be unreliable. */                             \
2072             if (ebpf_map_delete(return_map_fd, &tid) == 0 || errno == ENOENT) {                                                    \
2073                 do {                                                                                                               \
2074                     func                                                                                                           \
2075                 } while (0);                                                                                                       \
2076                 if (ebpf_map_lookup(return_map_fd, &tid, &flags) == 0) {                                                           \
2077                     h2o_error_reporter_record_success(&track_ebpf_lookup);                                                         \
2078                 } else {                                                                                                           \
2079                     if (errno == ENOENT) {                                                                                         \
2080                         /* ENOENT could be issued in some reasons even if BPF tries to insert the entry, for example:              \
2081                          *  * the entry in LRU hash was evicted                                                                    \
2082                          *  * the insert operation in BPF program failed with ENOMEM                                               \
2083                          * We don't know the frequency for this ENOENT, so cap the number of logs.                                 \
2084                          *                                                                                                         \
2085                          * Other than the above reasons, ENOENT is issued when the tracer does not set the flags via h2o_return    \
2086                          * map, See h2o:_private_socket_lookup_flags handler in h2olog for details. */                             \
2087                         h2o_error_reporter_record_error(loop, &track_ebpf_lookup, 60000, 0);                                       \
2088                     } else {                                                                                                       \
2089                         h2o_perror("BPF_MAP_LOOKUP failed");                                                                       \
2090                     }                                                                                                              \
2091                 }                                                                                                                  \
2092             } else {                                                                                                               \
2093                 h2o_perror("BPF_MAP_DELETE failed");                                                                               \
2094             }                                                                                                                      \
2095         }                                                                                                                          \
2096     } while (0)
2097 
h2o_socket_ebpf_lookup_flags(h2o_loop_t * loop,int (* init_key)(h2o_ebpf_map_key_t * key,void * cbdata),void * cbdata)2098 uint64_t h2o_socket_ebpf_lookup_flags(h2o_loop_t *loop, int (*init_key)(h2o_ebpf_map_key_t *key, void *cbdata), void *cbdata)
2099 {
2100     uint64_t flags = 0;
2101 
2102     int tracing_map_fd = get_tracing_map_fd(loop);
2103     h2o_ebpf_map_key_t key;
2104     if ((tracing_map_fd >= 0 || H2O__PRIVATE_SOCKET_LOOKUP_FLAGS_ENABLED()) && init_key(&key, cbdata)) {
2105         if (tracing_map_fd >= 0)
2106             ebpf_map_lookup(tracing_map_fd, &key, &flags);
2107 
2108         if (H2O__PRIVATE_SOCKET_LOOKUP_FLAGS_ENABLED())
2109             DO_EBPF_RETURN_LOOKUP({ H2O__PRIVATE_SOCKET_LOOKUP_FLAGS(tid, flags, &key); });
2110     }
2111 
2112     return flags;
2113 }
2114 
h2o_socket_ebpf_lookup_flags_sni(h2o_loop_t * loop,uint64_t flags,const char * server_name,size_t server_name_len)2115 uint64_t h2o_socket_ebpf_lookup_flags_sni(h2o_loop_t *loop, uint64_t flags, const char *server_name, size_t server_name_len)
2116 {
2117     if (H2O__PRIVATE_SOCKET_LOOKUP_FLAGS_SNI_ENABLED())
2118         DO_EBPF_RETURN_LOOKUP({ H2O__PRIVATE_SOCKET_LOOKUP_FLAGS_SNI(tid, flags, server_name, server_name_len); });
2119     return flags;
2120 }
2121 
2122 #undef DO_EBPF_RETURN_LOOKUP
2123 
2124 #else
2125 
h2o_socket_ebpf_setup(void)2126 int h2o_socket_ebpf_setup(void)
2127 {
2128     return 0;
2129 }
2130 
h2o_socket_ebpf_init_key_raw(h2o_ebpf_map_key_t * key,int sock_type,struct sockaddr * local,struct sockaddr * remote)2131 int h2o_socket_ebpf_init_key_raw(h2o_ebpf_map_key_t *key, int sock_type, struct sockaddr *local, struct sockaddr *remote)
2132 {
2133     h2o_fatal("unimplemented");
2134 }
2135 
h2o_socket_ebpf_init_key(h2o_ebpf_map_key_t * key,void * sock)2136 int h2o_socket_ebpf_init_key(h2o_ebpf_map_key_t *key, void *sock)
2137 {
2138     h2o_fatal("unimplemented");
2139 }
2140 
h2o_socket_ebpf_lookup_flags(h2o_loop_t * loop,int (* init_key)(h2o_ebpf_map_key_t * key,void * cbdata),void * cbdata)2141 uint64_t h2o_socket_ebpf_lookup_flags(h2o_loop_t *loop, int (*init_key)(h2o_ebpf_map_key_t *key, void *cbdata), void *cbdata)
2142 {
2143     return 0;
2144 }
2145 
h2o_socket_ebpf_lookup_flags_sni(h2o_loop_t * loop,uint64_t flags,const char * server_name,size_t server_name_len)2146 uint64_t h2o_socket_ebpf_lookup_flags_sni(h2o_loop_t *loop, uint64_t flags, const char *server_name, size_t server_name_len)
2147 {
2148     return flags;
2149 }
2150 
2151 #endif
2152