1 /*
2  * Copyright (c) 2018 Fastly, Kazuho Oku
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a copy
5  * of this software and associated documentation files (the "Software"), to
6  * deal in the Software without restriction, including without limitation the
7  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
8  * sell copies of the Software, and to permit persons to whom the Software is
9  * furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice shall be included in
12  * all copies or substantial portions of the Software.
13  *
14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
19  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
20  * IN THE SOFTWARE.
21  */
22 #ifdef __APPLE__
23 #define __APPLE_USE_RFC_3542 /* to use IPV6_PKTINFO */
24 #endif
25 #include <errno.h>
26 #include <sys/types.h>
27 #include <netinet/in.h>
28 #include <netinet/udp.h>
29 #include <pthread.h>
30 #include <stdio.h>
31 #include <sys/socket.h>
32 #include "picotls/openssl.h"
33 #include "h2o/string_.h"
34 #include "h2o/http3_common.h"
35 #include "h2o/http3_internal.h"
36 #include "h2o/multithread.h"
37 #include "../probes_.h"
38 
39 struct st_h2o_http3_ingress_unistream_t {
40     /**
41      * back pointer
42      */
43     quicly_stream_t *quic;
44     /**
45      *
46      */
47     h2o_buffer_t *recvbuf;
48     /**
49      * A callback that passes unparsed input to be handled. `src` is set to NULL when receiving a reset.
50      */
51     void (*handle_input)(h2o_http3_conn_t *conn, struct st_h2o_http3_ingress_unistream_t *stream, const uint8_t **src,
52                          const uint8_t *src_end, int is_eos);
53 };
54 
55 const ptls_iovec_t h2o_http3_alpn[3] = {{(void *)H2O_STRLIT("h3")}, {(void *)H2O_STRLIT("h3-29")}, {(void *)H2O_STRLIT("h3-27")}};
56 
report_sendmsg_errors(h2o_error_reporter_t * reporter,uint64_t total_successes,uint64_t cur_successes)57 static void report_sendmsg_errors(h2o_error_reporter_t *reporter, uint64_t total_successes, uint64_t cur_successes)
58 {
59     char errstr[256];
60     fprintf(stderr, "sendmsg failed %" PRIu64 " time%s, succeeded: %" PRIu64 " time%s, over the last minute: %s\n",
61             reporter->cur_errors, reporter->cur_errors > 1 ? "s" : "", cur_successes, cur_successes > 1 ? "s" : "",
62             h2o_strerror_r((int)reporter->data, errstr, sizeof(errstr)));
63 }
64 
65 static h2o_error_reporter_t track_sendmsg = H2O_ERROR_REPORTER_INITIALIZER(report_sendmsg_errors);
66 
h2o_quic_send_datagrams(h2o_quic_ctx_t * ctx,quicly_address_t * dest,quicly_address_t * src,struct iovec * datagrams,size_t num_datagrams)67 int h2o_quic_send_datagrams(h2o_quic_ctx_t *ctx, quicly_address_t *dest, quicly_address_t *src, struct iovec *datagrams,
68                             size_t num_datagrams)
69 {
70     union {
71         struct cmsghdr hdr;
72         char buf[
73 #ifdef IPV6_PKTINFO
74             CMSG_SPACE(sizeof(struct in6_pktinfo))
75 #elif defined(IP_PKTINFO)
76             CMSG_SPACE(sizeof(struct in_pktinfo))
77 #elif defined(IP_SENDSRCADDR)
78             CMSG_SPACE(sizeof(struct in_addr))
79 #else
80             CMSG_SPACE(1)
81 #endif
82             +
83 #ifdef UDP_SEGMENT
84             CMSG_SPACE(sizeof(uint16_t))
85 #else
86             0
87 #endif
88         ];
89     } cmsgbuf = {.buf = {}};
90     struct cmsghdr *cmsg = &cmsgbuf.hdr;
91 
92     /* first CMSG is the source address */
93     if (src->sa.sa_family != AF_UNSPEC) {
94         size_t cmsg_bodylen = 0;
95         switch (src->sa.sa_family) {
96         case AF_INET: {
97 #if defined(IP_PKTINFO)
98             if (*ctx->sock.port != src->sin.sin_port)
99                 return 0;
100             cmsg->cmsg_level = IPPROTO_IP;
101             cmsg->cmsg_type = IP_PKTINFO;
102             cmsg_bodylen = sizeof(struct in_pktinfo);
103             memcpy(&((struct in_pktinfo *)CMSG_DATA(cmsg))->ipi_spec_dst, &src->sin.sin_addr, sizeof(struct in_addr));
104 #elif defined(IP_SENDSRCADDR)
105             if (*ctx->sock.port != src->sin.sin_port)
106                 return 0;
107             struct sockaddr_in *fdaddr = (struct sockaddr_in *)&ctx->sock.addr;
108             assert(fdaddr->sin_family == AF_INET);
109             if (fdaddr->sin_addr.s_addr == INADDR_ANY) {
110                 cmsg->cmsg_level = IPPROTO_IP;
111                 cmsg->cmsg_type = IP_SENDSRCADDR;
112                 cmsg_bodylen = sizeof(struct in_addr);
113                 memcpy(CMSG_DATA(cmsg), &src->sin.sin_addr, sizeof(struct in_addr));
114             }
115 #else
116             h2o_fatal("IP_PKTINFO not available");
117 #endif
118         } break;
119         case AF_INET6:
120 #ifdef IPV6_PKTINFO
121             if (*ctx->sock.port != src->sin6.sin6_port)
122                 return 0;
123             cmsg->cmsg_level = IPPROTO_IPV6;
124             cmsg->cmsg_type = IPV6_PKTINFO;
125             cmsg_bodylen = sizeof(struct in6_pktinfo);
126             memcpy(&((struct in6_pktinfo *)CMSG_DATA(cmsg))->ipi6_addr, &src->sin6.sin6_addr, sizeof(struct in6_addr));
127 #else
128             h2o_fatal("IPV6_PKTINFO not available");
129 #endif
130             break;
131         default:
132             h2o_fatal("unexpected address family");
133             break;
134         }
135         cmsg->cmsg_len = (socklen_t)CMSG_LEN(cmsg_bodylen);
136         cmsg = (struct cmsghdr *)((char *)cmsg + CMSG_SPACE(cmsg_bodylen));
137     }
138 
139     /* next CMSG is UDP_SEGMENT size (for GSO) */
140     int using_gso = 0;
141 #ifdef UDP_SEGMENT
142     if (num_datagrams > 1 && ctx->use_gso) {
143         for (size_t i = 1; i < num_datagrams - 1; ++i)
144             assert(datagrams[i].iov_len == datagrams[0].iov_len);
145         uint16_t segsize = (uint16_t)datagrams[0].iov_len;
146         cmsg->cmsg_level = SOL_UDP;
147         cmsg->cmsg_type = UDP_SEGMENT;
148         cmsg->cmsg_len = CMSG_LEN(sizeof(segsize));
149         memcpy(CMSG_DATA(cmsg), &segsize, sizeof(segsize));
150         cmsg = (struct cmsghdr *)((char *)cmsg + CMSG_SPACE(sizeof(segsize)));
151         using_gso = 1;
152     }
153 #endif
154 
155     /* send datagrams */
156     struct msghdr mess = {
157         .msg_name = &dest->sa,
158         .msg_namelen = quicly_get_socklen(&dest->sa),
159     };
160     if (cmsg != &cmsgbuf.hdr) {
161         mess.msg_control = &cmsgbuf.buf;
162         mess.msg_controllen = (socklen_t)((char *)cmsg - cmsgbuf.buf);
163     }
164     int ret;
165 
166     if (using_gso) {
167         mess.msg_iov = datagrams;
168         mess.msg_iovlen = (int)num_datagrams;
169         while ((ret = (int)sendmsg(h2o_socket_get_fd(ctx->sock.sock), &mess, 0)) == -1 && errno == EINTR)
170             ;
171         if (ret == -1)
172             goto SendmsgError;
173     } else {
174         for (size_t i = 0; i < num_datagrams; ++i) {
175             mess.msg_iov = datagrams + i;
176             mess.msg_iovlen = 1;
177             while ((ret = (int)sendmsg(h2o_socket_get_fd(ctx->sock.sock), &mess, 0)) == -1 && errno == EINTR)
178                 ;
179             if (ret == -1)
180                 goto SendmsgError;
181         }
182     }
183 
184     h2o_error_reporter_record_success(&track_sendmsg);
185 
186     return 1;
187 
188 SendmsgError:
189     /* The UDP stack returns EINVAL (linux) or EADDRNOTAVAIL (darwin, and presumably other BSD) when it was unable to use the
190      * designated source address.  We communicate that back to the caller so that the connection can be closed immediately. */
191     if (src->sa.sa_family != AF_UNSPEC && (errno == EINVAL || errno == EADDRNOTAVAIL))
192         return 0;
193 
194     /* Temporary failure to send a packet is not a permanent error fo the connection. (TODO do we want do something more
195      * specific?) */
196 
197     /* Log the number of failed invocations once per minute, if there has been such a failure. */
198     h2o_error_reporter_record_error(ctx->loop, &track_sendmsg, 60000, errno);
199 
200     return 1;
201 }
202 
get_callbacks(h2o_http3_conn_t * conn)203 static inline const h2o_http3_conn_callbacks_t *get_callbacks(h2o_http3_conn_t *conn)
204 {
205     return (const h2o_http3_conn_callbacks_t *)conn->super.callbacks;
206 }
207 
ingress_unistream_on_destroy(quicly_stream_t * qs,int err)208 static void ingress_unistream_on_destroy(quicly_stream_t *qs, int err)
209 {
210     struct st_h2o_http3_ingress_unistream_t *stream = qs->data;
211     h2o_buffer_dispose(&stream->recvbuf);
212     free(stream);
213 }
214 
ingress_unistream_on_receive(quicly_stream_t * qs,size_t off,const void * input,size_t len)215 static void ingress_unistream_on_receive(quicly_stream_t *qs, size_t off, const void *input, size_t len)
216 {
217     h2o_http3_conn_t *conn = *quicly_get_data(qs->conn);
218     struct st_h2o_http3_ingress_unistream_t *stream = qs->data;
219 
220     /* save received data */
221     h2o_http3_update_recvbuf(&stream->recvbuf, off, input, len);
222 
223     /* determine bytes that can be handled */
224     const uint8_t *src = (const uint8_t *)stream->recvbuf->bytes,
225                   *src_end = src + quicly_recvstate_bytes_available(&stream->quic->recvstate);
226     if (src == src_end && !quicly_recvstate_transfer_complete(&stream->quic->recvstate))
227         return;
228 
229     /* handle the bytes */
230     stream->handle_input(conn, stream, &src, src_end, quicly_recvstate_transfer_complete(&stream->quic->recvstate));
231     if (quicly_get_state(conn->super.quic) >= QUICLY_STATE_CLOSING)
232         return;
233 
234     /* remove bytes that have been consumed */
235     size_t bytes_consumed = src - (const uint8_t *)stream->recvbuf->bytes;
236     if (bytes_consumed != 0) {
237         h2o_buffer_consume(&stream->recvbuf, bytes_consumed);
238         quicly_stream_sync_recvbuf(stream->quic, bytes_consumed);
239     }
240 }
241 
ingress_unistream_on_receive_reset(quicly_stream_t * qs,int err)242 static void ingress_unistream_on_receive_reset(quicly_stream_t *qs, int err)
243 {
244     h2o_http3_conn_t *conn = *quicly_get_data(qs->conn);
245     struct st_h2o_http3_ingress_unistream_t *stream = qs->data;
246 
247     stream->handle_input(conn, stream, NULL, NULL, 1);
248 }
249 
qpack_encoder_stream_handle_input(h2o_http3_conn_t * conn,struct st_h2o_http3_ingress_unistream_t * stream,const uint8_t ** src,const uint8_t * src_end,int is_eos)250 static void qpack_encoder_stream_handle_input(h2o_http3_conn_t *conn, struct st_h2o_http3_ingress_unistream_t *stream,
251                                               const uint8_t **src, const uint8_t *src_end, int is_eos)
252 {
253     if (src == NULL || is_eos) {
254         h2o_quic_close_connection(&conn->super, H2O_HTTP3_ERROR_CLOSED_CRITICAL_STREAM, NULL);
255         return;
256     }
257 
258     while (*src != src_end) {
259         int64_t *unblocked_stream_ids;
260         size_t num_unblocked;
261         int ret;
262         const char *err_desc = NULL;
263         if ((ret = h2o_qpack_decoder_handle_input(conn->qpack.dec, &unblocked_stream_ids, &num_unblocked, src, src_end,
264                                                   &err_desc)) != 0) {
265             h2o_quic_close_connection(&conn->super, ret, err_desc);
266             break;
267         }
268         /* TODO handle unblocked streams */
269     }
270 }
271 
qpack_decoder_stream_handle_input(h2o_http3_conn_t * conn,struct st_h2o_http3_ingress_unistream_t * stream,const uint8_t ** src,const uint8_t * src_end,int is_eos)272 static void qpack_decoder_stream_handle_input(h2o_http3_conn_t *conn, struct st_h2o_http3_ingress_unistream_t *stream,
273                                               const uint8_t **src, const uint8_t *src_end, int is_eos)
274 {
275     if (src == NULL || is_eos) {
276         h2o_quic_close_connection(&conn->super, H2O_HTTP3_ERROR_CLOSED_CRITICAL_STREAM, NULL);
277         return;
278     }
279 
280     while (*src != src_end) {
281         int ret;
282         const char *err_desc = NULL;
283         if ((ret = h2o_qpack_encoder_handle_input(conn->qpack.enc, src, src_end, &err_desc)) != 0) {
284             h2o_quic_close_connection(&conn->super, ret, err_desc);
285             break;
286         }
287     }
288 }
289 
control_stream_handle_input(h2o_http3_conn_t * conn,struct st_h2o_http3_ingress_unistream_t * stream,const uint8_t ** src,const uint8_t * src_end,int is_eos)290 static void control_stream_handle_input(h2o_http3_conn_t *conn, struct st_h2o_http3_ingress_unistream_t *stream,
291                                         const uint8_t **src, const uint8_t *src_end, int is_eos)
292 {
293     if (src == NULL || is_eos) {
294         h2o_quic_close_connection(&conn->super, H2O_HTTP3_ERROR_CLOSED_CRITICAL_STREAM, NULL);
295         return;
296     }
297 
298     do {
299         h2o_http3_read_frame_t frame;
300         int ret;
301         const char *err_desc = NULL;
302 
303         if ((ret = h2o_http3_read_frame(&frame, quicly_is_client(conn->super.quic), H2O_HTTP3_STREAM_TYPE_CONTROL, src, src_end,
304                                         &err_desc)) != 0) {
305             if (ret != H2O_HTTP3_ERROR_INCOMPLETE)
306                 h2o_quic_close_connection(&conn->super, ret, err_desc);
307             break;
308         }
309         if (h2o_http3_has_received_settings(conn) == (frame.type == H2O_HTTP3_FRAME_TYPE_SETTINGS) ||
310             frame.type == H2O_HTTP3_FRAME_TYPE_DATA) {
311             h2o_quic_close_connection(&conn->super, H2O_HTTP3_ERROR_FRAME_UNEXPECTED, NULL);
312             break;
313         }
314         get_callbacks(conn)->handle_control_stream_frame(conn, frame.type, frame.payload, frame.length);
315         if (quicly_get_state(conn->super.quic) >= QUICLY_STATE_CLOSING)
316             break;
317     } while (*src != src_end);
318 }
319 
discard_handle_input(h2o_http3_conn_t * conn,struct st_h2o_http3_ingress_unistream_t * stream,const uint8_t ** src,const uint8_t * src_end,int is_eos)320 static void discard_handle_input(h2o_http3_conn_t *conn, struct st_h2o_http3_ingress_unistream_t *stream, const uint8_t **src,
321                                  const uint8_t *src_end, int is_eos)
322 {
323     if (src == NULL)
324         return;
325     *src = src_end;
326 }
327 
unknown_type_handle_input(h2o_http3_conn_t * conn,struct st_h2o_http3_ingress_unistream_t * stream,const uint8_t ** src,const uint8_t * src_end,int is_eos)328 static void unknown_type_handle_input(h2o_http3_conn_t *conn, struct st_h2o_http3_ingress_unistream_t *stream, const uint8_t **src,
329                                       const uint8_t *src_end, int is_eos)
330 {
331     uint64_t type;
332 
333     /* resets are allowed at least until the type is being determined */
334     if (src == NULL)
335         return;
336 
337     /* read the type, or just return if incomplete */
338     if ((type = quicly_decodev(src, src_end)) == UINT64_MAX)
339         return;
340 
341     switch (type) {
342     case H2O_HTTP3_STREAM_TYPE_CONTROL:
343         conn->_control_streams.ingress.control = stream;
344         stream->handle_input = control_stream_handle_input;
345         break;
346     case H2O_HTTP3_STREAM_TYPE_QPACK_ENCODER:
347         conn->_control_streams.ingress.qpack_encoder = stream;
348         stream->handle_input = qpack_encoder_stream_handle_input;
349         break;
350     case H2O_HTTP3_STREAM_TYPE_QPACK_DECODER:
351         conn->_control_streams.ingress.qpack_decoder = stream;
352         stream->handle_input = qpack_decoder_stream_handle_input;
353         break;
354     default:
355         quicly_request_stop(stream->quic, H2O_HTTP3_ERROR_STREAM_CREATION);
356         stream->handle_input = discard_handle_input;
357         break;
358     }
359 
360     return stream->handle_input(conn, stream, src, src_end, is_eos);
361 }
362 
egress_unistream_on_destroy(quicly_stream_t * qs,int err)363 static void egress_unistream_on_destroy(quicly_stream_t *qs, int err)
364 {
365     struct st_h2o_http3_egress_unistream_t *stream = qs->data;
366     h2o_buffer_dispose(&stream->sendbuf);
367     free(stream);
368 }
369 
egress_unistream_on_send_shift(quicly_stream_t * qs,size_t delta)370 static void egress_unistream_on_send_shift(quicly_stream_t *qs, size_t delta)
371 {
372     struct st_h2o_http3_egress_unistream_t *stream = qs->data;
373     h2o_buffer_consume(&stream->sendbuf, delta);
374 }
375 
egress_unistream_on_send_emit(quicly_stream_t * qs,size_t off,void * dst,size_t * len,int * wrote_all)376 static void egress_unistream_on_send_emit(quicly_stream_t *qs, size_t off, void *dst, size_t *len, int *wrote_all)
377 {
378     struct st_h2o_http3_egress_unistream_t *stream = qs->data;
379 
380     if (*len >= stream->sendbuf->size - off) {
381         *len = stream->sendbuf->size - off;
382         *wrote_all = 1;
383     } else {
384         *wrote_all = 0;
385     }
386     memcpy(dst, stream->sendbuf->bytes + off, *len);
387 }
388 
egress_unistream_on_send_stop(quicly_stream_t * qs,int err)389 static void egress_unistream_on_send_stop(quicly_stream_t *qs, int err)
390 {
391     struct st_h2o_http3_conn_t *conn = *quicly_get_data(qs->conn);
392     h2o_quic_close_connection(&conn->super, H2O_HTTP3_ERROR_CLOSED_CRITICAL_STREAM, NULL);
393 }
394 
h2o_http3_on_create_unidirectional_stream(quicly_stream_t * qs)395 void h2o_http3_on_create_unidirectional_stream(quicly_stream_t *qs)
396 {
397     if (quicly_stream_is_self_initiated(qs)) {
398         /* create egress unistream */
399         static const quicly_stream_callbacks_t callbacks = {egress_unistream_on_destroy, egress_unistream_on_send_shift,
400                                                             egress_unistream_on_send_emit, egress_unistream_on_send_stop};
401         struct st_h2o_http3_egress_unistream_t *stream = h2o_mem_alloc(sizeof(*stream));
402         qs->data = stream;
403         qs->callbacks = &callbacks;
404         stream->quic = qs;
405         h2o_buffer_init(&stream->sendbuf, &h2o_socket_buffer_prototype);
406     } else {
407         /* create ingress unistream */
408         static const quicly_stream_callbacks_t callbacks = {
409             ingress_unistream_on_destroy, NULL, NULL, NULL, ingress_unistream_on_receive, ingress_unistream_on_receive_reset};
410         struct st_h2o_http3_ingress_unistream_t *stream = h2o_mem_alloc(sizeof(*stream));
411         qs->data = stream;
412         qs->callbacks = &callbacks;
413         stream->quic = qs;
414         h2o_buffer_init(&stream->recvbuf, &h2o_socket_buffer_prototype);
415         stream->handle_input = unknown_type_handle_input;
416     }
417 }
418 
open_egress_unistream(h2o_http3_conn_t * conn,struct st_h2o_http3_egress_unistream_t ** stream,h2o_iovec_t initial_bytes)419 static int open_egress_unistream(h2o_http3_conn_t *conn, struct st_h2o_http3_egress_unistream_t **stream, h2o_iovec_t initial_bytes)
420 {
421     quicly_stream_t *qs;
422     int ret;
423 
424     if ((ret = quicly_open_stream(conn->super.quic, &qs, 1)) != 0)
425         return ret;
426     *stream = qs->data;
427     assert((*stream)->quic == qs);
428 
429     h2o_buffer_append(&(*stream)->sendbuf, initial_bytes.base, initial_bytes.len);
430     return quicly_stream_sync_sendbuf((*stream)->quic, 1);
431 }
432 
accept_hashkey_flatten_address(uint8_t * p,quicly_address_t * addr)433 static uint8_t *accept_hashkey_flatten_address(uint8_t *p, quicly_address_t *addr)
434 {
435     switch (addr->sa.sa_family) {
436     case AF_INET:
437         *p++ = 4;
438         memcpy(p, &addr->sin.sin_addr.s_addr, 4);
439         p += 4;
440         memcpy(p, &addr->sin.sin_port, 2);
441         p += 2;
442         break;
443     case AF_INET6:
444         *p++ = 6;
445         memcpy(p, addr->sin6.sin6_addr.s6_addr, 16);
446         p += 16;
447         memcpy(p, &addr->sin.sin_port, 2);
448         p += 2;
449         break;
450     case AF_UNSPEC:
451         *p++ = 0;
452         break;
453     default:
454         h2o_fatal("unknown protocol family");
455         break;
456     }
457     return p;
458 }
459 
calc_accept_hashkey(quicly_address_t * destaddr,quicly_address_t * srcaddr,ptls_iovec_t src_cid)460 static uint64_t calc_accept_hashkey(quicly_address_t *destaddr, quicly_address_t *srcaddr, ptls_iovec_t src_cid)
461 {
462     /* prepare key */
463     static __thread EVP_CIPHER_CTX *cipher = NULL;
464     if (cipher == NULL) {
465         static uint8_t key[PTLS_AES128_KEY_SIZE];
466         H2O_MULTITHREAD_ONCE({ ptls_openssl_random_bytes(key, sizeof(key)); });
467         cipher = EVP_CIPHER_CTX_new();
468         EVP_EncryptInit_ex(cipher, EVP_aes_128_cbc(), NULL, key, NULL);
469     }
470 
471     uint8_t buf[(1 + 16 + 2) * 2 + QUICLY_MAX_CID_LEN_V1 + PTLS_AES_BLOCK_SIZE] = {0};
472     uint8_t *p = buf;
473 
474     /* build plaintext to encrypt */
475     p = accept_hashkey_flatten_address(p, destaddr);
476     p = accept_hashkey_flatten_address(p, srcaddr);
477     memcpy(p, src_cid.base, src_cid.len);
478     p += src_cid.len;
479     assert(p <= buf + sizeof(buf));
480     size_t bytes_to_encrypt = ((p - buf) + PTLS_AES_BLOCK_SIZE - 1) / PTLS_AES_BLOCK_SIZE * PTLS_AES_BLOCK_SIZE;
481     assert(bytes_to_encrypt <= sizeof(buf));
482 
483     { /* encrypt */
484         EVP_EncryptInit_ex(cipher, NULL, NULL, NULL, NULL);
485         int bytes_encrypted = 0, ret = EVP_EncryptUpdate(cipher, buf, &bytes_encrypted, buf, (int)bytes_to_encrypt);
486         assert(ret);
487         assert(bytes_encrypted == bytes_to_encrypt);
488     }
489 
490     /* use the last `size_t` bytes of the CBC output as the result */
491     uint64_t result;
492     memcpy(&result, buf + bytes_to_encrypt - sizeof(result), sizeof(result));
493     /* avoid 0 (used as nonexist) */
494     if (result == 0)
495         result = 1;
496     return result;
497 }
498 
drop_from_acceptmap(h2o_quic_ctx_t * ctx,h2o_quic_conn_t * conn)499 static void drop_from_acceptmap(h2o_quic_ctx_t *ctx, h2o_quic_conn_t *conn)
500 {
501     if (conn->_accept_hashkey != 0) {
502         khint_t iter;
503         if ((iter = kh_get_h2o_quic_acceptmap(ctx->conns_accepting, conn->_accept_hashkey)) != kh_end(ctx->conns_accepting))
504             kh_del_h2o_quic_acceptmap(ctx->conns_accepting, iter);
505         conn->_accept_hashkey = 0;
506     }
507 }
508 
send_version_negotiation(h2o_quic_ctx_t * ctx,quicly_address_t * destaddr,ptls_iovec_t dest_cid,quicly_address_t * srcaddr,ptls_iovec_t src_cid,const uint32_t * versions)509 static void send_version_negotiation(h2o_quic_ctx_t *ctx, quicly_address_t *destaddr, ptls_iovec_t dest_cid,
510                                      quicly_address_t *srcaddr, ptls_iovec_t src_cid, const uint32_t *versions)
511 {
512     uint8_t payload[QUICLY_MIN_CLIENT_INITIAL_SIZE];
513     size_t payload_size = quicly_send_version_negotiation(ctx->quic, dest_cid, src_cid, versions, payload);
514     assert(payload_size != SIZE_MAX);
515     struct iovec vec = {.iov_base = payload, .iov_len = payload_size};
516     h2o_quic_send_datagrams(ctx, destaddr, srcaddr, &vec, 1);
517     return;
518 }
519 
process_packets(h2o_quic_ctx_t * ctx,quicly_address_t * destaddr,quicly_address_t * srcaddr,uint8_t ttl,quicly_decoded_packet_t * packets,size_t num_packets)520 static void process_packets(h2o_quic_ctx_t *ctx, quicly_address_t *destaddr, quicly_address_t *srcaddr, uint8_t ttl,
521                             quicly_decoded_packet_t *packets, size_t num_packets)
522 {
523     h2o_quic_conn_t *conn = NULL;
524     size_t accepted_packet_index = SIZE_MAX;
525 
526     assert(num_packets != 0);
527 
528     if (ctx->quic_stats != NULL) {
529         ctx->quic_stats->packet_received += num_packets;
530     }
531 
532 #if H2O_USE_DTRACE
533     if (PTLS_UNLIKELY(H2O_H3_PACKET_RECEIVE_ENABLED())) {
534         for (size_t i = 0; i != num_packets; ++i)
535             H2O_H3_PACKET_RECEIVE(&destaddr->sa, &srcaddr->sa, packets[i].octets.base, packets[i].octets.len);
536     }
537 #endif
538 
539     if (packets[0].cid.src.len > QUICLY_MAX_CID_LEN_V1)
540         return;
541 
542     /* find the matching connection, by first looking at the CID (all packets as client, or Handshake, 1-RTT packets as server) */
543     if (packets[0].cid.dest.plaintext.node_id == ctx->next_cid.node_id &&
544         packets[0].cid.dest.plaintext.thread_id == ctx->next_cid.thread_id) {
545         khiter_t iter = kh_get_h2o_quic_idmap(ctx->conns_by_id, packets[0].cid.dest.plaintext.master_id);
546         if (iter != kh_end(ctx->conns_by_id)) {
547             conn = kh_val(ctx->conns_by_id, iter);
548             /* CID-based matching on Initial and 0-RTT packets should only be applied for clients */
549             if (!quicly_is_client(conn->quic) && packets[0].cid.dest.might_be_client_generated)
550                 conn = NULL;
551         } else if (!packets[0].cid.dest.might_be_client_generated) {
552             /* send stateless reset when we could not find a matching connection for a 1 RTT packet */
553             if (packets[0].octets.len >= QUICLY_STATELESS_RESET_PACKET_MIN_LEN) {
554                 uint8_t payload[QUICLY_MIN_CLIENT_INITIAL_SIZE];
555                 size_t payload_size = quicly_send_stateless_reset(ctx->quic, packets[0].cid.dest.encrypted.base, payload);
556                 if (payload_size != SIZE_MAX) {
557                     struct iovec vec = {.iov_base = payload, .iov_len = payload_size};
558                     h2o_quic_send_datagrams(ctx, srcaddr, destaddr, &vec, 1);
559                 }
560             }
561             return;
562         }
563     } else if (!packets[0].cid.dest.might_be_client_generated) {
564         /* forward 1-RTT packets belonging to different nodes, threads */
565         if (ttl == 0)
566             return;
567         uint64_t offending_node_id = packets[0].cid.dest.plaintext.node_id;
568         if (ctx->forward_packets != NULL && ctx->forward_packets(ctx, &offending_node_id, packets[0].cid.dest.plaintext.thread_id,
569                                                                  destaddr, srcaddr, ttl, packets, num_packets))
570             return;
571         /* non-authenticating 1-RTT packets are potentially stateless resets (FIXME handle them, note that we need to use a hashdos-
572          * resistant hash map that also meets constant-time comparison requirements) */
573         return;
574     }
575 
576     if (conn == NULL) {
577         /* Initial or 0-RTT packet, use 4-tuple to match the thread and the connection */
578         assert(packets[0].cid.dest.might_be_client_generated);
579         uint64_t accept_hashkey = calc_accept_hashkey(destaddr, srcaddr, packets[0].cid.src);
580         if (ctx->accept_thread_divisor != 0) {
581             uint32_t offending_thread = accept_hashkey % ctx->accept_thread_divisor;
582             if (offending_thread != ctx->next_cid.thread_id) {
583                 if (ctx->forward_packets != NULL)
584                     ctx->forward_packets(ctx, NULL, offending_thread, destaddr, srcaddr, ttl, packets, num_packets);
585                 return;
586             }
587         }
588         khiter_t iter = kh_get_h2o_quic_acceptmap(ctx->conns_accepting, accept_hashkey);
589         if (iter == kh_end(ctx->conns_accepting)) {
590             /* a new connection for this thread (at least on this process); accept or delegate to newer process */
591             if (ctx->acceptor != NULL) {
592                 if (packets[0].version != 0 && !quicly_is_supported_version(packets[0].version)) {
593                     send_version_negotiation(ctx, srcaddr, packets[0].cid.src, destaddr, packets[0].cid.dest.encrypted,
594                                              quicly_supported_versions);
595                     return;
596                 }
597             } else {
598                 /* This is the offending thread but it is not accepting, which means that the process (or the thread) is not acting
599                  * as a server (likely gracefully shutting down). Let the application process forward the packet to the next
600                  * generation. */
601                 if (ctx->forward_packets != NULL &&
602                     ctx->forward_packets(ctx, NULL, ctx->next_cid.thread_id, destaddr, srcaddr, ttl, packets, num_packets))
603                     return;
604                 /* If not forwarded, send rejection to the peer. A Version Negotiation packet that carries only a greasing version
605                  * number is used for the purpose, hoping that that signal will trigger immediate downgrade to HTTP/2, across the
606                  * broad spectrum of the client implementations than if CONNECTION_REFUSED is being used. */
607                 if (packets[0].version != 0) {
608                     static const uint32_t no_versions[] = {0};
609                     send_version_negotiation(ctx, srcaddr, packets[0].cid.src, destaddr, packets[0].cid.dest.encrypted,
610                                              no_versions);
611                 }
612                 return;
613             }
614             /* try to accept any of the Initial packets being received */
615             size_t i;
616             for (i = 0; i != num_packets; ++i)
617                 if ((packets[i].octets.base[0] & QUICLY_PACKET_TYPE_BITMASK) == QUICLY_PACKET_TYPE_INITIAL)
618                     if ((conn = ctx->acceptor(ctx, destaddr, srcaddr, packets + i)) != NULL) {
619                         /* non-null generally means success, except for H2O_QUIC_ACCEPT_CONN_DECRYPTION_FAILED */
620                         if (conn == (h2o_quic_conn_t *)H2O_QUIC_ACCEPT_CONN_DECRYPTION_FAILED) {
621                             /* failed to decrypt Initial packet <=> it could belong to a connection on a different node; forward it
622                              * to the destination being claimed by the DCID */
623                             uint64_t offending_node_id = packets[i].cid.dest.plaintext.node_id;
624                             uint32_t offending_thread_id = packets[i].cid.dest.plaintext.thread_id;
625                             if (ctx->forward_packets != NULL && ttl > 0 &&
626                                 (offending_node_id != ctx->next_cid.node_id || offending_thread_id != ctx->next_cid.thread_id))
627                                 ctx->forward_packets(ctx, &offending_node_id, offending_thread_id, destaddr, srcaddr, ttl, packets,
628                                                      num_packets);
629                             return;
630                         }
631                         break;
632                     }
633             if (conn == NULL)
634                 return;
635             accepted_packet_index = i;
636             conn->_accept_hashkey = accept_hashkey;
637             int r;
638             iter = kh_put_h2o_quic_acceptmap(conn->ctx->conns_accepting, accept_hashkey, &r);
639             assert(iter != kh_end(conn->ctx->conns_accepting));
640             kh_val(conn->ctx->conns_accepting, iter) = conn;
641         } else {
642             /* existing connection */
643             conn = kh_val(ctx->conns_accepting, iter);
644             assert(conn != NULL);
645             assert(!quicly_is_client(conn->quic));
646             if (quicly_is_destination(conn->quic, &destaddr->sa, &srcaddr->sa, packets))
647                 goto Receive;
648             uint64_t offending_node_id = packets[0].cid.dest.plaintext.node_id;
649             uint32_t offending_thread_id = packets[0].cid.dest.plaintext.thread_id;
650             if (offending_node_id != ctx->next_cid.node_id || offending_thread_id != ctx->next_cid.thread_id) {
651                 /* accept key matches to a connection being established, but DCID doesn't -- likely a second (or later) Initial that
652                  * is supposed to be handled by another node. forward it. */
653                 if (ttl == 0)
654                     return;
655                 if (ctx->forward_packets != NULL)
656                     ctx->forward_packets(ctx, &offending_node_id, offending_thread_id, destaddr, srcaddr, ttl, packets,
657                                          num_packets);
658             }
659             /* regardless of forwarding outcome, we need to drop this packet as it is not for us */
660             return;
661         }
662     }
663 
664     { /* receive packets to the found connection */
665         if (!quicly_is_destination(conn->quic, &destaddr->sa, &srcaddr->sa, packets))
666             return;
667         size_t i;
668     Receive:
669         for (i = 0; i != num_packets; ++i) {
670             if (i != accepted_packet_index) {
671                 int ret = quicly_receive(conn->quic, &destaddr->sa, &srcaddr->sa, packets + i);
672                 switch (ret) {
673                 case QUICLY_ERROR_STATE_EXHAUSTION:
674                 case PTLS_ERROR_NO_MEMORY:
675                     fprintf(stderr, "%s: `quicly_receive()` returned ret:%d\n", __func__, ret);
676                     conn->callbacks->destroy_connection(conn);
677                     return;
678                 }
679                 if (ret != QUICLY_ERROR_PACKET_IGNORED && ret != QUICLY_ERROR_DECRYPTION_FAILED) {
680                     if (ctx->quic_stats != NULL) {
681                         ++ctx->quic_stats->packet_processed;
682                     }
683                 }
684             }
685         }
686     }
687 
688     h2o_quic_schedule_timer(conn);
689     if (ctx->notify_conn_update != NULL)
690         ctx->notify_conn_update(ctx, conn);
691 }
692 
h2o_quic_read_socket(h2o_quic_ctx_t * ctx,h2o_socket_t * sock)693 void h2o_quic_read_socket(h2o_quic_ctx_t *ctx, h2o_socket_t *sock)
694 {
695     struct {
696         quicly_address_t destaddr, srcaddr;
697         struct iovec vec;
698         uint8_t ttl;
699         char controlbuf[
700 #ifdef IPV6_PKTINFO
701             CMSG_SPACE(sizeof(struct in6_pktinfo))
702 #elif defined(IP_PKTINFO)
703             CMSG_SPACE(sizeof(struct in_pktinfo))
704 #elif defined(IP_RECVDSTADDR)
705             CMSG_SPACE(sizeof(struct in_addr))
706 #else
707             CMSG_SPACE(1)
708 #endif
709         ];
710         uint8_t buf[1600];
711     } dgrams[10];
712 #ifdef __linux__
713     struct mmsghdr mess[PTLS_ELEMENTSOF(dgrams)];
714 #else
715     struct {
716         struct msghdr msg_hdr;
717     } mess[PTLS_ELEMENTSOF(dgrams)];
718 #endif
719 
720 #define INIT_DGRAMS(i)                                                                                                             \
721     do {                                                                                                                           \
722         mess[i].msg_hdr = (struct msghdr){                                                                                         \
723             .msg_name = &dgrams[i].srcaddr,                                                                                        \
724             .msg_namelen = sizeof(dgrams[i].srcaddr),                                                                              \
725             .msg_iov = &dgrams[i].vec,                                                                                             \
726             .msg_iovlen = 1,                                                                                                       \
727             .msg_control = &dgrams[i].controlbuf,                                                                                  \
728             .msg_controllen = sizeof(dgrams[i].controlbuf),                                                                        \
729         };                                                                                                                         \
730         memset(&dgrams[i].destaddr, 0, sizeof(dgrams[i].destaddr));                                                                \
731         dgrams[i].vec.iov_base = dgrams[i].buf;                                                                                    \
732         dgrams[i].vec.iov_len = sizeof(dgrams[i].buf);                                                                             \
733     } while (0)
734 
735     int fd = h2o_socket_get_fd(sock);
736     size_t dgram_index, num_dgrams;
737 
738     /* Read datagrams. Sender should be provided an ACK every fraction of RTT, otherwise its behavior becomes bursty (assuming that
739      * pacing is not used), rather than packets being spread across entire round-trip. To minimize the chance of us entering such
740      * situation, number of datagrams being read at once is limited to `PTLS_ELEMENTSOF(dgrams)`. In other words, one ack is
741      * generated for no more than every 10 ack-eliciting packets being received, unless the ack-frequency extension is used. */
742 #ifdef __linux__
743     {
744         int rret;
745         do {
746             for (dgram_index = 0; dgram_index < PTLS_ELEMENTSOF(dgrams); ++dgram_index)
747                 INIT_DGRAMS(dgram_index);
748         } while ((rret = recvmmsg(fd, mess, PTLS_ELEMENTSOF(mess), 0, NULL)) < 0 && errno == EINTR);
749         if (rret <= 0)
750             goto Exit;
751         num_dgrams = (size_t)rret;
752         for (dgram_index = 0; dgram_index < num_dgrams; ++dgram_index)
753             dgrams[dgram_index].vec.iov_len = (size_t)mess[dgram_index].msg_len;
754     }
755 #else
756     for (dgram_index = 0; dgram_index < PTLS_ELEMENTSOF(dgrams); ++dgram_index) {
757         ssize_t rret;
758         do {
759             INIT_DGRAMS(dgram_index);
760         } while ((rret = recvmsg(fd, &mess[dgram_index].msg_hdr, 0)) < 0 && errno == EINTR);
761         if (rret < 0)
762             break;
763         dgrams[dgram_index].vec.iov_len = rret;
764     }
765     num_dgrams = dgram_index;
766     if (num_dgrams == 0)
767         goto Exit;
768 #endif
769 
770     /* normalize and store the obtained data into `dgrams` */
771     for (dgram_index = 0; dgram_index < num_dgrams; ++dgram_index) {
772         { /* fetch destination address */
773             struct cmsghdr *cmsg;
774             for (cmsg = CMSG_FIRSTHDR(&mess[dgram_index].msg_hdr); cmsg != NULL;
775                  cmsg = CMSG_NXTHDR(&mess[dgram_index].msg_hdr, cmsg)) {
776 #ifdef IP_PKTINFO
777                 if (cmsg->cmsg_level == IPPROTO_IP && cmsg->cmsg_type == IP_PKTINFO) {
778                     dgrams[dgram_index].destaddr.sin.sin_family = AF_INET;
779                     dgrams[dgram_index].destaddr.sin.sin_addr = ((struct in_pktinfo *)CMSG_DATA(cmsg))->ipi_addr;
780                     dgrams[dgram_index].destaddr.sin.sin_port = *ctx->sock.port;
781                     goto DestAddrFound;
782                 }
783 #endif
784 #ifdef IP_RECVDSTADDR
785                 if (cmsg->cmsg_level == IPPROTO_IP && cmsg->cmsg_type == IP_RECVDSTADDR) {
786                     dgrams[dgram_index].destaddr.sin.sin_family = AF_INET;
787                     dgrams[dgram_index].destaddr.sin.sin_addr = *(struct in_addr *)CMSG_DATA(cmsg);
788                     dgrams[dgram_index].destaddr.sin.sin_port = *ctx->sock.port;
789                     goto DestAddrFound;
790                 }
791 #endif
792 #ifdef IPV6_PKTINFO
793                 if (cmsg->cmsg_level == IPPROTO_IPV6 && cmsg->cmsg_type == IPV6_PKTINFO) {
794                     dgrams[dgram_index].destaddr.sin6.sin6_family = AF_INET6;
795                     dgrams[dgram_index].destaddr.sin6.sin6_addr = ((struct in6_pktinfo *)CMSG_DATA(cmsg))->ipi6_addr;
796                     dgrams[dgram_index].destaddr.sin6.sin6_port = *ctx->sock.port;
797                     goto DestAddrFound;
798                 }
799 #endif
800             }
801             dgrams[dgram_index].destaddr.sa.sa_family = AF_UNSPEC;
802         DestAddrFound:;
803         }
804         dgrams[dgram_index].ttl = ctx->default_ttl;
805         /* preprocess (and drop the packet if it failed) */
806         if (ctx->preprocess_packet != NULL &&
807             !ctx->preprocess_packet(ctx, &mess[dgram_index].msg_hdr, &dgrams[dgram_index].destaddr,
808                                     &dgrams[dgram_index].srcaddr, &dgrams[dgram_index].ttl)) {
809             dgrams[dgram_index].vec.iov_len = 0; /* mark as unused */
810         } else {
811             assert(dgrams[dgram_index].srcaddr.sa.sa_family == AF_INET || dgrams[dgram_index].srcaddr.sa.sa_family == AF_INET6);
812         }
813     }
814 
815     /* convert dgrams to decoded packets and process them in group of (4-tuple, dcid) */
816     quicly_decoded_packet_t packets[64];
817     size_t packet_index = 0;
818     dgram_index = 0;
819     while (dgram_index < num_dgrams) {
820         int has_decoded = 0; /* indicates if a decoded packet belonging to a different connection is stored at
821                               * `packets[packet_index]` */
822         /* skip zero-sized datagrams (or the ones for which preprocessing failed) */
823         if (dgrams[dgram_index].vec.iov_len == 0) {
824             ++dgram_index;
825             continue;
826         }
827         /* dispatch packets in `packets`, if the datagram at dgram_index is from a different path */
828         if (packet_index != 0) {
829             assert(dgram_index != 0);
830             /* check source address */
831             if (h2o_socket_compare_address(&dgrams[dgram_index - 1].srcaddr.sa, &dgrams[dgram_index].srcaddr.sa, 1) != 0)
832                 goto ProcessPackets;
833             /* check destination address, if available */
834             if (dgrams[dgram_index - 1].destaddr.sa.sa_family == AF_UNSPEC &&
835                 dgrams[dgram_index].destaddr.sa.sa_family == AF_UNSPEC) {
836                 /* ok */
837             } else if (h2o_socket_compare_address(&dgrams[dgram_index - 1].destaddr.sa, &dgrams[dgram_index].destaddr.sa, 1) ==
838                        0) {
839                 /* ok */
840             } else {
841                 goto ProcessPackets;
842             }
843             /* TTL should be same for dispatched packets */
844             if (dgrams[dgram_index - 1].ttl != dgrams[dgram_index].ttl)
845                 goto ProcessPackets;
846         }
847         /* decode the first packet */
848         size_t payload_off = 0;
849         if (quicly_decode_packet(ctx->quic, packets + packet_index, dgrams[dgram_index].vec.iov_base,
850                                  dgrams[dgram_index].vec.iov_len, &payload_off) == SIZE_MAX) {
851             ++dgram_index;
852             goto ProcessPackets;
853         }
854         /* dispatch packets in `packets` if the DCID is different, setting the `has_decoded` flag */
855         if (packet_index != 0) {
856             const ptls_iovec_t *prev_dcid = &packets[packet_index - 1].cid.dest.encrypted,
857                                *cur_dcid = &packets[packet_index].cid.dest.encrypted;
858             if (!(prev_dcid->len == cur_dcid->len && memcmp(prev_dcid->base, cur_dcid->base, prev_dcid->len) == 0)) {
859                 has_decoded = 1;
860                 ++dgram_index;
861                 goto ProcessPackets;
862             }
863         }
864         ++packet_index;
865         /* add rest of the packets */
866         while (payload_off < dgrams[dgram_index].vec.iov_len && packet_index < PTLS_ELEMENTSOF(packets)) {
867             if (quicly_decode_packet(ctx->quic, packets + packet_index, dgrams[dgram_index].vec.iov_base,
868                                      dgrams[dgram_index].vec.iov_len, &payload_off) == SIZE_MAX)
869                 break;
870             ++packet_index;
871         }
872         ++dgram_index;
873         /* if we have enough room for the next datagram, that is, the expected worst case of 4 packets in a coalesced datagram,
874          * continue */
875         if (packet_index + 4 < PTLS_ELEMENTSOF(packets))
876             continue;
877 
878     ProcessPackets:
879         if (packet_index != 0) {
880             process_packets(ctx, &dgrams[dgram_index - 1].destaddr, &dgrams[dgram_index - 1].srcaddr,
881                             dgrams[dgram_index - 1].ttl, packets, packet_index);
882             if (has_decoded) {
883                 packets[0] = packets[packet_index];
884                 packet_index = 1;
885             } else {
886                 packet_index = 0;
887             }
888         }
889     }
890     if (packet_index != 0)
891         process_packets(ctx, &dgrams[dgram_index - 1].destaddr, &dgrams[dgram_index - 1].srcaddr, dgrams[dgram_index - 1].ttl,
892                         packets, packet_index);
893 
894 Exit:;
895 
896 #undef INIT_DGRAMS
897 }
898 
on_read(h2o_socket_t * sock,const char * err)899 static void on_read(h2o_socket_t *sock, const char *err)
900 {
901     h2o_quic_ctx_t *ctx = sock->data;
902     h2o_quic_read_socket(ctx, sock);
903 }
904 
on_timeout(h2o_timer_t * timeout)905 static void on_timeout(h2o_timer_t *timeout)
906 {
907     h2o_quic_conn_t *conn = H2O_STRUCT_FROM_MEMBER(h2o_quic_conn_t, _timeout, timeout);
908     h2o_quic_send(conn);
909 }
910 
h2o_http3_read_frame(h2o_http3_read_frame_t * frame,int is_client,uint64_t stream_type,const uint8_t ** _src,const uint8_t * src_end,const char ** err_desc)911 int h2o_http3_read_frame(h2o_http3_read_frame_t *frame, int is_client, uint64_t stream_type, const uint8_t **_src,
912                          const uint8_t *src_end, const char **err_desc)
913 {
914     const uint8_t *src = *_src;
915 
916     if ((frame->type = quicly_decodev(&src, src_end)) == UINT64_MAX)
917         return H2O_HTTP3_ERROR_INCOMPLETE;
918     if ((frame->length = quicly_decodev(&src, src_end)) == UINT64_MAX)
919         return H2O_HTTP3_ERROR_INCOMPLETE;
920     frame->_header_size = (uint8_t)(src - *_src);
921 
922     /* read the content of the frame (unless it's a DATA frame) */
923     frame->payload = NULL;
924     if (frame->type != H2O_HTTP3_FRAME_TYPE_DATA) {
925         if (frame->length > H2O_HTTP3_MAX_FRAME_PAYLOAD_SIZE) {
926             H2O_PROBE(H3_FRAME_RECEIVE, frame->type, NULL, frame->length);
927             *err_desc = "H3 frame too large";
928             return H2O_HTTP3_ERROR_GENERAL_PROTOCOL; /* FIXME is this the correct code? */
929         }
930         if (src_end - src < frame->length)
931             return H2O_HTTP3_ERROR_INCOMPLETE;
932         frame->payload = src;
933         src += frame->length;
934     }
935 
936     H2O_PROBE(H3_FRAME_RECEIVE, frame->type, frame->payload, frame->length);
937 
938     /* validate frame type */
939     switch (frame->type) {
940 #define FRAME(id, req_clnt, req_srvr, ctl_clnt, ctl_srvr)                                                                          \
941     case H2O_HTTP3_FRAME_TYPE_##id:                                                                                                \
942         switch (stream_type) {                                                                                                     \
943         case H2O_HTTP3_STREAM_TYPE_REQUEST:                                                                                        \
944             if (req_clnt && !is_client)                                                                                            \
945                 goto Validation_Success;                                                                                           \
946             if (req_srvr && is_client)                                                                                             \
947                 goto Validation_Success;                                                                                           \
948             break;                                                                                                                 \
949         case H2O_HTTP3_STREAM_TYPE_CONTROL:                                                                                        \
950             if (ctl_clnt && !is_client)                                                                                            \
951                 goto Validation_Success;                                                                                           \
952             if (ctl_srvr && is_client)                                                                                             \
953                 goto Validation_Success;                                                                                           \
954             break;                                                                                                                 \
955         default:                                                                                                                   \
956             h2o_fatal("unexpected stream type");                                                                                   \
957             break;                                                                                                                 \
958         }                                                                                                                          \
959         break
960         /* clang-format off */
961         /*   +-----------------+-------------+-------------+
962          *   |                 | req-stream  | ctrl-stream |
963          *   |      frame      +------+------+------+------+
964          *   |                 |client|server|client|server|
965          *   +-----------------+------+------+------+------+ */
966         FRAME( DATA            ,    1 ,    1 ,    0 ,    0 );
967         FRAME( HEADERS         ,    1 ,    1 ,    0 ,    0 );
968         FRAME( CANCEL_PUSH     ,    0 ,    0 ,    1 ,    1 );
969         FRAME( SETTINGS        ,    0 ,    0 ,    1 ,    1 );
970         FRAME( PUSH_PROMISE    ,    0 ,    1 ,    0 ,    0 );
971         FRAME( GOAWAY          ,    0 ,    0 ,    1 ,    1 );
972         FRAME( MAX_PUSH_ID     ,    0 ,    0 ,    1 ,    0 );
973         FRAME( PRIORITY_UPDATE ,    0 ,    0 ,    1 ,    0 );
974         /*   +-----------------+------+------+------+------+ */
975         /* clang-format on */
976 #undef FRAME
977     default:
978         /* ignore extension frames that we do not handle */
979         goto Validation_Success;
980     }
981     return H2O_HTTP3_ERROR_FRAME_UNEXPECTED;
982 Validation_Success:;
983 
984     *_src = src;
985     return 0;
986 }
987 
h2o_quic_init_context(h2o_quic_ctx_t * ctx,h2o_loop_t * loop,h2o_socket_t * sock,quicly_context_t * quic,h2o_quic_accept_cb acceptor,h2o_quic_notify_connection_update_cb notify_conn_update,uint8_t use_gso,h2o_quic_stats_t * quic_stats)988 void h2o_quic_init_context(h2o_quic_ctx_t *ctx, h2o_loop_t *loop, h2o_socket_t *sock, quicly_context_t *quic,
989                            h2o_quic_accept_cb acceptor, h2o_quic_notify_connection_update_cb notify_conn_update, uint8_t use_gso,
990                            h2o_quic_stats_t *quic_stats)
991 {
992     assert(quic->stream_open != NULL);
993 
994     *ctx = (h2o_quic_ctx_t){
995         .loop = loop,
996         .sock = {.sock = sock},
997         .quic = quic,
998         .next_cid = {0} /* thread_id, node_id are set by h2o_http3_set_context_identifier */,
999         .conns_by_id = kh_init_h2o_quic_idmap(),
1000         .conns_accepting = kh_init_h2o_quic_acceptmap(),
1001         .notify_conn_update = notify_conn_update,
1002         .acceptor = acceptor,
1003         .use_gso = use_gso,
1004         .quic_stats = quic_stats,
1005     };
1006     ctx->sock.sock->data = ctx;
1007     ctx->sock.addrlen = h2o_socket_getsockname(ctx->sock.sock, (void *)&ctx->sock.addr);
1008     assert(ctx->sock.addrlen != 0);
1009     switch (ctx->sock.addr.ss_family) {
1010     case AF_INET:
1011         ctx->sock.port = &((struct sockaddr_in *)&ctx->sock.addr)->sin_port;
1012         break;
1013     case AF_INET6:
1014         ctx->sock.port = &((struct sockaddr_in6 *)&ctx->sock.addr)->sin6_port;
1015         break;
1016     default:
1017         assert(!"unexpected address family");
1018         break;
1019     }
1020 
1021     h2o_socket_read_start(ctx->sock.sock, on_read);
1022 }
1023 
h2o_quic_dispose_context(h2o_quic_ctx_t * ctx)1024 void h2o_quic_dispose_context(h2o_quic_ctx_t *ctx)
1025 {
1026     assert(kh_size(ctx->conns_by_id) == 0);
1027     assert(kh_size(ctx->conns_accepting) == 0);
1028 
1029     h2o_socket_close(ctx->sock.sock);
1030     kh_destroy_h2o_quic_idmap(ctx->conns_by_id);
1031     kh_destroy_h2o_quic_acceptmap(ctx->conns_accepting);
1032 }
1033 
h2o_quic_set_context_identifier(h2o_quic_ctx_t * ctx,uint32_t accept_thread_divisor,uint32_t thread_id,uint64_t node_id,uint8_t ttl,h2o_quic_forward_packets_cb forward_cb,h2o_quic_preprocess_packet_cb preprocess_cb)1034 void h2o_quic_set_context_identifier(h2o_quic_ctx_t *ctx, uint32_t accept_thread_divisor, uint32_t thread_id, uint64_t node_id,
1035                                      uint8_t ttl, h2o_quic_forward_packets_cb forward_cb,
1036                                      h2o_quic_preprocess_packet_cb preprocess_cb)
1037 {
1038     ctx->accept_thread_divisor = accept_thread_divisor;
1039     ctx->next_cid.thread_id = thread_id;
1040     ctx->next_cid.node_id = node_id;
1041     ctx->forward_packets = forward_cb;
1042     ctx->default_ttl = ttl;
1043     ctx->preprocess_packet = preprocess_cb;
1044 }
1045 
h2o_quic_close_connection(h2o_quic_conn_t * conn,int err,const char * reason_phrase)1046 void h2o_quic_close_connection(h2o_quic_conn_t *conn, int err, const char *reason_phrase)
1047 {
1048     switch (quicly_get_state(conn->quic)) {
1049     case QUICLY_STATE_FIRSTFLIGHT: /* FIXME why is this separate? */
1050         conn->callbacks->destroy_connection(conn);
1051         break;
1052     case QUICLY_STATE_CONNECTED:
1053         quicly_close(conn->quic, err, reason_phrase);
1054         h2o_quic_schedule_timer(conn);
1055         break;
1056     default:
1057         /* only need to wait for the socket close */
1058         break;
1059     }
1060 }
1061 
h2o_quic_close_all_connections(h2o_quic_ctx_t * ctx)1062 void h2o_quic_close_all_connections(h2o_quic_ctx_t *ctx)
1063 {
1064     h2o_quic_conn_t *conn;
1065 
1066     kh_foreach_value(ctx->conns_by_id, conn, { h2o_quic_close_connection(conn, 0, NULL); });
1067     /* closing a connection should also remove an entry from conns_accepting */
1068     assert(kh_size(ctx->conns_accepting) == 0);
1069 }
1070 
h2o_quic_num_connections(h2o_quic_ctx_t * ctx)1071 size_t h2o_quic_num_connections(h2o_quic_ctx_t *ctx)
1072 {
1073     /* throughout its lifetime, a connection is always registered to both conns_by_id and conns_accepting,
1074        thus counting conns_by_id is enough */
1075     return kh_size(ctx->conns_by_id);
1076 }
1077 
h2o_quic_init_conn(h2o_quic_conn_t * conn,h2o_quic_ctx_t * ctx,const h2o_quic_conn_callbacks_t * callbacks)1078 void h2o_quic_init_conn(h2o_quic_conn_t *conn, h2o_quic_ctx_t *ctx, const h2o_quic_conn_callbacks_t *callbacks)
1079 {
1080     *conn = (h2o_quic_conn_t){ctx, NULL, callbacks};
1081     h2o_timer_init(&conn->_timeout, on_timeout);
1082 }
1083 
h2o_quic_dispose_conn(h2o_quic_conn_t * conn)1084 void h2o_quic_dispose_conn(h2o_quic_conn_t *conn)
1085 {
1086     if (conn->quic != NULL) {
1087         khiter_t iter;
1088         /* unregister from maps */
1089         if ((iter = kh_get_h2o_quic_idmap(conn->ctx->conns_by_id, quicly_get_master_id(conn->quic)->master_id)) !=
1090             kh_end(conn->ctx->conns_by_id))
1091             kh_del_h2o_quic_idmap(conn->ctx->conns_by_id, iter);
1092         drop_from_acceptmap(conn->ctx, conn);
1093         quicly_free(conn->quic);
1094     }
1095     h2o_timer_unlink(&conn->_timeout);
1096 }
1097 
h2o_quic_setup(h2o_quic_conn_t * conn,quicly_conn_t * quic)1098 void h2o_quic_setup(h2o_quic_conn_t *conn, quicly_conn_t *quic)
1099 {
1100     conn->quic = quic;
1101     *quicly_get_data(conn->quic) = conn;
1102 
1103     /* register to the idmap */
1104     int r;
1105     khiter_t iter = kh_put_h2o_quic_idmap(conn->ctx->conns_by_id, quicly_get_master_id(conn->quic)->master_id, &r);
1106     assert(iter != kh_end(conn->ctx->conns_by_id));
1107     kh_val(conn->ctx->conns_by_id, iter) = conn;
1108 }
1109 
h2o_http3_init_conn(h2o_http3_conn_t * conn,h2o_quic_ctx_t * ctx,const h2o_http3_conn_callbacks_t * callbacks,const h2o_http3_qpack_context_t * qpack_ctx)1110 void h2o_http3_init_conn(h2o_http3_conn_t *conn, h2o_quic_ctx_t *ctx, const h2o_http3_conn_callbacks_t *callbacks,
1111                          const h2o_http3_qpack_context_t *qpack_ctx)
1112 {
1113     h2o_quic_init_conn(&conn->super, ctx, &callbacks->super);
1114     memset((char *)conn + sizeof(conn->super), 0, sizeof(*conn) - sizeof(conn->super));
1115     conn->qpack.ctx = qpack_ctx;
1116 }
1117 
h2o_http3_dispose_conn(h2o_http3_conn_t * conn)1118 void h2o_http3_dispose_conn(h2o_http3_conn_t *conn)
1119 {
1120     if (conn->qpack.dec != NULL)
1121         h2o_qpack_destroy_decoder(conn->qpack.dec);
1122     if (conn->qpack.enc != NULL)
1123         h2o_qpack_destroy_encoder(conn->qpack.enc);
1124     h2o_quic_dispose_conn(&conn->super);
1125 }
1126 
build_firstflight(h2o_http3_conn_t * conn,uint8_t * bytebuf,size_t capacity)1127 static size_t build_firstflight(h2o_http3_conn_t *conn, uint8_t *bytebuf, size_t capacity)
1128 {
1129     ptls_buffer_t buf;
1130     int ret = 0;
1131 
1132     ptls_buffer_init(&buf, bytebuf, capacity);
1133 
1134     /* push stream type */
1135     ptls_buffer_push_quicint(&buf, H2O_HTTP3_STREAM_TYPE_CONTROL);
1136 
1137     /* push SETTINGS frame */
1138     ptls_buffer_push_quicint(&buf, H2O_HTTP3_FRAME_TYPE_SETTINGS);
1139     ptls_buffer_push_block(&buf, -1, {
1140         quicly_context_t *qctx = quicly_get_context(conn->super.quic);
1141         if (qctx->transport_params.max_datagram_frame_size != 0) {
1142             ptls_buffer_push_quicint(&buf, H2O_HTTP3_SETTINGS_H3_DATAGRAM);
1143             ptls_buffer_push_quicint(&buf, 1);
1144         };
1145     });
1146 
1147     assert(!buf.is_allocated);
1148     return buf.off;
1149 
1150 Exit:
1151     h2o_fatal("unreachable");
1152 }
1153 
h2o_http3_setup(h2o_http3_conn_t * conn,quicly_conn_t * quic)1154 int h2o_http3_setup(h2o_http3_conn_t *conn, quicly_conn_t *quic)
1155 {
1156     int ret;
1157 
1158     h2o_quic_setup(&conn->super, quic);
1159     conn->state = H2O_HTTP3_CONN_STATE_OPEN;
1160 
1161     /* setup h3 objects, only when the connection state has been created */
1162     if (quicly_get_state(quic) > QUICLY_STATE_CONNECTED)
1163         goto Exit;
1164 
1165     /* create decoder with the table size set to zero; see SETTINGS sent below. */
1166     conn->qpack.dec = h2o_qpack_create_decoder(0, 100 /* FIXME */);
1167 
1168     { /* open control streams, send SETTINGS */
1169         uint8_t firstflight[32];
1170         size_t firstflight_len = build_firstflight(conn, firstflight, sizeof(firstflight));
1171         if ((ret = open_egress_unistream(conn, &conn->_control_streams.egress.control,
1172                                          h2o_iovec_init(firstflight, firstflight_len))) != 0)
1173             return ret;
1174     }
1175 
1176     { /* open QPACK encoder & decoder streams */
1177         static const uint8_t encoder_first_flight[] = {H2O_HTTP3_STREAM_TYPE_QPACK_ENCODER};
1178         static const uint8_t decoder_first_flight[] = {H2O_HTTP3_STREAM_TYPE_QPACK_DECODER};
1179         if ((ret = open_egress_unistream(conn, &conn->_control_streams.egress.qpack_encoder,
1180                                          h2o_iovec_init(encoder_first_flight, sizeof(encoder_first_flight)))) != 0 ||
1181             (ret = open_egress_unistream(conn, &conn->_control_streams.egress.qpack_decoder,
1182                                          h2o_iovec_init(decoder_first_flight, sizeof(decoder_first_flight)))) != 0)
1183             return ret;
1184     }
1185 
1186 Exit:
1187     h2o_quic_schedule_timer(&conn->super);
1188     return 0;
1189 }
1190 
h2o_quic_send(h2o_quic_conn_t * conn)1191 int h2o_quic_send(h2o_quic_conn_t *conn)
1192 {
1193     quicly_address_t dest, src;
1194     struct iovec datagrams[10];
1195     size_t num_datagrams = PTLS_ELEMENTSOF(datagrams);
1196     uint8_t datagram_buf[1500 * PTLS_ELEMENTSOF(datagrams)];
1197 
1198     int ret = quicly_send(conn->quic, &dest, &src, datagrams, &num_datagrams, datagram_buf, sizeof(datagram_buf));
1199     switch (ret) {
1200     case 0:
1201         if (num_datagrams != 0 && !h2o_quic_send_datagrams(conn->ctx, &dest, &src, datagrams, num_datagrams)) {
1202             /* FIXME close the connection immediately */
1203             break;
1204         }
1205         break;
1206     case QUICLY_ERROR_STATE_EXHAUSTION:
1207     case QUICLY_ERROR_FREE_CONNECTION:
1208         conn->callbacks->destroy_connection(conn);
1209         return 0;
1210     default:
1211         fprintf(stderr, "quicly_send returned %d\n", ret);
1212         abort();
1213     }
1214 
1215     h2o_quic_schedule_timer(conn);
1216 
1217     return 1;
1218 }
1219 
h2o_http3_update_recvbuf(h2o_buffer_t ** buf,size_t off,const void * src,size_t len)1220 void h2o_http3_update_recvbuf(h2o_buffer_t **buf, size_t off, const void *src, size_t len)
1221 {
1222     size_t new_size = off + len;
1223 
1224     if ((*buf)->size < new_size) {
1225         h2o_buffer_reserve(buf, new_size - (*buf)->size);
1226         (*buf)->size = new_size;
1227     }
1228     memcpy((*buf)->bytes + off, src, len);
1229 }
1230 
h2o_quic_schedule_timer(h2o_quic_conn_t * conn)1231 void h2o_quic_schedule_timer(h2o_quic_conn_t *conn)
1232 {
1233     int64_t timeout = quicly_get_first_timeout(conn->quic);
1234     if (h2o_timer_is_linked(&conn->_timeout)) {
1235 #if !H2O_USE_LIBUV /* optimization to skip registering a timer specifying the same time */
1236         if (timeout == conn->_timeout.expire_at)
1237             return;
1238 #endif
1239         h2o_timer_unlink(&conn->_timeout);
1240     }
1241     uint64_t now = h2o_now(conn->ctx->loop), delay = now < timeout ? timeout - now : 0;
1242     h2o_timer_link(conn->ctx->loop, delay, &conn->_timeout);
1243 }
1244 
h2o_http3_handle_settings_frame(h2o_http3_conn_t * conn,const uint8_t * payload,size_t length,const char ** err_desc)1245 int h2o_http3_handle_settings_frame(h2o_http3_conn_t *conn, const uint8_t *payload, size_t length, const char **err_desc)
1246 {
1247     const uint8_t *src = payload, *src_end = src + length;
1248     uint32_t header_table_size = 0;
1249     uint64_t blocked_streams = 0;
1250 
1251     assert(!h2o_http3_has_received_settings(conn));
1252 
1253     while (src != src_end) {
1254         uint64_t id;
1255         uint64_t value;
1256         if ((id = quicly_decodev(&src, src_end)) == UINT64_MAX)
1257             goto Malformed;
1258         if ((value = quicly_decodev(&src, src_end)) == UINT64_MAX)
1259             goto Malformed;
1260         switch (id) {
1261         case H2O_HTTP3_SETTINGS_MAX_FIELD_SECTION_SIZE:
1262             conn->peer_settings.max_field_section_size = value;
1263             break;
1264         case H2O_HTTP3_SETTINGS_QPACK_MAX_TABLE_CAPACITY:
1265             header_table_size =
1266                 value < conn->qpack.ctx->encoder_table_capacity ? (uint32_t)value : conn->qpack.ctx->encoder_table_capacity;
1267             break;
1268         case H2O_HTTP3_SETTINGS_QPACK_BLOCKED_STREAMS:
1269             blocked_streams = value;
1270             break;
1271         case H2O_HTTP3_SETTINGS_H3_DATAGRAM:
1272             switch (value) {
1273             case 0:
1274                 break;
1275             case 1: {
1276                 const quicly_transport_parameters_t *remote_tp = quicly_get_remote_transport_parameters(conn->super.quic);
1277                 if (remote_tp->max_datagram_frame_size == 0)
1278                     goto Malformed;
1279                 conn->peer_settings.h3_datagram = 1;
1280             } break;
1281             default:
1282                 goto Malformed;
1283             }
1284             break;
1285         default:
1286             break;
1287         }
1288     }
1289 
1290     conn->qpack.enc = h2o_qpack_create_encoder(header_table_size, blocked_streams);
1291     return 0;
1292 Malformed:
1293     *err_desc = "malformed SETTINGS frame";
1294     return H2O_HTTP3_ERROR_FRAME;
1295 }
1296 
h2o_http3_send_qpack_stream_cancel(h2o_http3_conn_t * conn,quicly_stream_id_t stream_id)1297 void h2o_http3_send_qpack_stream_cancel(h2o_http3_conn_t *conn, quicly_stream_id_t stream_id)
1298 {
1299     struct st_h2o_http3_egress_unistream_t *stream = conn->_control_streams.egress.qpack_decoder;
1300 
1301     /* allocate and write */
1302     h2o_iovec_t buf = h2o_buffer_reserve(&stream->sendbuf, stream->sendbuf->size + H2O_HPACK_ENCODE_INT_MAX_LENGTH);
1303     assert(buf.base != NULL);
1304     stream->sendbuf->size += h2o_qpack_decoder_send_stream_cancel(conn->qpack.dec, (uint8_t *)buf.base, stream_id);
1305 
1306     /* notify the transport */
1307     H2O_HTTP3_CHECK_SUCCESS(quicly_stream_sync_sendbuf(stream->quic, 1) == 0);
1308 }
1309 
h2o_http3_send_qpack_header_ack(h2o_http3_conn_t * conn,const void * bytes,size_t len)1310 void h2o_http3_send_qpack_header_ack(h2o_http3_conn_t *conn, const void *bytes, size_t len)
1311 {
1312     struct st_h2o_http3_egress_unistream_t *stream = conn->_control_streams.egress.qpack_encoder;
1313 
1314     assert(stream != NULL);
1315     h2o_buffer_append(&stream->sendbuf, bytes, len);
1316     H2O_HTTP3_CHECK_SUCCESS(quicly_stream_sync_sendbuf(stream->quic, 1));
1317 }
1318 
h2o_http3_send_goaway_frame(h2o_http3_conn_t * conn,uint64_t stream_or_push_id)1319 void h2o_http3_send_goaway_frame(h2o_http3_conn_t *conn, uint64_t stream_or_push_id)
1320 {
1321     size_t cap = h2o_http3_goaway_frame_capacity(stream_or_push_id);
1322     h2o_iovec_t alloced = h2o_buffer_reserve(&conn->_control_streams.egress.control->sendbuf, cap);
1323     h2o_http3_encode_goaway_frame((uint8_t *)alloced.base, stream_or_push_id);
1324     conn->_control_streams.egress.control->sendbuf->size += cap;
1325     quicly_stream_sync_sendbuf(conn->_control_streams.egress.control->quic, 1);
1326 }
1327 
h2o_http3_can_use_h3_datagram(h2o_http3_conn_t * conn)1328 int h2o_http3_can_use_h3_datagram(h2o_http3_conn_t *conn)
1329 {
1330     if (!conn->peer_settings.h3_datagram)
1331         return 0;
1332     quicly_context_t *qctx = quicly_get_context(conn->super.quic);
1333     return qctx->transport_params.max_datagram_frame_size != 0;
1334 }
1335 
h2o_http3_send_h3_datagrams(h2o_http3_conn_t * conn,uint64_t flow_id,h2o_iovec_t * datagrams,size_t num_datagrams)1336 void h2o_http3_send_h3_datagrams(h2o_http3_conn_t *conn, uint64_t flow_id, h2o_iovec_t *datagrams, size_t num_datagrams)
1337 {
1338     for (size_t i = 0; i < num_datagrams; ++i) {
1339         h2o_iovec_t *src = datagrams + i;
1340         uint8_t buf[quicly_encodev_capacity(flow_id) + src->len], *p = buf;
1341         p = quicly_encodev(p, flow_id);
1342         memcpy(p, src->base, src->len);
1343         p += src->len;
1344         ptls_iovec_t payload = ptls_iovec_init(buf, p - buf);
1345         quicly_send_datagram_frames(conn->super.quic, &payload, 1);
1346     }
1347 
1348     h2o_quic_schedule_timer(&conn->super);
1349 }
1350 
h2o_http3_decode_h3_datagram(h2o_iovec_t * payload,const void * _src,size_t len)1351 uint64_t h2o_http3_decode_h3_datagram(h2o_iovec_t *payload, const void *_src, size_t len)
1352 {
1353     const uint8_t *src = _src, *end = src + len;
1354     uint64_t flow_id;
1355 
1356     if ((flow_id = ptls_decode_quicint(&src, end)) != UINT64_MAX)
1357         *payload = h2o_iovec_init(src, end - src);
1358     return flow_id;
1359 }
1360