1 /*
2 * Copyright (c) 2018 Fastly, Kazuho Oku
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a copy
5 * of this software and associated documentation files (the "Software"), to
6 * deal in the Software without restriction, including without limitation the
7 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
8 * sell copies of the Software, and to permit persons to whom the Software is
9 * furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice shall be included in
12 * all copies or substantial portions of the Software.
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
19 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
20 * IN THE SOFTWARE.
21 */
22 #ifdef __APPLE__
23 #define __APPLE_USE_RFC_3542 /* to use IPV6_PKTINFO */
24 #endif
25 #include <errno.h>
26 #include <sys/types.h>
27 #include <netinet/in.h>
28 #include <netinet/udp.h>
29 #include <pthread.h>
30 #include <stdio.h>
31 #include <sys/socket.h>
32 #include "picotls/openssl.h"
33 #include "h2o/string_.h"
34 #include "h2o/http3_common.h"
35 #include "h2o/http3_internal.h"
36 #include "h2o/multithread.h"
37 #include "../probes_.h"
38
39 struct st_h2o_http3_ingress_unistream_t {
40 /**
41 * back pointer
42 */
43 quicly_stream_t *quic;
44 /**
45 *
46 */
47 h2o_buffer_t *recvbuf;
48 /**
49 * A callback that passes unparsed input to be handled. `src` is set to NULL when receiving a reset.
50 */
51 void (*handle_input)(h2o_http3_conn_t *conn, struct st_h2o_http3_ingress_unistream_t *stream, const uint8_t **src,
52 const uint8_t *src_end, int is_eos);
53 };
54
55 const ptls_iovec_t h2o_http3_alpn[3] = {{(void *)H2O_STRLIT("h3")}, {(void *)H2O_STRLIT("h3-29")}, {(void *)H2O_STRLIT("h3-27")}};
56
report_sendmsg_errors(h2o_error_reporter_t * reporter,uint64_t total_successes,uint64_t cur_successes)57 static void report_sendmsg_errors(h2o_error_reporter_t *reporter, uint64_t total_successes, uint64_t cur_successes)
58 {
59 char errstr[256];
60 fprintf(stderr, "sendmsg failed %" PRIu64 " time%s, succeeded: %" PRIu64 " time%s, over the last minute: %s\n",
61 reporter->cur_errors, reporter->cur_errors > 1 ? "s" : "", cur_successes, cur_successes > 1 ? "s" : "",
62 h2o_strerror_r((int)reporter->data, errstr, sizeof(errstr)));
63 }
64
65 static h2o_error_reporter_t track_sendmsg = H2O_ERROR_REPORTER_INITIALIZER(report_sendmsg_errors);
66
h2o_quic_send_datagrams(h2o_quic_ctx_t * ctx,quicly_address_t * dest,quicly_address_t * src,struct iovec * datagrams,size_t num_datagrams)67 int h2o_quic_send_datagrams(h2o_quic_ctx_t *ctx, quicly_address_t *dest, quicly_address_t *src, struct iovec *datagrams,
68 size_t num_datagrams)
69 {
70 union {
71 struct cmsghdr hdr;
72 char buf[
73 #ifdef IPV6_PKTINFO
74 CMSG_SPACE(sizeof(struct in6_pktinfo))
75 #elif defined(IP_PKTINFO)
76 CMSG_SPACE(sizeof(struct in_pktinfo))
77 #elif defined(IP_SENDSRCADDR)
78 CMSG_SPACE(sizeof(struct in_addr))
79 #else
80 CMSG_SPACE(1)
81 #endif
82 +
83 #ifdef UDP_SEGMENT
84 CMSG_SPACE(sizeof(uint16_t))
85 #else
86 0
87 #endif
88 ];
89 } cmsgbuf = {.buf = {}};
90 struct cmsghdr *cmsg = &cmsgbuf.hdr;
91
92 /* first CMSG is the source address */
93 if (src->sa.sa_family != AF_UNSPEC) {
94 size_t cmsg_bodylen = 0;
95 switch (src->sa.sa_family) {
96 case AF_INET: {
97 #if defined(IP_PKTINFO)
98 if (*ctx->sock.port != src->sin.sin_port)
99 return 0;
100 cmsg->cmsg_level = IPPROTO_IP;
101 cmsg->cmsg_type = IP_PKTINFO;
102 cmsg_bodylen = sizeof(struct in_pktinfo);
103 memcpy(&((struct in_pktinfo *)CMSG_DATA(cmsg))->ipi_spec_dst, &src->sin.sin_addr, sizeof(struct in_addr));
104 #elif defined(IP_SENDSRCADDR)
105 if (*ctx->sock.port != src->sin.sin_port)
106 return 0;
107 struct sockaddr_in *fdaddr = (struct sockaddr_in *)&ctx->sock.addr;
108 assert(fdaddr->sin_family == AF_INET);
109 if (fdaddr->sin_addr.s_addr == INADDR_ANY) {
110 cmsg->cmsg_level = IPPROTO_IP;
111 cmsg->cmsg_type = IP_SENDSRCADDR;
112 cmsg_bodylen = sizeof(struct in_addr);
113 memcpy(CMSG_DATA(cmsg), &src->sin.sin_addr, sizeof(struct in_addr));
114 }
115 #else
116 h2o_fatal("IP_PKTINFO not available");
117 #endif
118 } break;
119 case AF_INET6:
120 #ifdef IPV6_PKTINFO
121 if (*ctx->sock.port != src->sin6.sin6_port)
122 return 0;
123 cmsg->cmsg_level = IPPROTO_IPV6;
124 cmsg->cmsg_type = IPV6_PKTINFO;
125 cmsg_bodylen = sizeof(struct in6_pktinfo);
126 memcpy(&((struct in6_pktinfo *)CMSG_DATA(cmsg))->ipi6_addr, &src->sin6.sin6_addr, sizeof(struct in6_addr));
127 #else
128 h2o_fatal("IPV6_PKTINFO not available");
129 #endif
130 break;
131 default:
132 h2o_fatal("unexpected address family");
133 break;
134 }
135 cmsg->cmsg_len = (socklen_t)CMSG_LEN(cmsg_bodylen);
136 cmsg = (struct cmsghdr *)((char *)cmsg + CMSG_SPACE(cmsg_bodylen));
137 }
138
139 /* next CMSG is UDP_SEGMENT size (for GSO) */
140 int using_gso = 0;
141 #ifdef UDP_SEGMENT
142 if (num_datagrams > 1 && ctx->use_gso) {
143 for (size_t i = 1; i < num_datagrams - 1; ++i)
144 assert(datagrams[i].iov_len == datagrams[0].iov_len);
145 uint16_t segsize = (uint16_t)datagrams[0].iov_len;
146 cmsg->cmsg_level = SOL_UDP;
147 cmsg->cmsg_type = UDP_SEGMENT;
148 cmsg->cmsg_len = CMSG_LEN(sizeof(segsize));
149 memcpy(CMSG_DATA(cmsg), &segsize, sizeof(segsize));
150 cmsg = (struct cmsghdr *)((char *)cmsg + CMSG_SPACE(sizeof(segsize)));
151 using_gso = 1;
152 }
153 #endif
154
155 /* send datagrams */
156 struct msghdr mess = {
157 .msg_name = &dest->sa,
158 .msg_namelen = quicly_get_socklen(&dest->sa),
159 };
160 if (cmsg != &cmsgbuf.hdr) {
161 mess.msg_control = &cmsgbuf.buf;
162 mess.msg_controllen = (socklen_t)((char *)cmsg - cmsgbuf.buf);
163 }
164 int ret;
165
166 if (using_gso) {
167 mess.msg_iov = datagrams;
168 mess.msg_iovlen = (int)num_datagrams;
169 while ((ret = (int)sendmsg(h2o_socket_get_fd(ctx->sock.sock), &mess, 0)) == -1 && errno == EINTR)
170 ;
171 if (ret == -1)
172 goto SendmsgError;
173 } else {
174 for (size_t i = 0; i < num_datagrams; ++i) {
175 mess.msg_iov = datagrams + i;
176 mess.msg_iovlen = 1;
177 while ((ret = (int)sendmsg(h2o_socket_get_fd(ctx->sock.sock), &mess, 0)) == -1 && errno == EINTR)
178 ;
179 if (ret == -1)
180 goto SendmsgError;
181 }
182 }
183
184 h2o_error_reporter_record_success(&track_sendmsg);
185
186 return 1;
187
188 SendmsgError:
189 /* The UDP stack returns EINVAL (linux) or EADDRNOTAVAIL (darwin, and presumably other BSD) when it was unable to use the
190 * designated source address. We communicate that back to the caller so that the connection can be closed immediately. */
191 if (src->sa.sa_family != AF_UNSPEC && (errno == EINVAL || errno == EADDRNOTAVAIL))
192 return 0;
193
194 /* Temporary failure to send a packet is not a permanent error fo the connection. (TODO do we want do something more
195 * specific?) */
196
197 /* Log the number of failed invocations once per minute, if there has been such a failure. */
198 h2o_error_reporter_record_error(ctx->loop, &track_sendmsg, 60000, errno);
199
200 return 1;
201 }
202
get_callbacks(h2o_http3_conn_t * conn)203 static inline const h2o_http3_conn_callbacks_t *get_callbacks(h2o_http3_conn_t *conn)
204 {
205 return (const h2o_http3_conn_callbacks_t *)conn->super.callbacks;
206 }
207
ingress_unistream_on_destroy(quicly_stream_t * qs,int err)208 static void ingress_unistream_on_destroy(quicly_stream_t *qs, int err)
209 {
210 struct st_h2o_http3_ingress_unistream_t *stream = qs->data;
211 h2o_buffer_dispose(&stream->recvbuf);
212 free(stream);
213 }
214
ingress_unistream_on_receive(quicly_stream_t * qs,size_t off,const void * input,size_t len)215 static void ingress_unistream_on_receive(quicly_stream_t *qs, size_t off, const void *input, size_t len)
216 {
217 h2o_http3_conn_t *conn = *quicly_get_data(qs->conn);
218 struct st_h2o_http3_ingress_unistream_t *stream = qs->data;
219
220 /* save received data */
221 h2o_http3_update_recvbuf(&stream->recvbuf, off, input, len);
222
223 /* determine bytes that can be handled */
224 const uint8_t *src = (const uint8_t *)stream->recvbuf->bytes,
225 *src_end = src + quicly_recvstate_bytes_available(&stream->quic->recvstate);
226 if (src == src_end && !quicly_recvstate_transfer_complete(&stream->quic->recvstate))
227 return;
228
229 /* handle the bytes */
230 stream->handle_input(conn, stream, &src, src_end, quicly_recvstate_transfer_complete(&stream->quic->recvstate));
231 if (quicly_get_state(conn->super.quic) >= QUICLY_STATE_CLOSING)
232 return;
233
234 /* remove bytes that have been consumed */
235 size_t bytes_consumed = src - (const uint8_t *)stream->recvbuf->bytes;
236 if (bytes_consumed != 0) {
237 h2o_buffer_consume(&stream->recvbuf, bytes_consumed);
238 quicly_stream_sync_recvbuf(stream->quic, bytes_consumed);
239 }
240 }
241
ingress_unistream_on_receive_reset(quicly_stream_t * qs,int err)242 static void ingress_unistream_on_receive_reset(quicly_stream_t *qs, int err)
243 {
244 h2o_http3_conn_t *conn = *quicly_get_data(qs->conn);
245 struct st_h2o_http3_ingress_unistream_t *stream = qs->data;
246
247 stream->handle_input(conn, stream, NULL, NULL, 1);
248 }
249
qpack_encoder_stream_handle_input(h2o_http3_conn_t * conn,struct st_h2o_http3_ingress_unistream_t * stream,const uint8_t ** src,const uint8_t * src_end,int is_eos)250 static void qpack_encoder_stream_handle_input(h2o_http3_conn_t *conn, struct st_h2o_http3_ingress_unistream_t *stream,
251 const uint8_t **src, const uint8_t *src_end, int is_eos)
252 {
253 if (src == NULL || is_eos) {
254 h2o_quic_close_connection(&conn->super, H2O_HTTP3_ERROR_CLOSED_CRITICAL_STREAM, NULL);
255 return;
256 }
257
258 while (*src != src_end) {
259 int64_t *unblocked_stream_ids;
260 size_t num_unblocked;
261 int ret;
262 const char *err_desc = NULL;
263 if ((ret = h2o_qpack_decoder_handle_input(conn->qpack.dec, &unblocked_stream_ids, &num_unblocked, src, src_end,
264 &err_desc)) != 0) {
265 h2o_quic_close_connection(&conn->super, ret, err_desc);
266 break;
267 }
268 /* TODO handle unblocked streams */
269 }
270 }
271
qpack_decoder_stream_handle_input(h2o_http3_conn_t * conn,struct st_h2o_http3_ingress_unistream_t * stream,const uint8_t ** src,const uint8_t * src_end,int is_eos)272 static void qpack_decoder_stream_handle_input(h2o_http3_conn_t *conn, struct st_h2o_http3_ingress_unistream_t *stream,
273 const uint8_t **src, const uint8_t *src_end, int is_eos)
274 {
275 if (src == NULL || is_eos) {
276 h2o_quic_close_connection(&conn->super, H2O_HTTP3_ERROR_CLOSED_CRITICAL_STREAM, NULL);
277 return;
278 }
279
280 while (*src != src_end) {
281 int ret;
282 const char *err_desc = NULL;
283 if ((ret = h2o_qpack_encoder_handle_input(conn->qpack.enc, src, src_end, &err_desc)) != 0) {
284 h2o_quic_close_connection(&conn->super, ret, err_desc);
285 break;
286 }
287 }
288 }
289
control_stream_handle_input(h2o_http3_conn_t * conn,struct st_h2o_http3_ingress_unistream_t * stream,const uint8_t ** src,const uint8_t * src_end,int is_eos)290 static void control_stream_handle_input(h2o_http3_conn_t *conn, struct st_h2o_http3_ingress_unistream_t *stream,
291 const uint8_t **src, const uint8_t *src_end, int is_eos)
292 {
293 if (src == NULL || is_eos) {
294 h2o_quic_close_connection(&conn->super, H2O_HTTP3_ERROR_CLOSED_CRITICAL_STREAM, NULL);
295 return;
296 }
297
298 do {
299 h2o_http3_read_frame_t frame;
300 int ret;
301 const char *err_desc = NULL;
302
303 if ((ret = h2o_http3_read_frame(&frame, quicly_is_client(conn->super.quic), H2O_HTTP3_STREAM_TYPE_CONTROL, src, src_end,
304 &err_desc)) != 0) {
305 if (ret != H2O_HTTP3_ERROR_INCOMPLETE)
306 h2o_quic_close_connection(&conn->super, ret, err_desc);
307 break;
308 }
309 if (h2o_http3_has_received_settings(conn) == (frame.type == H2O_HTTP3_FRAME_TYPE_SETTINGS) ||
310 frame.type == H2O_HTTP3_FRAME_TYPE_DATA) {
311 h2o_quic_close_connection(&conn->super, H2O_HTTP3_ERROR_FRAME_UNEXPECTED, NULL);
312 break;
313 }
314 get_callbacks(conn)->handle_control_stream_frame(conn, frame.type, frame.payload, frame.length);
315 if (quicly_get_state(conn->super.quic) >= QUICLY_STATE_CLOSING)
316 break;
317 } while (*src != src_end);
318 }
319
discard_handle_input(h2o_http3_conn_t * conn,struct st_h2o_http3_ingress_unistream_t * stream,const uint8_t ** src,const uint8_t * src_end,int is_eos)320 static void discard_handle_input(h2o_http3_conn_t *conn, struct st_h2o_http3_ingress_unistream_t *stream, const uint8_t **src,
321 const uint8_t *src_end, int is_eos)
322 {
323 if (src == NULL)
324 return;
325 *src = src_end;
326 }
327
unknown_type_handle_input(h2o_http3_conn_t * conn,struct st_h2o_http3_ingress_unistream_t * stream,const uint8_t ** src,const uint8_t * src_end,int is_eos)328 static void unknown_type_handle_input(h2o_http3_conn_t *conn, struct st_h2o_http3_ingress_unistream_t *stream, const uint8_t **src,
329 const uint8_t *src_end, int is_eos)
330 {
331 uint64_t type;
332
333 /* resets are allowed at least until the type is being determined */
334 if (src == NULL)
335 return;
336
337 /* read the type, or just return if incomplete */
338 if ((type = quicly_decodev(src, src_end)) == UINT64_MAX)
339 return;
340
341 switch (type) {
342 case H2O_HTTP3_STREAM_TYPE_CONTROL:
343 conn->_control_streams.ingress.control = stream;
344 stream->handle_input = control_stream_handle_input;
345 break;
346 case H2O_HTTP3_STREAM_TYPE_QPACK_ENCODER:
347 conn->_control_streams.ingress.qpack_encoder = stream;
348 stream->handle_input = qpack_encoder_stream_handle_input;
349 break;
350 case H2O_HTTP3_STREAM_TYPE_QPACK_DECODER:
351 conn->_control_streams.ingress.qpack_decoder = stream;
352 stream->handle_input = qpack_decoder_stream_handle_input;
353 break;
354 default:
355 quicly_request_stop(stream->quic, H2O_HTTP3_ERROR_STREAM_CREATION);
356 stream->handle_input = discard_handle_input;
357 break;
358 }
359
360 return stream->handle_input(conn, stream, src, src_end, is_eos);
361 }
362
egress_unistream_on_destroy(quicly_stream_t * qs,int err)363 static void egress_unistream_on_destroy(quicly_stream_t *qs, int err)
364 {
365 struct st_h2o_http3_egress_unistream_t *stream = qs->data;
366 h2o_buffer_dispose(&stream->sendbuf);
367 free(stream);
368 }
369
egress_unistream_on_send_shift(quicly_stream_t * qs,size_t delta)370 static void egress_unistream_on_send_shift(quicly_stream_t *qs, size_t delta)
371 {
372 struct st_h2o_http3_egress_unistream_t *stream = qs->data;
373 h2o_buffer_consume(&stream->sendbuf, delta);
374 }
375
egress_unistream_on_send_emit(quicly_stream_t * qs,size_t off,void * dst,size_t * len,int * wrote_all)376 static void egress_unistream_on_send_emit(quicly_stream_t *qs, size_t off, void *dst, size_t *len, int *wrote_all)
377 {
378 struct st_h2o_http3_egress_unistream_t *stream = qs->data;
379
380 if (*len >= stream->sendbuf->size - off) {
381 *len = stream->sendbuf->size - off;
382 *wrote_all = 1;
383 } else {
384 *wrote_all = 0;
385 }
386 memcpy(dst, stream->sendbuf->bytes + off, *len);
387 }
388
egress_unistream_on_send_stop(quicly_stream_t * qs,int err)389 static void egress_unistream_on_send_stop(quicly_stream_t *qs, int err)
390 {
391 struct st_h2o_http3_conn_t *conn = *quicly_get_data(qs->conn);
392 h2o_quic_close_connection(&conn->super, H2O_HTTP3_ERROR_CLOSED_CRITICAL_STREAM, NULL);
393 }
394
h2o_http3_on_create_unidirectional_stream(quicly_stream_t * qs)395 void h2o_http3_on_create_unidirectional_stream(quicly_stream_t *qs)
396 {
397 if (quicly_stream_is_self_initiated(qs)) {
398 /* create egress unistream */
399 static const quicly_stream_callbacks_t callbacks = {egress_unistream_on_destroy, egress_unistream_on_send_shift,
400 egress_unistream_on_send_emit, egress_unistream_on_send_stop};
401 struct st_h2o_http3_egress_unistream_t *stream = h2o_mem_alloc(sizeof(*stream));
402 qs->data = stream;
403 qs->callbacks = &callbacks;
404 stream->quic = qs;
405 h2o_buffer_init(&stream->sendbuf, &h2o_socket_buffer_prototype);
406 } else {
407 /* create ingress unistream */
408 static const quicly_stream_callbacks_t callbacks = {
409 ingress_unistream_on_destroy, NULL, NULL, NULL, ingress_unistream_on_receive, ingress_unistream_on_receive_reset};
410 struct st_h2o_http3_ingress_unistream_t *stream = h2o_mem_alloc(sizeof(*stream));
411 qs->data = stream;
412 qs->callbacks = &callbacks;
413 stream->quic = qs;
414 h2o_buffer_init(&stream->recvbuf, &h2o_socket_buffer_prototype);
415 stream->handle_input = unknown_type_handle_input;
416 }
417 }
418
open_egress_unistream(h2o_http3_conn_t * conn,struct st_h2o_http3_egress_unistream_t ** stream,h2o_iovec_t initial_bytes)419 static int open_egress_unistream(h2o_http3_conn_t *conn, struct st_h2o_http3_egress_unistream_t **stream, h2o_iovec_t initial_bytes)
420 {
421 quicly_stream_t *qs;
422 int ret;
423
424 if ((ret = quicly_open_stream(conn->super.quic, &qs, 1)) != 0)
425 return ret;
426 *stream = qs->data;
427 assert((*stream)->quic == qs);
428
429 h2o_buffer_append(&(*stream)->sendbuf, initial_bytes.base, initial_bytes.len);
430 return quicly_stream_sync_sendbuf((*stream)->quic, 1);
431 }
432
accept_hashkey_flatten_address(uint8_t * p,quicly_address_t * addr)433 static uint8_t *accept_hashkey_flatten_address(uint8_t *p, quicly_address_t *addr)
434 {
435 switch (addr->sa.sa_family) {
436 case AF_INET:
437 *p++ = 4;
438 memcpy(p, &addr->sin.sin_addr.s_addr, 4);
439 p += 4;
440 memcpy(p, &addr->sin.sin_port, 2);
441 p += 2;
442 break;
443 case AF_INET6:
444 *p++ = 6;
445 memcpy(p, addr->sin6.sin6_addr.s6_addr, 16);
446 p += 16;
447 memcpy(p, &addr->sin.sin_port, 2);
448 p += 2;
449 break;
450 case AF_UNSPEC:
451 *p++ = 0;
452 break;
453 default:
454 h2o_fatal("unknown protocol family");
455 break;
456 }
457 return p;
458 }
459
calc_accept_hashkey(quicly_address_t * destaddr,quicly_address_t * srcaddr,ptls_iovec_t src_cid)460 static uint64_t calc_accept_hashkey(quicly_address_t *destaddr, quicly_address_t *srcaddr, ptls_iovec_t src_cid)
461 {
462 /* prepare key */
463 static __thread EVP_CIPHER_CTX *cipher = NULL;
464 if (cipher == NULL) {
465 static uint8_t key[PTLS_AES128_KEY_SIZE];
466 H2O_MULTITHREAD_ONCE({ ptls_openssl_random_bytes(key, sizeof(key)); });
467 cipher = EVP_CIPHER_CTX_new();
468 EVP_EncryptInit_ex(cipher, EVP_aes_128_cbc(), NULL, key, NULL);
469 }
470
471 uint8_t buf[(1 + 16 + 2) * 2 + QUICLY_MAX_CID_LEN_V1 + PTLS_AES_BLOCK_SIZE] = {0};
472 uint8_t *p = buf;
473
474 /* build plaintext to encrypt */
475 p = accept_hashkey_flatten_address(p, destaddr);
476 p = accept_hashkey_flatten_address(p, srcaddr);
477 memcpy(p, src_cid.base, src_cid.len);
478 p += src_cid.len;
479 assert(p <= buf + sizeof(buf));
480 size_t bytes_to_encrypt = ((p - buf) + PTLS_AES_BLOCK_SIZE - 1) / PTLS_AES_BLOCK_SIZE * PTLS_AES_BLOCK_SIZE;
481 assert(bytes_to_encrypt <= sizeof(buf));
482
483 { /* encrypt */
484 EVP_EncryptInit_ex(cipher, NULL, NULL, NULL, NULL);
485 int bytes_encrypted = 0, ret = EVP_EncryptUpdate(cipher, buf, &bytes_encrypted, buf, (int)bytes_to_encrypt);
486 assert(ret);
487 assert(bytes_encrypted == bytes_to_encrypt);
488 }
489
490 /* use the last `size_t` bytes of the CBC output as the result */
491 uint64_t result;
492 memcpy(&result, buf + bytes_to_encrypt - sizeof(result), sizeof(result));
493 /* avoid 0 (used as nonexist) */
494 if (result == 0)
495 result = 1;
496 return result;
497 }
498
drop_from_acceptmap(h2o_quic_ctx_t * ctx,h2o_quic_conn_t * conn)499 static void drop_from_acceptmap(h2o_quic_ctx_t *ctx, h2o_quic_conn_t *conn)
500 {
501 if (conn->_accept_hashkey != 0) {
502 khint_t iter;
503 if ((iter = kh_get_h2o_quic_acceptmap(ctx->conns_accepting, conn->_accept_hashkey)) != kh_end(ctx->conns_accepting))
504 kh_del_h2o_quic_acceptmap(ctx->conns_accepting, iter);
505 conn->_accept_hashkey = 0;
506 }
507 }
508
send_version_negotiation(h2o_quic_ctx_t * ctx,quicly_address_t * destaddr,ptls_iovec_t dest_cid,quicly_address_t * srcaddr,ptls_iovec_t src_cid,const uint32_t * versions)509 static void send_version_negotiation(h2o_quic_ctx_t *ctx, quicly_address_t *destaddr, ptls_iovec_t dest_cid,
510 quicly_address_t *srcaddr, ptls_iovec_t src_cid, const uint32_t *versions)
511 {
512 uint8_t payload[QUICLY_MIN_CLIENT_INITIAL_SIZE];
513 size_t payload_size = quicly_send_version_negotiation(ctx->quic, dest_cid, src_cid, versions, payload);
514 assert(payload_size != SIZE_MAX);
515 struct iovec vec = {.iov_base = payload, .iov_len = payload_size};
516 h2o_quic_send_datagrams(ctx, destaddr, srcaddr, &vec, 1);
517 return;
518 }
519
process_packets(h2o_quic_ctx_t * ctx,quicly_address_t * destaddr,quicly_address_t * srcaddr,uint8_t ttl,quicly_decoded_packet_t * packets,size_t num_packets)520 static void process_packets(h2o_quic_ctx_t *ctx, quicly_address_t *destaddr, quicly_address_t *srcaddr, uint8_t ttl,
521 quicly_decoded_packet_t *packets, size_t num_packets)
522 {
523 h2o_quic_conn_t *conn = NULL;
524 size_t accepted_packet_index = SIZE_MAX;
525
526 assert(num_packets != 0);
527
528 if (ctx->quic_stats != NULL) {
529 ctx->quic_stats->packet_received += num_packets;
530 }
531
532 #if H2O_USE_DTRACE
533 if (PTLS_UNLIKELY(H2O_H3_PACKET_RECEIVE_ENABLED())) {
534 for (size_t i = 0; i != num_packets; ++i)
535 H2O_H3_PACKET_RECEIVE(&destaddr->sa, &srcaddr->sa, packets[i].octets.base, packets[i].octets.len);
536 }
537 #endif
538
539 if (packets[0].cid.src.len > QUICLY_MAX_CID_LEN_V1)
540 return;
541
542 /* find the matching connection, by first looking at the CID (all packets as client, or Handshake, 1-RTT packets as server) */
543 if (packets[0].cid.dest.plaintext.node_id == ctx->next_cid.node_id &&
544 packets[0].cid.dest.plaintext.thread_id == ctx->next_cid.thread_id) {
545 khiter_t iter = kh_get_h2o_quic_idmap(ctx->conns_by_id, packets[0].cid.dest.plaintext.master_id);
546 if (iter != kh_end(ctx->conns_by_id)) {
547 conn = kh_val(ctx->conns_by_id, iter);
548 /* CID-based matching on Initial and 0-RTT packets should only be applied for clients */
549 if (!quicly_is_client(conn->quic) && packets[0].cid.dest.might_be_client_generated)
550 conn = NULL;
551 } else if (!packets[0].cid.dest.might_be_client_generated) {
552 /* send stateless reset when we could not find a matching connection for a 1 RTT packet */
553 if (packets[0].octets.len >= QUICLY_STATELESS_RESET_PACKET_MIN_LEN) {
554 uint8_t payload[QUICLY_MIN_CLIENT_INITIAL_SIZE];
555 size_t payload_size = quicly_send_stateless_reset(ctx->quic, packets[0].cid.dest.encrypted.base, payload);
556 if (payload_size != SIZE_MAX) {
557 struct iovec vec = {.iov_base = payload, .iov_len = payload_size};
558 h2o_quic_send_datagrams(ctx, srcaddr, destaddr, &vec, 1);
559 }
560 }
561 return;
562 }
563 } else if (!packets[0].cid.dest.might_be_client_generated) {
564 /* forward 1-RTT packets belonging to different nodes, threads */
565 if (ttl == 0)
566 return;
567 uint64_t offending_node_id = packets[0].cid.dest.plaintext.node_id;
568 if (ctx->forward_packets != NULL && ctx->forward_packets(ctx, &offending_node_id, packets[0].cid.dest.plaintext.thread_id,
569 destaddr, srcaddr, ttl, packets, num_packets))
570 return;
571 /* non-authenticating 1-RTT packets are potentially stateless resets (FIXME handle them, note that we need to use a hashdos-
572 * resistant hash map that also meets constant-time comparison requirements) */
573 return;
574 }
575
576 if (conn == NULL) {
577 /* Initial or 0-RTT packet, use 4-tuple to match the thread and the connection */
578 assert(packets[0].cid.dest.might_be_client_generated);
579 uint64_t accept_hashkey = calc_accept_hashkey(destaddr, srcaddr, packets[0].cid.src);
580 if (ctx->accept_thread_divisor != 0) {
581 uint32_t offending_thread = accept_hashkey % ctx->accept_thread_divisor;
582 if (offending_thread != ctx->next_cid.thread_id) {
583 if (ctx->forward_packets != NULL)
584 ctx->forward_packets(ctx, NULL, offending_thread, destaddr, srcaddr, ttl, packets, num_packets);
585 return;
586 }
587 }
588 khiter_t iter = kh_get_h2o_quic_acceptmap(ctx->conns_accepting, accept_hashkey);
589 if (iter == kh_end(ctx->conns_accepting)) {
590 /* a new connection for this thread (at least on this process); accept or delegate to newer process */
591 if (ctx->acceptor != NULL) {
592 if (packets[0].version != 0 && !quicly_is_supported_version(packets[0].version)) {
593 send_version_negotiation(ctx, srcaddr, packets[0].cid.src, destaddr, packets[0].cid.dest.encrypted,
594 quicly_supported_versions);
595 return;
596 }
597 } else {
598 /* This is the offending thread but it is not accepting, which means that the process (or the thread) is not acting
599 * as a server (likely gracefully shutting down). Let the application process forward the packet to the next
600 * generation. */
601 if (ctx->forward_packets != NULL &&
602 ctx->forward_packets(ctx, NULL, ctx->next_cid.thread_id, destaddr, srcaddr, ttl, packets, num_packets))
603 return;
604 /* If not forwarded, send rejection to the peer. A Version Negotiation packet that carries only a greasing version
605 * number is used for the purpose, hoping that that signal will trigger immediate downgrade to HTTP/2, across the
606 * broad spectrum of the client implementations than if CONNECTION_REFUSED is being used. */
607 if (packets[0].version != 0) {
608 static const uint32_t no_versions[] = {0};
609 send_version_negotiation(ctx, srcaddr, packets[0].cid.src, destaddr, packets[0].cid.dest.encrypted,
610 no_versions);
611 }
612 return;
613 }
614 /* try to accept any of the Initial packets being received */
615 size_t i;
616 for (i = 0; i != num_packets; ++i)
617 if ((packets[i].octets.base[0] & QUICLY_PACKET_TYPE_BITMASK) == QUICLY_PACKET_TYPE_INITIAL)
618 if ((conn = ctx->acceptor(ctx, destaddr, srcaddr, packets + i)) != NULL) {
619 /* non-null generally means success, except for H2O_QUIC_ACCEPT_CONN_DECRYPTION_FAILED */
620 if (conn == (h2o_quic_conn_t *)H2O_QUIC_ACCEPT_CONN_DECRYPTION_FAILED) {
621 /* failed to decrypt Initial packet <=> it could belong to a connection on a different node; forward it
622 * to the destination being claimed by the DCID */
623 uint64_t offending_node_id = packets[i].cid.dest.plaintext.node_id;
624 uint32_t offending_thread_id = packets[i].cid.dest.plaintext.thread_id;
625 if (ctx->forward_packets != NULL && ttl > 0 &&
626 (offending_node_id != ctx->next_cid.node_id || offending_thread_id != ctx->next_cid.thread_id))
627 ctx->forward_packets(ctx, &offending_node_id, offending_thread_id, destaddr, srcaddr, ttl, packets,
628 num_packets);
629 return;
630 }
631 break;
632 }
633 if (conn == NULL)
634 return;
635 accepted_packet_index = i;
636 conn->_accept_hashkey = accept_hashkey;
637 int r;
638 iter = kh_put_h2o_quic_acceptmap(conn->ctx->conns_accepting, accept_hashkey, &r);
639 assert(iter != kh_end(conn->ctx->conns_accepting));
640 kh_val(conn->ctx->conns_accepting, iter) = conn;
641 } else {
642 /* existing connection */
643 conn = kh_val(ctx->conns_accepting, iter);
644 assert(conn != NULL);
645 assert(!quicly_is_client(conn->quic));
646 if (quicly_is_destination(conn->quic, &destaddr->sa, &srcaddr->sa, packets))
647 goto Receive;
648 uint64_t offending_node_id = packets[0].cid.dest.plaintext.node_id;
649 uint32_t offending_thread_id = packets[0].cid.dest.plaintext.thread_id;
650 if (offending_node_id != ctx->next_cid.node_id || offending_thread_id != ctx->next_cid.thread_id) {
651 /* accept key matches to a connection being established, but DCID doesn't -- likely a second (or later) Initial that
652 * is supposed to be handled by another node. forward it. */
653 if (ttl == 0)
654 return;
655 if (ctx->forward_packets != NULL)
656 ctx->forward_packets(ctx, &offending_node_id, offending_thread_id, destaddr, srcaddr, ttl, packets,
657 num_packets);
658 }
659 /* regardless of forwarding outcome, we need to drop this packet as it is not for us */
660 return;
661 }
662 }
663
664 { /* receive packets to the found connection */
665 if (!quicly_is_destination(conn->quic, &destaddr->sa, &srcaddr->sa, packets))
666 return;
667 size_t i;
668 Receive:
669 for (i = 0; i != num_packets; ++i) {
670 if (i != accepted_packet_index) {
671 int ret = quicly_receive(conn->quic, &destaddr->sa, &srcaddr->sa, packets + i);
672 switch (ret) {
673 case QUICLY_ERROR_STATE_EXHAUSTION:
674 case PTLS_ERROR_NO_MEMORY:
675 fprintf(stderr, "%s: `quicly_receive()` returned ret:%d\n", __func__, ret);
676 conn->callbacks->destroy_connection(conn);
677 return;
678 }
679 if (ret != QUICLY_ERROR_PACKET_IGNORED && ret != QUICLY_ERROR_DECRYPTION_FAILED) {
680 if (ctx->quic_stats != NULL) {
681 ++ctx->quic_stats->packet_processed;
682 }
683 }
684 }
685 }
686 }
687
688 h2o_quic_schedule_timer(conn);
689 if (ctx->notify_conn_update != NULL)
690 ctx->notify_conn_update(ctx, conn);
691 }
692
h2o_quic_read_socket(h2o_quic_ctx_t * ctx,h2o_socket_t * sock)693 void h2o_quic_read_socket(h2o_quic_ctx_t *ctx, h2o_socket_t *sock)
694 {
695 struct {
696 quicly_address_t destaddr, srcaddr;
697 struct iovec vec;
698 uint8_t ttl;
699 char controlbuf[
700 #ifdef IPV6_PKTINFO
701 CMSG_SPACE(sizeof(struct in6_pktinfo))
702 #elif defined(IP_PKTINFO)
703 CMSG_SPACE(sizeof(struct in_pktinfo))
704 #elif defined(IP_RECVDSTADDR)
705 CMSG_SPACE(sizeof(struct in_addr))
706 #else
707 CMSG_SPACE(1)
708 #endif
709 ];
710 uint8_t buf[1600];
711 } dgrams[10];
712 #ifdef __linux__
713 struct mmsghdr mess[PTLS_ELEMENTSOF(dgrams)];
714 #else
715 struct {
716 struct msghdr msg_hdr;
717 } mess[PTLS_ELEMENTSOF(dgrams)];
718 #endif
719
720 #define INIT_DGRAMS(i) \
721 do { \
722 mess[i].msg_hdr = (struct msghdr){ \
723 .msg_name = &dgrams[i].srcaddr, \
724 .msg_namelen = sizeof(dgrams[i].srcaddr), \
725 .msg_iov = &dgrams[i].vec, \
726 .msg_iovlen = 1, \
727 .msg_control = &dgrams[i].controlbuf, \
728 .msg_controllen = sizeof(dgrams[i].controlbuf), \
729 }; \
730 memset(&dgrams[i].destaddr, 0, sizeof(dgrams[i].destaddr)); \
731 dgrams[i].vec.iov_base = dgrams[i].buf; \
732 dgrams[i].vec.iov_len = sizeof(dgrams[i].buf); \
733 } while (0)
734
735 int fd = h2o_socket_get_fd(sock);
736 size_t dgram_index, num_dgrams;
737
738 /* Read datagrams. Sender should be provided an ACK every fraction of RTT, otherwise its behavior becomes bursty (assuming that
739 * pacing is not used), rather than packets being spread across entire round-trip. To minimize the chance of us entering such
740 * situation, number of datagrams being read at once is limited to `PTLS_ELEMENTSOF(dgrams)`. In other words, one ack is
741 * generated for no more than every 10 ack-eliciting packets being received, unless the ack-frequency extension is used. */
742 #ifdef __linux__
743 {
744 int rret;
745 do {
746 for (dgram_index = 0; dgram_index < PTLS_ELEMENTSOF(dgrams); ++dgram_index)
747 INIT_DGRAMS(dgram_index);
748 } while ((rret = recvmmsg(fd, mess, PTLS_ELEMENTSOF(mess), 0, NULL)) < 0 && errno == EINTR);
749 if (rret <= 0)
750 goto Exit;
751 num_dgrams = (size_t)rret;
752 for (dgram_index = 0; dgram_index < num_dgrams; ++dgram_index)
753 dgrams[dgram_index].vec.iov_len = (size_t)mess[dgram_index].msg_len;
754 }
755 #else
756 for (dgram_index = 0; dgram_index < PTLS_ELEMENTSOF(dgrams); ++dgram_index) {
757 ssize_t rret;
758 do {
759 INIT_DGRAMS(dgram_index);
760 } while ((rret = recvmsg(fd, &mess[dgram_index].msg_hdr, 0)) < 0 && errno == EINTR);
761 if (rret < 0)
762 break;
763 dgrams[dgram_index].vec.iov_len = rret;
764 }
765 num_dgrams = dgram_index;
766 if (num_dgrams == 0)
767 goto Exit;
768 #endif
769
770 /* normalize and store the obtained data into `dgrams` */
771 for (dgram_index = 0; dgram_index < num_dgrams; ++dgram_index) {
772 { /* fetch destination address */
773 struct cmsghdr *cmsg;
774 for (cmsg = CMSG_FIRSTHDR(&mess[dgram_index].msg_hdr); cmsg != NULL;
775 cmsg = CMSG_NXTHDR(&mess[dgram_index].msg_hdr, cmsg)) {
776 #ifdef IP_PKTINFO
777 if (cmsg->cmsg_level == IPPROTO_IP && cmsg->cmsg_type == IP_PKTINFO) {
778 dgrams[dgram_index].destaddr.sin.sin_family = AF_INET;
779 dgrams[dgram_index].destaddr.sin.sin_addr = ((struct in_pktinfo *)CMSG_DATA(cmsg))->ipi_addr;
780 dgrams[dgram_index].destaddr.sin.sin_port = *ctx->sock.port;
781 goto DestAddrFound;
782 }
783 #endif
784 #ifdef IP_RECVDSTADDR
785 if (cmsg->cmsg_level == IPPROTO_IP && cmsg->cmsg_type == IP_RECVDSTADDR) {
786 dgrams[dgram_index].destaddr.sin.sin_family = AF_INET;
787 dgrams[dgram_index].destaddr.sin.sin_addr = *(struct in_addr *)CMSG_DATA(cmsg);
788 dgrams[dgram_index].destaddr.sin.sin_port = *ctx->sock.port;
789 goto DestAddrFound;
790 }
791 #endif
792 #ifdef IPV6_PKTINFO
793 if (cmsg->cmsg_level == IPPROTO_IPV6 && cmsg->cmsg_type == IPV6_PKTINFO) {
794 dgrams[dgram_index].destaddr.sin6.sin6_family = AF_INET6;
795 dgrams[dgram_index].destaddr.sin6.sin6_addr = ((struct in6_pktinfo *)CMSG_DATA(cmsg))->ipi6_addr;
796 dgrams[dgram_index].destaddr.sin6.sin6_port = *ctx->sock.port;
797 goto DestAddrFound;
798 }
799 #endif
800 }
801 dgrams[dgram_index].destaddr.sa.sa_family = AF_UNSPEC;
802 DestAddrFound:;
803 }
804 dgrams[dgram_index].ttl = ctx->default_ttl;
805 /* preprocess (and drop the packet if it failed) */
806 if (ctx->preprocess_packet != NULL &&
807 !ctx->preprocess_packet(ctx, &mess[dgram_index].msg_hdr, &dgrams[dgram_index].destaddr,
808 &dgrams[dgram_index].srcaddr, &dgrams[dgram_index].ttl)) {
809 dgrams[dgram_index].vec.iov_len = 0; /* mark as unused */
810 } else {
811 assert(dgrams[dgram_index].srcaddr.sa.sa_family == AF_INET || dgrams[dgram_index].srcaddr.sa.sa_family == AF_INET6);
812 }
813 }
814
815 /* convert dgrams to decoded packets and process them in group of (4-tuple, dcid) */
816 quicly_decoded_packet_t packets[64];
817 size_t packet_index = 0;
818 dgram_index = 0;
819 while (dgram_index < num_dgrams) {
820 int has_decoded = 0; /* indicates if a decoded packet belonging to a different connection is stored at
821 * `packets[packet_index]` */
822 /* skip zero-sized datagrams (or the ones for which preprocessing failed) */
823 if (dgrams[dgram_index].vec.iov_len == 0) {
824 ++dgram_index;
825 continue;
826 }
827 /* dispatch packets in `packets`, if the datagram at dgram_index is from a different path */
828 if (packet_index != 0) {
829 assert(dgram_index != 0);
830 /* check source address */
831 if (h2o_socket_compare_address(&dgrams[dgram_index - 1].srcaddr.sa, &dgrams[dgram_index].srcaddr.sa, 1) != 0)
832 goto ProcessPackets;
833 /* check destination address, if available */
834 if (dgrams[dgram_index - 1].destaddr.sa.sa_family == AF_UNSPEC &&
835 dgrams[dgram_index].destaddr.sa.sa_family == AF_UNSPEC) {
836 /* ok */
837 } else if (h2o_socket_compare_address(&dgrams[dgram_index - 1].destaddr.sa, &dgrams[dgram_index].destaddr.sa, 1) ==
838 0) {
839 /* ok */
840 } else {
841 goto ProcessPackets;
842 }
843 /* TTL should be same for dispatched packets */
844 if (dgrams[dgram_index - 1].ttl != dgrams[dgram_index].ttl)
845 goto ProcessPackets;
846 }
847 /* decode the first packet */
848 size_t payload_off = 0;
849 if (quicly_decode_packet(ctx->quic, packets + packet_index, dgrams[dgram_index].vec.iov_base,
850 dgrams[dgram_index].vec.iov_len, &payload_off) == SIZE_MAX) {
851 ++dgram_index;
852 goto ProcessPackets;
853 }
854 /* dispatch packets in `packets` if the DCID is different, setting the `has_decoded` flag */
855 if (packet_index != 0) {
856 const ptls_iovec_t *prev_dcid = &packets[packet_index - 1].cid.dest.encrypted,
857 *cur_dcid = &packets[packet_index].cid.dest.encrypted;
858 if (!(prev_dcid->len == cur_dcid->len && memcmp(prev_dcid->base, cur_dcid->base, prev_dcid->len) == 0)) {
859 has_decoded = 1;
860 ++dgram_index;
861 goto ProcessPackets;
862 }
863 }
864 ++packet_index;
865 /* add rest of the packets */
866 while (payload_off < dgrams[dgram_index].vec.iov_len && packet_index < PTLS_ELEMENTSOF(packets)) {
867 if (quicly_decode_packet(ctx->quic, packets + packet_index, dgrams[dgram_index].vec.iov_base,
868 dgrams[dgram_index].vec.iov_len, &payload_off) == SIZE_MAX)
869 break;
870 ++packet_index;
871 }
872 ++dgram_index;
873 /* if we have enough room for the next datagram, that is, the expected worst case of 4 packets in a coalesced datagram,
874 * continue */
875 if (packet_index + 4 < PTLS_ELEMENTSOF(packets))
876 continue;
877
878 ProcessPackets:
879 if (packet_index != 0) {
880 process_packets(ctx, &dgrams[dgram_index - 1].destaddr, &dgrams[dgram_index - 1].srcaddr,
881 dgrams[dgram_index - 1].ttl, packets, packet_index);
882 if (has_decoded) {
883 packets[0] = packets[packet_index];
884 packet_index = 1;
885 } else {
886 packet_index = 0;
887 }
888 }
889 }
890 if (packet_index != 0)
891 process_packets(ctx, &dgrams[dgram_index - 1].destaddr, &dgrams[dgram_index - 1].srcaddr, dgrams[dgram_index - 1].ttl,
892 packets, packet_index);
893
894 Exit:;
895
896 #undef INIT_DGRAMS
897 }
898
on_read(h2o_socket_t * sock,const char * err)899 static void on_read(h2o_socket_t *sock, const char *err)
900 {
901 h2o_quic_ctx_t *ctx = sock->data;
902 h2o_quic_read_socket(ctx, sock);
903 }
904
on_timeout(h2o_timer_t * timeout)905 static void on_timeout(h2o_timer_t *timeout)
906 {
907 h2o_quic_conn_t *conn = H2O_STRUCT_FROM_MEMBER(h2o_quic_conn_t, _timeout, timeout);
908 h2o_quic_send(conn);
909 }
910
h2o_http3_read_frame(h2o_http3_read_frame_t * frame,int is_client,uint64_t stream_type,const uint8_t ** _src,const uint8_t * src_end,const char ** err_desc)911 int h2o_http3_read_frame(h2o_http3_read_frame_t *frame, int is_client, uint64_t stream_type, const uint8_t **_src,
912 const uint8_t *src_end, const char **err_desc)
913 {
914 const uint8_t *src = *_src;
915
916 if ((frame->type = quicly_decodev(&src, src_end)) == UINT64_MAX)
917 return H2O_HTTP3_ERROR_INCOMPLETE;
918 if ((frame->length = quicly_decodev(&src, src_end)) == UINT64_MAX)
919 return H2O_HTTP3_ERROR_INCOMPLETE;
920 frame->_header_size = (uint8_t)(src - *_src);
921
922 /* read the content of the frame (unless it's a DATA frame) */
923 frame->payload = NULL;
924 if (frame->type != H2O_HTTP3_FRAME_TYPE_DATA) {
925 if (frame->length > H2O_HTTP3_MAX_FRAME_PAYLOAD_SIZE) {
926 H2O_PROBE(H3_FRAME_RECEIVE, frame->type, NULL, frame->length);
927 *err_desc = "H3 frame too large";
928 return H2O_HTTP3_ERROR_GENERAL_PROTOCOL; /* FIXME is this the correct code? */
929 }
930 if (src_end - src < frame->length)
931 return H2O_HTTP3_ERROR_INCOMPLETE;
932 frame->payload = src;
933 src += frame->length;
934 }
935
936 H2O_PROBE(H3_FRAME_RECEIVE, frame->type, frame->payload, frame->length);
937
938 /* validate frame type */
939 switch (frame->type) {
940 #define FRAME(id, req_clnt, req_srvr, ctl_clnt, ctl_srvr) \
941 case H2O_HTTP3_FRAME_TYPE_##id: \
942 switch (stream_type) { \
943 case H2O_HTTP3_STREAM_TYPE_REQUEST: \
944 if (req_clnt && !is_client) \
945 goto Validation_Success; \
946 if (req_srvr && is_client) \
947 goto Validation_Success; \
948 break; \
949 case H2O_HTTP3_STREAM_TYPE_CONTROL: \
950 if (ctl_clnt && !is_client) \
951 goto Validation_Success; \
952 if (ctl_srvr && is_client) \
953 goto Validation_Success; \
954 break; \
955 default: \
956 h2o_fatal("unexpected stream type"); \
957 break; \
958 } \
959 break
960 /* clang-format off */
961 /* +-----------------+-------------+-------------+
962 * | | req-stream | ctrl-stream |
963 * | frame +------+------+------+------+
964 * | |client|server|client|server|
965 * +-----------------+------+------+------+------+ */
966 FRAME( DATA , 1 , 1 , 0 , 0 );
967 FRAME( HEADERS , 1 , 1 , 0 , 0 );
968 FRAME( CANCEL_PUSH , 0 , 0 , 1 , 1 );
969 FRAME( SETTINGS , 0 , 0 , 1 , 1 );
970 FRAME( PUSH_PROMISE , 0 , 1 , 0 , 0 );
971 FRAME( GOAWAY , 0 , 0 , 1 , 1 );
972 FRAME( MAX_PUSH_ID , 0 , 0 , 1 , 0 );
973 FRAME( PRIORITY_UPDATE , 0 , 0 , 1 , 0 );
974 /* +-----------------+------+------+------+------+ */
975 /* clang-format on */
976 #undef FRAME
977 default:
978 /* ignore extension frames that we do not handle */
979 goto Validation_Success;
980 }
981 return H2O_HTTP3_ERROR_FRAME_UNEXPECTED;
982 Validation_Success:;
983
984 *_src = src;
985 return 0;
986 }
987
h2o_quic_init_context(h2o_quic_ctx_t * ctx,h2o_loop_t * loop,h2o_socket_t * sock,quicly_context_t * quic,h2o_quic_accept_cb acceptor,h2o_quic_notify_connection_update_cb notify_conn_update,uint8_t use_gso,h2o_quic_stats_t * quic_stats)988 void h2o_quic_init_context(h2o_quic_ctx_t *ctx, h2o_loop_t *loop, h2o_socket_t *sock, quicly_context_t *quic,
989 h2o_quic_accept_cb acceptor, h2o_quic_notify_connection_update_cb notify_conn_update, uint8_t use_gso,
990 h2o_quic_stats_t *quic_stats)
991 {
992 assert(quic->stream_open != NULL);
993
994 *ctx = (h2o_quic_ctx_t){
995 .loop = loop,
996 .sock = {.sock = sock},
997 .quic = quic,
998 .next_cid = {0} /* thread_id, node_id are set by h2o_http3_set_context_identifier */,
999 .conns_by_id = kh_init_h2o_quic_idmap(),
1000 .conns_accepting = kh_init_h2o_quic_acceptmap(),
1001 .notify_conn_update = notify_conn_update,
1002 .acceptor = acceptor,
1003 .use_gso = use_gso,
1004 .quic_stats = quic_stats,
1005 };
1006 ctx->sock.sock->data = ctx;
1007 ctx->sock.addrlen = h2o_socket_getsockname(ctx->sock.sock, (void *)&ctx->sock.addr);
1008 assert(ctx->sock.addrlen != 0);
1009 switch (ctx->sock.addr.ss_family) {
1010 case AF_INET:
1011 ctx->sock.port = &((struct sockaddr_in *)&ctx->sock.addr)->sin_port;
1012 break;
1013 case AF_INET6:
1014 ctx->sock.port = &((struct sockaddr_in6 *)&ctx->sock.addr)->sin6_port;
1015 break;
1016 default:
1017 assert(!"unexpected address family");
1018 break;
1019 }
1020
1021 h2o_socket_read_start(ctx->sock.sock, on_read);
1022 }
1023
h2o_quic_dispose_context(h2o_quic_ctx_t * ctx)1024 void h2o_quic_dispose_context(h2o_quic_ctx_t *ctx)
1025 {
1026 assert(kh_size(ctx->conns_by_id) == 0);
1027 assert(kh_size(ctx->conns_accepting) == 0);
1028
1029 h2o_socket_close(ctx->sock.sock);
1030 kh_destroy_h2o_quic_idmap(ctx->conns_by_id);
1031 kh_destroy_h2o_quic_acceptmap(ctx->conns_accepting);
1032 }
1033
h2o_quic_set_context_identifier(h2o_quic_ctx_t * ctx,uint32_t accept_thread_divisor,uint32_t thread_id,uint64_t node_id,uint8_t ttl,h2o_quic_forward_packets_cb forward_cb,h2o_quic_preprocess_packet_cb preprocess_cb)1034 void h2o_quic_set_context_identifier(h2o_quic_ctx_t *ctx, uint32_t accept_thread_divisor, uint32_t thread_id, uint64_t node_id,
1035 uint8_t ttl, h2o_quic_forward_packets_cb forward_cb,
1036 h2o_quic_preprocess_packet_cb preprocess_cb)
1037 {
1038 ctx->accept_thread_divisor = accept_thread_divisor;
1039 ctx->next_cid.thread_id = thread_id;
1040 ctx->next_cid.node_id = node_id;
1041 ctx->forward_packets = forward_cb;
1042 ctx->default_ttl = ttl;
1043 ctx->preprocess_packet = preprocess_cb;
1044 }
1045
h2o_quic_close_connection(h2o_quic_conn_t * conn,int err,const char * reason_phrase)1046 void h2o_quic_close_connection(h2o_quic_conn_t *conn, int err, const char *reason_phrase)
1047 {
1048 switch (quicly_get_state(conn->quic)) {
1049 case QUICLY_STATE_FIRSTFLIGHT: /* FIXME why is this separate? */
1050 conn->callbacks->destroy_connection(conn);
1051 break;
1052 case QUICLY_STATE_CONNECTED:
1053 quicly_close(conn->quic, err, reason_phrase);
1054 h2o_quic_schedule_timer(conn);
1055 break;
1056 default:
1057 /* only need to wait for the socket close */
1058 break;
1059 }
1060 }
1061
h2o_quic_close_all_connections(h2o_quic_ctx_t * ctx)1062 void h2o_quic_close_all_connections(h2o_quic_ctx_t *ctx)
1063 {
1064 h2o_quic_conn_t *conn;
1065
1066 kh_foreach_value(ctx->conns_by_id, conn, { h2o_quic_close_connection(conn, 0, NULL); });
1067 /* closing a connection should also remove an entry from conns_accepting */
1068 assert(kh_size(ctx->conns_accepting) == 0);
1069 }
1070
h2o_quic_num_connections(h2o_quic_ctx_t * ctx)1071 size_t h2o_quic_num_connections(h2o_quic_ctx_t *ctx)
1072 {
1073 /* throughout its lifetime, a connection is always registered to both conns_by_id and conns_accepting,
1074 thus counting conns_by_id is enough */
1075 return kh_size(ctx->conns_by_id);
1076 }
1077
h2o_quic_init_conn(h2o_quic_conn_t * conn,h2o_quic_ctx_t * ctx,const h2o_quic_conn_callbacks_t * callbacks)1078 void h2o_quic_init_conn(h2o_quic_conn_t *conn, h2o_quic_ctx_t *ctx, const h2o_quic_conn_callbacks_t *callbacks)
1079 {
1080 *conn = (h2o_quic_conn_t){ctx, NULL, callbacks};
1081 h2o_timer_init(&conn->_timeout, on_timeout);
1082 }
1083
h2o_quic_dispose_conn(h2o_quic_conn_t * conn)1084 void h2o_quic_dispose_conn(h2o_quic_conn_t *conn)
1085 {
1086 if (conn->quic != NULL) {
1087 khiter_t iter;
1088 /* unregister from maps */
1089 if ((iter = kh_get_h2o_quic_idmap(conn->ctx->conns_by_id, quicly_get_master_id(conn->quic)->master_id)) !=
1090 kh_end(conn->ctx->conns_by_id))
1091 kh_del_h2o_quic_idmap(conn->ctx->conns_by_id, iter);
1092 drop_from_acceptmap(conn->ctx, conn);
1093 quicly_free(conn->quic);
1094 }
1095 h2o_timer_unlink(&conn->_timeout);
1096 }
1097
h2o_quic_setup(h2o_quic_conn_t * conn,quicly_conn_t * quic)1098 void h2o_quic_setup(h2o_quic_conn_t *conn, quicly_conn_t *quic)
1099 {
1100 conn->quic = quic;
1101 *quicly_get_data(conn->quic) = conn;
1102
1103 /* register to the idmap */
1104 int r;
1105 khiter_t iter = kh_put_h2o_quic_idmap(conn->ctx->conns_by_id, quicly_get_master_id(conn->quic)->master_id, &r);
1106 assert(iter != kh_end(conn->ctx->conns_by_id));
1107 kh_val(conn->ctx->conns_by_id, iter) = conn;
1108 }
1109
h2o_http3_init_conn(h2o_http3_conn_t * conn,h2o_quic_ctx_t * ctx,const h2o_http3_conn_callbacks_t * callbacks,const h2o_http3_qpack_context_t * qpack_ctx)1110 void h2o_http3_init_conn(h2o_http3_conn_t *conn, h2o_quic_ctx_t *ctx, const h2o_http3_conn_callbacks_t *callbacks,
1111 const h2o_http3_qpack_context_t *qpack_ctx)
1112 {
1113 h2o_quic_init_conn(&conn->super, ctx, &callbacks->super);
1114 memset((char *)conn + sizeof(conn->super), 0, sizeof(*conn) - sizeof(conn->super));
1115 conn->qpack.ctx = qpack_ctx;
1116 }
1117
h2o_http3_dispose_conn(h2o_http3_conn_t * conn)1118 void h2o_http3_dispose_conn(h2o_http3_conn_t *conn)
1119 {
1120 if (conn->qpack.dec != NULL)
1121 h2o_qpack_destroy_decoder(conn->qpack.dec);
1122 if (conn->qpack.enc != NULL)
1123 h2o_qpack_destroy_encoder(conn->qpack.enc);
1124 h2o_quic_dispose_conn(&conn->super);
1125 }
1126
build_firstflight(h2o_http3_conn_t * conn,uint8_t * bytebuf,size_t capacity)1127 static size_t build_firstflight(h2o_http3_conn_t *conn, uint8_t *bytebuf, size_t capacity)
1128 {
1129 ptls_buffer_t buf;
1130 int ret = 0;
1131
1132 ptls_buffer_init(&buf, bytebuf, capacity);
1133
1134 /* push stream type */
1135 ptls_buffer_push_quicint(&buf, H2O_HTTP3_STREAM_TYPE_CONTROL);
1136
1137 /* push SETTINGS frame */
1138 ptls_buffer_push_quicint(&buf, H2O_HTTP3_FRAME_TYPE_SETTINGS);
1139 ptls_buffer_push_block(&buf, -1, {
1140 quicly_context_t *qctx = quicly_get_context(conn->super.quic);
1141 if (qctx->transport_params.max_datagram_frame_size != 0) {
1142 ptls_buffer_push_quicint(&buf, H2O_HTTP3_SETTINGS_H3_DATAGRAM);
1143 ptls_buffer_push_quicint(&buf, 1);
1144 };
1145 });
1146
1147 assert(!buf.is_allocated);
1148 return buf.off;
1149
1150 Exit:
1151 h2o_fatal("unreachable");
1152 }
1153
h2o_http3_setup(h2o_http3_conn_t * conn,quicly_conn_t * quic)1154 int h2o_http3_setup(h2o_http3_conn_t *conn, quicly_conn_t *quic)
1155 {
1156 int ret;
1157
1158 h2o_quic_setup(&conn->super, quic);
1159 conn->state = H2O_HTTP3_CONN_STATE_OPEN;
1160
1161 /* setup h3 objects, only when the connection state has been created */
1162 if (quicly_get_state(quic) > QUICLY_STATE_CONNECTED)
1163 goto Exit;
1164
1165 /* create decoder with the table size set to zero; see SETTINGS sent below. */
1166 conn->qpack.dec = h2o_qpack_create_decoder(0, 100 /* FIXME */);
1167
1168 { /* open control streams, send SETTINGS */
1169 uint8_t firstflight[32];
1170 size_t firstflight_len = build_firstflight(conn, firstflight, sizeof(firstflight));
1171 if ((ret = open_egress_unistream(conn, &conn->_control_streams.egress.control,
1172 h2o_iovec_init(firstflight, firstflight_len))) != 0)
1173 return ret;
1174 }
1175
1176 { /* open QPACK encoder & decoder streams */
1177 static const uint8_t encoder_first_flight[] = {H2O_HTTP3_STREAM_TYPE_QPACK_ENCODER};
1178 static const uint8_t decoder_first_flight[] = {H2O_HTTP3_STREAM_TYPE_QPACK_DECODER};
1179 if ((ret = open_egress_unistream(conn, &conn->_control_streams.egress.qpack_encoder,
1180 h2o_iovec_init(encoder_first_flight, sizeof(encoder_first_flight)))) != 0 ||
1181 (ret = open_egress_unistream(conn, &conn->_control_streams.egress.qpack_decoder,
1182 h2o_iovec_init(decoder_first_flight, sizeof(decoder_first_flight)))) != 0)
1183 return ret;
1184 }
1185
1186 Exit:
1187 h2o_quic_schedule_timer(&conn->super);
1188 return 0;
1189 }
1190
h2o_quic_send(h2o_quic_conn_t * conn)1191 int h2o_quic_send(h2o_quic_conn_t *conn)
1192 {
1193 quicly_address_t dest, src;
1194 struct iovec datagrams[10];
1195 size_t num_datagrams = PTLS_ELEMENTSOF(datagrams);
1196 uint8_t datagram_buf[1500 * PTLS_ELEMENTSOF(datagrams)];
1197
1198 int ret = quicly_send(conn->quic, &dest, &src, datagrams, &num_datagrams, datagram_buf, sizeof(datagram_buf));
1199 switch (ret) {
1200 case 0:
1201 if (num_datagrams != 0 && !h2o_quic_send_datagrams(conn->ctx, &dest, &src, datagrams, num_datagrams)) {
1202 /* FIXME close the connection immediately */
1203 break;
1204 }
1205 break;
1206 case QUICLY_ERROR_STATE_EXHAUSTION:
1207 case QUICLY_ERROR_FREE_CONNECTION:
1208 conn->callbacks->destroy_connection(conn);
1209 return 0;
1210 default:
1211 fprintf(stderr, "quicly_send returned %d\n", ret);
1212 abort();
1213 }
1214
1215 h2o_quic_schedule_timer(conn);
1216
1217 return 1;
1218 }
1219
h2o_http3_update_recvbuf(h2o_buffer_t ** buf,size_t off,const void * src,size_t len)1220 void h2o_http3_update_recvbuf(h2o_buffer_t **buf, size_t off, const void *src, size_t len)
1221 {
1222 size_t new_size = off + len;
1223
1224 if ((*buf)->size < new_size) {
1225 h2o_buffer_reserve(buf, new_size - (*buf)->size);
1226 (*buf)->size = new_size;
1227 }
1228 memcpy((*buf)->bytes + off, src, len);
1229 }
1230
h2o_quic_schedule_timer(h2o_quic_conn_t * conn)1231 void h2o_quic_schedule_timer(h2o_quic_conn_t *conn)
1232 {
1233 int64_t timeout = quicly_get_first_timeout(conn->quic);
1234 if (h2o_timer_is_linked(&conn->_timeout)) {
1235 #if !H2O_USE_LIBUV /* optimization to skip registering a timer specifying the same time */
1236 if (timeout == conn->_timeout.expire_at)
1237 return;
1238 #endif
1239 h2o_timer_unlink(&conn->_timeout);
1240 }
1241 uint64_t now = h2o_now(conn->ctx->loop), delay = now < timeout ? timeout - now : 0;
1242 h2o_timer_link(conn->ctx->loop, delay, &conn->_timeout);
1243 }
1244
h2o_http3_handle_settings_frame(h2o_http3_conn_t * conn,const uint8_t * payload,size_t length,const char ** err_desc)1245 int h2o_http3_handle_settings_frame(h2o_http3_conn_t *conn, const uint8_t *payload, size_t length, const char **err_desc)
1246 {
1247 const uint8_t *src = payload, *src_end = src + length;
1248 uint32_t header_table_size = 0;
1249 uint64_t blocked_streams = 0;
1250
1251 assert(!h2o_http3_has_received_settings(conn));
1252
1253 while (src != src_end) {
1254 uint64_t id;
1255 uint64_t value;
1256 if ((id = quicly_decodev(&src, src_end)) == UINT64_MAX)
1257 goto Malformed;
1258 if ((value = quicly_decodev(&src, src_end)) == UINT64_MAX)
1259 goto Malformed;
1260 switch (id) {
1261 case H2O_HTTP3_SETTINGS_MAX_FIELD_SECTION_SIZE:
1262 conn->peer_settings.max_field_section_size = value;
1263 break;
1264 case H2O_HTTP3_SETTINGS_QPACK_MAX_TABLE_CAPACITY:
1265 header_table_size =
1266 value < conn->qpack.ctx->encoder_table_capacity ? (uint32_t)value : conn->qpack.ctx->encoder_table_capacity;
1267 break;
1268 case H2O_HTTP3_SETTINGS_QPACK_BLOCKED_STREAMS:
1269 blocked_streams = value;
1270 break;
1271 case H2O_HTTP3_SETTINGS_H3_DATAGRAM:
1272 switch (value) {
1273 case 0:
1274 break;
1275 case 1: {
1276 const quicly_transport_parameters_t *remote_tp = quicly_get_remote_transport_parameters(conn->super.quic);
1277 if (remote_tp->max_datagram_frame_size == 0)
1278 goto Malformed;
1279 conn->peer_settings.h3_datagram = 1;
1280 } break;
1281 default:
1282 goto Malformed;
1283 }
1284 break;
1285 default:
1286 break;
1287 }
1288 }
1289
1290 conn->qpack.enc = h2o_qpack_create_encoder(header_table_size, blocked_streams);
1291 return 0;
1292 Malformed:
1293 *err_desc = "malformed SETTINGS frame";
1294 return H2O_HTTP3_ERROR_FRAME;
1295 }
1296
h2o_http3_send_qpack_stream_cancel(h2o_http3_conn_t * conn,quicly_stream_id_t stream_id)1297 void h2o_http3_send_qpack_stream_cancel(h2o_http3_conn_t *conn, quicly_stream_id_t stream_id)
1298 {
1299 struct st_h2o_http3_egress_unistream_t *stream = conn->_control_streams.egress.qpack_decoder;
1300
1301 /* allocate and write */
1302 h2o_iovec_t buf = h2o_buffer_reserve(&stream->sendbuf, stream->sendbuf->size + H2O_HPACK_ENCODE_INT_MAX_LENGTH);
1303 assert(buf.base != NULL);
1304 stream->sendbuf->size += h2o_qpack_decoder_send_stream_cancel(conn->qpack.dec, (uint8_t *)buf.base, stream_id);
1305
1306 /* notify the transport */
1307 H2O_HTTP3_CHECK_SUCCESS(quicly_stream_sync_sendbuf(stream->quic, 1) == 0);
1308 }
1309
h2o_http3_send_qpack_header_ack(h2o_http3_conn_t * conn,const void * bytes,size_t len)1310 void h2o_http3_send_qpack_header_ack(h2o_http3_conn_t *conn, const void *bytes, size_t len)
1311 {
1312 struct st_h2o_http3_egress_unistream_t *stream = conn->_control_streams.egress.qpack_encoder;
1313
1314 assert(stream != NULL);
1315 h2o_buffer_append(&stream->sendbuf, bytes, len);
1316 H2O_HTTP3_CHECK_SUCCESS(quicly_stream_sync_sendbuf(stream->quic, 1));
1317 }
1318
h2o_http3_send_goaway_frame(h2o_http3_conn_t * conn,uint64_t stream_or_push_id)1319 void h2o_http3_send_goaway_frame(h2o_http3_conn_t *conn, uint64_t stream_or_push_id)
1320 {
1321 size_t cap = h2o_http3_goaway_frame_capacity(stream_or_push_id);
1322 h2o_iovec_t alloced = h2o_buffer_reserve(&conn->_control_streams.egress.control->sendbuf, cap);
1323 h2o_http3_encode_goaway_frame((uint8_t *)alloced.base, stream_or_push_id);
1324 conn->_control_streams.egress.control->sendbuf->size += cap;
1325 quicly_stream_sync_sendbuf(conn->_control_streams.egress.control->quic, 1);
1326 }
1327
h2o_http3_can_use_h3_datagram(h2o_http3_conn_t * conn)1328 int h2o_http3_can_use_h3_datagram(h2o_http3_conn_t *conn)
1329 {
1330 if (!conn->peer_settings.h3_datagram)
1331 return 0;
1332 quicly_context_t *qctx = quicly_get_context(conn->super.quic);
1333 return qctx->transport_params.max_datagram_frame_size != 0;
1334 }
1335
h2o_http3_send_h3_datagrams(h2o_http3_conn_t * conn,uint64_t flow_id,h2o_iovec_t * datagrams,size_t num_datagrams)1336 void h2o_http3_send_h3_datagrams(h2o_http3_conn_t *conn, uint64_t flow_id, h2o_iovec_t *datagrams, size_t num_datagrams)
1337 {
1338 for (size_t i = 0; i < num_datagrams; ++i) {
1339 h2o_iovec_t *src = datagrams + i;
1340 uint8_t buf[quicly_encodev_capacity(flow_id) + src->len], *p = buf;
1341 p = quicly_encodev(p, flow_id);
1342 memcpy(p, src->base, src->len);
1343 p += src->len;
1344 ptls_iovec_t payload = ptls_iovec_init(buf, p - buf);
1345 quicly_send_datagram_frames(conn->super.quic, &payload, 1);
1346 }
1347
1348 h2o_quic_schedule_timer(&conn->super);
1349 }
1350
h2o_http3_decode_h3_datagram(h2o_iovec_t * payload,const void * _src,size_t len)1351 uint64_t h2o_http3_decode_h3_datagram(h2o_iovec_t *payload, const void *_src, size_t len)
1352 {
1353 const uint8_t *src = _src, *end = src + len;
1354 uint64_t flow_id;
1355
1356 if ((flow_id = ptls_decode_quicint(&src, end)) != UINT64_MAX)
1357 *payload = h2o_iovec_init(src, end - src);
1358 return flow_id;
1359 }
1360