1 /* Copyright (c) 2012, 2020, Oracle and/or its affiliates. All rights reserved.
2
3 This program is free software; you can redistribute it and/or modify
4 it under the terms of the GNU General Public License, version 2.0,
5 as published by the Free Software Foundation.
6
7 This program is also distributed with certain software (including
8 but not limited to OpenSSL) that is licensed under separate terms,
9 as designated in a particular file or component or in included license
10 documentation. The authors of MySQL hereby grant you an additional
11 permission to link the program and your derivative works with the
12 separately licensed software that they have included with MySQL.
13
14 This program is distributed in the hope that it will be useful,
15 but WITHOUT ANY WARRANTY; without even the implied warranty of
16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 GNU General Public License, version 2.0, for more details.
18
19 You should have received a copy of the GNU General Public License
20 along with this program; if not, write to the Free Software
21 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */
22
23 #include <assert.h>
24 #include <errno.h>
25 #ifndef __STDC_FORMAT_MACROS
26 #define __STDC_FORMAT_MACROS
27 #endif
28 #ifndef _WIN32
29 #include <inttypes.h>
30 #endif
31 #include <limits.h>
32 #include <signal.h>
33 #include <stdio.h>
34 #include <stdlib.h>
35 #include <string.h>
36 #include <sys/time.h>
37 #include <sys/types.h>
38
39 #ifndef _WIN32
40 #include <poll.h>
41 #endif
42
43 #ifdef _WIN32
44 #define xcom_buf char
45 #else
46 #define xcom_buf void
47 #endif
48
49 /**
50 @file
51 xcom/xcom_base.c
52 The new version of xcom is a major rewrite to allow
53 transmission of multiple messages from several sources
54 simultaneously without collision. The interface to xcom is largely
55 intact, one notable change is that xcom will consider the message
56 delivered as soon as it has got a majority. Consequently, the VP
57 set will not necessarily show all nodes which will actually
58 receive the message.
59
60 OHKFIX Add wait for complete last known node set to mimic the old
61 semantics.
62
63
64 IMPORTANT: What xcom does and what it does not do:
65
66 xcom messages are received in the same order on all nodes.
67
68 xcom guarantees that if a message is delivered to one node, it will
69 eventually be seen on all other nodes as well.
70
71 xcom messages are available to a crashed node when it comes up
72 again if at least one node which knows the value of the message
73 has not crashed. The size of the message cache is configurable.
74
75 OHKFIX Add logging to disk to make messages durable across system
76 crash and to increase the number of messages which may be cached.
77
78 There is no guarantee whatsoever about the order of messages from
79 different nodes, not even the order of multiple messages from the
80 same node. It is up to the client to impose such an order by
81 waiting on a message before it sends the next.
82
83 xcom can notify the client that a message has timed out, and in
84 that case will try to cancel the message, but it cannot guarantee
85 that a message which has timed out will not be delivered.
86
87 xcom attaches a node set to each message as it is delivered to the
88 client. This node set reflects the current node set that xcom
89 believes is active, it does not mean that the message has been
90 delivered yet to all nodes in the set. Neither does it mean that
91 the message has not been delivered to the nodes not in the set.
92
93 A cache of Paxos state machines is central to the new design. The
94 purpose of the cache is both to store a window of messages, and to
95 decouple the different parts of xcom, like message proposal,
96 message delivery and execution, and recovery. The old cache was
97 limited to caching messages, and a single state machine ran the
98 combined VP and Paxos algorithm. This constrained xcom to deliver
99 only a single message at a time.
100
101 Each instance of the Paxos state machine implements the basic
102 Paxos protocol. Unlike the cache in the old system, it is not
103 cleared when a site is deleted. This removes some problems
104 related to message delivery during site deletion. The cache is a
105 classic fixed size LRU with a hash index.
106
107 Some extensions to the basic Paxos algorithm has been implemented:
108
109 A node has ownership to all synodes with its own node number. Only
110 a node with node number N can propose a value for synode {X N},
111 where X is the sequence number, and N is the node number. Other
112 nodes can only propose the special value no_op for synode {X N}.
113 The reason for this is to retain the leaderless Paxos algorithm,
114 but to avoid collisions between nodes which are competing for the
115 same synode number. With this scheme, each node has its own unique
116 number series during normal operation. The scheme has the
117 following implications:
118
119 1. If a node N has not already proposed a value for the synode {X N},
120 it may at any time send a LEARN message to the other nodes with
121 the reserved value no_op, without going through phase 1 and 2 of
122 Paxos. This is because the other nodes are constrained to propose
123 no_op for this synode, so the final outcome will always be no_op.
124 To avoid unnecessary message transmission, a node will try to
125 broadcast the no_op LEARN messages by piggybacking the information
126 on the messages of the basic Paxos protocol.
127
128 2. Other nodes which want to find the value of synode {X N} may do
129 so by trying to get the value no_op accepted by following the
130 basic Paxos algorithm. The result will be the actual value
131 proposed by node N if it has done so, otherwise no_op. This will
132 typically only be necessary when a node is down, and the other
133 nodes need to find the values from the missing node in order to be
134 able to continue execution.
135
136 Messages are delivered in order to the client, and the order is
137 determined by the sequence number and the node number, with the
138 sequence number as the most significant part.
139
140 The xcom network interface has been redesigned and is now
141 implemented directly on top of TCP, and has so far been completely
142 trouble free. We use poll() or select() to implement non-blocking
143 send and receive, but libev could equally well have been used.
144
145 Multicast is implemented on top of unicast as before, but the
146 implementation is prepared to use real multicast with relatively
147 minor changes.
148
149 The roles of proposer, acceptor/learner, and executor are now
150 directly mapped to unique task types which interact with the Paxos
151 state machines, whereas the previous implementation folded all the
152 roles into a single event driven state machine.
153
154 The following terminology will be used:
155
156 A node is an instance of the xcom thread. There is only one instance
157 of the xcom thread in the agent.
158 A client is the application which is using xcom to send messages.
159 A thread is a real OS thread.
160 A task is a logical process. It is implemented by coroutines and
161 an explicit stack.
162
163 The implementation of tasks and non-blocking socket operations is
164 isolated in task.h and task.c.
165
166 A node will open a tcp connection to each of the other nodes. This
167 connection is used for all communication initiated by the node,
168 and replies to messages will arrive on the connection on which it
169 was sent.
170
171 static int tcp_server(task_arg);
172
173 The tcp_server listens on the xcom port and starts an
174 acceptor_learner_task whenever a new connection is detected.
175
176 static int tcp_reaper_task(task_arg);
177
178 Closes tcp connection which have been unused for too long.
179
180 static int sender_task(task_arg);
181
182 The sender_task waits for tcp messages on its input queue and
183 sends it on the tcp socket. If the socket is closed for any
184 reason, the sender_task will reconnect the socket. There is one
185 sender_task for each socket. The sender task exists mainly to
186 simplify the logic in the other tasks, but it could have been
187 replaced with a coroutine which handles the connection logic after
188 having reserved the socket for its client task.
189
190 static int generator_task(task_arg);
191
192 The generator_task reads messages from the client queue and moves
193 them into the input queue of the proposer_task.
194
195 OHKFIX Use a tcp socket instead of the client queue. We can then
196 remove the generator_task and let the acceptor_learner_task do the
197 dispatching.
198
199 static int proposer_task(task_arg);
200
201 Assign a message number to an outgoing message and try to get it
202 accepted. There may be several proposer tasks on each node
203 working in parallel. If there are multiple proposer tasks, xcom can
204 not guarantee that the messages will be sent in the same order as
205 received from the client.
206
207 static int acceptor_learner_task(task_arg);
208
209 This is the server part of the xcom thread. There is one
210 acceptor_learner_task for each node in the system. The acceptor
211 learner_task reads messages from the socket, finds the correct
212 Paxos state machine, and dispatches to the correct message handler
213 with the state machine and message as arguments.
214
215 static int reply_handler_task(task_arg);
216
217 The reply_handler_task does the same job as the
218 acceptor_learner_task, but listens on the socket which the node
219 uses to send messages, so it will handle only replies on that
220 socket.
221
222 static int executor_task(task_arg);
223
224 The ececutor_task waits for a Paxos message to be accpeted. When
225 the message is accepted, it is delivered to the client,
226 unless it is a no-op. In either case, the executor_task steps to
227 the next message and repeats the wait. If it times out waiting for
228 a message, it will try to get a no-op accepted.
229
230 static int alive_task(task_arg);
231
232 Sends i-am-alive to other nodes if there has been no normal traffic
233 for a while. It also pings nodes which seem to be inactive.
234
235 static int detector_task(task_arg);
236
237 The detector_task periodically scans the set of connections from
238 other nodes and sees if there has been any activity. If there has
239 been no activity for some time, it will assume that the node is
240 dead, and send a view message to the client.
241
242
243 Reconfiguration:
244
245 The xcom reconfiguration process is essentially the one described in
246 "Reconfiguring a State Machine" by Lamport et al. as the R-alpha
247 algorithm.
248 We execute the reconfiguration command immediately, but the config is
249 only valid after a delay of alpha messages.
250 The parameter alpha is the same as
251 EVENT_HORIZON in this implementation. :/static.*too_far
252 All tcp messages from beyond the event horizon will be ignored.
253
254 */
255 #include "xcom/xcom_profile.h"
256
257 #ifndef XCOM_STANDALONE
258 #include "my_compiler.h"
259 #endif
260 #include "xcom/x_platform.h"
261
262 #ifndef _WIN32
263 #include <arpa/inet.h>
264 #include <net/if.h>
265 #include <sys/ioctl.h>
266 #include <sys/socket.h>
267 #ifndef __linux__
268 #include <sys/sockio.h>
269 #endif
270 #endif
271
272 #if defined(_WIN32)
273 #include <windows.h>
274 #endif
275
276 #include "xcom/app_data.h"
277 #include "xcom/get_synode_app_data.h"
278 #include "xcom/node_no.h"
279 #include "xcom/server_struct.h"
280 #include "xcom/simset.h"
281 #include "xcom/site_struct.h"
282 #include "xcom/task.h"
283 #include "xcom/task_net.h"
284 #include "xcom/task_os.h"
285 #include "xcom/xcom_base.h"
286 #include "xcom/xcom_common.h"
287 #include "xcom/xcom_detector.h"
288 #include "xcom/xcom_transport.h"
289 #include "xcom/xdr_utils.h"
290 #include "xdr_gen/xcom_vp.h"
291
292 #ifndef XCOM_WITHOUT_OPENSSL
293 #include "xcom/xcom_ssl_transport.h"
294 #endif
295
296 #include "xcom/bitset.h"
297 #include "xcom/node_list.h"
298 #include "xcom/node_set.h"
299 #include "xcom/pax_msg.h"
300 #include "xcom/site_def.h"
301 #include "xcom/sock_probe.h"
302 #include "xcom/synode_no.h"
303 #include "xcom/task_debug.h"
304 #include "xcom/task_net.h"
305 #include "xcom/xcom_cache.h"
306 #include "xcom/xcom_cfg.h"
307 #include "xcom/xcom_interface.h"
308 #include "xcom/xcom_memory.h"
309 #include "xcom/xcom_msg_queue.h"
310 #include "xcom/xcom_recover.h"
311 #include "xcom/xcom_statistics.h"
312 #include "xcom/xcom_vp_str.h"
313
314 #ifndef XCOM_WITHOUT_OPENSSL
315 #ifdef _WIN32
316 /* In OpenSSL before 1.1.0, we need this first. */
317 #include <winsock2.h>
318 #endif /* _WIN32 */
319
320 #include <openssl/ssl.h>
321
322 #endif
323
324 /* Defines and constants */
325
326 #define SYS_STRERROR_SIZE 512
327
328 /* Avoid printing the warning of protocol version mismatch too often */
329 #define PROTOVERSION_WARNING_TIMEOUT 600.0 /** Every 10 minutes */
330 static double protoversion_warning_time =
331 0.0; /** Timestamp of previous protoversion warning */
332
333 /* Skip prepare for first ballot */
334 #ifdef ALWAYS_THREEPHASE
335 int const threephase = 1;
336 #else
337 int const threephase = 0;
338 #endif
339
340 #include "xcom/retry.h"
341
342 #ifdef NODE_0_IS_ARBITRATOR
343 int ARBITRATOR_HACK = 1;
344 #else
345 int ARBITRATOR_HACK = 0;
346 #endif
347
348 static int const no_duplicate_payload = 1;
349
350 /* Use buffered read when reading messages from the network */
351 static int use_buffered_read = 1;
352
353 /* Used to handle OOM errors */
354 static unsigned short oom_abort = 0;
355
356 /* Forward declarations */
357 long xcom_unique_long(void);
358
359 static double wakeup_delay(double old);
360 static void note_snapshot(node_no node);
361
362 /* Task types */
363 static int proposer_task(task_arg arg);
364 static int executor_task(task_arg arg);
365 static int sweeper_task(task_arg arg);
366 extern int alive_task(task_arg arg);
367 extern int cache_manager_task(task_arg arg);
368 extern int detector_task(task_arg arg);
369
370 static int finished(pax_machine *p);
371 static int accepted(pax_machine *p);
372 static int started(pax_machine *p);
373 static synode_no first_free_synode(synode_no msgno);
374 static void free_forced_config_site_def();
375 static void activate_sweeper();
376 static void force_pax_machine(pax_machine *p, int enforcer);
377 static void handle_need_snapshot(linkage *reply_queue, pax_msg *pm);
378 static void handle_skip(site_def const *site, pax_machine *p, pax_msg *m);
379
380 extern void bit_set_or(bit_set *x, bit_set const *y);
381
382 /* Global variables */
383
384 int xcom_shutdown = 0; /* Xcom_Shutdown flag */
385 synode_no executed_msg; /* The message we are waiting to execute */
386 synode_no max_synode; /* Max message number seen so far */
387 task_env *boot = NULL;
388 task_env *detector = NULL;
389 task_env *killer = NULL;
390 task_env *net_boot = NULL;
391 task_env *net_recover = NULL;
392 void *xcom_thread_input = 0;
393
394 long xcom_debug_mask =
395 /* D_DETECT | */ D_FSM /* | D_FILEOP | D_CONS | D_BASE */ | D_TRANSPORT;
396 long xcom_dbg_stack[DBG_STACK_SIZE];
397 int xcom_dbg_stack_top = 0;
398
399 static void init_proposers();
400 void initialize_lsn(uint64_t n);
401
init_base_vars()402 void init_base_vars() {
403 xcom_shutdown = 0; /* Xcom_Shutdown flag */
404 executed_msg = null_synode; /* The message we are waiting to execute */
405 max_synode = null_synode; /* Max message number seen so far */
406 boot = NULL;
407 detector = NULL;
408 killer = NULL;
409 net_boot = NULL;
410 net_recover = NULL;
411 xcom_thread_input = 0;
412 }
413
414 static task_env *executor = NULL;
415 static task_env *sweeper = NULL;
416 static task_env *retry = NULL;
417 static task_env *proposer[PROPOSERS];
418 static task_env *alive_t = NULL;
419 static task_env *cache_task = NULL;
420
421 static uint32_t my_id = 0; /* Unique id of this instance */
get_my_xcom_id()422 uint32_t get_my_xcom_id() { return my_id; }
423 static synode_no current_message; /* Current message number */
424 static synode_no
425 last_config_modification_id; /*Last configuration change proposal*/
426 static uint64_t lsn = 0; /* Current log sequence number */
427
get_current_message()428 synode_no get_current_message() { return current_message; }
429
430 static channel prop_input_queue; /* Proposer task input queue */
431
432 extern int client_boot_done;
433 extern int netboot_ok;
434
435 static linkage exec_wait = {
436 0, &exec_wait, &exec_wait}; /* Executor will wake up tasks sleeping here */
437
438 linkage detector_wait = {0, &detector_wait,
439 &detector_wait}; /* Detector sleeps here */
440
441 static struct {
442 int n;
443 unsigned long id[MAX_DEAD];
444 } dead_sites;
445
get_max_synode()446 synode_no get_max_synode() { return max_synode; }
447
is_latest_config(site_def const * const config)448 static bool_t is_latest_config(site_def const *const config) {
449 site_def const *const latest_config = get_site_def();
450 assert(latest_config != NULL);
451 return config == latest_config;
452 }
453
454 /**
455 * Get the first pending configuration that reconfigures the event horizon.
456 *
457 * Retrieve the first pending site_def, i.e. with the smallest start synod that
458 * is greater than executed_msg, that reconfigures the event horizon.
459 */
first_event_horizon_reconfig()460 static site_def const *first_event_horizon_reconfig() {
461 site_def const *active_config = find_site_def(executed_msg);
462 xcom_event_horizon active_event_horizon = active_config->event_horizon;
463 site_def const *first_event_horizon_reconfig = NULL;
464 site_def const *next_config = NULL;
465 for (next_config = find_next_site_def(active_config->start);
466 next_config != NULL && first_event_horizon_reconfig == NULL;
467 next_config = find_next_site_def(next_config->start)) {
468 if (active_event_horizon != next_config->event_horizon) {
469 first_event_horizon_reconfig = next_config;
470 }
471 }
472 return first_event_horizon_reconfig;
473 }
474
475 /**
476 * Get the latest pending configuration that reconfigures the event horizon.
477 *
478 * Retrieve the last pending site_def, i.e. with the greatest start synod that
479 * is greater than executed_msg, that reconfigures the event horizon.
480 */
latest_event_horizon_reconfig()481 static site_def const *latest_event_horizon_reconfig() {
482 site_def const *active_config = find_site_def(executed_msg);
483 xcom_event_horizon previous_event_horizon = active_config->event_horizon;
484 site_def const *last_event_horizon_reconfig = NULL;
485 site_def const *next_config = NULL;
486 for (next_config = find_next_site_def(active_config->start);
487 next_config != NULL;
488 next_config = find_next_site_def(next_config->start)) {
489 if (previous_event_horizon != next_config->event_horizon) {
490 previous_event_horizon = next_config->event_horizon;
491 last_event_horizon_reconfig = next_config;
492 }
493 }
494 return last_event_horizon_reconfig;
495 }
496
497 /**
498 * Add the event horizon to the given base synod s.
499 *
500 * We are assuming right now that this function is used solely in the context of
501 * "we have received a reconfiguration command at synod s, when should it be
502 * scheduled to take effect?"
503 * The result of this function is *when* it should take effect.
504 *
505 * Common case: there are no configurations pending, or if there are, none of
506 * them reconfigure the event horizon. The common case result is:
507 *
508 * s + event_horizon(active_config) + 1
509 *
510 *
511 * If an event horizon reconfiguration R is pending, it means that the command C
512 * proposed for synod s is concurrent with R, i.e., s falls in the interval
513 * ]proposed(R), start(R)[.
514 *
515 * In this situation we apply the command C proposed for synod s *after* taking
516 * into account R's event horizon.
517 *
518 * This means that the result is:
519 *
520 * start(R) + event_horizon(R) + 1
521 */
522 /* purecov: begin deadcode */
add_default_event_horizon(synode_no s)523 static synode_no add_default_event_horizon(synode_no s) {
524 s.msgno += EVENT_HORIZON_MIN + 1;
525 return s;
526 }
527 /* purecov: end */
528
add_event_horizon(synode_no s)529 static synode_no add_event_horizon(synode_no s) {
530 site_def const *active_config = find_site_def(executed_msg);
531 if (active_config) {
532 site_def const *pending_config = latest_event_horizon_reconfig();
533 bool_t const no_event_horizon_reconfig_pending = (pending_config == NULL);
534 if (is_latest_config(active_config) || no_event_horizon_reconfig_pending) {
535 s.msgno = s.msgno + active_config->event_horizon + 1;
536 } else {
537 s.msgno = pending_config->start.msgno + pending_config->event_horizon + 1;
538 }
539 return s;
540 } else { /* This is initial boot or recovery, we have no config */
541 #ifdef PERMISSIVE_EH_ACTIVE_CONFIG
542 return add_default_event_horizon(s);
543 #else
544 /* We should always have an active config */
545 /* purecov: begin deadcode */
546 assert(active_config != NULL);
547 return null_synode;
548 /* purecov: end */
549 #endif
550 }
551 }
552
553 /**
554 Set node group
555 */
set_group(uint32_t id)556 void set_group(uint32_t id) {
557 IFDBG(D_NONE, FN; STRLIT("changing group id of global variables ");
558 NDBG((unsigned long)id, lu););
559 /* set_group_id(id); */
560 current_message.group_id = id;
561 executed_msg.group_id = id;
562 max_synode.group_id = id;
563 }
564
bury_site(uint32_t id)565 static void bury_site(uint32_t id) {
566 if (id != 0) {
567 dead_sites.id[dead_sites.n % MAX_DEAD] = id;
568 dead_sites.n = (dead_sites.n + 1) % MAX_DEAD;
569 }
570 }
571
is_dead_site(uint32_t id)572 static bool_t is_dead_site(uint32_t id) {
573 int i = 0;
574 for (i = 0; i < MAX_DEAD; i++) {
575 if (dead_sites.id[i] == id)
576 return TRUE;
577 else if (dead_sites.id[i] == 0)
578 return FALSE;
579 }
580 return FALSE;
581 }
582
583 extern node_set *init_node_set(node_set *set, u_int n);
584 extern node_set *alloc_node_set(node_set *set, u_int n);
585
586 #if 0
587 /* Find our previous message number. */
588 static synode_no decr_msgno(synode_no msgno)
589 {
590 synode_no ret = msgno;
591 ret.msgno--;
592 ret.node = get_nodeno(find_site_def(ret)); /* In case site and node number has changed */
593 return ret;
594 }
595 #endif
596
597 /* Find our next message number. */
incr_msgno(synode_no msgno)598 static synode_no incr_msgno(synode_no msgno) {
599 synode_no ret = msgno;
600 ret.msgno++;
601 ret.node = get_nodeno(
602 find_site_def(ret)); /* In case site and node number has changed */
603 return ret;
604 }
605
incr_synode(synode_no synode)606 synode_no incr_synode(synode_no synode) {
607 synode_no ret = synode;
608 ret.node++;
609 if (ret.node >= get_maxnodes(find_site_def(synode))) {
610 ret.node = 0;
611 ret.msgno++;
612 }
613 /* IFDBG(D_NONE, FN; SYCEXP(synode); SYCEXP(ret)); */
614 return ret; /* Change this if we change message number type */
615 }
616
decr_synode(synode_no synode)617 synode_no decr_synode(synode_no synode) {
618 synode_no ret = synode;
619 if (ret.node == 0) {
620 ret.msgno--;
621 ret.node = get_maxnodes(find_site_def(ret));
622 }
623 ret.node--;
624 return ret; /* Change this if we change message number type */
625 }
626
skip_value(pax_msg * p)627 static void skip_value(pax_msg *p) {
628 IFDBG(D_NONE, FN; SYCEXP(p->synode));
629 p->op = learn_op;
630 p->msg_type = no_op;
631 }
632
633 /* Utilities and debug */
634
635 #ifndef _WIN32
636 /* Ignore this signal */
ignoresig(int signum)637 static int ignoresig(int signum) {
638 struct sigaction act;
639 struct sigaction oldact;
640
641 memset(&act, 0, sizeof(act));
642 act.sa_handler = SIG_IGN;
643 memset(&oldact, 0, sizeof(oldact));
644
645 return sigaction(signum, &act, &oldact);
646 }
647 #else
648 #define SIGPIPE 0
ignoresig(int signum)649 static int ignoresig(int signum) { return 0; }
650 #endif
651
recently_active(pax_machine * p)652 static int recently_active(pax_machine *p) {
653 IFDBG(D_NONE, FN; SYCEXP(p->synode); STRLIT(" op "); PTREXP(p);
654 STRLIT(p->learner.msg ? pax_op_to_str(p->learner.msg->op) : "NULL");
655 NDBG(p->last_modified, f); NDBG(task_now(), f));
656 return p->last_modified != 0.0 &&
657 (p->last_modified + BUILD_TIMEOUT + median_time()) > task_now();
658 }
659
finished(pax_machine * p)660 static inline int finished(pax_machine *p) {
661 IFDBG(D_NONE, FN; SYCEXP(p->synode); STRLIT(" op "); PTREXP(p);
662 STRLIT(p->learner.msg ? pax_op_to_str(p->learner.msg->op) : "NULL"););
663 return p->learner.msg && (p->learner.msg->op == learn_op ||
664 p->learner.msg->op == tiny_learn_op);
665 }
666
pm_finished(pax_machine * p)667 int pm_finished(pax_machine *p) { return finished(p); }
668
accepted(pax_machine * p)669 static inline int accepted(pax_machine *p) {
670 IFDBG(D_NONE, FN; SYCEXP(p->synode); STRLIT(" op "); PTREXP(p);
671 STRLIT(p->acceptor.msg ? pax_op_to_str(p->acceptor.msg->op) : "NULL"););
672 return p->acceptor.msg && p->acceptor.msg->op != initial_op;
673 }
674
accepted_noop(pax_machine * p)675 static inline int accepted_noop(pax_machine *p) {
676 IFDBG(D_NONE, FN; SYCEXP(p->synode); STRLIT(" op "); PTREXP(p);
677 STRLIT(p->acceptor.msg ? pax_op_to_str(p->acceptor.msg->op) : "NULL"););
678 return accepted(p) && p->acceptor.msg->msg_type == no_op;
679 }
680
noop_match(pax_machine * p,pax_msg * pm)681 static inline int noop_match(pax_machine *p, pax_msg *pm) {
682 return pm->msg_type == no_op && accepted_noop(p);
683 }
684
started(pax_machine * p)685 static inline int started(pax_machine *p) {
686 return p->op != initial_op || (p->acceptor.promise.cnt > 0) ||
687 (p->proposer.msg && (p->proposer.msg->op != initial_op)) ||
688 accepted(p) || finished(p);
689 }
690
set_last_received_config(synode_no received_config_change)691 void set_last_received_config(synode_no received_config_change) {
692 last_config_modification_id = received_config_change;
693 }
694
695 /* Definition of majority */
max_check(site_def const * site)696 static inline node_no max_check(site_def const *site) {
697 #ifdef MAXACCEPT
698 return MIN(get_maxnodes(site), MAXACCEPT);
699 #else
700 return get_maxnodes(site);
701 #endif
702 }
703
704 static site_def *forced_config = 0;
is_forcing_node(pax_machine const * p)705 static int is_forcing_node(pax_machine const *p) { return p->enforcer; }
706 static int wait_forced_config = 0;
707
708 /* Definition of majority */
majority(bit_set const * nodeset,site_def const * s,int all,int delay MY_ATTRIBUTE ((unused)),int force)709 static inline int majority(bit_set const *nodeset, site_def const *s, int all,
710 int delay MY_ATTRIBUTE((unused)), int force) {
711 node_no ok = 0;
712 node_no i = 0;
713 int retval = 0;
714 #ifdef WAIT_FOR_ALL_FIRST
715 double sec = task_now();
716 #endif
717 node_no max = max_check(s);
718
719 /* IFDBG(D_NONE, FN; NDBG(max,lu); NDBG(all,d); NDBG(delay,d); NDBG(force,d));
720 */
721
722 /* Count nodes that has answered */
723 for (i = 0; i < max; i++) {
724 if (BIT_ISSET(i, nodeset)) {
725 ok++;
726 }
727 #ifdef WAIT_FOR_ALL_FIRST
728 else {
729 if (all) return 0; /* Delay until all nodes have answered */
730 if (delay && !may_be_dead(s->detected, i, sec)) {
731 return 0; /* Delay until all live nodes have answered */
732 }
733 }
734 #endif
735 }
736
737 /* If we are forcing messages, attempt to ensure consistency by
738 requiring all remaining nodes to agree. Forced_config points to
739 the config that should be used as acceptors in this
740 case. Another possibility is to use the original config and
741 count the number of live nodes, but since the force flag is
742 being used only to force a new config, it seems safer to use
743 the new config and no time-dependent info. Note that we are
744 counting the answers based on the normal config, but use the
745 number of nodes from forced_config. This is safe, since we can
746 assume that the nodes that are not in forced_config will never
747 answer. */
748
749 if (force) {
750 IFDBG(D_NONE, FN; STRLIT("force majority"); NDBG(ok, u); NDBG(max, u);
751 NDBG(get_maxnodes(forced_config), u));
752 return ok == get_maxnodes(forced_config);
753 } else {
754 /* Have now seen answer from all live nodes */
755 #ifdef NODE_0_IS_ARBITRATOR
756 retval = all ? ok == max
757 : ok > max / 2 ||
758 (ARBITRATOR_HACK && (get_nodeno(s) == 0) && (2 == max));
759 #else
760 retval = all ? ok == max : ok > max / 2 || (ARBITRATOR_HACK && (2 == max));
761 #endif
762 /* IFDBG(D_NONE, FN; NDBG(max,lu); NDBG(all,d); NDBG(delay,d);
763 * NDBG(retval,d)); */
764 return retval;
765 }
766 }
767
768 #define IS_CONS_ALL(p) \
769 ((p)->proposer.msg->a ? (p)->proposer.msg->a->consensus == cons_all : 0)
770
771 /* See if a majority of acceptors have answered our prepare */
prep_majority(site_def const * site,pax_machine * p)772 static int prep_majority(site_def const *site, pax_machine *p) {
773 int ok = 0;
774
775 assert(p);
776 assert(p->proposer.prep_nodeset);
777 assert(p->proposer.msg);
778 /* IFDBG(D_NONE, FN; BALCEXP(p->proposer.bal)); */
779 ok = majority(p->proposer.prep_nodeset, site, IS_CONS_ALL(p),
780 p->proposer.bal.cnt == 1,
781 p->proposer.msg->force_delivery || p->force_delivery);
782 return ok;
783 }
784
785 /* See if a majority of acceptors have answered our propose */
prop_majority(site_def const * site,pax_machine * p)786 static int prop_majority(site_def const *site, pax_machine *p) {
787 int ok = 0;
788
789 assert(p);
790 assert(p->proposer.prop_nodeset);
791 assert(p->proposer.msg);
792 /* IFDBG(D_NONE, FN; BALCEXP(p->proposer.bal)); */
793 ok = majority(p->proposer.prop_nodeset, site, IS_CONS_ALL(p),
794 p->proposer.bal.cnt == 1,
795 p->proposer.msg->force_delivery || p->force_delivery);
796 return ok;
797 }
798
799 /* Xcom thread */
800
801 static site_def *executor_site = 0;
802
get_executor_site()803 site_def const *get_executor_site() { return executor_site; }
get_executor_site_rw()804 site_def *get_executor_site_rw() { return executor_site; }
805
806 static site_def *proposer_site = 0;
807
get_proposer_site()808 site_def const *get_proposer_site() { return proposer_site; }
809
810 /* delivered_msg may point to a no_op message, which will not actually be
811 * delivered */
812 static synode_no delivered_msg = NULL_SYNODE;
813
get_delivered_msg()814 synode_no get_delivered_msg() { return delivered_msg; }
815
816 /* last_delivered_msg is the last synode we actually delivered */
817 static synode_no last_delivered_msg = NULL_SYNODE;
get_last_delivered_msg()818 synode_no get_last_delivered_msg() { return last_delivered_msg; }
819
init_xcom_base()820 void init_xcom_base() {
821 IFDBG(D_NONE, FN);
822 xcom_shutdown = 0;
823 current_message = null_synode;
824 executed_msg = null_synode;
825 delivered_msg = null_synode;
826 last_delivered_msg = null_synode;
827 max_synode = null_synode;
828 client_boot_done = 0;
829 netboot_ok = 0;
830
831 xcom_recover_init();
832 my_id = new_id();
833 push_site_def(NULL);
834 /* update_servers(NULL); */
835 xcom_cache_var_init();
836 median_filter_init();
837 link_init(&exec_wait, TYPE_HASH("task_env"));
838 link_init(&detector_wait, TYPE_HASH("task_env"));
839 link_init(&connect_wait, TYPE_HASH("task_env"));
840 executor_site = 0;
841 proposer_site = 0;
842
843 /** Reset lsn */
844 initialize_lsn(0);
845 IFDBG(D_NONE, FN);
846 }
847
init_tasks()848 static void init_tasks() {
849 IFDBG(D_NONE, FN);
850 set_task(&boot, NULL);
851 set_task(&net_boot, NULL);
852 set_task(&net_recover, NULL);
853 set_task(&killer, NULL);
854 set_task(&executor, NULL);
855 set_task(&retry, NULL);
856 set_task(&detector, NULL);
857 init_proposers();
858 set_task(&alive_t, NULL);
859 set_task(&sweeper, NULL);
860 set_task(&cache_task, NULL);
861 IFDBG(D_NONE, FN);
862 }
863
864 /* Initialize the xcom thread */
xcom_thread_init()865 void xcom_thread_init() {
866 #ifndef NO_SIGPIPE
867 signal(SIGPIPE, SIG_IGN);
868 #endif
869 init_base_vars();
870 init_site_vars();
871 init_crc32c();
872 xcom_srand48((long int)task_now());
873
874 init_xcom_base();
875 init_tasks();
876
877 /* Initialize input queue */
878 channel_init(&prop_input_queue, TYPE_HASH("msg_link"));
879 init_link_list();
880 task_sys_init();
881
882 init_cache();
883 }
884
885 /* Empty the proposer input queue */
empty_prop_input_queue()886 static void empty_prop_input_queue() {
887 empty_msg_channel(&prop_input_queue);
888 IFDBG(D_NONE, FN; STRLIT("prop_input_queue empty"));
889 }
890
891 /* De-initialize the xcom thread */
xcom_thread_deinit()892 void xcom_thread_deinit() {
893 IFDBG(D_BUG, FN; STRLIT("Empty proposer input queue"));
894 empty_prop_input_queue();
895 IFDBG(D_BUG, FN; STRLIT("Empty link free list"));
896 empty_link_free_list();
897 IFDBG(D_BUG, FN; STRLIT("De-initialize cache"));
898 deinit_cache();
899 garbage_collect_servers();
900 IFDBG(D_BUG, FN; STRLIT("De-initialize network cache"));
901 deinit_network_cache();
902 IFDBG(D_BUG, FN; STRLIT("De-initialize xcom_interface"));
903 deinit_xcom_interface();
904 }
905
906 #define PROP_ITER \
907 int i; \
908 for (i = 0; i < PROPOSERS; i++)
909
init_proposers()910 static void init_proposers() {
911 PROP_ITER { set_task(&proposer[i], NULL); }
912 }
913
create_proposers()914 static void create_proposers() {
915 PROP_ITER {
916 set_task(&proposer[i], task_new(proposer_task, int_arg(i), "proposer_task",
917 XCOM_THREAD_DEBUG));
918 }
919 }
920
terminate_proposers()921 static void terminate_proposers() {
922 PROP_ITER { task_terminate(proposer[i]); }
923 }
924
free_forced_config_site_def()925 static void free_forced_config_site_def() {
926 free_site_def(forced_config);
927 forced_config = NULL;
928 }
929
930 #if TASK_DBUG_ON
931 static void dbg_proposers() MY_ATTRIBUTE((unused));
dbg_proposers()932 static void dbg_proposers() {
933 GET_GOUT;
934 if (!IS_XCOM_DEBUG_WITH(XCOM_DEBUG_TRACE)) return;
935 NDBG(PROPOSERS, d);
936 {
937 PROP_ITER { PPUT(proposer[i]); }
938 }
939 PRINT_GOUT;
940 FREE_GOUT;
941 }
942 #endif
943
set_proposer_startpoint()944 static void set_proposer_startpoint() {
945 IFDBG(D_NONE, FN; STRLIT("changing current message"));
946 if (synode_gt(max_synode, get_current_message())) {
947 if (max_synode.msgno <= 1)
948 set_current_message(first_free_synode(max_synode));
949 else
950 set_current_message(incr_msgno(first_free_synode(max_synode)));
951 }
952 if (synode_gt(executed_msg, get_current_message())) {
953 set_current_message(first_free_synode(executed_msg));
954 }
955 }
956
957 /* Task functions */
958
959 static xcom_state_change_cb xcom_run_cb = 0;
960 static xcom_state_change_cb xcom_terminate_cb = 0;
961 static xcom_state_change_cb xcom_comms_cb = 0;
962 static xcom_state_change_cb xcom_exit_cb = 0;
963 static xcom_state_change_cb xcom_expel_cb = 0;
964 static xcom_input_try_pop_cb xcom_try_pop_from_input_cb = NULL;
965
set_xcom_run_cb(xcom_state_change_cb x)966 void set_xcom_run_cb(xcom_state_change_cb x) { xcom_run_cb = x; }
967
set_xcom_comms_cb(xcom_state_change_cb x)968 void set_xcom_comms_cb(xcom_state_change_cb x) { xcom_comms_cb = x; }
969 /* purecov: begin deadcode */
set_xcom_terminate_cb(xcom_state_change_cb x)970 void set_xcom_terminate_cb(xcom_state_change_cb x) { xcom_terminate_cb = x; }
971 /* purecov: end */
set_xcom_exit_cb(xcom_state_change_cb x)972 void set_xcom_exit_cb(xcom_state_change_cb x) { xcom_exit_cb = x; }
973
974 static xcom_recovery_cb recovery_begin_cb = NULL;
975 /* purecov: begin deadcode */
set_xcom_recovery_begin_cb(xcom_recovery_cb x)976 void set_xcom_recovery_begin_cb(xcom_recovery_cb x) { recovery_begin_cb = x; }
977 /* purecov: end */
978
979 static xcom_recovery_cb recovery_restart_cb = NULL;
980 /* purecov: begin deadcode */
set_xcom_recovery_restart_cb(xcom_recovery_cb x)981 void set_xcom_recovery_restart_cb(xcom_recovery_cb x) {
982 recovery_restart_cb = x;
983 }
984 /* purecov: end */
985
986 static xcom_recovery_cb recovery_init_cb = NULL;
987 /* purecov: begin deadcode */
set_xcom_recovery_init_cb(xcom_recovery_cb x)988 void set_xcom_recovery_init_cb(xcom_recovery_cb x) { recovery_init_cb = x; }
989 /* purecov: end */
990
991 static xcom_recovery_cb recovery_end_cb = NULL;
992 /* purecov: begin deadcode */
set_xcom_recovery_end_cb(xcom_recovery_cb x)993 void set_xcom_recovery_end_cb(xcom_recovery_cb x) { recovery_end_cb = x; }
994 /* purecov: end */
995
set_xcom_expel_cb(xcom_state_change_cb x)996 void set_xcom_expel_cb(xcom_state_change_cb x) { xcom_expel_cb = x; }
997
set_xcom_input_try_pop_cb(xcom_input_try_pop_cb pop)998 void set_xcom_input_try_pop_cb(xcom_input_try_pop_cb pop) {
999 xcom_try_pop_from_input_cb = pop;
1000 }
1001
1002 static connection_descriptor *input_signal_connection = NULL;
1003
1004 #ifndef XCOM_WITHOUT_OPENSSL
xcom_input_signal_connection_shutdown_ssl_wait_for_peer()1005 static bool_t xcom_input_signal_connection_shutdown_ssl_wait_for_peer() {
1006 int ssl_error_code = 0;
1007 do {
1008 char buf[1024];
1009 ssl_error_code = SSL_read(input_signal_connection->ssl_fd, buf, 1024);
1010 } while (ssl_error_code > 0);
1011
1012 bool_t const successful =
1013 (SSL_get_error(input_signal_connection->ssl_fd, ssl_error_code) ==
1014 SSL_ERROR_ZERO_RETURN);
1015 return successful;
1016 }
xcom_input_signal_connection_shutdown_ssl()1017 static bool_t xcom_input_signal_connection_shutdown_ssl() {
1018 bool_t successful = FALSE;
1019
1020 int ssl_error_code = SSL_shutdown(input_signal_connection->ssl_fd);
1021
1022 bool_t const need_to_wait_for_peer_shutdown = (ssl_error_code == 0);
1023 bool_t const something_went_wrong = (ssl_error_code < 0);
1024 if (need_to_wait_for_peer_shutdown) {
1025 successful = xcom_input_signal_connection_shutdown_ssl_wait_for_peer();
1026 if (!successful) goto end;
1027 } else if (something_went_wrong) {
1028 goto end;
1029 }
1030
1031 ssl_free_con(input_signal_connection);
1032 successful = TRUE;
1033
1034 end:
1035 return successful;
1036 }
1037 #endif
1038
xcom_input_new_signal_connection(char const * address,xcom_port port)1039 bool_t xcom_input_new_signal_connection(char const *address, xcom_port port) {
1040 bool_t const SUCCESSFUL = TRUE;
1041 bool_t const UNSUCCESSFUL = FALSE;
1042 assert(input_signal_connection == NULL);
1043
1044 /* Try to connect. */
1045 input_signal_connection = xcom_open_client_connection(address, port);
1046 if (input_signal_connection == NULL) return UNSUCCESSFUL;
1047
1048 /* Have the server handle the rest of this connection using a local_server
1049 task. */
1050 if (xcom_client_convert_into_local_server(input_signal_connection) == 1) {
1051 G_TRACE(
1052 "Converted the signalling connection handler into a local_server "
1053 "task on the client side.");
1054 #ifndef XCOM_WITHOUT_OPENSSL
1055 /* No more SSL in this connection. */
1056 {
1057 bool_t const using_ssl = (input_signal_connection->ssl_fd != NULL);
1058 if (using_ssl) {
1059 bool_t successful = xcom_input_signal_connection_shutdown_ssl();
1060 if (!successful) {
1061 G_ERROR(
1062 "Error shutting down SSL on XCom's signalling connection on the "
1063 "client side.");
1064 xcom_input_free_signal_connection();
1065 return UNSUCCESSFUL;
1066 }
1067 }
1068 }
1069 #endif
1070 return SUCCESSFUL;
1071 } else {
1072 G_DEBUG(
1073 "Error converting the signalling connection handler into a "
1074 "local_server task on the client side.");
1075 xcom_input_free_signal_connection();
1076 return UNSUCCESSFUL;
1077 }
1078 }
1079 static int64_t socket_write(connection_descriptor *wfd, void *_buf, uint32_t n);
xcom_input_signal()1080 bool_t xcom_input_signal() {
1081 bool_t successful = FALSE;
1082 if (input_signal_connection != NULL) {
1083 unsigned char tiny_buf[1] = {0};
1084 int64_t error_code = socket_write(input_signal_connection, tiny_buf, 1);
1085 successful = (error_code == 1);
1086 }
1087 return successful;
1088 }
xcom_input_free_signal_connection()1089 void xcom_input_free_signal_connection() {
1090 if (input_signal_connection != NULL) {
1091 xcom_close_client_connection(input_signal_connection);
1092 input_signal_connection = NULL;
1093 }
1094 }
1095
1096 #ifndef XCOM_WITHOUT_OPENSSL
local_server_shutdown_ssl(connection_descriptor * con,void * buf,int n,int * ret)1097 static int local_server_shutdown_ssl(connection_descriptor *con, void *buf,
1098 int n, int *ret) {
1099 DECL_ENV
1100 int ssl_error_code;
1101 bool_t need_to_wait_for_peer_shutdown;
1102 bool_t something_went_wrong;
1103 int64_t nr_read;
1104 END_ENV;
1105 *ret = 0;
1106 TASK_BEGIN
1107 ep->ssl_error_code = SSL_shutdown(con->ssl_fd);
1108 ep->need_to_wait_for_peer_shutdown = (ep->ssl_error_code == 0);
1109 ep->something_went_wrong = (ep->ssl_error_code < 0);
1110 if (ep->need_to_wait_for_peer_shutdown) {
1111 do {
1112 TASK_CALL(task_read(con, buf, n, &ep->nr_read));
1113 } while (ep->nr_read > 0);
1114 ep->ssl_error_code = SSL_get_error(con->ssl_fd, ep->nr_read);
1115 ep->something_went_wrong = (ep->ssl_error_code != SSL_ERROR_ZERO_RETURN);
1116 }
1117 if (ep->something_went_wrong) TERMINATE;
1118 ssl_free_con(con);
1119 *ret = 1;
1120 FINALLY
1121 TASK_END;
1122 }
1123 #endif
1124
local_server(task_arg arg)1125 int local_server(task_arg arg) {
1126 DECL_ENV
1127 connection_descriptor rfd;
1128 int ssl_shutdown_ret;
1129 unsigned char buf[1024]; /* arbitrary size */
1130 int64_t nr_read;
1131 xcom_input_request_ptr request;
1132 xcom_input_request_ptr next_request;
1133 pax_msg *request_pax_msg;
1134 pax_msg *reply_payload;
1135 linkage internal_reply_queue;
1136 msg_link *internal_reply;
1137 END_ENV;
1138 TASK_BEGIN
1139 assert(xcom_try_pop_from_input_cb != NULL);
1140 {
1141 connection_descriptor *arg_rfd = (connection_descriptor *)get_void_arg(arg);
1142 ep->rfd = *arg_rfd;
1143 free(arg_rfd);
1144 }
1145 ep->ssl_shutdown_ret = 0;
1146 memset(ep->buf, 0, 1024);
1147 ep->nr_read = 0;
1148 ep->request = NULL;
1149 ep->next_request = NULL;
1150 ep->request_pax_msg = NULL;
1151 ep->reply_payload = NULL;
1152 link_init(&ep->internal_reply_queue, TYPE_HASH("msg_link"));
1153 ep->internal_reply = NULL;
1154
1155 #ifndef XCOM_WITHOUT_OPENSSL
1156 /* No more SSL in this connection. */
1157 if (ep->rfd.ssl_fd) {
1158 TASK_CALL(local_server_shutdown_ssl(&ep->rfd, ep->buf, 1024,
1159 &ep->ssl_shutdown_ret));
1160 if (ep->ssl_shutdown_ret != 1) {
1161 G_ERROR(
1162 "Error shutting down SSL on XCom's signalling connection on the "
1163 "server side.");
1164 TERMINATE;
1165 }
1166 }
1167 #endif
1168
1169 while (!xcom_shutdown) {
1170 /* Wait for signal that there is work to consume from the queue. */
1171 TASK_CALL(task_read(&ep->rfd, ep->buf, 1024, &ep->nr_read));
1172 if (ep->nr_read == 0) {
1173 /* purecov: begin inspected */
1174 G_WARNING("local_server: client closed the signalling connection?");
1175 break;
1176 /* purecov: end */
1177 } else if (ep->nr_read < 0) {
1178 /* purecov: begin inspected */
1179 IFDBG(D_NONE, FN; NDBG64(ep->nr_read));
1180 G_WARNING("local_server: error reading from the signalling connection?");
1181 break;
1182 /* purecov: end */
1183 }
1184 /* Pop, dispatch, and reply. */
1185 ep->request = xcom_try_pop_from_input_cb();
1186 while (ep->request != NULL) {
1187 /* Take ownership of the tail of the list, otherwise we lose it when we
1188 free ep->request. */
1189 ep->next_request = xcom_input_request_extract_next(ep->request);
1190 unchecked_replace_pax_msg(&ep->request_pax_msg,
1191 pax_msg_new_0(null_synode));
1192 assert(ep->request_pax_msg->refcnt == 1);
1193 ep->request_pax_msg->op = client_msg;
1194 /* Take ownership of the request's app_data, otherwise the app_data is
1195 freed with ep->request. */
1196 ep->request_pax_msg->a = xcom_input_request_extract_app_data(ep->request);
1197 ep->request_pax_msg->to = VOID_NODE_NO;
1198 ep->request_pax_msg->force_delivery =
1199 (ep->request_pax_msg->a->body.c_t == force_config_type);
1200 dispatch_op(NULL, ep->request_pax_msg, &ep->internal_reply_queue);
1201 if (!link_empty(&ep->internal_reply_queue)) {
1202 ep->internal_reply =
1203 (msg_link *)(link_extract_first(&ep->internal_reply_queue));
1204 assert(ep->internal_reply->p);
1205 assert(ep->internal_reply->p->refcnt == 1);
1206 /* We are going to take ownership of the pax_msg which has the reply
1207 payload, so we bump its reference count so that it is not freed by
1208 msg_link_delete. */
1209 ep->reply_payload = ep->internal_reply->p;
1210 ep->reply_payload->refcnt++;
1211 msg_link_delete(&ep->internal_reply);
1212 /* There should only have been one reply. */
1213 assert(link_empty(&ep->internal_reply_queue));
1214 } else {
1215 ep->reply_payload = NULL;
1216 }
1217 /* Reply to the request. */
1218 xcom_input_request_reply(ep->request, ep->reply_payload);
1219 xcom_input_request_free(ep->request);
1220 ep->request = ep->next_request;
1221 }
1222 }
1223 FINALLY
1224 IFDBG(D_BUG, FN; STRLIT(" shutdown "); NDBG(ep->rfd.fd, d);
1225 NDBG(task_now(), f));
1226 /* Close the signalling connection. */
1227 shutdown_connection(&ep->rfd);
1228 unchecked_replace_pax_msg(&ep->request_pax_msg, NULL);
1229 IFDBG(D_NONE, FN; NDBG(xcom_shutdown, d));
1230 TASK_END;
1231 }
1232
local_server_is_setup()1233 static bool_t local_server_is_setup() {
1234 return xcom_try_pop_from_input_cb != NULL;
1235 }
1236
xcom_taskmain2(xcom_port listen_port)1237 int xcom_taskmain2(xcom_port listen_port) {
1238 init_xcom_transport(listen_port);
1239
1240 IFDBG(D_BUG, FN; STRLIT("enter taskmain"));
1241 ignoresig(SIGPIPE);
1242
1243 {
1244 /* Setup tcp_server socket */
1245 result tcp_fd = {0, 0};
1246
1247 if ((tcp_fd = announce_tcp(listen_port)).val < 0) {
1248 /* purecov: begin inspected */
1249 IFDBG(D_BUG, FN; STRLIT("cannot annonunce tcp "); NDBG(listen_port, d));
1250 task_dump_err(tcp_fd.funerr);
1251 g_critical("Unable to announce tcp port %d. Port already in use?",
1252 listen_port);
1253 if (xcom_comms_cb) {
1254 xcom_comms_cb(XCOM_COMMS_ERROR);
1255 }
1256 if (xcom_terminate_cb) {
1257 xcom_terminate_cb(0);
1258 }
1259 goto cleanup;
1260 /* purecov: end */
1261 }
1262
1263 if (xcom_comms_cb) {
1264 xcom_comms_cb(XCOM_COMMS_OK);
1265 }
1266
1267 IFDBG(D_NONE, FN; STRLIT("Creating tasks"));
1268 /* task_new(generator_task, null_arg, "generator_task", XCOM_THREAD_DEBUG);
1269 */
1270 task_new(tcp_server, int_arg(tcp_fd.val), "tcp_server", XCOM_THREAD_DEBUG);
1271 task_new(tcp_reaper_task, null_arg, "tcp_reaper_task", XCOM_THREAD_DEBUG);
1272 IFDBG(D_BUG, FN; STRLIT("XCOM is listening on "); NPUT(listen_port, d));
1273 }
1274
1275 if (recovery_init_cb) recovery_init_cb();
1276
1277 if (recovery_begin_cb) recovery_begin_cb();
1278
1279 task_loop();
1280
1281 cleanup:
1282
1283 #ifdef TASK_EVENT_TRACE
1284 dump_task_events();
1285 #endif
1286 #ifndef XCOM_WITHOUT_OPENSSL
1287 xcom_cleanup_ssl();
1288 #endif
1289
1290 xcom_thread_deinit();
1291 if (xcom_exit_cb) {
1292 xcom_exit_cb(0);
1293 }
1294 IFDBG(D_BUG, FN; STRLIT(" exit "); NDBG(xcom_dbg_stack_top, d);
1295 NDBG((unsigned)xcom_debug_mask, x));
1296 xcom_debug_mask = 0;
1297 xcom_dbg_stack_top = 0;
1298 return 1;
1299 }
1300
1301 /* Paxos message construction and sending */
1302
1303 /* Initialize a message for sending */
prepare(pax_msg * p,pax_op op)1304 static void prepare(pax_msg *p, pax_op op) {
1305 p->op = op;
1306 p->reply_to = p->proposal;
1307 }
1308
1309 /* Initialize a prepare_msg */
init_prepare_msg(pax_msg * p)1310 void init_prepare_msg(pax_msg *p) { prepare(p, prepare_op); }
1311
prepare_msg(pax_msg * p)1312 static int prepare_msg(pax_msg *p) {
1313 init_prepare_msg(p);
1314 /* p->msg_type = normal; */
1315 return send_to_acceptors(p, "prepare_msg");
1316 }
1317
1318 /* Initialize a noop_msg */
create_noop(pax_msg * p)1319 pax_msg *create_noop(pax_msg *p) {
1320 init_prepare_msg(p);
1321 p->msg_type = no_op;
1322 return p;
1323 }
1324
1325 /* Initialize a read_msg */
create_read(site_def const * site,pax_msg * p)1326 static pax_msg *create_read(site_def const *site, pax_msg *p) {
1327 p->msg_type = normal;
1328 p->proposal.node = get_nodeno(site);
1329 prepare(p, read_op);
1330 return p;
1331 }
1332
skip_msg(pax_msg * p)1333 static int skip_msg(pax_msg *p) {
1334 prepare(p, skip_op);
1335 IFDBG(D_NONE, FN; STRLIT("skipping message "); SYCEXP(p->synode));
1336 p->msg_type = no_op;
1337 return send_to_all(p, "skip_msg");
1338 }
1339
brand_app_data(pax_msg * p)1340 static void brand_app_data(pax_msg *p) {
1341 if (p->a) {
1342 p->a->app_key.msgno = p->synode.msgno;
1343 p->a->app_key.node = p->synode.node;
1344 p->a->app_key.group_id = p->a->group_id = p->synode.group_id;
1345 }
1346 }
1347
my_unique_id(synode_no synode)1348 static synode_no my_unique_id(synode_no synode) {
1349 assert(my_id != 0);
1350 /* Random number derived from node number and timestamp which uniquely defines
1351 * this instance */
1352 synode.group_id = my_id;
1353 return synode;
1354 }
1355
set_unique_id(pax_msg * msg,synode_no synode)1356 static void set_unique_id(pax_msg *msg, synode_no synode) {
1357 app_data_ptr a = msg->a;
1358 while (a) {
1359 a->unique_id = synode;
1360 a = a->next;
1361 }
1362 }
1363
init_propose_msg(pax_msg * p)1364 void init_propose_msg(pax_msg *p) {
1365 p->op = accept_op;
1366 p->reply_to = p->proposal;
1367 brand_app_data(p);
1368 /* set_unique_id(p, my_unique_id(synode)); */
1369 }
1370
send_propose_msg(pax_msg * p)1371 static int send_propose_msg(pax_msg *p) {
1372 return send_to_acceptors(p, "propose_msg");
1373 }
1374
propose_msg(pax_msg * p)1375 static int propose_msg(pax_msg *p) {
1376 init_propose_msg(p);
1377 return send_propose_msg(p);
1378 }
1379
set_learn_type(pax_msg * p)1380 static void set_learn_type(pax_msg *p) {
1381 p->op = learn_op;
1382 p->msg_type = p->a ? normal : no_op;
1383 }
1384
1385 /* purecov: begin deadcode */
init_learn_msg(pax_msg * p)1386 static void init_learn_msg(pax_msg *p) {
1387 set_learn_type(p);
1388 p->reply_to = p->proposal;
1389 brand_app_data(p);
1390 }
1391
send_learn_msg(site_def const * site,pax_msg * p)1392 static int send_learn_msg(site_def const *site, pax_msg *p) {
1393 IFDBG(D_NONE, FN; dbg_bitset(p->receivers, get_maxnodes(site)););
1394 return send_to_all_site(site, p, "learn_msg");
1395 }
1396 /* purecov: end */
1397
create_tiny_learn_msg(pax_machine * pm,pax_msg * p)1398 static pax_msg *create_tiny_learn_msg(pax_machine *pm, pax_msg *p) {
1399 pax_msg *tiny_learn_msg = clone_pax_msg_no_app(p);
1400
1401 ref_msg(tiny_learn_msg);
1402 tiny_learn_msg->msg_type = p->a ? normal : no_op;
1403 tiny_learn_msg->op = tiny_learn_op;
1404 tiny_learn_msg->reply_to = pm->proposer.bal;
1405 brand_app_data(tiny_learn_msg);
1406
1407 return tiny_learn_msg;
1408 }
1409
send_tiny_learn_msg(site_def const * site,pax_msg * p)1410 static int send_tiny_learn_msg(site_def const *site, pax_msg *p) {
1411 int retval = send_to_all_site(site, p, "tiny_learn_msg");
1412 unref_msg(&p);
1413 return retval;
1414 }
1415
1416 /* Proposer task */
1417
prepare_push_3p(site_def const * site,pax_machine * p,pax_msg * msg,synode_no msgno,pax_msg_type msg_type)1418 void prepare_push_3p(site_def const *site, pax_machine *p, pax_msg *msg,
1419 synode_no msgno, pax_msg_type msg_type) {
1420 IFDBG(D_NONE, FN; SYCEXP(msgno); NDBG(p->proposer.bal.cnt, d);
1421 NDBG(p->acceptor.promise.cnt, d));
1422 BIT_ZERO(p->proposer.prep_nodeset);
1423 p->proposer.bal.node = get_nodeno(site);
1424 {
1425 int maxcnt = MAX(p->proposer.bal.cnt, p->acceptor.promise.cnt);
1426 p->proposer.bal.cnt = ++maxcnt;
1427 }
1428 msg->synode = msgno;
1429 msg->proposal = p->proposer.bal;
1430 msg->msg_type = msg_type;
1431 msg->force_delivery = p->force_delivery;
1432 }
1433
prepare_push_2p(site_def const * site,pax_machine * p)1434 void prepare_push_2p(site_def const *site, pax_machine *p) {
1435 assert(p->proposer.msg);
1436
1437 BIT_ZERO(p->proposer.prop_nodeset);
1438 IFDBG(D_NONE, FN; SYCEXP(p->synode));
1439 p->proposer.bal.cnt = 0;
1440 p->proposer.bal.node = get_nodeno(site);
1441 p->proposer.msg->proposal = p->proposer.bal;
1442 p->proposer.msg->synode = p->synode;
1443 p->proposer.msg->force_delivery = p->force_delivery;
1444 }
1445
push_msg_2p(site_def const * site,pax_machine * p)1446 static void push_msg_2p(site_def const *site, pax_machine *p) {
1447 prepare_push_2p(site, p);
1448 propose_msg(p->proposer.msg);
1449 }
1450
push_msg_3p(site_def const * site,pax_machine * p,pax_msg * msg,synode_no msgno,pax_msg_type msg_type)1451 static void push_msg_3p(site_def const *site, pax_machine *p, pax_msg *msg,
1452 synode_no msgno, pax_msg_type msg_type) {
1453 if (wait_forced_config) {
1454 force_pax_machine(p, 1);
1455 }
1456
1457 assert(msgno.msgno != 0);
1458 prepare_push_3p(site, p, msg, msgno, msg_type);
1459 assert(p->proposer.msg);
1460 prepare_msg(msg);
1461 IFDBG(D_NONE, FN; BALCEXP(msg->proposal); SYCEXP(msgno); STRLIT(" op ");
1462 STRLIT(pax_op_to_str(msg->op)));
1463 }
1464
1465 /* Brand client message with unique ID */
brand_client_msg(pax_msg * msg,synode_no msgno)1466 static void brand_client_msg(pax_msg *msg, synode_no msgno) {
1467 assert(!synode_eq(msgno, null_synode));
1468 set_unique_id(msg, my_unique_id(msgno));
1469 }
1470
xcom_send(app_data_ptr a,pax_msg * msg)1471 void xcom_send(app_data_ptr a, pax_msg *msg) {
1472 IFDBG(D_NONE, FN; PTREXP(a); SYCEXP(a->app_key); SYCEXP(msg->synode));
1473 msg->a = a;
1474 msg->op = client_msg;
1475 {
1476 msg_link *link = msg_link_new(msg, VOID_NODE_NO);
1477 IFDBG(D_NONE, FN; COPY_AND_FREE_GOUT(dbg_pax_msg(msg)));
1478 channel_put(&prop_input_queue, &link->l);
1479 }
1480 }
1481
1482 #define FNVSTART 0x811c9dc5
1483
1484 /* Fowler-Noll-Vo type multiplicative hash */
fnv_hash(unsigned char * buf,size_t length,uint32_t sum)1485 static uint32_t fnv_hash(unsigned char *buf, size_t length, uint32_t sum) {
1486 size_t i = 0;
1487 for (i = 0; i < length; i++) {
1488 sum = sum * (uint32_t)0x01000193 ^ (uint32_t)buf[i];
1489 }
1490 return sum;
1491 }
1492
1493 /**
1494 Create a new (hopefully unique) ID. The basic idea is to create a hash from
1495 the host ID and a timestamp.
1496 */
new_id()1497 uint32_t new_id() {
1498 long id = xcom_unique_long();
1499 double timestamp = task_now();
1500 uint32_t retval = 0;
1501 while (retval == 0 ||
1502 is_dead_site(retval)) { /* Avoid returning 0 or already used site id */
1503 retval = fnv_hash((unsigned char *)&id, sizeof(id), 0);
1504 retval = fnv_hash((unsigned char *)×tamp, sizeof(timestamp), retval);
1505 }
1506 return retval;
1507 }
1508
getstart(app_data_ptr a)1509 static synode_no getstart(app_data_ptr a) {
1510 synode_no retval = null_synode;
1511 /* If a->group_id is null_id, we set the group id from app_key.group_id,
1512 * which is hopefully not null_id. If it is, we're out of luck. */
1513 if (a && a->group_id == null_id) {
1514 /* purecov: begin deadcode */
1515 a->group_id = a->app_key.group_id; /* app_key may have valid group */
1516 /* purecov: end */
1517 }
1518 G_DEBUG("pid %d getstart group_id %x", xpid(), a->group_id);
1519 if (!a || a->group_id == null_id) {
1520 retval.group_id = new_id();
1521 } else {
1522 a->app_key.group_id = a->group_id;
1523 retval = a->app_key;
1524 if (get_site_def() &&
1525 retval.msgno > 1) { /* Special case for initial boot of site */
1526 /* Not valid until after event horizon has been passed */
1527 retval = add_event_horizon(retval);
1528 }
1529 }
1530 return retval;
1531 }
1532
1533 /* purecov: begin deadcode */
get_default_start(app_data_ptr a)1534 synode_no get_default_start(app_data_ptr a) {
1535 synode_no retval = null_synode;
1536 /* If a->group_id is null_id, we set the group id from app_key.group_id,
1537 * which is hopefully not null_id. If it is, we're out of luck. */
1538 if (a && a->group_id == null_id) {
1539 a->group_id = a->app_key.group_id; /* app_key may have valid group */
1540 }
1541 G_DEBUG("pid %d getstart group_id %x", xpid(), a->group_id);
1542 if (!a || a->group_id == null_id) {
1543 retval.group_id = new_id();
1544 } else {
1545 a->app_key.group_id = a->group_id;
1546 retval = a->app_key;
1547 if (retval.msgno > 1) { /* Special case for initial boot of site */
1548 /* Not valid until after event horizon has been passed */
1549 retval = add_default_event_horizon(retval);
1550 }
1551 }
1552 return retval;
1553 }
1554 /* purecov: end */
1555
1556 /* purecov: begin deadcode */
dump_xcom_node_names(site_def const * site)1557 static void dump_xcom_node_names(site_def const *site) {
1558 u_int i;
1559 char buf[NSERVERS * 256]; /* Big enough */
1560 char *p = buf;
1561 if (!site) {
1562 G_INFO("pid %d no site", xpid());
1563 return;
1564 }
1565 *p = 0;
1566 for (i = 0; i < site->nodes.node_list_len; i++) {
1567 p = strcat(p, site->nodes.node_list_val[i].address);
1568 p = strcat(p, " ");
1569 }
1570 G_INFO("pid %d node names %s", xpid(), buf);
1571 }
1572 /* purecov: end */
1573
site_install_action(site_def * site,cargo_type operation)1574 void site_install_action(site_def *site, cargo_type operation) {
1575 IFDBG(D_NONE, FN; NDBG(get_nodeno(get_site_def()), u));
1576 assert(site->event_horizon);
1577 if (group_mismatch(site->start, max_synode) ||
1578 synode_gt(site->start, max_synode))
1579 set_max_synode(site->start);
1580 site->nodeno = xcom_find_node_index(&site->nodes);
1581 push_site_def(site);
1582 IFDBG(D_NONE, dump_xcom_node_names(site));
1583 IFDBG(D_BUG, FN; SYCEXP(site->start); SYCEXP(site->boot_key));
1584 IFDBG(D_BUG, FN; COPY_AND_FREE_GOUT(dbg_site_def(site)));
1585 set_group(get_group_id(site));
1586 if (get_maxnodes(get_site_def())) {
1587 update_servers(site, operation);
1588 }
1589 site->install_time = task_now();
1590 G_INFO("pid %d Installed site start=" SY_FMT " boot_key=" SY_FMT
1591 " event_horizon=%" PRIu32
1592 " node %u chksum_node_list(&site->nodes) %" PRIu32,
1593 xpid(), SY_MEM(site->start), SY_MEM(site->boot_key),
1594 site->event_horizon, get_nodeno(site), chksum_node_list(&site->nodes));
1595 IFDBG(D_NONE, FN; NDBG(get_nodeno(site), u));
1596 IFDBG(D_NONE, FN; SYCEXP(site->start); SYCEXP(site->boot_key);
1597 NDBG(site->install_time, f));
1598 IFDBG(D_NONE, FN; NDBG(get_nodeno(site), u));
1599 ADD_DBG(
1600 D_BASE, add_event(EVENT_DUMP_PAD, string_arg("nodeno"));
1601 add_event(EVENT_DUMP_PAD, uint_arg(get_nodeno(site)));
1602 add_event(EVENT_DUMP_PAD, string_arg("site->boot_key"));
1603 add_synode_event(site->boot_key);
1604 /* add_event(EVENT_DUMP_PAD, uint_arg(chksum_node_list(&site->nodes))); */
1605 );
1606 }
1607
create_site_def_with_start(app_data_ptr a,synode_no start)1608 static site_def *create_site_def_with_start(app_data_ptr a, synode_no start) {
1609 site_def *site = new_site_def();
1610 IFDBG(D_NONE, FN; COPY_AND_FREE_GOUT(dbg_list(&a->body.app_u_u.nodes)););
1611 init_site_def(a->body.app_u_u.nodes.node_list_len,
1612 a->body.app_u_u.nodes.node_list_val, site);
1613 site->start = start;
1614 site->boot_key = a->app_key;
1615 return site;
1616 }
1617
install_ng_with_start(app_data_ptr a,synode_no start)1618 static site_def *install_ng_with_start(app_data_ptr a, synode_no start) {
1619 if (a) {
1620 site_def *site = create_site_def_with_start(a, start);
1621 site_install_action(site, a->body.c_t);
1622 return site;
1623 }
1624 return 0;
1625 }
1626
install_node_group(app_data_ptr a)1627 site_def *install_node_group(app_data_ptr a) {
1628 ADD_DBG(D_BASE, add_event(EVENT_DUMP_PAD, string_arg("a->app_key"));
1629 add_synode_event(a->app_key););
1630 if (a)
1631 return install_ng_with_start(a, getstart(a));
1632 else
1633 return 0;
1634 }
1635
set_max_synode(synode_no synode)1636 void set_max_synode(synode_no synode) {
1637 max_synode = synode; /* Track max synode number */
1638 IFDBG(D_NONE, FN; STRLIT("new "); SYCEXP(max_synode));
1639 activate_sweeper();
1640 }
1641
is_busy(synode_no s)1642 static int is_busy(synode_no s) {
1643 pax_machine *p = hash_get(s);
1644 if (!p) {
1645 return 0;
1646 } else {
1647 return started(p);
1648 }
1649 }
1650
match_my_msg(pax_msg * learned,pax_msg * mine)1651 bool_t match_my_msg(pax_msg *learned, pax_msg *mine) {
1652 IFDBG(D_NONE, FN; PTREXP(learned->a);
1653 if (learned->a) SYCEXP(learned->a->unique_id); PTREXP(mine->a);
1654 if (mine->a) SYCEXP(mine->a->unique_id););
1655 if (learned->a && mine->a) { /* Both have app data, see if data is mine */
1656 return synode_eq(learned->a->unique_id, mine->a->unique_id);
1657 } else if (!(learned->a || mine->a)) { /* None have app data, anything goes */
1658 return TRUE;
1659 } else { /* Definitely mismatch */
1660 return FALSE;
1661 }
1662 }
1663
1664 /*
1665 * Initialize the log sequence number (lsn).
1666 */
initialize_lsn(uint64_t n)1667 void initialize_lsn(uint64_t n) { lsn = n; }
1668
1669 /**
1670 * Assign the next log sequence number (lsn) for a message.
1671 *
1672 * Initial propose sets lsn to msgno of the max message number as safe starting
1673 * point, otherwise lsn shall be ever increasing. lsn ensures sender order known
1674 * on receiver side, as messages may arrive "out of order" due to
1675 * retransmission. We use max_synode instead of current_message to avoid any
1676 * conflict with lsn allocated by a previous instance of the node.
1677 */
assign_lsn()1678 static uint64_t assign_lsn() {
1679 if (lsn == 0) {
1680 initialize_lsn(max_synode.msgno);
1681 }
1682 lsn++;
1683 IFDBG(D_EXEC, NDBG64(lsn));
1684 return lsn;
1685 }
1686
1687 /* purecov: begin deadcode */
check_lsn(app_data_ptr a)1688 static int check_lsn(app_data_ptr a) {
1689 while (a) {
1690 if (!a->lsn) return 0;
1691 a = a->next;
1692 }
1693 return 1;
1694 }
1695 /* purecov: end */
1696
1697 static void propose_noop(synode_no find, pax_machine *p);
1698
1699 /**
1700 * Checks if the given synod s is outside the event horizon.
1701 *
1702 * Common case: there are no configurations pending, or if there are, none of
1703 * them reconfigure the event horizon. The common case threshold is:
1704 *
1705 * last_executed_synod + event_horizon(active_config)
1706 *
1707 *
1708 * If an event horizon reconfiguration R is pending, it is possible that it
1709 * reduces the event horizon. In that case, it is possible that the threshold
1710 * above falls outside the new event horizon.
1711 *
1712 * For example, consider last_executed_synod = 42 and
1713 * event_horizon(active_config) = 10.
1714 * At this point this member participates in synods up to 52.
1715 * Now consider an event horizon reconfiguration that takes effect at synod 45,
1716 * which modifies the event horizon to 2. This means that when
1717 * last_executed_synod = 45, event_horizon(active_config) = 2. At this point
1718 * this member should only be able to participate in synods up to 47. The member
1719 * may have previously started processing messages directed to synods between 47
1720 * and 52, but will now ignore messages directed to those same synods.
1721 *
1722 * We do not want to start processing messages that will eventually fall out
1723 * of the event horizon. More importantly, the threshold above may not be safe
1724 * due to the exit logic of executor_task.
1725 *
1726 * When a node removes itself from the group on configuration C starting at
1727 * synod start(C), the exit logic relies on knowing *when* a majority has
1728 * executed synod start(C) - 1, i.e. the last message of the last configuration
1729 * to contain the leaving node.
1730 *
1731 * With a constant event horizon, we know that when synod
1732 * start(C) + event_horizon is learnt, it is because a majority already executed
1733 * or is ready to execute (and thus learned) synod start(C). This implies that a
1734 * majority already executed start(C) - 1.
1735 *
1736 * With a dynamic event horizon, we cannot be sure that when synod
1737 * start(C) + event_horizon(C) is learnt, a majority already executed or is
1738 * ready to execute synod start(C).
1739 * This is because it is possible for a new, smaller, event horizon to take
1740 * effect between start(C) and start(C) + event_horizon(C).
1741 * If that happens, the threshold above allows nodes to participate in synods
1742 * which are possibly beyond start(C) + event_horizon(C), which can lead to the
1743 * value of synod start(C) + event_horizon(C) being learnt without a majority
1744 * already having executed or being ready to execute synod start(C).
1745 *
1746 * In order to maintain the assumption made by the executor_task's exit logic,
1747 * when an event horizon reconfiguration R is pending we set the threshold to
1748 * the minimum between:
1749 *
1750 * last_executed_synod + event_horizon(active_config)
1751 *
1752 * and:
1753 *
1754 * start(R) - 1 + event_horizon(R)
1755 */
too_far_threshold(xcom_event_horizon active_event_horizon)1756 static uint64_t too_far_threshold(xcom_event_horizon active_event_horizon) {
1757 return executed_msg.msgno + active_event_horizon;
1758 }
1759
too_far_threshold_new_event_horizon_pending(site_def const * new_config)1760 static uint64_t too_far_threshold_new_event_horizon_pending(
1761 site_def const *new_config) {
1762 uint64_t last_executed = executed_msg.msgno;
1763 /* compute normal threshold */
1764 uint64_t possibly_unsafe_threshold;
1765 site_def const *active_config = find_site_def(executed_msg);
1766 xcom_event_horizon active_event_horizon = active_config->event_horizon;
1767 possibly_unsafe_threshold = last_executed + active_event_horizon;
1768 /* compute threshold taking into account new event horizon */ {
1769 uint64_t maximum_safe_threshold;
1770 xcom_event_horizon new_event_horizon;
1771 uint64_t start_new_event_horizon = new_config->start.msgno;
1772 new_event_horizon = new_config->event_horizon;
1773 maximum_safe_threshold = start_new_event_horizon - 1 + new_event_horizon;
1774 /* use the minimum of both for safety */
1775 return MIN(possibly_unsafe_threshold, maximum_safe_threshold);
1776 }
1777 }
1778
too_far(synode_no s)1779 static inline int too_far(synode_no s) {
1780 uint64_t threshold = 0;
1781 site_def const *active_config = find_site_def(executed_msg);
1782 if (active_config != NULL) {
1783 site_def const *pending_config = first_event_horizon_reconfig();
1784 bool_t const no_event_horizon_reconfig_pending = (pending_config == NULL);
1785 if (is_latest_config(active_config) || no_event_horizon_reconfig_pending) {
1786 threshold = too_far_threshold(active_config->event_horizon);
1787 } else {
1788 threshold = too_far_threshold_new_event_horizon_pending(pending_config);
1789 }
1790 } else {
1791 /* we have no configs, resort to default */
1792 threshold = too_far_threshold(EVENT_HORIZON_MIN);
1793 }
1794 return s.msgno >= threshold;
1795 }
1796
1797 #define GOTO(x) \
1798 { \
1799 IFDBG(D_NONE, STRLIT("goto "); STRLIT(#x)); \
1800 goto x; \
1801 }
1802
is_view(cargo_type x)1803 static inline int is_view(cargo_type x) { return x == view_msg; }
1804
is_config(cargo_type x)1805 static inline int is_config(cargo_type x) {
1806 return x == unified_boot_type || x == add_node_type ||
1807 x == remove_node_type || x == set_event_horizon_type ||
1808 x == force_config_type;
1809 }
1810
1811 static int wait_for_cache(pax_machine **pm, synode_no synode, double timeout);
1812 static int prop_started = 0;
1813 static int prop_finished = 0;
1814
1815 /* Send messages by fetching from the input queue and trying to get it accepted
1816 by a Paxos instance */
proposer_task(task_arg arg)1817 static int proposer_task(task_arg arg) {
1818 DECL_ENV
1819 int self; /* ID of this proposer task */
1820 pax_machine *p; /* Pointer to Paxos instance */
1821 msg_link *client_msg; /* The client message we are trying to push */
1822 synode_no msgno;
1823 pax_msg *prepare_msg;
1824 double start_propose;
1825 double start_push;
1826 double delay;
1827 site_def const *site;
1828 size_t size;
1829 size_t nr_batched_app_data;
1830 END_ENV;
1831
1832 TASK_BEGIN
1833
1834 ep->self = get_int_arg(arg);
1835 ep->p = NULL;
1836 ep->client_msg = NULL;
1837 ep->prepare_msg = NULL;
1838 ep->start_propose = 0.0;
1839 ep->start_push = 0.0;
1840 ep->delay = 0.0;
1841 ep->msgno = current_message;
1842 ep->site = 0;
1843 ep->size = 0;
1844 ep->nr_batched_app_data = 0;
1845
1846 IFDBG(D_NONE, FN; NDBG(ep->self, d); NDBG(task_now(), f));
1847
1848 while (!xcom_shutdown) { /* Loop until no more work to do */
1849 /* Wait for client message */
1850 assert(!ep->client_msg);
1851 CHANNEL_GET(&prop_input_queue, &ep->client_msg, msg_link);
1852 prop_started++;
1853 IFDBG(D_NONE, FN; PTREXP(ep->client_msg->p->a); STRLIT("extracted ");
1854 SYCEXP(ep->client_msg->p->a->app_key));
1855
1856 /* Grab rest of messages in queue as well, but never batch config messages,
1857 * which need a unique number */
1858
1859 /* The batch is limited either by size or number of batched app_datas.
1860 * We limit the number of elements because the XDR deserialization
1861 * implementation is recursive, and batching too many app_datas will cause a
1862 * call stack overflow. */
1863 if (!is_config(ep->client_msg->p->a->body.c_t) &&
1864 !is_view(ep->client_msg->p->a->body.c_t)) {
1865 ep->size = app_data_size(ep->client_msg->p->a);
1866 ep->nr_batched_app_data = 1;
1867 while (AUTOBATCH && ep->size <= MAX_BATCH_SIZE &&
1868 ep->nr_batched_app_data <= MAX_BATCH_APP_DATA &&
1869 !link_empty(&prop_input_queue
1870 .data)) { /* Batch payloads into single message */
1871 msg_link *tmp;
1872 app_data_ptr atmp;
1873
1874 CHANNEL_GET(&prop_input_queue, &tmp, msg_link);
1875 atmp = tmp->p->a;
1876 ep->size += app_data_size(atmp);
1877 ep->nr_batched_app_data++;
1878 /* Abort batching if config or too big batch */
1879 if (is_config(atmp->body.c_t) || is_view(atmp->body.c_t) ||
1880 ep->nr_batched_app_data > MAX_BATCH_APP_DATA ||
1881 ep->size > MAX_BATCH_SIZE) {
1882 channel_put_front(&prop_input_queue, &tmp->l);
1883 break;
1884 }
1885 ADD_T_EV(seconds(), __FILE__, __LINE__, "batching");
1886
1887 tmp->p->a = 0; /* Steal this payload */
1888 msg_link_delete(&tmp); /* Get rid of the empty message */
1889 atmp->next = ep->client_msg->p->a; /* Add to list of app_data */
1890 /* G_TRACE("Batching %s %s",
1891 * cargo_type_to_str(ep->client_msg->p->a->body.c_t), */
1892 /* cargo_type_to_str(atmp->body.c_t)); */
1893 ep->client_msg->p->a = atmp;
1894 IFDBG(D_NONE, FN; PTREXP(ep->client_msg->p->a); STRLIT("extracted ");
1895 SYCEXP(ep->client_msg->p->a->app_key));
1896 }
1897 }
1898
1899 ep->start_propose = task_now();
1900 ep->delay = 0.0;
1901
1902 assert(!ep->client_msg->p->a->chosen);
1903
1904 /* It is a new message */
1905
1906 assert(!synode_eq(current_message, null_synode));
1907
1908 /* Assign a log sequence number only on initial propose */
1909 {
1910 uint64_t prop_lsn = assign_lsn();
1911 app_data_ptr ap = ep->client_msg->p->a;
1912 /* Assign to all app_data structs */
1913 while (ap) {
1914 ap->lsn = prop_lsn;
1915 ap = ap->next;
1916 }
1917 }
1918 DBGOUT_ASSERT(check_lsn(ep->client_msg->p->a), STRLIT("NULL lsn"));
1919 retry_new:
1920 /* Find a free slot */
1921
1922 assert(!synode_eq(current_message, null_synode));
1923 ep->msgno = current_message;
1924 proposer_site = find_site_def_rw(ep->msgno);
1925 ep->site = proposer_site;
1926
1927 while (is_busy(ep->msgno)) {
1928 while (/* ! ep->client_msg->p->force_delivery && */ too_far(
1929 incr_msgno(ep->msgno))) { /* Too far ahead of executor */
1930 TIMED_TASK_WAIT(&exec_wait, 1.0);
1931 IFDBG(D_NONE, FN; SYCEXP(ep->msgno); TIMECEXP(ep->start_propose);
1932 TIMECEXP(ep->client_msg->p->a->expiry_time); TIMECEXP(task_now());
1933
1934 NDBG(enough_live_nodes(ep->site), d));
1935 #ifdef DELIVERY_TIMEOUT
1936 if ((ep->start_propose + ep->client_msg->p->a->expiry_time) <
1937 task_now() &&
1938 !enough_live_nodes(ep->site)) {
1939 /* Give up */
1940 DBGOUT_ASSERT(check_lsn(ep->client_msg->p->a), STRLIT("NULL lsn"));
1941 IFDBG(D_NONE, FN; STRLIT("timeout -> delivery_failure"));
1942 deliver_to_app(NULL, ep->client_msg->p->a, delivery_failure);
1943 GOTO(next);
1944 }
1945 #endif
1946 }
1947 ep->msgno = incr_msgno(ep->msgno);
1948 /* Refresh site to next msgno */
1949 proposer_site = find_site_def_rw(ep->msgno);
1950 ep->site = proposer_site;
1951 }
1952 assert(!synode_eq(ep->msgno, null_synode));
1953
1954 /* See if we can do anything with this message */
1955 if (!ep->site || get_nodeno(ep->site) == VOID_NODE_NO) {
1956 /* Give up */
1957 DBGOUT_ASSERT(check_lsn(ep->client_msg->p->a), STRLIT("NULL lsn"));
1958 IFDBG(D_NONE, FN; STRLIT("delivery_failure "); SYCEXP(ep->msgno);
1959 PTREXP(ep->site); NDBG(get_nodeno(ep->site), u));
1960 deliver_to_app(NULL, ep->client_msg->p->a, delivery_failure);
1961 GOTO(next);
1962 }
1963 IFDBG(D_NONE, FN; STRLIT("changing current message to ");
1964 SYCEXP(ep->msgno));
1965 set_current_message(ep->msgno);
1966
1967 brand_client_msg(ep->client_msg->p, ep->msgno);
1968
1969 for (;;) { /* Loop until the client message has been learned */
1970 /* Get a Paxos instance to send the client message */
1971
1972 TASK_CALL(wait_for_cache(&ep->p, ep->msgno, 60));
1973 if (!ep->p) {
1974 G_MESSAGE("Could not get a pax_machine for msgno %lu. Retrying",
1975 (unsigned long)ep->msgno.msgno);
1976 goto retry_new;
1977 }
1978
1979 assert(ep->p);
1980 if (ep->client_msg->p->force_delivery)
1981 ep->p->force_delivery = ep->client_msg->p->force_delivery;
1982 {
1983 int MY_ATTRIBUTE((unused)) lock = lock_pax_machine(ep->p);
1984 assert(!lock);
1985 }
1986
1987 /* Set the client message as current proposal */
1988 assert(ep->client_msg->p);
1989 replace_pax_msg(&ep->p->proposer.msg, clone_pax_msg(ep->client_msg->p));
1990 if (ep->p->proposer.msg == NULL) {
1991 g_critical(
1992 "Node %u has run out of memory while sending a message and "
1993 "will now exit.",
1994 get_nodeno(proposer_site));
1995 terminate_and_exit(); /* Tell xcom to stop */
1996 TERMINATE;
1997 }
1998 assert(ep->p->proposer.msg);
1999 PAX_MSG_SANITY_CHECK(ep->p->proposer.msg);
2000
2001 /* Create the prepare message */
2002 unchecked_replace_pax_msg(&ep->prepare_msg,
2003 pax_msg_new(ep->msgno, ep->site));
2004 IFDBG(D_NONE, FN; PTREXP(ep->client_msg->p->a); STRLIT("pushing ");
2005 SYCEXP(ep->msgno));
2006 IFDBG(D_NONE, FN; COPY_AND_FREE_GOUT(dbg_app_data(ep->prepare_msg->a)));
2007
2008 /* Use 3 phase algorithm if threephase is set or we are forcing or we have
2009 already accepted something, which may happen if another node has timed
2010 out waiting for this node and proposed a no_op, which we have accepted.
2011 */
2012 if (threephase || ep->p->force_delivery || ep->p->acceptor.promise.cnt) {
2013 push_msg_3p(ep->site, ep->p, ep->prepare_msg, ep->msgno, normal);
2014 } else {
2015 push_msg_2p(ep->site, ep->p);
2016 }
2017
2018 ep->start_push = task_now();
2019
2020 while (!finished(ep->p)) { /* Try to get a value accepted */
2021 /* We will wake up periodically, and whenever a message arrives */
2022 TIMED_TASK_WAIT(&ep->p->rv, ep->delay = wakeup_delay(ep->delay));
2023 if (!synode_eq(ep->msgno, ep->p->synode) ||
2024 ep->p->proposer.msg == NULL) {
2025 IFDBG(D_NONE, FN; STRLIT("detected stolen state machine, retry"););
2026 /* unlock_pax_machine(ep->p); */
2027 GOTO(retry_new); /* Need to break out of both loops,
2028 and we have no "exit named
2029 loop" construction */
2030 }
2031 assert(synode_eq(ep->msgno, ep->p->synode) && ep->p->proposer.msg);
2032 if (finished(ep->p)) break;
2033 {
2034 double now = task_now();
2035 #ifdef DELIVERY_TIMEOUT
2036 if ((ep->start_propose + ep->client_msg->p->a->expiry_time) < now) {
2037 IFDBG(D_NONE, FN; STRLIT("timeout when pushing ");
2038 SYCEXP(ep->msgno); SYCEXP(executed_msg));
2039 /* Proposing a no-op here is a last ditch effort to cancel the
2040 failed message. If any of the currently reachable nodes have
2041 participated in the failed consensus round, it is equivalent to
2042 retrying a final time, otherwise we could get a no-op
2043 accepted. Proposing a no-op is always harmless.
2044 Having a timeout on delivery and telling the client is really
2045 contrary to the spirit of
2046 Paxos, since we cannot guarantee that the message has not been
2047 delivered, but at the moment, MCM depends on it.
2048 Proposing a no-op here increases the probability that the outcome
2049 matches what we tell MCM about the outcome. */
2050 propose_noop(ep->msgno, ep->p);
2051 DBGOUT_ASSERT(check_lsn(ep->client_msg->p->a), STRLIT("NULL lsn"));
2052 IFDBG(D_NONE, FN; STRLIT("timeout -> delivery_failure"));
2053 deliver_to_app(ep->p, ep->client_msg->p->a, delivery_failure);
2054 unlock_pax_machine(ep->p);
2055 GOTO(next);
2056 }
2057 #endif
2058 if ((ep->start_push + ep->delay) <= now) {
2059 PAX_MSG_SANITY_CHECK(ep->p->proposer.msg);
2060 IFDBG(D_NONE, FN; STRLIT("retry pushing "); SYCEXP(ep->msgno));
2061 IFDBG(D_NONE, FN;
2062 COPY_AND_FREE_GOUT(dbg_app_data(ep->prepare_msg->a)););
2063 IFDBG(D_NONE, BALCEXP(ep->p->proposer.bal);
2064 BALCEXP(ep->p->acceptor.promise));
2065 push_msg_3p(ep->site, ep->p, ep->prepare_msg, ep->msgno, normal);
2066 ep->start_push = now;
2067 }
2068 }
2069 }
2070 /* When we get here, we know the value for this message number,
2071 but it may not be the value we tried to push,
2072 so loop until we have a successful push. */
2073 unlock_pax_machine(ep->p);
2074 IFDBG(D_NONE, FN; STRLIT(" found finished message "); SYCEXP(ep->msgno);
2075 STRLIT("seconds since last push ");
2076 NPUT(task_now() - ep->start_push, f); STRLIT("ep->client_msg ");
2077 COPY_AND_FREE_GOUT(dbg_pax_msg(ep->client_msg->p)););
2078 IFDBG(D_NONE, FN; STRLIT("ep->p->learner.msg ");
2079 COPY_AND_FREE_GOUT(dbg_pax_msg(ep->p->learner.msg)););
2080 if (match_my_msg(ep->p->learner.msg, ep->client_msg->p)) {
2081 break;
2082 } else
2083 GOTO(retry_new);
2084 }
2085 next : {
2086 double now = task_now();
2087 double used = now - ep->start_propose;
2088 add_to_filter(used);
2089 prop_finished++;
2090 IFDBG(D_NONE, FN; STRLIT("completed ep->msgno "); SYCEXP(ep->msgno);
2091 NDBG(used, f); NDBG(median_time(), f);
2092 STRLIT("seconds since last push "); NDBG(now - ep->start_push, f););
2093 IFDBG(D_NONE, FN; STRLIT("ep->client_msg ");
2094 COPY_AND_FREE_GOUT(dbg_pax_msg(ep->client_msg->p)););
2095 if (ep->p) {
2096 IFDBG(D_NONE, FN; STRLIT("ep->p->learner.msg ");
2097 COPY_AND_FREE_GOUT(dbg_pax_msg(ep->p->learner.msg)););
2098 }
2099 msg_link_delete(&ep->client_msg);
2100 }
2101 }
2102 FINALLY
2103 IFDBG(D_BUG, FN; STRLIT("exit "); NDBG(ep->self, d); NDBG(task_now(), f));
2104 if (ep->p) {
2105 unlock_pax_machine(ep->p);
2106 }
2107 replace_pax_msg(&ep->prepare_msg, NULL);
2108 if (ep->client_msg) { /* If we get here with a client message, we have
2109 failed to deliver */
2110 DBGOUT_ASSERT(check_lsn(ep->client_msg->p->a), STRLIT("NULL lsn"));
2111 IFDBG(D_NONE, FN;
2112 STRLIT("undelivered message at task end -> delivery_failure"));
2113 deliver_to_app(ep->p, ep->client_msg->p->a, delivery_failure);
2114 msg_link_delete(&ep->client_msg);
2115 }
2116 TASK_END;
2117 }
2118
2119 static xcom_proto constexpr first_protocol_that_ignores_intermediate_forced_configs_or_views =
2120 x_1_8;
2121
should_ignore_forced_config_or_view(xcom_proto protocol_version)2122 static bool constexpr should_ignore_forced_config_or_view(
2123 xcom_proto protocol_version) {
2124 return protocol_version >=
2125 first_protocol_that_ignores_intermediate_forced_configs_or_views;
2126 }
2127
leader(site_def const * s)2128 static node_no leader(site_def const *s) {
2129 node_no leader = 0;
2130 for (leader = 0; leader < get_maxnodes(s); leader++) {
2131 if (!may_be_dead(s->detected, leader, task_now())) return leader;
2132 }
2133 return 0;
2134 }
2135
iamthegreatest(site_def const * s)2136 int iamthegreatest(site_def const *s) { return leader(s) == s->nodeno; }
2137
execute_msg(site_def * site,pax_machine * pma,pax_msg * p)2138 void execute_msg(site_def *site, pax_machine *pma, pax_msg *p) {
2139 app_data_ptr a = p->a;
2140 IFDBG(D_EXEC, FN; COPY_AND_FREE_GOUT(dbg_pax_msg(p)););
2141 if (a) {
2142 switch (a->body.c_t) {
2143 case unified_boot_type:
2144 case force_config_type:
2145 deliver_config(a);
2146 case add_node_type:
2147 case remove_node_type:
2148 break;
2149 case app_type:
2150 IFDBG(D_NONE, FN; STRLIT(" learner.msg ");
2151 COPY_AND_FREE_GOUT(dbg_pax_msg(pma->learner.msg)););
2152 /* DBGOUT_ASSERT(check_lsn(a), STRLIT("NULL lsn")); */
2153 deliver_to_app(pma, a, delivery_ok);
2154 break;
2155 case view_msg:
2156 IFDBG(D_EXEC, FN; STRLIT(" global view ");
2157 COPY_AND_FREE_GOUT(dbg_pax_msg(pma->learner.msg)););
2158 if (site && site->global_node_set.node_set_len ==
2159 a->body.app_u_u.present.node_set_len) {
2160 if ((p->force_delivery != 0) &&
2161 should_ignore_forced_config_or_view(site->x_proto)) {
2162 G_DEBUG(
2163 "execute_msg: Ignoring a forced intermediate, pending "
2164 "view_msg");
2165 } else {
2166 assert(site->global_node_set.node_set_len ==
2167 a->body.app_u_u.present.node_set_len);
2168 copy_node_set(&a->body.app_u_u.present, &site->global_node_set);
2169 deliver_global_view_msg(site, p->synode);
2170 ADD_DBG(D_BASE,
2171 add_event(EVENT_DUMP_PAD,
2172 string_arg("deliver_global_view_msg p->synode"));
2173 add_synode_event(p->synode););
2174 }
2175 }
2176 break;
2177 default:
2178 break;
2179 }
2180 }
2181 IFDBG(D_NONE, FN; SYCEXP(p->synode));
2182 }
2183
2184 static void read_missing_values(int n);
2185 static void propose_missing_values(int n);
2186
2187 #ifdef EXECUTOR_TASK_AGGRESSIVE_NO_OP
2188 /* With many nodes sending read_ops on instances that are not decided yet, it
2189 * may take a very long time until someone finally decides to start a new
2190 * consensus round. As the cost of a new proposal is not that great, it's
2191 * acceptable to go directly to proposing a no-op instead of first trying to get
2192 * the value with a read_op. An added benefit of this is that if more than one
2193 * node needs the result, they will get it all when the consensus round
2194 * finishes. */
find_value(site_def const * site,unsigned int * wait,int n)2195 static void find_value(site_def const *site, unsigned int *wait, int n) {
2196 IFDBG(D_NONE, FN; NDBG(*wait, d));
2197
2198 if (get_nodeno(site) == VOID_NODE_NO) {
2199 read_missing_values(n);
2200 return;
2201 }
2202
2203 if ((*wait) > 1 || /* Only leader will propose initially */
2204 ((*wait) > 0 && iamthegreatest(site)))
2205 propose_missing_values(n);
2206
2207 #ifdef TASK_EVENT_TRACE
2208 if ((*wait) > 1) dump_task_events();
2209 #endif
2210 (*wait)++;
2211 }
2212 #else
find_value(site_def const * site,unsigned int * wait,int n)2213 static void find_value(site_def const *site, unsigned int *wait, int n) {
2214 IFDBG(D_NONE, FN; NDBG(*wait, d));
2215
2216 if (get_nodeno(site) == VOID_NODE_NO) {
2217 read_missing_values(n);
2218 return;
2219 }
2220
2221 switch (*wait) {
2222 case 0:
2223 case 1:
2224 read_missing_values(n);
2225 (*wait)++;
2226 break;
2227 case 2:
2228 if (iamthegreatest(site))
2229 propose_missing_values(n);
2230 else
2231 read_missing_values(n);
2232 (*wait)++;
2233 break;
2234 case 3:
2235 propose_missing_values(n);
2236 break;
2237 default:
2238 break;
2239 }
2240 }
2241 #endif /* EXECUTOR_TASK_AGGRESSIVE_NO_OP */
2242
2243 static void dump_debug_exec_state();
2244
2245 #ifdef PROPOSE_IF_LEADER
get_xcom_message(pax_machine ** p,synode_no msgno,int n)2246 int get_xcom_message(pax_machine **p, synode_no msgno, int n) {
2247 DECL_ENV
2248 unsigned int wait;
2249 double delay;
2250 site_def const *site;
2251 END_ENV;
2252
2253 TASK_BEGIN
2254
2255 ep->wait = 0;
2256 ep->delay = 0.0;
2257 *p = force_get_cache(msgno);
2258 ep->site = NULL;
2259
2260 dump_debug_exec_state();
2261 while (!finished(*p)) {
2262 ep->site = find_site_def(msgno);
2263 /* The end of the world ?, fake message by skipping */
2264 if (get_maxnodes(ep->site) == 0) {
2265 pax_msg *msg = pax_msg_new(msgno, ep->site);
2266 handle_skip(ep->site, *p, msg);
2267 break;
2268 }
2269 IFDBG(D_NONE, FN; STRLIT(" not finished "); SYCEXP(msgno); PTREXP(*p);
2270 NDBG(ep->wait, u); SYCEXP(msgno));
2271 if (get_maxnodes(ep->site) > 1 && iamthegreatest(ep->site) &&
2272 ep->site->global_node_set.node_set_val &&
2273 !ep->site->global_node_set.node_set_val[msgno.node] &&
2274 may_be_dead(ep->site->detected, msgno.node, task_now())) {
2275 propose_missing_values(n);
2276 } else {
2277 find_value(ep->site, &ep->wait, n);
2278 }
2279 TIMED_TASK_WAIT(&(*p)->rv, ep->delay = wakeup_delay(ep->delay));
2280 *p = get_cache(msgno);
2281 dump_debug_exec_state();
2282 }
2283
2284 FINALLY
2285 IFDBG(D_NONE, FN; SYCEXP(msgno); PTREXP(*p); NDBG(ep->wait, u);
2286 SYCEXP(msgno));
2287 TASK_END;
2288 }
2289 #else
get_xcom_message(pax_machine ** p,synode_no msgno,int n)2290 int get_xcom_message(pax_machine **p, synode_no msgno, int n) {
2291 DECL_ENV
2292 unsigned int wait;
2293 double delay;
2294 site_def const *site;
2295 END_ENV;
2296
2297 TASK_BEGIN
2298
2299 ep->wait = 0;
2300 ep->delay = 0.0;
2301 *p = force_get_cache(msgno);
2302 ep->site = NULL;
2303
2304 dump_debug_exec_state();
2305 while (!finished(*p)) {
2306 ep->site = find_site_def(msgno);
2307 /* The end of the world ?, fake message by skipping */
2308 if (get_maxnodes(ep->site) == 0) {
2309 pax_msg *msg = pax_msg_new(msgno, ep->site);
2310 handle_skip(ep->site, *p, msg);
2311 break;
2312 }
2313 IFDBG(D_NONE, FN; STRLIT("before find_value"); SYCEXP(msgno); PTREXP(*p);
2314 NDBG(ep->wait, u); SYCEXP(msgno));
2315 find_value(ep->site, &ep->wait, n);
2316 IFDBG(D_NONE, FN; STRLIT("after find_value"); SYCEXP(msgno); PTREXP(*p);
2317 NDBG(ep->wait, u); SYCEXP(msgno));
2318 ep->delay = wakeup_delay(ep->delay);
2319 IFDBG(D_NONE, FN; NDBG(ep->delay, f));
2320 TIMED_TASK_WAIT(&(*p)->rv, ep->delay);
2321 *p = get_cache(msgno);
2322 dump_debug_exec_state();
2323 }
2324
2325 FINALLY
2326 TASK_END;
2327 }
2328 #endif
2329
set_executed_msg(synode_no msgno)2330 synode_no set_executed_msg(synode_no msgno) {
2331 IFDBG(D_EXEC, FN; STRLIT("changing executed_msg from "); SYCEXP(executed_msg);
2332 STRLIT(" to "); SYCEXP(msgno));
2333 if (group_mismatch(msgno, current_message) ||
2334 synode_gt(msgno, current_message)) {
2335 IFDBG(D_EXEC, FN; STRLIT("changing current message"));
2336 set_current_message(first_free_synode(msgno));
2337 }
2338
2339 if (msgno.msgno > executed_msg.msgno) task_wakeup(&exec_wait);
2340
2341 executed_msg = msgno;
2342 executor_site = find_site_def_rw(executed_msg);
2343 return executed_msg;
2344 }
2345
first_free_synode(synode_no msgno)2346 static synode_no first_free_synode(synode_no msgno) {
2347 site_def const *site = find_site_def(msgno);
2348 synode_no retval = msgno;
2349 if (!site) {
2350 /* purecov: begin deadcode */
2351 site = get_site_def();
2352 IFDBG(D_NONE, FN; PTREXP(site); SYCEXP(msgno));
2353 assert(get_group_id(site) != 0);
2354 return site->start;
2355 /* purecov: end */
2356 }
2357 if (get_group_id(site) == 0) {
2358 IFDBG(D_NONE, FN; PTREXP(site); SYCEXP(msgno));
2359 if (site) {
2360 IFDBG(D_NONE, FN; SYCEXP(site->boot_key); SYCEXP(site->start);
2361 COPY_AND_FREE_GOUT(dbg_site_def(site)));
2362 }
2363 }
2364 assert(get_group_id(site) != 0);
2365 assert(!synode_eq(msgno, null_synode));
2366 if (retval.msgno == 0) retval.msgno = 1;
2367 retval.node = get_nodeno(site);
2368 if (synode_lt(retval, msgno))
2369 return incr_msgno(retval);
2370 else
2371 return retval;
2372 }
2373
set_current_message(synode_no msgno)2374 synode_no set_current_message(synode_no msgno) {
2375 IFDBG(D_PROPOSE, FN; STRLIT("changing current_message from ");
2376 SYCEXP(current_message); STRLIT(" to "); SYCEXP(msgno));
2377 return current_message = msgno;
2378 }
2379
2380 static void update_max_synode(pax_msg *p);
2381
2382 #if TASK_DBUG_ON
2383 static void perf_dbg(int *_n, int *_old_n, double *_old_t)
2384 MY_ATTRIBUTE((unused));
perf_dbg(int * _n,int * _old_n,double * _old_t)2385 static void perf_dbg(int *_n, int *_old_n, double *_old_t) {
2386 int n = *_n;
2387 int old_n = *_old_n;
2388 double old_t = *_old_t;
2389
2390 if (!IS_XCOM_DEBUG_WITH(XCOM_DEBUG_TRACE)) return;
2391
2392 IFDBG(D_NONE, FN; SYCEXP(executed_msg));
2393 if (!(n % 5000)) {
2394 GET_GOUT;
2395 NDBG(get_nodeno(get_site_def()), u);
2396 NDBG(task_now(), f);
2397 NDBG(n, d);
2398 NDBG(median_time(), f);
2399 SYCEXP(executed_msg);
2400 PRINT_GOUT;
2401 FREE_GOUT;
2402 }
2403 (*_n)++;
2404 if (task_now() - old_t > 1.0) {
2405 GET_GOUT;
2406 NDBG(get_nodeno(get_site_def()), u);
2407 NDBG(task_now(), f);
2408 NDBG(n, d);
2409 NDBG((n - old_n) / (task_now() - old_t), f);
2410 PRINT_GOUT;
2411 FREE_GOUT;
2412 *_old_t = task_now();
2413 *_old_n = n;
2414 }
2415 }
2416 #endif
2417
2418 #ifdef IGNORE_LOSERS
2419
LOSER(synode_no x,site_def const * site)2420 static inline int LOSER(synode_no x, site_def const *site) {
2421 IFDBG(D_NONE, NEXP(x.node, u);
2422 NEXP(site->global_node_set.node_set_val[(x).node], d));
2423 return (!(site)->global_node_set.node_set_val[(x).node]);
2424 }
2425
2426 #else
2427 #define LOSER(x, site) 0
2428 #endif
2429
2430 static void debug_loser(synode_no x) MY_ATTRIBUTE((unused));
2431 #if defined(TASK_DBUG_ON) && TASK_DBUG_ON
debug_loser(synode_no x)2432 static void debug_loser(synode_no x) {
2433 if (!IS_XCOM_DEBUG_WITH(XCOM_DEBUG_TRACE)) return;
2434 if (1 || x.msgno < 10) {
2435 GET_GOUT;
2436 NDBG(get_nodeno(find_site_def(x)), u);
2437 STRLIT(" ignoring loser ");
2438 SYCEXP(x);
2439 SYCEXP(max_synode);
2440 PRINT_GOUT;
2441 FREE_GOUT;
2442 }
2443 }
2444 #else
2445 /* purecov: begin deadcode */
debug_loser(synode_no x MY_ATTRIBUTE ((unused)))2446 static void debug_loser(synode_no x MY_ATTRIBUTE((unused))) {}
2447 /* purecov: end */
2448 #endif
2449
send_value(site_def const * site,node_no to,synode_no synode)2450 static void send_value(site_def const *site, node_no to, synode_no synode) {
2451 pax_machine *pm = get_cache(synode);
2452 if (pm && pm->learner.msg) {
2453 pax_msg *msg = clone_pax_msg(pm->learner.msg);
2454 if (msg == NULL) return;
2455 ref_msg(msg);
2456 send_server_msg(site, to, msg);
2457 unref_msg(&msg);
2458 }
2459 }
2460
2461 /**
2462 * Returns the message number where it is safe for nodes in previous
2463 * configuration to exit.
2464 *
2465 * @param start start synod of the next configuration
2466 * @param event_horizon event horizon of the next configuration
2467 */
compute_delay(synode_no start,xcom_event_horizon event_horizon)2468 static synode_no compute_delay(synode_no start,
2469 xcom_event_horizon event_horizon) {
2470 start.msgno += event_horizon;
2471 return start;
2472 }
2473
2474 /* Push messages to all nodes which were in the previous site, but not in this
2475 */
inform_removed(int index,int all)2476 static void inform_removed(int index, int all) {
2477 site_def **sites = 0;
2478 uint32_t site_count = 0;
2479 IFDBG(D_NONE, FN; NEXP(index, d));
2480 get_all_site_defs(&sites, &site_count);
2481 while (site_count > 1 && index >= 0 && (uint32_t)(index + 1) < site_count) {
2482 site_def *s = sites[index];
2483 site_def *ps = sites[index + 1];
2484
2485 /* Compute diff and push messages */
2486 IFDBG(D_NONE, FN; NDBG(index, d); PTREXP(s); if (s) SYCEXP(s->boot_key);
2487 PTREXP(ps); if (ps) SYCEXP(ps->boot_key));
2488
2489 if (s && ps) {
2490 node_no i = 0;
2491 IFDBG(D_NONE, FN; SYCEXP(s->boot_key); SYCEXP(s->start);
2492 SYCEXP(ps->boot_key); SYCEXP(ps->start));
2493 for (i = 0; i < ps->nodes.node_list_len; i++) { /* Loop over prev site */
2494 if (ps->nodeno != i &&
2495 !node_exists(&ps->nodes.node_list_val[i], &s->nodes)) {
2496 synode_no synode = s->start;
2497 synode_no end = max_synode;
2498 while (!synode_gt(synode, end)) { /* Loop over relevant messages */
2499 send_value(ps, i, synode);
2500 synode = incr_synode(synode);
2501 }
2502 }
2503 }
2504 }
2505 if (!all) /* Early exit if not all configs should be examined */
2506 break;
2507 index--;
2508 }
2509 }
2510
backwards_compatible(xcom_event_horizon event_horizon)2511 static bool_t backwards_compatible(xcom_event_horizon event_horizon) {
2512 return event_horizon == EVENT_HORIZON_MIN;
2513 }
2514
2515 static xcom_proto const first_event_horizon_aware_protocol = x_1_4;
2516
reconfigurable_event_horizon(xcom_proto protocol_version)2517 static bool_t reconfigurable_event_horizon(xcom_proto protocol_version) {
2518 return protocol_version >= first_event_horizon_aware_protocol;
2519 }
2520
add_node_unsafe_against_ipv4_old_nodes(app_data_ptr a)2521 static bool_t add_node_unsafe_against_ipv4_old_nodes(app_data_ptr a) {
2522 assert(a->body.c_t == add_node_type);
2523
2524 {
2525 site_def const *latest_config = get_site_def();
2526 if (latest_config && latest_config->x_proto >= minimum_ipv6_version())
2527 return FALSE;
2528
2529 {
2530 u_int const nr_nodes_to_add = a->body.app_u_u.nodes.node_list_len;
2531 node_address *nodes_to_add = a->body.app_u_u.nodes.node_list_val;
2532
2533 u_int i;
2534 xcom_port node_port = 0;
2535 char node_addr[IP_MAX_SIZE];
2536
2537 for (i = 0; i < nr_nodes_to_add; i++) {
2538 if (get_ip_and_port(nodes_to_add[i].address, node_addr, &node_port)) {
2539 G_ERROR(
2540 "Error parsing address from a joining node. Join operation "
2541 "will be "
2542 "rejected");
2543 return TRUE;
2544 }
2545
2546 if (!is_node_v4_reachable(node_addr)) return TRUE;
2547 }
2548 }
2549
2550 return FALSE;
2551 }
2552 }
2553
2554 /**
2555 * Check if a node is compatible with the group's event horizon.
2556 *
2557 * A node is compatible with the group's configuration if:
2558 *
2559 * a) The node supports event horizon reconfigurations, or
2560 * b) The group's event horizon is, or is scheduled to be, the default event
2561 * horizon.
2562 */
unsafe_against_event_horizon(node_address const * node)2563 static bool_t unsafe_against_event_horizon(node_address const *node) {
2564 site_def const *latest_config = get_site_def();
2565 xcom_proto node_max_protocol_version = node->proto.max_proto;
2566 bool_t const compatible =
2567 reconfigurable_event_horizon(node_max_protocol_version) ||
2568 backwards_compatible(latest_config->event_horizon);
2569
2570 if (!compatible) {
2571 /*
2572 * The node that wants to join does not support event horizon
2573 * reconfigurations and the group's event horizon is, or is scheduled to
2574 * be, different from the default.
2575 * The node can not safely join the group so we deny its attempt to join.
2576 */
2577 G_INFO(
2578 "%s's request to join the group was rejected because the group's event "
2579 "horizon is, or will be %" PRIu32 " and %s only supports %" PRIu32,
2580 node->address, get_site_def()->event_horizon, node->address,
2581 EVENT_HORIZON_MIN);
2582 return TRUE;
2583 }
2584 return FALSE;
2585 }
2586
add_node_unsafe_against_event_horizon(app_data_ptr a)2587 static bool_t add_node_unsafe_against_event_horizon(app_data_ptr a) {
2588 assert(a->body.c_t == add_node_type);
2589 {
2590 u_int nodes_len = a->body.app_u_u.nodes.node_list_len;
2591 node_address *nodes_to_add = a->body.app_u_u.nodes.node_list_val;
2592 u_int i;
2593 for (i = 0; i < nodes_len; i++) {
2594 if (unsafe_against_event_horizon(&nodes_to_add[i])) return TRUE;
2595 }
2596 }
2597 return FALSE;
2598 }
2599
2600 /**
2601 * Reconfigure the group membership: add new member(s).
2602 *
2603 * It is possible that concurrent reconfigurations take effect between the time
2604 * this reconfiguration was proposed and now.
2605 *
2606 * Particularly, it is possible that any of the concurrent reconfigurations
2607 * modified the event horizon and that the new member(s) do not support event
2608 * horizon reconfigurations.
2609 *
2610 * We account for these situations by validating if adding the new members is
2611 * still possible under the current state.
2612 *
2613 * If it is not, this reconfiguration does not produce any effect, i.e. no new
2614 * configuration is installed.
2615 */
handle_add_node(app_data_ptr a)2616 site_def *handle_add_node(app_data_ptr a) {
2617 if (add_node_unsafe_against_event_horizon(a)) {
2618 /*
2619 * Note that the result of this function is only applicable to
2620 * unused and not-fully-implemented code paths where add_node_type is used
2621 * forcibly.
2622 * Should this fact change, this obviously does not work.
2623 */
2624 return NULL;
2625 }
2626 {
2627 site_def *site = clone_site_def(get_site_def());
2628 IFDBG(D_NONE, FN; COPY_AND_FREE_GOUT(dbg_list(&a->body.app_u_u.nodes)););
2629 IFDBG(D_NONE, FN; COPY_AND_FREE_GOUT(dbg_list(&a->body.app_u_u.nodes)););
2630 ADD_DBG(D_BASE, add_event(EVENT_DUMP_PAD, string_arg("a->app_key"));
2631 add_synode_event(a->app_key););
2632 assert(get_site_def());
2633 assert(site);
2634 add_site_def(a->body.app_u_u.nodes.node_list_len,
2635 a->body.app_u_u.nodes.node_list_val, site);
2636 site->start = getstart(a);
2637 site->boot_key = a->app_key;
2638 site_install_action(site, a->body.c_t);
2639 return site;
2640 }
2641 }
2642
2643 /**
2644 * Check if we can reconfigure the event horizon.
2645 *
2646 * We can reconfigure the event horizon if all group members support
2647 * reconfiguring the event horizon, and the new event horizon in the domain
2648 * [EVENT_HORIZON_MIN, EVENT_HORIZON_MAX].
2649 *
2650 * We use the group's latest common XCom protocol as a proxy to decide if all
2651 * members support reconfiguring the event horizon.
2652 *
2653 * If the common protocol is at least version 5 (x_1_4) then all members run
2654 * compatible server instances.
2655 *
2656 * Otherwise there are older instances, and it follows that the event horizon
2657 * must be the default and cannot be reconfigured.
2658 */
2659 enum allow_event_horizon_result {
2660 EVENT_HORIZON_ALLOWED,
2661 EVENT_HORIZON_INVALID,
2662 EVENT_HORIZON_UNCHANGEABLE
2663 };
2664 typedef enum allow_event_horizon_result allow_event_horizon_result;
2665
log_event_horizon_reconfiguration_failure(allow_event_horizon_result error_code,xcom_event_horizon attempted_event_horizon)2666 static void log_event_horizon_reconfiguration_failure(
2667 allow_event_horizon_result error_code,
2668 xcom_event_horizon attempted_event_horizon) {
2669 switch (error_code) {
2670 case EVENT_HORIZON_INVALID:
2671 G_WARNING("The event horizon was not reconfigured to %" PRIu32
2672 "because its domain is [%" PRIu32 ", %" PRIu32 "]",
2673 attempted_event_horizon, xcom_get_minimum_event_horizon(),
2674 xcom_get_maximum_event_horizon());
2675 break;
2676 case EVENT_HORIZON_UNCHANGEABLE:
2677 G_WARNING("The event horizon was not reconfigured to %" PRIu32
2678 " because some of the group's members do not support "
2679 "reconfiguring the event horizon",
2680 attempted_event_horizon);
2681 break;
2682 case EVENT_HORIZON_ALLOWED:
2683 break;
2684 }
2685 }
2686
allow_event_horizon(xcom_event_horizon event_horizon)2687 static allow_event_horizon_result allow_event_horizon(
2688 xcom_event_horizon event_horizon) {
2689 if (event_horizon < EVENT_HORIZON_MIN || event_horizon > EVENT_HORIZON_MAX)
2690 return EVENT_HORIZON_INVALID;
2691
2692 {
2693 const site_def *latest_config = get_site_def();
2694 if (!reconfigurable_event_horizon(latest_config->x_proto)) {
2695 assert(backwards_compatible(latest_config->event_horizon));
2696 return EVENT_HORIZON_UNCHANGEABLE;
2697 }
2698 }
2699 return EVENT_HORIZON_ALLOWED;
2700 }
2701
unsafe_event_horizon_reconfiguration(app_data_ptr a)2702 static bool_t unsafe_event_horizon_reconfiguration(app_data_ptr a) {
2703 assert(a->body.c_t == set_event_horizon_type);
2704 {
2705 xcom_event_horizon new_event_horizon = a->body.app_u_u.event_horizon;
2706 bool_t result = FALSE;
2707 allow_event_horizon_result error_code;
2708 error_code = allow_event_horizon(new_event_horizon);
2709 switch (error_code) {
2710 case EVENT_HORIZON_INVALID:
2711 case EVENT_HORIZON_UNCHANGEABLE:
2712 log_event_horizon_reconfiguration_failure(error_code,
2713 new_event_horizon);
2714 result = TRUE;
2715 break;
2716 case EVENT_HORIZON_ALLOWED:
2717 break;
2718 }
2719 return result;
2720 }
2721 }
2722
are_there_dead_nodes_in_new_config(app_data_ptr a)2723 static bool_t are_there_dead_nodes_in_new_config(app_data_ptr a) {
2724 assert(a->body.c_t == force_config_type);
2725
2726 {
2727 u_int nr_nodes_to_add = a->body.app_u_u.nodes.node_list_len;
2728 node_address *nodes_to_change = a->body.app_u_u.nodes.node_list_val;
2729 uint32_t i;
2730 G_DEBUG("Checking for dead nodes in Forced Configuration")
2731 for (i = 0; i < nr_nodes_to_add; i++) {
2732 node_no node = find_nodeno(get_site_def(), nodes_to_change[i].address);
2733
2734 if (node == get_nodeno(get_site_def()))
2735 continue; /* No need to validate myself */
2736
2737 if (node == VOID_NODE_NO) {
2738 G_ERROR(
2739 "%s is not in the current configuration."
2740 "Only members in the current configuration can be present"
2741 " in a forced configuration list",
2742 nodes_to_change[i].address)
2743 return TRUE;
2744 }
2745
2746 if (may_be_dead(get_site_def()->detected, node, task_now())) {
2747 G_ERROR(
2748 "%s is suspected to be failed."
2749 "Only alive members in the current configuration should be present"
2750 " in a forced configuration list",
2751 nodes_to_change[i].address)
2752 return TRUE;
2753 }
2754 }
2755 }
2756
2757 return FALSE;
2758 }
2759
2760 /**
2761 * Reconfigure the event horizon.
2762 *
2763 * It is possible that concurrent reconfigurations take effect between the
2764 * time this reconfiguration was proposed and now.
2765 *
2766 * Particularly, it is possible that any of the concurrent reconfigurations
2767 * added a new member which does not support reconfiguring the event
2768 * horizon.
2769 *
2770 * We account for these situations by validating if the event horizon
2771 * reconfiguration is still possible under the current state.
2772 *
2773 * If it is not, this reconfiguration does not produce any effect, i.e. no
2774 * new configuration is installed.
2775 */
handle_event_horizon(app_data_ptr a)2776 bool_t handle_event_horizon(app_data_ptr a) {
2777 if (unsafe_event_horizon_reconfiguration(a)) return FALSE;
2778
2779 {
2780 xcom_event_horizon new_event_horizon = a->body.app_u_u.event_horizon;
2781 const site_def *latest_config = get_site_def();
2782 site_def *new_config = clone_site_def(latest_config);
2783 IFDBG(D_NONE, FN; NDBG(new_event_horizon, u));
2784 IFDBG(D_NONE, FN; NDBG(new_event_horizon, u));
2785 ADD_DBG(D_BASE, add_event(EVENT_DUMP_PAD, string_arg("a->app_key"));
2786 add_synode_event(a->app_key););
2787 assert(get_site_def());
2788 assert(new_config);
2789 new_config->event_horizon = new_event_horizon;
2790 new_config->start = getstart(a);
2791 new_config->boot_key = a->app_key;
2792 site_install_action(new_config, a->body.c_t);
2793 G_INFO("The event horizon was reconfigured to %" PRIu32, new_event_horizon);
2794 }
2795 return TRUE;
2796 }
2797
terminate_and_exit()2798 void terminate_and_exit() {
2799 IFDBG(D_NONE, FN;);
2800 ADD_DBG(D_FSM, add_event(EVENT_DUMP_PAD, string_arg("terminating"));)
2801 XCOM_FSM(x_fsm_terminate, int_arg(0)); /* Tell xcom to stop */
2802 XCOM_FSM(x_fsm_exit, int_arg(0)); /* Tell xcom to exit */
2803 if (xcom_expel_cb) xcom_expel_cb(0);
2804 }
2805
is_empty_site(site_def const * s)2806 static inline int is_empty_site(site_def const *s) {
2807 return s->nodes.node_list_len == 0;
2808 }
2809
handle_remove_node(app_data_ptr a)2810 site_def *handle_remove_node(app_data_ptr a) {
2811 site_def *site = clone_site_def(get_site_def());
2812 IFDBG(D_NONE, FN; COPY_AND_FREE_GOUT(dbg_list(&a->body.app_u_u.nodes)));
2813 ADD_DBG(D_BASE, add_event(EVENT_DUMP_PAD, string_arg("a->app_key"));
2814 add_synode_event(a->app_key);
2815 add_event(EVENT_DUMP_PAD, string_arg("nodeno"));
2816 add_event(EVENT_DUMP_PAD, uint_arg(get_nodeno(site))););
2817
2818 remove_site_def(a->body.app_u_u.nodes.node_list_len,
2819 a->body.app_u_u.nodes.node_list_val, site);
2820 site->start = getstart(a);
2821 site->boot_key = a->app_key;
2822 site_install_action(site, a->body.c_t);
2823 return site;
2824 }
2825
log_ignored_forced_config(app_data_ptr a,char const * const caller_name)2826 static void log_ignored_forced_config(app_data_ptr a,
2827 char const *const caller_name) {
2828 switch (a->body.c_t) {
2829 case unified_boot_type:
2830 G_DEBUG("%s: Ignoring a forced intermediate, pending unified_boot",
2831 caller_name);
2832 break;
2833 case add_node_type:
2834 G_DEBUG("%s: Ignoring a forced intermediate, pending add_node for %s",
2835 caller_name, a->body.app_u_u.nodes.node_list_val[0].address);
2836 break;
2837 case remove_node_type:
2838 G_DEBUG("%s: Ignoring a forced intermediate, pending remove_node for %s",
2839 caller_name, a->body.app_u_u.nodes.node_list_val[0].address);
2840 break;
2841 case set_event_horizon_type:
2842 G_DEBUG(
2843 "%s: Ignoring a forced intermediate, pending set_event_horizon for "
2844 "%" PRIu32,
2845 caller_name, a->body.app_u_u.event_horizon);
2846 break;
2847 case force_config_type:
2848 G_DEBUG("%s: Ignoring a forced intermediate, pending force_config",
2849 caller_name);
2850 break;
2851 case abort_trans:
2852 case app_type:
2853 case begin_trans:
2854 case convert_into_local_server_type:
2855 case disable_arbitrator:
2856 case enable_arbitrator:
2857 case exit_type:
2858 case get_event_horizon_type:
2859 case get_synode_app_data_type:
2860 case prepared_trans:
2861 case remove_reset_type:
2862 case reset_type:
2863 case set_cache_limit:
2864 case view_msg:
2865 case x_terminate_and_exit:
2866 case xcom_boot_type:
2867 case xcom_set_group:
2868 // Meaningless for any other `cargo_type`s. Ignore.
2869 break;
2870 }
2871 }
2872
handle_config(app_data_ptr a,bool const forced)2873 bool_t handle_config(app_data_ptr a, bool const forced) {
2874 assert(a->body.c_t == unified_boot_type ||
2875 a->next == NULL); /* Reconfiguration commands are not batched. */
2876 {
2877 bool_t success = FALSE;
2878 if (forced &&
2879 should_ignore_forced_config_or_view(get_executor_site()->x_proto)) {
2880 log_ignored_forced_config(a, "handle_config");
2881 goto end;
2882 }
2883 switch (a->body.c_t) {
2884 case unified_boot_type:
2885 success = (install_node_group(a) != NULL);
2886 assert(success);
2887 break;
2888 case add_node_type:
2889 /*
2890 * May fail if meanwhile the event horizon was reconfigured and the
2891 * node is incompatible.
2892 */
2893 success = (handle_add_node(a) != NULL);
2894 break;
2895 case remove_node_type:
2896 ADD_DBG(D_BASE,
2897 add_event(EVENT_DUMP_PAD, string_arg("got remove_node_type"));)
2898 success = (handle_remove_node(a) != NULL);
2899 assert(success);
2900 break;
2901 case set_event_horizon_type:
2902 /* May fail if meanwhile an incompatible node joined. */
2903 success = handle_event_horizon(a);
2904 break;
2905 case force_config_type:
2906 success = (install_node_group(a) != NULL);
2907 assert(success);
2908 break;
2909 default:
2910 assert(FALSE); /* Boy oh boy, something is really wrong... */
2911 break;
2912 }
2913 end:
2914 return success;
2915 }
2916 }
2917
is_member(site_def const * site)2918 static inline int is_member(site_def const *site) {
2919 return site->nodeno != VOID_NODE_NO;
2920 }
2921
2922 /*
2923 Execute xcom message stream.
2924
2925 Beware of the exit logic in this task, which is both simple and
2926 not so simple. Consider three configs C1 and C2. C1 has two
2927 nodes, A and B. C2 has only node B. C3 is empty. A config with
2928 message number N will be activated after a delay of (at least)
2929 alpha messages, where alpha is the size of the pipeline (or the
2930 event horizon).
2931
2932 So, C1.start = C1+alpha, and C2.start = C2+alpha. A, which is re‐
2933 moved from C1, cannot exit until a majority of nodes in the new
2934 config C2 (in this case B) has learned all the messages from con‐
2935 fig C1, which means all messages less than C2.start. How can A
2936 know that a majority of C2 has learned those messages?
2937
2938 If we denote the first message that is not yet decided (and exe‐
2939 cuted) by E, the proposers will not try to propose messages with
2940 number >= E+alpha, and all incoming tcp messages with message
2941 number >= E+alpha will be ignored. E is incremented by the ex‐
2942 ecutor task, so all messages < E are known. This means that when
2943 the value of E+alpha is known, all messages up to and including E
2944 are also known, although not all messages E+1..E+alpha‐1 neces‐
2945 sarily are known.
2946
2947 This leads to the requirement that a node which is removed (A)
2948 needs to wait until it knows the value of C2.start+alpha, since
2949 by then it knows that a majority of the nodes in C2 are ready to
2950 execute C2.start, which in turn implies that a majority of nodes
2951 in C2 knows all the values from config C1. Note that the last
2952 message that should be delivered to the application by a node
2953 that is leaving C1 is C2.start‐1, which is the last message of
2954 C1.
2955
2956 How does a node that is removed get to know values from the next
2957 config? There are two ways, and we use both. First, the node
2958 that tries to exit can simply ask for the message. get_xcom_mes‐
2959 sage() will do this for all messages <= max_synode, but it may
2960 take some time. Second, the nodes of C2 can send the messages
2961 C2.start..C2.start+alpha to the nodes that are removed (nodes
2962 that are in C1 but not in C2). inform_removed() does this. We
2963 take care to handle the case where configs are close enough that
2964 C0 < C1 <= C0+alpha by tracking the oldest config that contains
2965 nodes that are leaving.
2966
2967 This takes care of nodes leaving C1. What about nodes that leave
2968 C2? C3 is empty, so B, which is leaving C2, cannot wait for mes‐
2969 sages from C3. But since C3 is empty, there is no need to wait.
2970 It can exit immediately after having executed C3.start‐1, the
2971 last message of C2. What if C3.start‐1 < C2.start+alpha? This can
2972 happen if C2 and C3 are close. In that case, B will exit before A
2973 gets the chance to learn C2.start+alpha, which will leave A hang‐
2974 ing forever. Clearly, we need to impose an additional constraint,
2975 that C3.start must be greater than C2.start+alpha. This is taken
2976 care of by the special test for an empty config.
2977
2978 Complicated and confusing? Not really, but there is a clean and
2979 simple solution which has not been implemented yet, since it re‐
2980 quires more changes to the consensus logic. If we require that
2981 for the messages C2..C2.start‐1 we have a majority from both the
2982 nodes in C1 and the nodes in C2, the nodes not in C2 can exit
2983 when they have executed message C2.start‐1, since we then know
2984 that a majority of the nodes of C2 has agreed on those messages
2985 as well, so they do not depend on the nodes not in C2 any more.
2986 This holds even if C2 is empty. Note that requiring a majority
2987 from both C1 and C2 is different from requiring a majority from
2988 C1+C2, which means that the proposer logic needs to consider an‐
2989 swers from two different sets of acceptors for those messages.
2990 Since acceptors are identified by their node number, and the node
2991 numbers need not be the same for both configs, we need to main‐
2992 tain a mapping between the nodes numbers of any two consecutive
2993 configs. Alternatively, we could remove the node numbers alto‐
2994 gether, and always use a unique, unchanging ID for a node, like
2995 IP address + port.
2996
2997 TODO:
2998
2999 Move the delayed delivery logic into MCM-specific code, since it is
3000 only needed by MCM. Is it still needed?
3001
3002 Rewrite exit logic as FSM with more states. (RUN, EMPTY_EXIT,
3003 NOT_MEMBER_EXIT) to avoid unnecessary tests.
3004
3005 */
3006
3007 /* FIFO which tracks the message numbers where we should deliver queued messages
3008 or
3009 inform the removed nodes */
3010 #define FIFO_SIZE 1000
3011 static struct {
3012 int n;
3013 int front;
3014 int rear;
3015 synode_no q[FIFO_SIZE];
3016 } delay_fifo;
3017
addone(int i)3018 static inline int addone(int i) { return ((i + 1) % FIFO_SIZE); }
3019
3020 /* Is queue empty? */
fifo_empty()3021 static inline int fifo_empty() { return delay_fifo.n <= 0; }
3022
3023 /* Is queue full? */
fifo_full()3024 static inline int fifo_full() { return delay_fifo.n >= FIFO_SIZE; }
3025
3026 /* Insert in queue */
fifo_insert(synode_no s)3027 static inline void fifo_insert(synode_no s) {
3028 if (!fifo_full()) {
3029 delay_fifo.n++;
3030 delay_fifo.q[delay_fifo.rear] = s;
3031 delay_fifo.rear = addone(delay_fifo.rear);
3032 }
3033 }
3034
3035 /* Extract first from queue */
fifo_extract()3036 static inline synode_no fifo_extract() {
3037 if (!fifo_empty()) {
3038 synode_no ret = delay_fifo.q[delay_fifo.front];
3039 delay_fifo.front = addone(delay_fifo.front);
3040 delay_fifo.n--;
3041 return ret;
3042 } else {
3043 return null_synode;
3044 }
3045 }
3046
3047 /* Return first in queue, but do not dequeue */
fifo_front()3048 static inline synode_no fifo_front() {
3049 if (!fifo_empty()) {
3050 return delay_fifo.q[delay_fifo.front];
3051 } else {
3052 return null_synode;
3053 }
3054 }
3055
3056 struct execute_context;
3057 typedef struct execute_context execute_context;
3058
3059 typedef void (*exec_fp)(execute_context *xc);
3060
3061 struct execute_context {
3062 pax_machine *p;
3063 int n;
3064 int old_n;
3065 double old_t;
3066 synode_no exit_synode;
3067 synode_no delivery_limit;
3068 exec_fp state;
3069 int exit_flag; /* To avoid state explosion */
3070 int inform_index;
3071 };
3072
3073 static void dump_exec_state(execute_context *xc, long dbg);
3074 static int x_check_exit(execute_context *xc);
3075 static int x_check_execute_inform(execute_context *xc);
3076 static void x_fetch(execute_context *xc);
3077 static void x_execute(execute_context *xc);
3078 static void x_check_increment_fetch(execute_context *xc);
3079 static void x_check_increment_execute(execute_context *xc);
3080 static void x_terminate(execute_context *xc);
3081
3082 struct fp_name {
3083 exec_fp fp;
3084 char const *name;
3085 };
3086
3087 #define NAME(f) \
3088 { f, #f }
3089
3090 /* List of fp, name pairs */
3091 static struct fp_name oblist[] = {
3092 NAME(x_fetch), NAME(x_execute), NAME(x_terminate), {0, 0}};
3093 #undef NAME
3094
3095 /* purecov: begin deadcode */
get_fp_name(exec_fp fp)3096 char const *get_fp_name(exec_fp fp) {
3097 struct fp_name *list = oblist;
3098 while (list->fp) {
3099 if (list->fp == fp) return list->name;
3100 list++;
3101 }
3102 return "no such fp";
3103 }
3104 /* purecov: end */
3105
setup_exit_handling(execute_context * xc,site_def * site)3106 static void setup_exit_handling(execute_context *xc, site_def *site) {
3107 synode_no delay_until;
3108 if (is_member(site)) {
3109 delay_until = compute_delay(site->start, site->event_horizon);
3110 } else { /* Not in this site */
3111 /* See if site will be empty when we leave. If the new site
3112 * is empty, we should exit after having delivered the last
3113 * message from the old site. */
3114
3115 /* Note limit of delivery. We should never deliver anything after the start
3116 * of the next site. */
3117 xc->delivery_limit = site->start;
3118
3119 /* If we are not a member of the new site, we should exit
3120 after having seen enough messages beyond the end of the current site.
3121 This ensures that a majority of the next site will have agreed upon all
3122 messages that belong to the current site.
3123 */
3124 xc->exit_synode = compute_delay(site->start, site->event_horizon);
3125 if (is_empty_site(site)) {
3126 /* If site is empty, increase start to allow nodes to terminate before
3127 * start. This works as if there was a non-empty group after the
3128 * exit_synode, effectively allowing the majority of the current group to
3129 * agree on all messages up to exit_synode.
3130 */
3131 site->start = compute_delay(
3132 compute_delay(site->start, site->event_horizon), site->event_horizon);
3133 }
3134 if (!synode_lt(xc->exit_synode, max_synode)) {
3135 /* We need messages from the next site, so set max_synode accordingly. */
3136 set_max_synode(incr_synode(xc->exit_synode));
3137 }
3138 /* Note where we switch to execute and inform removed nodes */
3139 delay_until = xc->exit_synode;
3140
3141 IFDBG(D_EXEC, FN; SYCEXP(delay_until); SYCEXP(executed_msg);
3142 SYCEXP(max_synode));
3143 IFDBG(D_EXEC, FN; SYCEXP(xc->exit_synode); SYCEXP(executed_msg);
3144 SYCEXP(max_synode));
3145
3146 /* Note that we will exit */
3147 xc->exit_flag = 1;
3148 }
3149
3150 /* Ensure that max_synode is greater than trigger for delivery
3151 */
3152 if (synode_gt(delay_until, max_synode))
3153 set_max_synode(incr_msgno(delay_until));
3154 fifo_insert(delay_until);
3155 (xc->inform_index)++;
3156
3157 /* If I am the leader, will propose no-ops until current max_synode
3158 */
3159 }
3160
3161 /* Called immediately after we have got a new message.
3162 Terminate if we have no site.
3163 Otherwise, handle config messages immediately.
3164 Afterwards, switch to check_exit_fetch. */
x_fetch(execute_context * xc)3165 static void x_fetch(execute_context *xc) {
3166 /* Execute unified_boot immediately, but do not deliver site message
3167 * until we are ready to execute messages from the new site
3168 * definition. At that point we can be certain that a majority have
3169 * learned everything from the old site. */
3170
3171 app_data *app = xc->p->learner.msg->a;
3172 if (app && is_config(app->body.c_t) &&
3173 synode_gt(executed_msg, get_site_def()->boot_key)) /* Redo test */
3174 {
3175 site_def *site = 0;
3176 bool_t reconfiguration_successful =
3177 handle_config(app, (xc->p->learner.msg->force_delivery != 0));
3178 if (reconfiguration_successful) {
3179 /* If the reconfiguration failed then it does not have any
3180 * effect. What follows only makes sense if the reconfiguration
3181 * took effect. */
3182 set_last_received_config(executed_msg);
3183 garbage_collect_site_defs(delivered_msg);
3184 site = get_site_def_rw();
3185 if (site == 0) {
3186 xc->state = x_terminate;
3187 return;
3188 }
3189 IFDBG(D_EXEC, FN; STRLIT("new config "); SYCEXP(site->boot_key););
3190
3191 if (xc->exit_flag == 0) {
3192 /* We have not yet set the exit trigger */
3193 setup_exit_handling(xc, site);
3194 }
3195 }
3196 } else {
3197 IFDBG(D_EXEC, FN; SYCEXP(executed_msg); SYCEXP(get_site_def()->boot_key));
3198 }
3199 /* Check for exit and increment executed_msg */
3200 x_check_increment_fetch(xc);
3201 }
3202
3203 /* Push messages to nodes that have been removed.
3204 Signal switch to execute when nothing left to push by returning 1 */
x_check_execute_inform(execute_context * xc)3205 static int x_check_execute_inform(execute_context *xc) {
3206 IFDBG(D_EXEC, FN; SYCEXP(fifo_front()); SYCEXP(executed_msg);
3207 SYCEXP(xc->exit_synode); NDBG(xc->exit_flag, d));
3208 if (fifo_empty()) {
3209 return 1;
3210 } else if (!synode_lt(executed_msg, fifo_front())) {
3211 while (
3212 !fifo_empty() &&
3213 !synode_lt(executed_msg, fifo_front())) { /* More than one may match */
3214 inform_removed(xc->inform_index, 0);
3215 fifo_extract();
3216 (xc->inform_index)--;
3217 }
3218 garbage_collect_servers();
3219 return 1;
3220 }
3221 dump_exec_state(xc, D_EXEC);
3222 return 0;
3223 }
3224
3225 /* Check for exit and return 1 if we should exit. */
x_check_exit(execute_context * xc)3226 static int x_check_exit(execute_context *xc) {
3227 /* See if we should exit when having seen this message */
3228 return (xc->exit_flag && !synode_lt(executed_msg, xc->exit_synode) &&
3229 !synode_lt(delivered_msg, xc->delivery_limit));
3230 }
3231
3232 /* Terminate if we should exit, else increment executed_msg and see if we should
3233 * switch to execute */
x_check_increment_fetch(execute_context * xc)3234 static void x_check_increment_fetch(execute_context *xc) {
3235 if (x_check_exit(xc)) {
3236 xc->state = x_terminate;
3237 } else {
3238 SET_EXECUTED_MSG(incr_synode(executed_msg));
3239 if (x_check_execute_inform(xc)) {
3240 xc->state = x_execute;
3241 }
3242 }
3243 }
3244
3245 /* Terminate if we should exit, else increment delivered_msg and see if we
3246 * should switch to fetch */
x_check_increment_execute(execute_context * xc)3247 static void x_check_increment_execute(execute_context *xc) {
3248 if (x_check_exit(xc)) {
3249 xc->state = x_terminate;
3250 } else {
3251 /* Increment delivered_msg and switch to fetch if delivered_msg equals
3252 * executed_msg; */
3253 delivered_msg = incr_synode(delivered_msg);
3254 if (synode_eq(delivered_msg, executed_msg)) {
3255 xc->state = x_fetch;
3256 }
3257 }
3258 }
3259
3260 /* Deliver one message if it should be delivered. Switch state to see if
3261 we should exit */
x_execute(execute_context * xc)3262 static void x_execute(execute_context *xc) {
3263 site_def const *x_site = find_site_def(delivered_msg);
3264
3265 IFDBG(D_EXEC, FN; SYCEXP(delivered_msg); SYCEXP(delivered_msg);
3266 SYCEXP(executed_msg); SYCEXP(xc->exit_synode); NDBG(xc->exit_flag, d));
3267 if (!is_cached(delivered_msg)) {
3268 /* purecov: begin deadcode */
3269 #ifdef TASK_EVENT_TRACE
3270 dump_task_events();
3271 #endif
3272 /* purecov: end */
3273 }
3274 assert(is_cached(delivered_msg) && "delivered_msg should have been cached");
3275 xc->p = get_cache(delivered_msg);
3276 if (LOSER(delivered_msg, x_site)) {
3277 #ifdef IGNORE_LOSERS
3278 IFDBG(D_EXEC, FN; debug_loser(delivered_msg); PTREXP(x_site);
3279 dbg_node_set(x_site->global_node_set));
3280 #endif
3281 } else if (xc->p->learner.msg->msg_type != no_op) {
3282 /* Avoid delivery after start if we should exit */
3283 if (xc->exit_flag == 0 || synode_lt(delivered_msg, xc->delivery_limit)) {
3284 /* IFDBG(D_EXEC, FN; NDBG(ep->state, d); STRLIT("executing ");
3285 SYCEXP(delivered_msg); SYCEXP(executed_msg);
3286 SYCEXP(xc->delivery_limit); NDBG(xc->exit_flag, d)); */
3287 last_delivered_msg = delivered_msg;
3288 execute_msg(find_site_def_rw(delivered_msg), xc->p, xc->p->learner.msg);
3289 }
3290 }
3291 /* Garbage collect old servers */
3292 if (synode_eq(delivered_msg, x_site->start)) {
3293 garbage_collect_servers();
3294 }
3295 #if defined(TASK_DBUG_ON) && TASK_DBUG_ON
3296 IFDBG(D_EXEC, perf_dbg(&xc->n, &xc->old_n, &xc->old_t));
3297 #endif
3298 /* Check for exit and increment delivered_msg */
3299 x_check_increment_execute(xc);
3300 }
3301
3302 static execute_context *debug_xc;
3303
dump_exec_state(execute_context * xc MY_ATTRIBUTE ((unused)),long dbg MY_ATTRIBUTE ((unused)))3304 static void dump_exec_state(execute_context *xc MY_ATTRIBUTE((unused)),
3305 long dbg MY_ATTRIBUTE((unused))) {
3306 IFDBG(dbg, FN; SYCEXP(executed_msg); SYCEXP(delivered_msg);
3307 SYCEXP(max_synode); SYCEXP(last_delivered_msg); NDBG(delay_fifo.n, d);
3308 NDBG(delay_fifo.front, d); NDBG(delay_fifo.rear, d);
3309 SYCEXP(fifo_front()); SYCEXP(xc->exit_synode);
3310 SYCEXP(xc->delivery_limit); NDBG(xc->exit_flag, d);
3311 NDBG(xc->inform_index, d); NDBG(prop_started, d);
3312 NDBG(prop_finished, d););
3313 }
3314
dump_debug_exec_state()3315 static void dump_debug_exec_state() {
3316 if (debug_xc) dump_exec_state(debug_xc, D_EXEC);
3317 }
3318
3319 /* Terminate the excutor_task. */
x_terminate(execute_context * xc)3320 static void x_terminate(execute_context *xc) {
3321 dump_exec_state(xc, D_BUG);
3322 xc->state = 0;
3323 }
3324
executor_task(task_arg arg MY_ATTRIBUTE ((unused)))3325 static int executor_task(task_arg arg MY_ATTRIBUTE((unused))) {
3326 DECL_ENV
3327 execute_context xc;
3328 END_ENV;
3329 /* xcom_debug_mask = D_BUG; */
3330 IFDBG(D_EXEC, FN; NDBG(stack->sp->state, d); SYCEXP(executed_msg););
3331 TASK_BEGIN
3332 ep->xc.p = NULL;
3333 ep->xc.n = 0;
3334 ep->xc.old_n = 0;
3335 ep->xc.old_t = task_now();
3336 ep->xc.exit_synode = null_synode;
3337 ep->xc.delivery_limit = null_synode;
3338 ep->xc.exit_flag = 0;
3339 ep->xc.inform_index = -1;
3340 delay_fifo.n = 0;
3341 delay_fifo.front = 0;
3342 delay_fifo.rear = 0;
3343 debug_xc = &ep->xc;
3344
3345 if (executed_msg.msgno == 0) executed_msg.msgno = 1;
3346 delivered_msg = executed_msg;
3347 ep->xc.state = x_fetch;
3348 executor_site = find_site_def_rw(executed_msg);
3349
3350 /* The following loop implements a state machine based on function pointers,
3351 effectively acting as non-local gotos.
3352 The functions all operate on data in the execution context xc, and
3353 switch state by setting xc->state to the function corresponding to the new
3354 state.
3355 */
3356 while (!xcom_shutdown && ep->xc.state != 0) {
3357 IFDBG(D_EXEC, FN; STRLIT(get_fp_name(ep->xc.state)););
3358 if (ep->xc.state == x_fetch) { /* Special case because of task macros */
3359 if (LOSER(executed_msg, executor_site)) {
3360 x_check_increment_fetch(&ep->xc); /* Just increment past losers */
3361 } else {
3362 TASK_CALL(get_xcom_message(&ep->xc.p, executed_msg, FIND_MAX));
3363 IFDBG(D_EXEC, FN; STRLIT("got message "); SYCEXP(ep->xc.p->synode);
3364 COPY_AND_FREE_GOUT(dbg_app_data(ep->xc.p->learner.msg->a)));
3365 x_fetch(&ep->xc);
3366 }
3367 } else {
3368 ep->xc.state(&ep->xc);
3369 }
3370 }
3371
3372 /* Inform all removed nodes before we exit */
3373 ADD_DBG(D_FSM, add_event(EVENT_DUMP_PAD, string_arg("terminating"));)
3374 inform_removed(ep->xc.inform_index, 1);
3375 dump_exec_state(&ep->xc, D_BUG);
3376
3377 #ifndef NO_DELAYED_TERMINATION
3378 IFDBG(D_EXEC, FN; STRLIT("delayed terminate and exit"));
3379
3380 /* Wait to allow messages to propagate */
3381 TASK_DELAY(TERMINATE_DELAY);
3382
3383 /* Start termination of xcom */
3384 terminate_and_exit();
3385 #endif
3386
3387 FINALLY
3388 dump_exec_state(&ep->xc, D_BUG);
3389 IFDBG(D_BUG, FN; STRLIT(" shutdown "); SYCEXP(executed_msg);
3390 NDBG(task_now(), f));
3391 TASK_END;
3392 }
3393
get_sweep_start()3394 static synode_no get_sweep_start() {
3395 synode_no find = executed_msg;
3396 find.node = get_nodeno(find_site_def(find));
3397 if (find.node < executed_msg.node) {
3398 find = incr_msgno(find);
3399 }
3400 return find;
3401 }
3402
sweeper_task(task_arg arg MY_ATTRIBUTE ((unused)))3403 static int sweeper_task(task_arg arg MY_ATTRIBUTE((unused))) {
3404 DECL_ENV
3405 synode_no find;
3406 END_ENV;
3407
3408 TASK_BEGIN
3409
3410 ep->find = get_sweep_start();
3411
3412 while (!xcom_shutdown) {
3413 ep->find.group_id =
3414 executed_msg.group_id; /* In case group id has changed */
3415 #ifndef AGGRESSIVE_SWEEP
3416 while (!is_only_task()) {
3417 TASK_YIELD;
3418 }
3419 #endif
3420 ADD_DBG(D_NONE, add_event(EVENT_DUMP_PAD, string_arg("sweeper ready"));
3421 add_synode_event(executed_msg););
3422 /* IFDBG(D_NONE, FN; STRLIT("ready to run "); */
3423 /* SYCEXP(executed_msg); SYCEXP(max_synode);
3424 * SYCEXP(ep->find));
3425 */
3426 {
3427 while (synode_lt(ep->find, max_synode) && !too_far(ep->find)) {
3428 /* pax_machine * pm = hash_get(ep->find); */
3429 pax_machine *pm = 0;
3430 ADD_DBG(D_NONE,
3431 add_event(EVENT_DUMP_PAD, string_arg("sweeper examining"));
3432 add_synode_event(ep->find););
3433 if (ep->find.node == VOID_NODE_NO) {
3434 if (synode_gt(executed_msg, ep->find)) {
3435 ep->find = get_sweep_start();
3436 }
3437 if (ep->find.node == VOID_NODE_NO) goto deactivate;
3438 }
3439 pm = get_cache(ep->find);
3440 ADD_DBG(D_CONS,
3441 add_event(EVENT_DUMP_PAD, string_arg("sweeper checking"));
3442 add_synode_event(ep->find);
3443 add_event(EVENT_DUMP_PAD, string_arg(pax_op_to_str(pm->op)));
3444 add_event(EVENT_DUMP_PAD, string_arg("pm"));
3445 add_event(EVENT_DUMP_PAD, void_arg(pm)););
3446 if (pm && !pm->force_delivery) { /* We want full 3 phase Paxos for
3447 forced messages */
3448 ADD_DBG(
3449 D_CONS, add_event(EVENT_DUMP_PAD, string_arg("sweeper checking"));
3450 add_synode_event(ep->find);
3451 add_event(EVENT_DUMP_PAD, string_arg(pax_op_to_str(pm->op)));
3452 add_event(EVENT_DUMP_PAD, string_arg("is_busy_machine"));
3453 add_event(EVENT_DUMP_PAD, int_arg(is_busy_machine(pm)));
3454 add_event(EVENT_DUMP_PAD, string_arg("pm->acceptor.promise.cnt"));
3455 add_event(EVENT_DUMP_PAD, int_arg(pm->acceptor.promise.cnt));
3456 add_event(EVENT_DUMP_PAD, string_arg("finished(pm)"));
3457 add_event(EVENT_DUMP_PAD, int_arg(finished(pm)));
3458 add_event(EVENT_DUMP_PAD, string_arg("pm->acceptor.msg"));
3459 add_event(EVENT_DUMP_PAD, void_arg(pm->acceptor.msg)););
3460 /* IFDBG(D_NONE, FN; dbg_pax_machine(pm)); */
3461 if (!is_busy_machine(pm) && pm->acceptor.promise.cnt == 0 &&
3462 !pm->acceptor.msg && !finished(pm)) {
3463 pm->op = skip_op;
3464 ADD_DBG(D_CONS,
3465 add_event(EVENT_DUMP_PAD, string_arg("sweeper skipping"));
3466 add_synode_event(ep->find); add_event(
3467 EVENT_DUMP_PAD, string_arg(pax_op_to_str(pm->op))););
3468 skip_msg(pax_msg_new(ep->find, find_site_def(ep->find)));
3469 IFDBG(D_NONE, FN; STRLIT("skipping "); SYCEXP(ep->find));
3470 /* IFDBG(D_NONE, FN;
3471 * dbg_pax_machine(pm));
3472 */
3473 }
3474 }
3475 ep->find = incr_msgno(ep->find);
3476 }
3477 }
3478 deactivate:
3479 TASK_DEACTIVATE;
3480 }
3481 FINALLY
3482 IFDBG(D_BUG, FN; STRLIT(" shutdown sweeper "); SYCEXP(executed_msg);
3483 NDBG(task_now(), f));
3484 TASK_END;
3485 }
3486
wakeup_delay(double old)3487 static double wakeup_delay(double old) {
3488 double retval = 0.0;
3489 if (0.0 == old) {
3490 double m = median_time();
3491 if (m == 0.0 || m > 0.3) m = 0.1;
3492 retval = 0.1 + 5.0 * m + m * xcom_drand48();
3493 } else {
3494 retval = old * 1.4142136; /* Exponential backoff */
3495 }
3496 {
3497 #ifdef EXECUTOR_TASK_AGGRESSIVE_NO_OP
3498 double const maximum_threshold = 1.0;
3499 #else
3500 double const maximum_threshold = 3.0;
3501 #endif /* EXECUTOR_TASK_AGGRESSIVE_NO_OP */
3502 while (retval > maximum_threshold) retval /= 1.31415926;
3503 }
3504 /* IFDBG(D_NONE, FN; NDBG(retval,d)); */
3505 return retval;
3506 }
3507
propose_noop(synode_no find,pax_machine * p)3508 static void propose_noop(synode_no find, pax_machine *p) {
3509 /* Prepare to send a noop */
3510 site_def const *site = find_site_def(find);
3511 IFDBG(D_NONE, FN; SYCEXP(find); SYCEXP(executed_msg));
3512 assert(!too_far(find));
3513 replace_pax_msg(&p->proposer.msg, pax_msg_new(find, site));
3514 assert(p->proposer.msg);
3515 create_noop(p->proposer.msg);
3516 {
3517 pax_msg *clone = clone_pax_msg(p->proposer.msg);
3518 if (clone != NULL) {
3519 push_msg_3p(site, p, clone, find, no_op);
3520 } else {
3521 /* purecov: begin inspected */
3522 G_DEBUG("Unable to propose NoOp due to an OOM error.");
3523 /* purecov: end */
3524 }
3525 }
3526 }
3527
send_read(synode_no find)3528 static void send_read(synode_no find) {
3529 /* Prepare to send a read_op */
3530 site_def const *site = find_site_def(find);
3531
3532 IFDBG(D_NONE, FN; NDBG(get_maxnodes(site), u); NDBG(get_nodeno(site), u););
3533 ADD_DBG(D_CONS, add_event(EVENT_DUMP_PAD, string_arg("find"));
3534 add_synode_event(find); add_event(EVENT_DUMP_PAD, string_arg("site"));
3535 add_event(EVENT_DUMP_PAD, void_arg((void *)find_site_def_rw(find)));
3536 add_event(EVENT_DUMP_PAD, string_arg("get_nodeno(site)"));
3537 add_event(EVENT_DUMP_PAD, uint_arg(get_nodeno(site))););
3538
3539 /* See if node number matches ours */
3540 if (site) {
3541 if (find.node != get_nodeno(site)) {
3542 pax_msg *pm = pax_msg_new(find, site);
3543 ref_msg(pm);
3544 create_read(site, pm);
3545 IFDBG(D_NONE, FN; SYCEXP(find););
3546
3547 IFDBG(D_NONE, FN; NDBG(get_maxnodes(site), u); NDBG(get_nodeno(site), u);
3548 PTREXP(pm));
3549 /* send_server_msg(site, find.node, pm); */
3550 #if 0
3551 send_to_others(site, pm, "send_read");
3552 #else
3553 /* If we have no node number, ask all the others */
3554 if (get_nodeno(site) == VOID_NODE_NO)
3555 send_to_others(site, pm, "send_read");
3556 else
3557 /* Ask a random node */
3558 send_to_someone(site, pm, "send_read");
3559 #endif
3560 unref_msg(&pm);
3561 } else { /* If node number matches our own number, ask all the others */
3562 pax_msg *pm = pax_msg_new(find, site);
3563 ref_msg(pm);
3564 create_read(site, pm);
3565 send_to_others(site, pm, "send_read");
3566 unref_msg(&pm);
3567 }
3568 }
3569 }
3570
3571 /* Find missing values */
3572
ok_to_propose(pax_machine * p)3573 static int ok_to_propose(pax_machine *p) {
3574 int retval = (is_forcing_node(p) || !recently_active(p)) && !finished(p) &&
3575 !is_busy_machine(p);
3576 IFDBG(D_NONE, FN; NDBG(p->synode.node, u); NDBG(recently_active(p), d);
3577 NDBG(finished(p), d); NDBG(is_busy_machine(p), d); NDBG(retval, d));
3578 return retval;
3579 }
3580
read_missing_values(int n)3581 static void read_missing_values(int n) {
3582 synode_no find = executed_msg;
3583 synode_no end = max_synode;
3584 int i = 0;
3585
3586 IFDBG(D_NONE, FN; SYCEXP(find); SYCEXP(end));
3587 if (synode_gt(executed_msg, max_synode) ||
3588 synode_eq(executed_msg, null_synode))
3589 return;
3590
3591 while (!synode_gt(find, end) && i < n && !too_far(find)) {
3592 pax_machine *p = force_get_cache(find);
3593 ADD_DBG(D_NONE, add_synode_event(find); add_synode_event(end);
3594 add_event(EVENT_DUMP_PAD, string_arg("active "));
3595 add_event(EVENT_DUMP_PAD, int_arg(recently_active(p)));
3596 add_event(EVENT_DUMP_PAD, string_arg("finished "));
3597 add_event(EVENT_DUMP_PAD, int_arg(finished(p)));
3598 add_event(EVENT_DUMP_PAD, string_arg("busy "));
3599 add_event(EVENT_DUMP_PAD, int_arg(is_busy_machine(p))););
3600 IFDBG(D_NONE, FN; SYCEXP(find); SYCEXP(end); NDBG(recently_active(p), d);
3601 NDBG(finished(p), d); NDBG(is_busy_machine(p), d));
3602 if (!recently_active(p) && !finished(p) && !is_busy_machine(p)) {
3603 send_read(find);
3604 }
3605 find = incr_synode(find);
3606 i++;
3607 }
3608 }
3609
propose_missing_values(int n)3610 static void propose_missing_values(int n) {
3611 synode_no find = executed_msg;
3612 synode_no end = max_synode;
3613 int i = 0;
3614
3615 IFDBG(D_NONE, FN; NDBG(get_maxnodes(get_site_def()), u); SYCEXP(find);
3616 SYCEXP(end));
3617 if (synode_gt(executed_msg, max_synode) ||
3618 synode_eq(executed_msg, null_synode))
3619 return;
3620
3621 IFDBG(D_NONE, FN; SYCEXP(find); SYCEXP(end));
3622 i = 0;
3623 while (!synode_gt(find, end) && i < n && !too_far(find)) {
3624 pax_machine *p = force_get_cache(find);
3625 if (wait_forced_config) {
3626 force_pax_machine(p, 1);
3627 }
3628 IFDBG(D_NONE, FN; NDBG(ok_to_propose(p), d); TIMECEXP(task_now());
3629 TIMECEXP(p->last_modified); SYCEXP(find));
3630 if (get_nodeno(find_site_def(find)) == VOID_NODE_NO) break;
3631 if (ok_to_propose(p)) {
3632 propose_noop(find, p);
3633 }
3634 find = incr_synode(find);
3635 i++;
3636 }
3637 }
3638
3639 /* Propose a noop for the range find..end */
request_values(synode_no find,synode_no end)3640 void request_values(synode_no find, synode_no end) {
3641 IFDBG(D_NONE, FN; SYCEXP(find); SYCEXP(end););
3642 while (!synode_gt(find, end) && !too_far(find)) {
3643 pax_machine *p = get_cache(find);
3644 site_def const *site = find_site_def(find);
3645 if (get_nodeno(site) == VOID_NODE_NO) break;
3646 if (!finished(p) && !is_busy_machine(p)) {
3647 /* Prepare to send a noop */
3648 replace_pax_msg(&p->proposer.msg, pax_msg_new(find, site));
3649 assert(p->proposer.msg);
3650 create_noop(p->proposer.msg);
3651
3652 IFDBG(D_NONE, FN; STRLIT("propose "); SYCEXP(find););
3653 push_msg_3p(site, p, pax_msg_new(find, site), find, no_op);
3654 }
3655 find = incr_synode(find);
3656 }
3657 }
3658
3659 /* Message handlers */
3660
3661 /*
3662 Reply to the sender of a message.
3663 Avoid using the outbound TCP connection to the node that sent the message, since
3664 it is simpler and safer to always use the same TCP connection as the one the
3665 message arrived on. We then know that the answever will always go to the same
3666 client (and the same instance of that client) that sent the request.
3667 */
3668 #define reply_msg(m) \
3669 { \
3670 if (is_local_node((m)->from, site)) { \
3671 dispatch_op(site, m, NULL); \
3672 } else { \
3673 link_into(&(msg_link_new((m), (m)->from)->l), reply_queue); \
3674 } \
3675 }
3676
3677 #define CREATE_REPLY(x) \
3678 pax_msg *reply = NULL; \
3679 CLONE_PAX_MSG(reply, x)
3680
3681 #define SEND_REPLY \
3682 reply_msg(reply); \
3683 replace_pax_msg(&reply, NULL)
3684
safe_app_data_copy(pax_msg ** target,app_data_ptr source)3685 bool_t safe_app_data_copy(pax_msg **target, app_data_ptr source) {
3686 copy_app_data(&(*target)->a, source);
3687 if ((*target)->a == NULL && source != NULL) {
3688 oom_abort = 1;
3689 replace_pax_msg(target, NULL);
3690 return FALSE;
3691 }
3692 return TRUE;
3693 }
3694
create_learn_msg_for_ignorant_node(pax_machine * p,pax_msg * pm,synode_no synode)3695 static pax_msg *create_learn_msg_for_ignorant_node(pax_machine *p, pax_msg *pm,
3696 synode_no synode) {
3697 CREATE_REPLY(pm);
3698 IFDBG(D_NONE, FN; SYCEXP(synode));
3699 reply->synode = synode;
3700 reply->proposal = p->learner.msg->proposal;
3701 reply->msg_type = p->learner.msg->msg_type;
3702 safe_app_data_copy(&reply, p->learner.msg->a);
3703 if (reply != NULL) set_learn_type(reply);
3704 /* set_unique_id(reply, p->learner.msg->unique_id); */
3705 return reply;
3706 }
3707
teach_ignorant_node(site_def const * site,pax_machine * p,pax_msg * pm,synode_no synode,linkage * reply_queue)3708 static void teach_ignorant_node(site_def const *site, pax_machine *p,
3709 pax_msg *pm, synode_no synode,
3710 linkage *reply_queue) {
3711 pax_msg *reply = create_learn_msg_for_ignorant_node(p, pm, synode);
3712 if (reply != NULL) SEND_REPLY;
3713 }
3714
3715 /* Handle incoming read */
handle_read(site_def const * site,pax_machine * p,linkage * reply_queue,pax_msg * pm)3716 static void handle_read(site_def const *site, pax_machine *p,
3717 linkage *reply_queue, pax_msg *pm) {
3718 IFDBG(D_NONE, FN; BALCEXP(pm->proposal); BALCEXP(p->acceptor.promise);
3719 if (p->acceptor.msg) BALCEXP(p->acceptor.msg->proposal);
3720 STRLIT("type "); STRLIT(pax_msg_type_to_str(pm->msg_type)));
3721
3722 if (finished(p)) { /* We have learned a value */
3723 teach_ignorant_node(site, p, pm, pm->synode, reply_queue);
3724 }
3725 }
3726
create_ack_prepare_msg(pax_machine * p,pax_msg * pm,synode_no synode)3727 static pax_msg *create_ack_prepare_msg(pax_machine *p, pax_msg *pm,
3728 synode_no synode) {
3729 CREATE_REPLY(pm);
3730 reply->synode = synode;
3731 if (accepted(p)) { /* We have accepted a value */
3732 reply->proposal = p->acceptor.msg->proposal;
3733 reply->msg_type = p->acceptor.msg->msg_type;
3734 IFDBG(D_NONE, FN; STRLIT(" already accepted value "); SYCEXP(synode));
3735 reply->op = ack_prepare_op;
3736 safe_app_data_copy(&reply, p->acceptor.msg->a);
3737 } else {
3738 IFDBG(D_NONE, FN; STRLIT(" no value synode "); SYCEXP(synode));
3739 reply->op = ack_prepare_empty_op;
3740 }
3741 return reply;
3742 }
3743
handle_simple_prepare(pax_machine * p,pax_msg * pm,synode_no synode)3744 pax_msg *handle_simple_prepare(pax_machine *p, pax_msg *pm, synode_no synode) {
3745 pax_msg *reply = NULL;
3746 if (finished(p)) { /* We have learned a value */
3747 IFDBG(D_NONE, FN; SYCEXP(synode); BALCEXP(pm->proposal);
3748 NDBG(finished(p), d));
3749 reply = create_learn_msg_for_ignorant_node(p, pm, synode);
3750 } else {
3751 int greater =
3752 gt_ballot(pm->proposal,
3753 p->acceptor.promise); /* Paxos acceptor phase 1 decision */
3754 IFDBG(D_NONE, FN; SYCEXP(synode); BALCEXP(pm->proposal); NDBG(greater, d));
3755 if (greater || noop_match(p, pm)) {
3756 p->last_modified = task_now();
3757 if (greater) {
3758 p->acceptor.promise = pm->proposal; /* promise to not accept any less */
3759 }
3760 reply = create_ack_prepare_msg(p, pm, synode);
3761 }
3762 }
3763 return reply;
3764 }
3765
3766 /* Handle incoming prepare */
handle_prepare(site_def const * site,pax_machine * p,linkage * reply_queue,pax_msg * pm)3767 static void handle_prepare(site_def const *site, pax_machine *p,
3768 linkage *reply_queue, pax_msg *pm) {
3769 ADD_DBG(D_CONS, add_synode_event(p->synode);
3770 add_event(EVENT_DUMP_PAD, string_arg("pm->from"));
3771 add_event(EVENT_DUMP_PAD, uint_arg(pm->from));
3772 add_event(EVENT_DUMP_PAD, string_arg(pax_op_to_str(pm->op)));
3773 add_event(EVENT_DUMP_PAD, string_arg("proposal"));
3774 add_ballot_event(pm->proposal);
3775 add_event(EVENT_DUMP_PAD, string_arg("promise"));
3776 add_ballot_event(p->acceptor.promise););
3777 IFDBG(D_NONE, FN; BALCEXP(pm->proposal); BALCEXP(p->acceptor.promise);
3778 if (p->acceptor.msg) BALCEXP(p->acceptor.msg->proposal);
3779 STRLIT("type "); STRLIT(pax_msg_type_to_str(pm->msg_type)));
3780
3781 {
3782 pax_msg *reply = handle_simple_prepare(p, pm, pm->synode);
3783 if (reply != NULL) SEND_REPLY;
3784 }
3785 }
3786
check_propose(site_def const * site,pax_machine * p)3787 bool_t check_propose(site_def const *site, pax_machine *p) {
3788 IFDBG(D_NONE, FN; SYCEXP(p->synode);
3789 COPY_AND_FREE_GOUT(dbg_machine_nodeset(p, get_maxnodes(site))););
3790 PAX_MSG_SANITY_CHECK(p->proposer.msg);
3791 {
3792 bool_t can_propose = FALSE;
3793 if (prep_majority(site, p)) {
3794 p->proposer.msg->proposal = p->proposer.bal;
3795 BIT_ZERO(p->proposer.prop_nodeset);
3796 p->proposer.msg->synode = p->synode;
3797 init_propose_msg(p->proposer.msg);
3798 p->proposer.sent_prop = p->proposer.bal;
3799 can_propose = TRUE;
3800 }
3801 return can_propose;
3802 }
3803 }
3804
check_learn(site_def const * site,pax_machine * p)3805 static pax_msg *check_learn(site_def const *site, pax_machine *p) {
3806 IFDBG(D_NONE, FN; SYCEXP(p->synode);
3807 COPY_AND_FREE_GOUT(dbg_machine_nodeset(p, get_maxnodes(site))););
3808 PAX_MSG_SANITY_CHECK(p->proposer.msg);
3809 {
3810 pax_msg *learn_msg = NULL;
3811 if (get_nodeno(site) != VOID_NODE_NO && prop_majority(site, p)) {
3812 p->proposer.msg->synode = p->synode;
3813 if (p->proposer.msg->receivers) free_bit_set(p->proposer.msg->receivers);
3814 p->proposer.msg->receivers = clone_bit_set(p->proposer.prep_nodeset);
3815 BIT_SET(get_nodeno(site), p->proposer.msg->receivers);
3816 if (no_duplicate_payload) {
3817 learn_msg = create_tiny_learn_msg(p, p->proposer.msg);
3818 } else {
3819 /* purecov: begin deadcode */
3820 init_learn_msg(p->proposer.msg);
3821 learn_msg = p->proposer.msg;
3822 /* purecov: end */
3823 }
3824 p->proposer.sent_learn = p->proposer.bal;
3825 }
3826 return learn_msg;
3827 }
3828 }
3829
do_learn(site_def const * site MY_ATTRIBUTE ((unused)),pax_machine * p,pax_msg * m)3830 static void do_learn(site_def const *site MY_ATTRIBUTE((unused)),
3831 pax_machine *p, pax_msg *m) {
3832 ADD_DBG(D_CONS, add_synode_event(p->synode);
3833 add_event(EVENT_DUMP_PAD, string_arg("m->from"));
3834 add_event(EVENT_DUMP_PAD, uint_arg(m->from));
3835 add_event(EVENT_DUMP_PAD, string_arg(pax_op_to_str(m->op)));
3836 add_event(EVENT_DUMP_PAD, string_arg("proposal"));
3837 add_ballot_event(m->proposal);
3838 add_event(EVENT_DUMP_PAD, string_arg("promise"));
3839 add_ballot_event(p->acceptor.promise););
3840 /* FN; SYCEXP(p->synode); SYCEXP(m->synode); STRLIT(NEWLINE); */
3841 IFDBG(D_NONE, FN; SYCEXP(p->synode); SYCEXP(m->synode);
3842 dbg_bitset(m->receivers, get_maxnodes(site)););
3843 if (m->a) m->a->chosen = TRUE;
3844 replace_pax_msg(&p->acceptor.msg, m);
3845 replace_pax_msg(&p->learner.msg, m);
3846 /*
3847 Track memory used by client data in the cache.
3848 If we do not care about instances that are being decided,
3849 it is only necessary to compute the added memory when we
3850 record the outcome of a consensus round.
3851 */
3852 add_cache_size(p);
3853 /* Shrink the cache size if necessary */
3854 shrink_cache();
3855 }
3856
handle_simple_ack_prepare(site_def const * site,pax_machine * p,pax_msg * m)3857 bool_t handle_simple_ack_prepare(site_def const *site, pax_machine *p,
3858 pax_msg *m) {
3859 if (get_nodeno(site) != VOID_NODE_NO)
3860 BIT_SET(m->from, p->proposer.prep_nodeset);
3861
3862 {
3863 bool_t can_propose = FALSE;
3864 if (m->op == ack_prepare_op &&
3865 gt_ballot(m->proposal, p->proposer.msg->proposal)) { /* greater */
3866 replace_pax_msg(&p->proposer.msg, m);
3867 assert(p->proposer.msg);
3868 }
3869 if (gt_ballot(m->reply_to, p->proposer.sent_prop)) {
3870 can_propose = check_propose(site, p);
3871 }
3872 return can_propose;
3873 }
3874 }
3875
3876 /* Other node has already accepted a value */
handle_ack_prepare(site_def const * site,pax_machine * p,pax_msg * m)3877 static void handle_ack_prepare(site_def const *site, pax_machine *p,
3878 pax_msg *m) {
3879 ADD_DBG(D_CONS, add_synode_event(p->synode);
3880 add_event(EVENT_DUMP_PAD, string_arg("m->from"));
3881 add_event(EVENT_DUMP_PAD, uint_arg(m->from));
3882 add_event(EVENT_DUMP_PAD, string_arg(pax_op_to_str(m->op))););
3883 assert(m);
3884 IFDBG(D_NONE, FN; if (p->proposer.msg) BALCEXP(p->proposer.msg->proposal);
3885 BALCEXP(p->proposer.bal); BALCEXP(m->reply_to);
3886 BALCEXP(p->proposer.sent_prop); SYCEXP(m->synode));
3887 /*
3888 If the node is preparing a Noop for another node's slot, it is possible
3889 that the leader of the slot has since proposed a value. Hence, there is
3890 no need to move forward if we know that the value has been accepted. This
3891 also prevents changing the size of a learned pax_machine, which would
3892 cause inconsistent reporting of memory usage in P_S.
3893 */
3894 if (finished(p)) return;
3895
3896 if (m->from != VOID_NODE_NO &&
3897 eq_ballot(p->proposer.bal, m->reply_to)) { /* answer to my prepare */
3898 bool_t can_propose = handle_simple_ack_prepare(site, p, m);
3899 if (can_propose) send_propose_msg(p->proposer.msg);
3900 }
3901 }
3902
3903 /* #define AUTO_MSG(p,synode) {if(!(p)){replace_pax_msg(&(p),
3904 * pax_msg_new(synode, site));} */
3905
create_ack_accept_msg(pax_msg * m,synode_no synode)3906 static pax_msg *create_ack_accept_msg(pax_msg *m, synode_no synode) {
3907 CREATE_REPLY(m);
3908 reply->op = ack_accept_op;
3909 reply->synode = synode;
3910 return reply;
3911 }
3912
handle_simple_accept(pax_machine * p,pax_msg * m,synode_no synode)3913 pax_msg *handle_simple_accept(pax_machine *p, pax_msg *m, synode_no synode) {
3914 pax_msg *reply = NULL;
3915 if (finished(p)) { /* We have learned a value */
3916 reply = create_learn_msg_for_ignorant_node(p, m, synode);
3917 } else if (!gt_ballot(p->acceptor.promise,
3918 m->proposal) || /* Paxos acceptor phase 2 decision */
3919 noop_match(p, m)) {
3920 IFDBG(D_NONE, FN; SYCEXP(m->synode); STRLIT("accept ");
3921 BALCEXP(m->proposal));
3922 p->last_modified = task_now();
3923 replace_pax_msg(&p->acceptor.msg, m);
3924 reply = create_ack_accept_msg(m, synode);
3925 }
3926 return reply;
3927 }
3928
3929 /* Accecpt value if promise is not greater */
handle_accept(site_def const * site,pax_machine * p,linkage * reply_queue,pax_msg * m)3930 static void handle_accept(site_def const *site, pax_machine *p,
3931 linkage *reply_queue, pax_msg *m) {
3932 IFDBG(D_NONE, FN; BALCEXP(p->acceptor.promise); BALCEXP(m->proposal);
3933 STREXP(pax_msg_type_to_str(m->msg_type)));
3934 PAX_MSG_SANITY_CHECK(m);
3935 ADD_DBG(D_CONS, add_synode_event(p->synode);
3936 add_event(EVENT_DUMP_PAD, string_arg("m->from"));
3937 add_event(EVENT_DUMP_PAD, uint_arg(m->from));
3938 add_event(EVENT_DUMP_PAD, string_arg(pax_op_to_str(m->op)));
3939 add_event(EVENT_DUMP_PAD, string_arg("proposal"));
3940 add_ballot_event(m->proposal);
3941 add_event(EVENT_DUMP_PAD, string_arg("promise"));
3942 add_ballot_event(p->acceptor.promise););
3943
3944 {
3945 pax_msg *reply = handle_simple_accept(p, m, m->synode);
3946 if (reply != NULL) SEND_REPLY;
3947 }
3948 }
3949
3950 /* Handle answer to accept */
handle_simple_ack_accept(site_def const * site,pax_machine * p,pax_msg * m)3951 pax_msg *handle_simple_ack_accept(site_def const *site, pax_machine *p,
3952 pax_msg *m) {
3953 pax_msg *learn_msg = NULL;
3954 if (get_nodeno(site) != VOID_NODE_NO && m->from != VOID_NODE_NO &&
3955 eq_ballot(p->proposer.bal, m->reply_to)) { /* answer to my accept */
3956 BIT_SET(m->from, p->proposer.prop_nodeset);
3957 if (gt_ballot(m->proposal, p->proposer.sent_learn)) {
3958 learn_msg = check_learn(site, p);
3959 }
3960 }
3961 return learn_msg;
3962 }
handle_ack_accept(site_def const * site,pax_machine * p,pax_msg * m)3963 static void handle_ack_accept(site_def const *site, pax_machine *p,
3964 pax_msg *m) {
3965 ADD_DBG(D_CONS, add_synode_event(p->synode);
3966 add_event(EVENT_DUMP_PAD, string_arg("m->from"));
3967 add_event(EVENT_DUMP_PAD, uint_arg(m->from));
3968 add_event(EVENT_DUMP_PAD, string_arg(pax_op_to_str(m->op))););
3969 IFDBG(D_NONE, FN; SYCEXP(m->synode); BALCEXP(p->proposer.bal);
3970 BALCEXP(p->proposer.sent_learn); BALCEXP(m->proposal);
3971 BALCEXP(m->reply_to););
3972 IFDBG(D_NONE, FN; SYCEXP(p->synode);
3973 if (p->acceptor.msg) BALCEXP(p->acceptor.msg->proposal);
3974 BALCEXP(p->proposer.bal); BALCEXP(m->reply_to););
3975
3976 {
3977 pax_msg *learn_msg = handle_simple_ack_accept(site, p, m);
3978 if (learn_msg != NULL) {
3979 if (learn_msg->op == tiny_learn_op) {
3980 send_tiny_learn_msg(site, learn_msg);
3981 } else {
3982 /* purecov: begin deadcode */
3983 assert(learn_msg->op == learn_op);
3984 send_learn_msg(site, learn_msg);
3985 /* purecov: end */
3986 }
3987 }
3988 }
3989 }
3990
3991 /* Handle incoming learn. */
3992 static void activate_sweeper();
handle_tiny_learn(site_def const * site,pax_machine * pm,pax_msg * p)3993 void handle_tiny_learn(site_def const *site, pax_machine *pm, pax_msg *p) {
3994 assert(p->msg_type != no_op);
3995 if (pm->acceptor.msg) {
3996 /* BALCEXP(pm->acceptor.msg->proposal); */
3997 if (eq_ballot(pm->acceptor.msg->proposal, p->proposal)) {
3998 pm->acceptor.msg->op = learn_op;
3999 pm->last_modified = task_now();
4000 update_max_synode(p);
4001 handle_learn(site, pm, pm->acceptor.msg);
4002 } else {
4003 send_read(p->synode);
4004 IFDBG(D_NONE, FN; STRLIT("tiny_learn"); SYCEXP(p->synode);
4005 BALCEXP(pm->acceptor.msg->proposal); BALCEXP(p->proposal));
4006 }
4007 } else {
4008 send_read(p->synode);
4009 IFDBG(D_NONE, FN; STRLIT("tiny_learn"); SYCEXP(p->synode);
4010 BALCEXP(p->proposal));
4011 }
4012 }
4013
force_pax_machine(pax_machine * p,int enforcer)4014 static void force_pax_machine(pax_machine *p, int enforcer) {
4015 if (!p->enforcer) { /* Not if already marked as forcing node */
4016 if (enforcer) { /* Only if forcing node */
4017 /* Increase ballot count with a large increment without overflowing */
4018 /* p->proposer.bal.cnt may be -1. */
4019 int32_t delta = (INT32_MAX - MAX(p->proposer.bal.cnt, 0)) / 3;
4020 p->proposer.bal.cnt += delta;
4021 }
4022 }
4023 p->force_delivery = 1;
4024 p->enforcer = enforcer;
4025 }
4026
4027 /* Configure all messages in interval start, end to be forced */
force_interval(synode_no start,synode_no end,int enforcer)4028 static void force_interval(synode_no start, synode_no end, int enforcer) {
4029 while (!synode_gt(start, end)) {
4030 pax_machine *p = get_cache(start);
4031 if (get_nodeno(find_site_def(start)) == VOID_NODE_NO) break;
4032
4033 /* The forcing node will call force_interval twice, first when
4034 the new config is originally installed, and again when it
4035 receives it as an xcom message. start may be the same, but
4036 end will be greater the second time, since it is calculated
4037 based on the message number of the incoming config. Since the forcing
4038 node is the one responsible for delivering all messages until the
4039 start of the new site, it is important that all instances belonging to
4040 the old site are correctly marked. */
4041
4042 if (p->enforcer) enforcer = 1; /* Extend to new instances */
4043 force_pax_machine(p, enforcer);
4044
4045 /* Old nodesets are null and void */
4046 BIT_ZERO(p->proposer.prep_nodeset);
4047 BIT_ZERO(p->proposer.prop_nodeset);
4048 start = incr_synode(start);
4049 }
4050 }
4051
start_force_config(site_def * s,int enforcer)4052 static void start_force_config(site_def *s, int enforcer) {
4053 synode_no end = add_event_horizon(s->boot_key);
4054
4055 IFDBG(D_NONE, FN; SYCEXP(executed_msg); SYCEXP(end));
4056 if (synode_gt(end, max_synode)) set_max_synode(end);
4057
4058 free_forced_config_site_def();
4059 wait_forced_config = 0;
4060 forced_config = s;
4061 force_interval(executed_msg, max_synode,
4062 enforcer); /* Force everything in the pipeline */
4063 }
4064
4065 /* Learn this value */
handle_learn(site_def const * site,pax_machine * p,pax_msg * m)4066 void handle_learn(site_def const *site, pax_machine *p, pax_msg *m) {
4067 IFDBG(D_NONE, FN; STRLIT("proposer nodeset ");
4068 dbg_bitset(p->proposer.prop_nodeset, get_maxnodes(site)););
4069 IFDBG(D_NONE, FN; STRLIT("receivers ");
4070 dbg_bitset(m->receivers, get_maxnodes(site)););
4071 IFDBG(D_NONE, FN; NDBG(task_now(), f); SYCEXP(p->synode);
4072 COPY_AND_FREE_GOUT(dbg_app_data(m->a)););
4073
4074 PAX_MSG_SANITY_CHECK(m);
4075 p->last_modified = task_now();
4076 if (!finished(p)) { /* Avoid re-learn */
4077 activate_sweeper();
4078 do_learn(site, p, m);
4079 /* Check for special messages */
4080 if (m->a && m->a->body.c_t == unified_boot_type) {
4081 IFDBG(D_NONE, FN; STRLIT("Got unified_boot "); SYCEXP(p->synode);
4082 SYCEXP(m->synode););
4083 XCOM_FSM(x_fsm_net_boot, void_arg(m->a));
4084 }
4085 /* See if someone is forcing a new config */
4086 if (m->force_delivery && m->a) {
4087 IFDBG(D_NONE, FN; STRLIT("Got forced config "); SYCEXP(p->synode);
4088 SYCEXP(m->synode););
4089 /* Configure all messages from executed_msg until start of new config
4090 as forced messages so they will eventually be finished */
4091 /* Immediately install this new config */
4092 switch (m->a->body.c_t) {
4093 case add_node_type:
4094 /* purecov: begin deadcode */
4095 if (should_ignore_forced_config_or_view(
4096 find_site_def(p->synode)->x_proto)) {
4097 log_ignored_forced_config(m->a, "handle_learn");
4098 } else {
4099 start_force_config(clone_site_def(handle_add_node(m->a)), 0);
4100 }
4101 break;
4102 /* purecov: end */
4103 case remove_node_type:
4104 /* purecov: begin deadcode */
4105 if (should_ignore_forced_config_or_view(
4106 find_site_def(p->synode)->x_proto)) {
4107 log_ignored_forced_config(m->a, "handle_learn");
4108 } else {
4109 start_force_config(clone_site_def(handle_remove_node(m->a)), 0);
4110 }
4111 break;
4112 /* purecov: end */
4113 case force_config_type:
4114 start_force_config(clone_site_def(install_node_group(m->a)), 0);
4115 break;
4116 default:
4117 break;
4118 }
4119 }
4120 }
4121
4122 task_wakeup(&p->rv);
4123 }
4124
4125 /* Skip this value */
handle_skip(site_def const * site,pax_machine * p,pax_msg * m)4126 static void handle_skip(site_def const *site, pax_machine *p, pax_msg *m) {
4127 /* IFDBG(D_NONE, FN;); */
4128 /* IFDBG(D_NONE, FN; NDBG(task_now(),f); SYCEXP(p->msg->synode)); */
4129 if (!finished(p)) {
4130 p->last_modified = task_now();
4131 skip_value(m);
4132 do_learn(site, p, m);
4133 }
4134 /* IFDBG(D_NONE, FN; STRLIT("taskwakeup "); SYCEXP(p->msg->synode)); */
4135 task_wakeup(&p->rv);
4136 }
4137
handle_client_msg(pax_msg * p)4138 static void handle_client_msg(pax_msg *p) {
4139 if (!p || p->a == NULL) /* discard invalid message */
4140 return;
4141 {
4142 msg_link *ml = msg_link_new(p, VOID_NODE_NO);
4143
4144 /* Put it in the proposer queue */
4145 ADD_T_EV(task_now(), __FILE__, __LINE__, "handle_client_msg");
4146 channel_put(&prop_input_queue, &ml->l);
4147 }
4148 }
4149
4150 #ifdef ACCEPT_SITE_TEST
4151 /* See if we should process an incoming ping from a node.
4152 The purpose is to avoid doing recovery from a node with an obsolete site
4153 definition */
accept_site(site_def const * site)4154 static int accept_site(site_def const *site) {
4155 site_def *mysite = (site_def *)get_site_def();
4156
4157 if (site) {
4158 if (!mysite) {
4159 site_def *prev = (site_def *)find_prev_site_def(site->boot_key);
4160 IFDBG(
4161 D_NONE, FN; PTREXP(site); PTREXP(mysite); PTREXP(prev);
4162 SYCEXP(site->boot_key); if (prev) { SYCEXP(prev->boot_key); });
4163 if (!prev) {
4164 /** alive when no site, no known previous definition, and present in
4165 * new is accepted */
4166 return (site->boot_key.group_id == 0
4167 ? 1
4168 : (xcom_find_node_index((node_list *)&site->nodes) !=
4169 VOID_NODE_NO));
4170 } else {
4171 /** alive when no site, a previous definition of groupid is known, but
4172 * is older than site def, is accepted */
4173 return synode_gt(site->boot_key, prev->boot_key);
4174 }
4175 } else {
4176 IFDBG(D_NONE, FN; PTREXP(site); PTREXP(mysite); SYCEXP(site->boot_key);
4177 SYCEXP(mysite->boot_key));
4178 if (get_group_id(site) != get_group_id(mysite)) {
4179 /** alive from different site should never be accepted */
4180 return 0;
4181 } else {
4182 /** alive from same site should be accepted if boot_key is larger than
4183 * mine */
4184 node_no my_nodeno = xcom_find_node_index((node_list *)&mysite->nodes);
4185 node_no site_nodeno = xcom_find_node_index((node_list *)&site->nodes);
4186 return (synode_gt(site->boot_key, mysite->boot_key) &&
4187 ((my_nodeno != VOID_NODE_NO) || (site_nodeno != VOID_NODE_NO)));
4188 }
4189 }
4190 }
4191 /** Always accept a NULL site */
4192 IFDBG(D_NONE, FN; PTREXP(site));
4193 return 1;
4194 }
4195 #endif
4196
4197 /* Handle incoming "need boot" message. */
4198 /* purecov: begin deadcode */
handle_boot(site_def const * site,linkage * reply_queue,pax_msg * p)4199 static inline void handle_boot(site_def const *site, linkage *reply_queue,
4200 pax_msg *p) {
4201 /* This should never be TRUE, but validate it instead of asserting. */
4202 if (site == NULL || site->nodes.node_list_len < 1) {
4203 G_DEBUG(
4204 "handle_boot: Received an unexpected need_boot_op when site == NULL or "
4205 "site->nodes.node_list_len < 1");
4206 return;
4207 }
4208
4209 if (ALWAYS_HANDLE_NEED_BOOT || should_handle_need_boot(site, p)) {
4210 handle_need_snapshot(reply_queue, p);
4211 } else {
4212 G_DEBUG(
4213 "Ignoring a need_boot_op message from an XCom incarnation that does "
4214 "not belong to the group.");
4215 }
4216 }
4217 /* purecov: end */
4218
should_handle_need_boot(site_def const * site,pax_msg * p)4219 bool_t should_handle_need_boot(site_def const *site, pax_msg *p) {
4220 bool_t should_handle = FALSE;
4221 bool_t const sender_advertises_identity =
4222 (p->a != NULL && p->a->body.c_t == xcom_boot_type);
4223
4224 /*
4225 If the message advertises the sender's identity, check if it matches the
4226 membership information.
4227
4228 The sender's identity may not match if, e.g.:
4229
4230 a. The member was already removed, or
4231 b. It is a new incarnation of a crashed member that is yet to be removed.
4232
4233 ...or some other reason.
4234
4235 If it is due to reason (b), we do not want to boot the sender because XCom
4236 only implements a simple fail-stop model. Allowing the sender to rejoin the
4237 group without going through the full remove+add node path could violate
4238 safety because the sender does not remember any previous Paxos acceptances it
4239 acknowledged before crashing.
4240 Since the pre-crash incarnation may have accepted a value for a given synod
4241 but the post-crash incarnation has forgotten that fact, the post-crash
4242 incarnation will fail to propagate the previously accepted value to a higher
4243 ballot. Since majorities can overlap on a single node, if the overlap node
4244 is the post-crash incarnation which has forgotten about the previously
4245 accepted value, a higher ballot proposer may get a different value accepted,
4246 leading to conflicting values to be accepted for different proposers, which
4247 is a violation of the safety properties of the Paxos protocol.
4248
4249 If the sender does not advertise its identity, we boot it unconditionally.
4250 This is for backwards compatibility.
4251 */
4252 if (sender_advertises_identity) {
4253 bool_t const sender_advertises_one_identity =
4254 (p->a->body.app_u_u.nodes.node_list_len == 1);
4255
4256 /* Defensively accept only messages with a single identity. */
4257 if (sender_advertises_one_identity) {
4258 node_address *sender_identity = p->a->body.app_u_u.nodes.node_list_val;
4259
4260 should_handle = node_exists_with_uid(sender_identity, &site->nodes);
4261 }
4262 } else {
4263 should_handle = TRUE;
4264 }
4265
4266 return should_handle;
4267 }
4268
init_need_boot_op(pax_msg * p,node_address * identity)4269 void init_need_boot_op(pax_msg *p, node_address *identity) {
4270 p->op = need_boot_op;
4271 if (identity != NULL) {
4272 p->a = new_app_data();
4273 p->a->body.c_t = xcom_boot_type;
4274 init_node_list(1, identity, &p->a->body.app_u_u.nodes);
4275 }
4276 }
4277
4278 #define PING_GATHERING_TIME_WINDOW 5.0
4279 #define PINGS_GATHERED_BEFORE_CONNECTION_SHUTDOWN 3
4280
pre_process_incoming_ping(site_def const * site,pax_msg const * pm,int has_client_already_booted,double current_time)4281 int pre_process_incoming_ping(site_def const *site, pax_msg const *pm,
4282 int has_client_already_booted,
4283 double current_time) {
4284 // Yes... it is a ping for me, boot is done and it is a are_you_alive_op
4285 // This means that something wrong is not right...
4286 int did_shutdown = 0;
4287
4288 if ((pm->from != get_nodeno(site)) && has_client_already_booted &&
4289 (pm->op == are_you_alive_op)) {
4290 G_DEBUG(
4291 "Received a ping to myself. This means that something must be wrong in "
4292 "a bi-directional connection")
4293 // Going to kill the connection for that node...
4294 if (site && (pm->from < site->nodes.node_list_len)) {
4295 // This is not the first ping received in the last 5 seconds...
4296 if (site->servers[pm->from]->last_ping_received >
4297 (current_time - PING_GATHERING_TIME_WINDOW)) {
4298 site->servers[pm->from]->number_of_pings_received++;
4299 } else { // First ping since at least more than 5 seconds...
4300 site->servers[pm->from]->number_of_pings_received = 1;
4301 }
4302
4303 site->servers[pm->from]->last_ping_received = current_time;
4304
4305 // If we keep on receiving periodical pings... lets kill the connection
4306 if (is_connected(&site->servers[pm->from]->con) &&
4307 site->servers[pm->from]->number_of_pings_received ==
4308 PINGS_GATHERED_BEFORE_CONNECTION_SHUTDOWN) {
4309 shutdown_connection(&site->servers[pm->from]->con);
4310 G_WARNING(
4311 "Shutting down an outgoing connection. This happens because "
4312 "something might be wrong on a bi-directional connection to node "
4313 "%s:%d. Please check the connection status to this member",
4314 site->servers[pm->from]->srv, site->servers[pm->from]->port);
4315 did_shutdown = 1;
4316 }
4317 }
4318 }
4319
4320 return did_shutdown;
4321 }
4322
4323 /* Handle incoming alive message */
4324 static double sent_alive = 0.0;
handle_alive(site_def const * site,linkage * reply_queue,pax_msg * pm)4325 static inline void handle_alive(site_def const *site, linkage *reply_queue,
4326 pax_msg *pm) {
4327 pre_process_incoming_ping(site, pm, client_boot_done, task_now());
4328
4329 if (client_boot_done || !(task_now() - sent_alive > 1.0)) /* Already done? */
4330 return;
4331
4332 #ifdef ACCEPT_SITE_TEST
4333 if (!accept_site(site)) return;
4334 #endif
4335
4336 /* Avoid responding to own ping */
4337 if (pm->from == get_nodeno(site) || pm->from == pm->to) return;
4338
4339 /*
4340 This code will check if the ping is intended to us.
4341 If the encoded node does not exist in the current configuration,
4342 we avoid sending need_boot_op, since it must be from a different
4343 reincarnation of this node.
4344 */
4345 if (site && pm->a && pm->a->body.c_t == xcom_boot_type) {
4346 IFDBG(D_NONE, FN;
4347 COPY_AND_FREE_GOUT(dbg_list(&pm->a->body.app_u_u.nodes)););
4348
4349 if (!node_exists_with_uid(&pm->a->body.app_u_u.nodes.node_list_val[0],
4350 &get_site_def()->nodes))
4351 return;
4352 }
4353
4354 if (is_dead_site(pm->group_id)) return; /* Avoid dealing with zombies */
4355
4356 {
4357 CREATE_REPLY(pm);
4358 init_need_boot_op(reply, cfg_app_xcom_get_identity());
4359 sent_alive = task_now();
4360 SEND_REPLY;
4361 }
4362 IFDBG(D_NONE, FN; STRLIT("sent need_boot_op"););
4363 }
4364
update_max_synode(pax_msg * p)4365 static void update_max_synode(pax_msg *p) {
4366 if (is_dead_site(p->group_id)) return;
4367 if (get_group_id(get_site_def()) == 0 || max_synode.group_id == 0) {
4368 set_max_synode(p->synode);
4369 } else if (max_synode.group_id == p->synode.group_id) {
4370 if (synode_gt(p->synode, max_synode)) {
4371 set_max_synode(p->synode);
4372 }
4373 if (synode_gt(p->max_synode, max_synode)) {
4374 set_max_synode(p->max_synode);
4375 }
4376 }
4377 }
4378
4379 /* Message dispatch */
4380 #define BAL_FMT "ballot {cnt %d node %d}"
4381 #define BAL_MEM(x) (x).cnt, (x).node
4382
4383 static int clicnt = 0;
4384
xcom_get_minimum_event_horizon()4385 xcom_event_horizon xcom_get_minimum_event_horizon() {
4386 return EVENT_HORIZON_MIN;
4387 }
4388
xcom_get_maximum_event_horizon()4389 xcom_event_horizon xcom_get_maximum_event_horizon() {
4390 return EVENT_HORIZON_MAX;
4391 }
4392
4393 /**
4394 * Retrieves the latest event horizon.
4395 *
4396 * There is no specific reason for this method to return the latest event
4397 * horizon instead of the current one. Both would be acceptable results of
4398 * this function, but we had to make a decision of one over the other.
4399 *
4400 * @param[out] event_horizon the latest event horizon
4401 * @retval REQUEST_FAIL XCom is not initialized yet
4402 * @retval REQUEST_OK function was successful and event_horizon contains the
4403 * latest event horizon
4404 */
xcom_get_event_horizon(xcom_event_horizon * event_horizon)4405 static client_reply_code xcom_get_event_horizon(
4406 xcom_event_horizon *event_horizon) {
4407 site_def const *latest_config = get_site_def();
4408 if (latest_config == NULL) return REQUEST_FAIL;
4409 *event_horizon = latest_config->event_horizon;
4410 return REQUEST_OK;
4411 }
4412
allow_add_node(app_data_ptr a)4413 static u_int allow_add_node(app_data_ptr a) {
4414 /* Get information on the current site definition */
4415 const site_def *new_site_def = get_site_def();
4416 const site_def *valid_site_def = find_site_def(executed_msg);
4417
4418 /* Get information on the nodes to be added */
4419 u_int nr_nodes_to_add = a->body.app_u_u.nodes.node_list_len;
4420 node_address *nodes_to_change = a->body.app_u_u.nodes.node_list_val;
4421
4422 if (add_node_unsafe_against_event_horizon(a)) return 0;
4423
4424 if (add_node_unsafe_against_ipv4_old_nodes(a)) {
4425 G_MESSAGE(
4426 "This server is unable to join the group as the NIC used is configured "
4427 "with IPv6 only and there are members in the group that are unable to "
4428 "communicate using IPv6, only IPv4.Please configure this server to "
4429 "join the group using an IPv4 address instead.");
4430 return 0;
4431 }
4432
4433 {
4434 u_int i;
4435 for (i = 0; i < nr_nodes_to_add; i++) {
4436 if (node_exists(&nodes_to_change[i], &new_site_def->nodes) ||
4437 node_exists(&nodes_to_change[i], &valid_site_def->nodes)) {
4438 /*
4439 We are simply ignoring the attempt to add a node to the
4440 group when there is an old incarnation of it, meaning
4441 that the node has crashed and restarted so fastly that
4442 nobody has noticed that it has gone.
4443
4444 In XCOM, the group is not automatically reconfigured
4445 and it is possible to start reusing a node that has
4446 crashed and restarted without reconfiguring the group
4447 by adding the node back to it.
4448
4449 However, this operation may be unsafe because XCOM
4450 does not implement a crash-recovery model and nodes
4451 suffer from amnesia after restarting the service. In
4452 other words this may lead to inconsistency issues in
4453 the paxos protocol.
4454
4455 Unfortunately, preventing that a node is added back
4456 to the system where there is an old incarnation will
4457 not fix this problem since other changes are required.
4458 */
4459 G_MESSAGE(
4460 "Old incarnation found while trying to "
4461 "add node %s %.*s.",
4462 nodes_to_change[i].address, nodes_to_change[i].uuid.data.data_len,
4463 nodes_to_change[i].uuid.data.data_val);
4464 return 0;
4465 }
4466 }
4467 }
4468
4469 return 1;
4470 }
4471
allow_remove_node(app_data_ptr a)4472 static u_int allow_remove_node(app_data_ptr a) {
4473 /* Get information on the current site definition */
4474 const site_def *new_site_def = get_site_def();
4475
4476 /* Get information on the nodes to be added */
4477 u_int nodes_len = a->body.app_u_u.nodes.node_list_len;
4478 node_address *nodes_to_change = a->body.app_u_u.nodes.node_list_val;
4479
4480 u_int i;
4481 for (i = 0; i < nodes_len; i++) {
4482 if (!node_exists_with_uid(&nodes_to_change[i], &new_site_def->nodes)) {
4483 /*
4484 If the UID does not exist, then 1) the node has already been
4485 removed or 2) it has reincarnated.
4486 */
4487 /* purecov: begin inspected */
4488 if (node_exists(&nodes_to_change[i], &new_site_def->nodes)) {
4489 /*
4490 We also cannot allow an upper-layer to remove a new incarnation
4491 of a node when it tries to remove an old one.
4492 */
4493 G_MESSAGE(
4494 "New incarnation found while trying to "
4495 "remove node %s %.*s.",
4496 nodes_to_change[i].address, nodes_to_change[i].uuid.data.data_len,
4497 nodes_to_change[i].uuid.data.data_val);
4498 } else {
4499 /* The node has already been removed, so we block the request */
4500 G_MESSAGE(
4501 "Node has already been removed: "
4502 "%s %.*s.",
4503 nodes_to_change[i].address, nodes_to_change[i].uuid.data.data_len,
4504 nodes_to_change[i].uuid.data.data_val);
4505 }
4506 return 0;
4507 /* purecov: end */
4508 }
4509 }
4510
4511 return 1;
4512 }
4513
4514 /**
4515 * Logs the fact that an add/remove node request is aimed at another group.
4516 *
4517 * @param a a pointer to the app_data of the configuration command
4518 * @param message_fmt a formatted message to log, containing a single %s that
4519 * will be replaced by the node's address
4520 */
log_cfgchange_wrong_group(app_data_ptr a,const char * const message_fmt)4521 static void log_cfgchange_wrong_group(app_data_ptr a,
4522 const char *const message_fmt) {
4523 u_int const nr_nodes = a->body.app_u_u.nodes.node_list_len;
4524 u_int i;
4525 for (i = 0; i < nr_nodes; i++) {
4526 char const *const address = a->body.app_u_u.nodes.node_list_val[i].address;
4527 G_WARNING(message_fmt, address);
4528 }
4529 }
4530
4531 /**
4532 * Validates if a configuration command can be executed.
4533 * Checks whether the configuration command is aimed at the correct group.
4534 * Checks whether the configuration command pertains to a node reincarnation.
4535 *
4536 * @param p a pointer to the pax_msg of the configuration command
4537 * @retval REQUEST_OK if the reconfiguration command can be executed
4538 * @retval REQUEST_RETRY if XCom is still booting
4539 * @retval REQUEST_FAIL if the configuration command cannot be executed
4540 */
can_execute_cfgchange(pax_msg * p)4541 static client_reply_code can_execute_cfgchange(pax_msg *p) {
4542 app_data_ptr a = p->a;
4543
4544 if (executed_msg.msgno <= 2) return REQUEST_RETRY;
4545
4546 if (a && a->group_id != 0 && a->group_id != executed_msg.group_id) {
4547 switch (a->body.c_t) {
4548 case add_node_type:
4549 log_cfgchange_wrong_group(
4550 a,
4551 "The request to add %s to the group has been rejected because it "
4552 "is aimed at another group");
4553 break;
4554 case remove_node_type:
4555 log_cfgchange_wrong_group(
4556 a,
4557 "The request to remove %s from the group has been rejected because "
4558 "it is aimed at another group");
4559 break;
4560 case force_config_type:
4561 G_WARNING(
4562 "The request to force the group membership has been rejected "
4563 "because it is aimed at another group");
4564 break;
4565 default:
4566 assert(0 &&
4567 "A cargo_type different from {add_node_type, remove_node_type, "
4568 "force_config_type} should not have hit this code path");
4569 }
4570 return REQUEST_FAIL;
4571 }
4572
4573 if (a && a->body.c_t == add_node_type && !allow_add_node(a))
4574 return REQUEST_FAIL;
4575
4576 if (a && a->body.c_t == remove_node_type && !allow_remove_node(a))
4577 return REQUEST_FAIL;
4578
4579 if (a && a->body.c_t == set_event_horizon_type &&
4580 unsafe_event_horizon_reconfiguration(a))
4581 return REQUEST_FAIL;
4582
4583 if (a && a->body.c_t == force_config_type &&
4584 are_there_dead_nodes_in_new_config(a))
4585 return REQUEST_FAIL;
4586
4587 return REQUEST_OK;
4588 }
4589
activate_sweeper()4590 static void activate_sweeper() {
4591 if (sweeper) {
4592 ADD_DBG(D_CONS, add_event(EVENT_DUMP_PAD,
4593 string_arg("sweeper activated max_synode"));
4594 add_synode_event(max_synode););
4595 task_activate(sweeper);
4596 }
4597 }
4598
4599 static synode_no start_config = NULL_SYNODE;
4600
dispatch_get_event_horizon(site_def const * site,pax_msg * p,linkage * reply_queue)4601 void dispatch_get_event_horizon(site_def const *site, pax_msg *p,
4602 linkage *reply_queue) {
4603 CREATE_REPLY(p);
4604 IFDBG(D_NONE, FN; STRLIT("Got get_event_horizon from client");
4605 SYCEXP(p->synode););
4606 reply->op = xcom_client_reply;
4607 reply->cli_err = xcom_get_event_horizon(&reply->event_horizon);
4608 SEND_REPLY;
4609 }
4610
4611 /*
4612 * Log the result of the get_synode_app_data command.
4613 */
log_get_synode_app_data_failure(xcom_get_synode_app_data_result error_code)4614 static void log_get_synode_app_data_failure(
4615 xcom_get_synode_app_data_result error_code) {
4616 switch (error_code) {
4617 case XCOM_GET_SYNODE_APP_DATA_OK:
4618 break;
4619 case XCOM_GET_SYNODE_APP_DATA_ERROR:
4620 G_DEBUG("Could not reply successfully to request for synode data.");
4621 break;
4622 case XCOM_GET_SYNODE_APP_DATA_NOT_CACHED:
4623 G_DEBUG(
4624 "Could not reply successfully to request for synode data because "
4625 "some of the requested synodes are no longer cached.");
4626 break;
4627 case XCOM_GET_SYNODE_APP_DATA_NOT_DECIDED:
4628 G_DEBUG(
4629 "Could not reply successfully to request for synode data because "
4630 "some of the requested synodes are still undecided.");
4631 break;
4632 case XCOM_GET_SYNODE_APP_DATA_NO_MEMORY:
4633 G_DEBUG(
4634 "Could not reply successfully to request for synode data because "
4635 "memory could not be allocated.");
4636 break;
4637 }
4638 }
4639
dispatch_get_synode_app_data(site_def const * site,pax_msg * p,linkage * reply_queue)4640 void dispatch_get_synode_app_data(site_def const *site, pax_msg *p,
4641 linkage *reply_queue) {
4642 IFDBG(D_NONE, FN; STRLIT("Got get_synode_app_data from client");
4643 SYCEXP(p->synode););
4644
4645 {
4646 CREATE_REPLY(p);
4647 reply->op = xcom_client_reply;
4648
4649 {
4650 xcom_get_synode_app_data_result error_code;
4651 error_code = xcom_get_synode_app_data(&p->a->body.app_u_u.synodes,
4652 &reply->requested_synode_app_data);
4653 switch (error_code) {
4654 case XCOM_GET_SYNODE_APP_DATA_OK:
4655 reply->cli_err = REQUEST_OK;
4656 break;
4657 case XCOM_GET_SYNODE_APP_DATA_NOT_CACHED:
4658 case XCOM_GET_SYNODE_APP_DATA_NOT_DECIDED:
4659 case XCOM_GET_SYNODE_APP_DATA_NO_MEMORY:
4660 case XCOM_GET_SYNODE_APP_DATA_ERROR:
4661 reply->cli_err = REQUEST_FAIL;
4662 log_get_synode_app_data_failure(error_code);
4663 break;
4664 }
4665
4666 SEND_REPLY;
4667 }
4668 }
4669 }
4670
4671 static int can_send_snapshot();
4672
dispatch_op(site_def const * site,pax_msg * p,linkage * reply_queue)4673 pax_msg *dispatch_op(site_def const *site, pax_msg *p, linkage *reply_queue) {
4674 pax_machine *pm = NULL;
4675 site_def *dsite = find_site_def_rw(p->synode);
4676 int in_front = too_far(p->synode);
4677
4678 if (p->force_delivery) {
4679 /* Ensure that forced message can be processed */
4680 in_front = 0;
4681 }
4682
4683 if (dsite && p->op != client_msg && is_server_connected(dsite, p->from)) {
4684 /* Wake up the detector task if this node was previously marked as
4685 * potentially failed. */
4686 if (!note_detected(dsite, p->from)) task_wakeup(&detector_wait);
4687 update_delivered(dsite, p->from, p->delivered_msg);
4688 }
4689
4690 IFDBG(D_NONE, FN; STRLIT("incoming message ");
4691 COPY_AND_FREE_GOUT(dbg_pax_msg(p)););
4692 ADD_DBG(D_NONE, add_synode_event(p->synode);
4693 add_event(EVENT_DUMP_PAD, string_arg("p->from"));
4694 add_event(EVENT_DUMP_PAD, uint_arg(p->from));
4695 add_event(EVENT_DUMP_PAD, string_arg("in_front"));
4696 add_event(EVENT_DUMP_PAD, int_arg(in_front));
4697 add_event(EVENT_DUMP_PAD, string_arg(pax_op_to_str(p->op))););
4698
4699 switch (p->op) {
4700 case client_msg:
4701 clicnt++;
4702 if (p->a && (p->a->body.c_t == exit_type)) {
4703 /* purecov: begin deadcode */
4704 IFDBG(D_NONE, FN; STRLIT("Got exit from client"); SYCEXP(p->synode););
4705 bury_site(get_group_id(get_site_def()));
4706 ADD_DBG(D_FSM, add_event(EVENT_DUMP_PAD, string_arg("terminating"));)
4707 terminate_and_exit();
4708 break;
4709 /* purecov: end */
4710 }
4711 if (p->a && (p->a->body.c_t == reset_type)) {
4712 /* purecov: begin deadcode */
4713 IFDBG(D_NONE, FN; STRLIT("Got reset from client"); SYCEXP(p->synode););
4714 bury_site(get_group_id(get_site_def()));
4715 ADD_DBG(D_FSM, add_event(EVENT_DUMP_PAD, string_arg("terminating"));)
4716 XCOM_FSM(x_fsm_terminate, int_arg(0));
4717 break;
4718 /* purecov: end */
4719 }
4720 if (p->a && (p->a->body.c_t == remove_reset_type)) {
4721 /* purecov: begin deadcode */
4722 IFDBG(D_NONE, FN; STRLIT("Got remove_reset from client");
4723 SYCEXP(p->synode););
4724 ADD_DBG(D_FSM, add_event(EVENT_DUMP_PAD, string_arg("terminating"));)
4725 XCOM_FSM(x_fsm_terminate, int_arg(0));
4726 break;
4727 /* purecov: end */
4728 }
4729 if (p->a && (p->a->body.c_t == enable_arbitrator)) {
4730 CREATE_REPLY(p);
4731 IFDBG(D_NONE, FN; STRLIT("Got enable_arbitrator from client");
4732 SYCEXP(p->synode););
4733 ARBITRATOR_HACK = 1;
4734 reply->op = xcom_client_reply;
4735 reply->cli_err = REQUEST_OK;
4736 SEND_REPLY;
4737 break;
4738 }
4739 if (p->a && (p->a->body.c_t == disable_arbitrator)) {
4740 CREATE_REPLY(p);
4741 IFDBG(D_NONE, FN; STRLIT("Got disable_arbitrator from client");
4742 SYCEXP(p->synode););
4743 ARBITRATOR_HACK = 0;
4744 reply->op = xcom_client_reply;
4745 reply->cli_err = REQUEST_OK;
4746 SEND_REPLY;
4747 break;
4748 }
4749 if (p->a && (p->a->body.c_t == set_cache_limit)) {
4750 CREATE_REPLY(p);
4751 IFDBG(D_NONE, FN; STRLIT("Got set_cache_limit from client");
4752 SYCEXP(p->synode););
4753 if (the_app_xcom_cfg) {
4754 set_max_cache_size(p->a->body.app_u_u.cache_limit);
4755 reply->cli_err = REQUEST_OK;
4756 } else {
4757 reply->cli_err = REQUEST_FAIL;
4758 }
4759 reply->op = xcom_client_reply;
4760 SEND_REPLY;
4761 break;
4762 }
4763 if (p->a && (p->a->body.c_t == x_terminate_and_exit)) {
4764 /* purecov: begin deadcode */
4765 CREATE_REPLY(p);
4766 IFDBG(D_NONE, FN; STRLIT("Got terminate_and_exit from client");
4767 SYCEXP(p->synode););
4768 reply->op = xcom_client_reply;
4769 reply->cli_err = REQUEST_OK;
4770 SEND_REPLY;
4771 /*
4772 The function frees sites which is used by SEND_REPLY,
4773 so it should be called after SEND_REPLY.
4774 */
4775 IFDBG(D_NONE, FN; STRLIT("terminate_and_exit"));
4776 ADD_DBG(D_FSM, add_event(EVENT_DUMP_PAD, string_arg("terminating"));)
4777 terminate_and_exit();
4778 break;
4779 /* purecov: end */
4780 }
4781 if (p->a && (p->a->body.c_t == get_event_horizon_type)) {
4782 dispatch_get_event_horizon(site, p, reply_queue);
4783 break;
4784 }
4785 if (p->a && (p->a->body.c_t == get_synode_app_data_type)) {
4786 dispatch_get_synode_app_data(site, p, reply_queue);
4787 break;
4788 }
4789 if (p->a && (p->a->body.c_t == add_node_type ||
4790 p->a->body.c_t == remove_node_type ||
4791 p->a->body.c_t == force_config_type ||
4792 p->a->body.c_t == set_event_horizon_type)) {
4793 client_reply_code cli_err;
4794 CREATE_REPLY(p);
4795 reply->op = xcom_client_reply;
4796 reply->cli_err = cli_err = can_execute_cfgchange(p);
4797 SEND_REPLY;
4798 if (cli_err != REQUEST_OK) {
4799 break;
4800 }
4801 }
4802 if (p->a && p->a->body.c_t == unified_boot_type) {
4803 IFDBG(D_NONE, FN; STRLIT("Got unified_boot from client");
4804 SYCEXP(p->synode););
4805 IFDBG(D_NONE, FN;
4806 COPY_AND_FREE_GOUT(dbg_list(&p->a->body.app_u_u.nodes)););
4807 IFDBG(D_NONE, STRLIT("handle_client_msg "); NDBG(p->a->group_id, x));
4808 XCOM_FSM(x_fsm_net_boot, void_arg(p->a));
4809 }
4810 if (p->a && p->a->body.c_t == add_node_type) {
4811 IFDBG(D_NONE, FN; STRLIT("Got add_node from client");
4812 SYCEXP(p->synode););
4813 IFDBG(D_NONE, FN;
4814 COPY_AND_FREE_GOUT(dbg_list(&p->a->body.app_u_u.nodes)););
4815 IFDBG(D_NONE, STRLIT("handle_client_msg "); NDBG(p->a->group_id, x));
4816 assert(get_site_def());
4817 }
4818 if (p->a && p->a->body.c_t == remove_node_type) {
4819 IFDBG(D_NONE, FN; STRLIT("Got remove_node from client");
4820 SYCEXP(p->synode););
4821 IFDBG(D_NONE, FN;
4822 COPY_AND_FREE_GOUT(dbg_list(&p->a->body.app_u_u.nodes)););
4823 IFDBG(D_NONE, STRLIT("handle_client_msg "); NDBG(p->a->group_id, x));
4824 assert(get_site_def());
4825 }
4826 if (p->a && p->a->body.c_t == set_event_horizon_type) {
4827 IFDBG(D_NONE, FN; STRLIT("Got set_event_horizon from client");
4828 SYCEXP(p->synode););
4829 IFDBG(D_NONE, FN; NDBG(p->a->body.app_u_u.event_horizon, u));
4830 IFDBG(D_NONE, STRLIT("handle_client_msg "); NDBG(p->a->group_id, x));
4831 assert(get_site_def());
4832 }
4833 if (p->a && p->a->body.c_t == force_config_type) {
4834 IFDBG(D_NONE, FN; STRLIT("Got new force config from client");
4835 SYCEXP(p->synode););
4836 IFDBG(D_NONE, FN;
4837 COPY_AND_FREE_GOUT(dbg_list(&p->a->body.app_u_u.nodes)););
4838 IFDBG(D_NONE, STRLIT("handle_client_msg "); NDBG(p->a->group_id, x));
4839 assert(get_site_def());
4840 XCOM_FSM(x_fsm_force_config, void_arg(p->a));
4841 }
4842 handle_client_msg(p);
4843 break;
4844 case initial_op:
4845 break;
4846 case read_op:
4847 pm = get_cache(p->synode);
4848 assert(pm);
4849
4850 handle_read(site, pm, reply_queue, p);
4851 break;
4852 case prepare_op:
4853 pm = get_cache(p->synode);
4854 assert(pm);
4855 if (p->force_delivery) pm->force_delivery = 1;
4856 IFDBG(D_NONE, FN; dbg_pax_msg(p));
4857
4858 /*
4859 We can only be a productive Paxos Acceptor if we have been booted, i.e.
4860 added to the group and received an up-to-date snapshot from some member.
4861
4862 We do not allow non-booted members to participate in Paxos because they
4863 might be a reincarnation of a member that crashed and was then brought up
4864 without having gone through the remove+add node path.
4865 Since the pre-crash incarnation may have accepted a value for a given
4866 synod but the post-crash incarnation has forgotten that fact, the
4867 post-crash incarnation will fail to propagate the previously accepted
4868 value to a higher ballot. Since majorities can overlap on a single node,
4869 if the overlap node is the post-crash incarnation which has forgotten
4870 about the previously accepted value, the higher ballot proposer may get
4871 a different value accepted, leading to conflicting values to be accepted
4872 for different proposers, which is a violation of the safety requirements
4873 of the Paxos protocol.
4874 */
4875 if (ALWAYS_HANDLE_CONSENSUS || client_boot_done) {
4876 handle_prepare(site, pm, reply_queue, p);
4877 }
4878 break;
4879 case ack_prepare_op:
4880 case ack_prepare_empty_op:
4881 if (in_front || !is_cached(p->synode)) break;
4882 pm = get_cache(p->synode);
4883 if (p->force_delivery) pm->force_delivery = 1;
4884 if (!pm->proposer.msg) break;
4885 assert(pm && pm->proposer.msg);
4886 handle_ack_prepare(site, pm, p);
4887 break;
4888 case accept_op:
4889 pm = get_cache(p->synode);
4890 assert(pm);
4891 if (p->force_delivery) pm->force_delivery = 1;
4892 IFDBG(D_NONE, FN; dbg_pax_msg(p));
4893
4894 /*
4895 We can only be a productive Paxos Acceptor if we have been booted, i.e.
4896 added to the group and received an up-to-date snapshot from some member.
4897
4898 We do not allow non-booted members to participate in Paxos because they
4899 might be a reincarnation of a member that crashed and was then brought up
4900 without having gone through the remove+add node path.
4901 Since the pre-crash incarnation may have accepted a value for a given
4902 synod but the post-crash incarnation has forgotten that fact, the
4903 post-crash incarnation will fail to propagate the previously accepted
4904 value to a higher ballot. Since majorities can overlap on a single node,
4905 if the overlap node is the post-crash incarnation which has forgotten
4906 about the previously accepted value, the higher ballot proposer may get
4907 a different value accepted, leading to conflicting values to be accepted
4908 for different proposers, which is a violation of the safety requirements
4909 of the Paxos protocol.
4910 */
4911 if (ALWAYS_HANDLE_CONSENSUS || client_boot_done) {
4912 handle_alive(site, reply_queue, p);
4913
4914 handle_accept(site, pm, reply_queue, p);
4915 }
4916 break;
4917 case ack_accept_op:
4918 if (in_front || !is_cached(p->synode)) break;
4919 pm = get_cache(p->synode);
4920 if (p->force_delivery) pm->force_delivery = 1;
4921 if (!pm->proposer.msg) break;
4922 assert(pm && pm->proposer.msg);
4923 handle_ack_accept(site, pm, p);
4924 break;
4925 case recover_learn_op:
4926 IFDBG(D_NONE, FN; STRLIT("recover_learn_op receive "); SYCEXP(p->synode));
4927 pm = get_cache(p->synode);
4928 assert(pm);
4929 if (p->force_delivery) pm->force_delivery = 1;
4930 update_max_synode(p);
4931 {
4932 IFDBG(D_NONE, FN; STRLIT("recover_learn_op learn "); SYCEXP(p->synode));
4933 p->op = learn_op;
4934 handle_learn(site, pm, p);
4935 }
4936 break;
4937 case learn_op:
4938 learnop:
4939 pm = get_cache(p->synode);
4940 assert(pm);
4941 if (p->force_delivery) pm->force_delivery = 1;
4942 update_max_synode(p);
4943 handle_learn(site, pm, p);
4944 break;
4945 case tiny_learn_op:
4946 if (p->msg_type == no_op) goto learnop;
4947 pm = get_cache(p->synode);
4948 assert(pm);
4949 if (p->force_delivery) pm->force_delivery = 1;
4950 handle_tiny_learn(site, pm, p);
4951 break;
4952 case skip_op:
4953 pm = get_cache(p->synode);
4954 assert(pm);
4955 if (p->force_delivery) pm->force_delivery = 1;
4956 handle_skip(site, pm, p);
4957 break;
4958 case i_am_alive_op:
4959 /* Update max_synode, but use only p->max_synode, ignore p->synode */
4960 if (!is_dead_site(p->group_id)) {
4961 if (max_synode.group_id == p->synode.group_id &&
4962 synode_gt(p->max_synode, max_synode)) {
4963 set_max_synode(p->max_synode);
4964 }
4965 }
4966 handle_alive(site, reply_queue, p);
4967 break;
4968 case are_you_alive_op:
4969 handle_alive(site, reply_queue, p);
4970 break;
4971 case need_boot_op:
4972 /* purecov: begin deadcode */
4973 /* Only in run state. Test state and do it here because we need to use
4974 * reply queue */
4975 if (can_send_snapshot() &&
4976 !synode_eq(get_site_def()->boot_key, null_synode)) {
4977 handle_boot(site, reply_queue, p);
4978 }
4979 /* Wake senders waiting to connect, since new node has appeared */
4980 wakeup_sender();
4981 break;
4982 /* purecov: end */
4983 case gcs_snapshot_op:
4984 /* Avoid duplicate snapshots and snapshots from zombies */
4985 IFDBG(D_BUG, FN; SYCEXP(executed_msg););
4986 IFDBG(D_BUG, FN; SYCEXP(start_config););
4987 if (!synode_eq(start_config, get_highest_boot_key(p->gcs_snap)) &&
4988 !is_dead_site(p->group_id)) {
4989 update_max_synode(p);
4990 /* For incoming messages, note delivery of snapshot from sender node */
4991 note_snapshot(p->from);
4992 XCOM_FSM(x_fsm_snapshot, void_arg(p->gcs_snap));
4993 }
4994 break;
4995 case die_op:
4996 /* assert("die horribly" == "need coredump"); */
4997 {
4998 GET_GOUT;
4999 FN;
5000 STRLIT("die_op ");
5001 SYCEXP(executed_msg);
5002 SYCEXP(delivered_msg);
5003 SYCEXP(p->synode);
5004 SYCEXP(p->delivered_msg);
5005 SYCEXP(p->max_synode);
5006 PRINT_GOUT;
5007 FREE_GOUT;
5008 }
5009 /*
5010 If the message with the number in the incoming die_op message
5011 already has been executed (delivered), then it means that we
5012 actually got consensus on it, since otherwise we would not have
5013 delivered it.Such a situation could arise if one of the nodes has
5014 expelled the message from its cache, but others have not. So when
5015 sending out a request, we might get two different answers, one
5016 indicating that we are too far behind and should restart, and
5017 another with the actual consensus value. If the value arrives
5018 first, we will deliver it, and then the die_op may arrive later.
5019 But it this case it does not matter, since we got what we needed
5020 anyway. It is only a partial guard against exiting without really
5021 needing it of course, since the die_op may arrive first, and we
5022 do not wait for a die_op from all the other nodes. We could do
5023 that with some extra housekeeping in the pax_machine (a new bit
5024 vector), but I am not convinced that it is worth the effort.
5025 */
5026 if (!synode_lt(p->synode, executed_msg)) {
5027 ADD_DBG(D_FSM, add_event(EVENT_DUMP_PAD, string_arg("terminating"));)
5028 g_critical("Node %u is unable to get message {%x %" PRIu64
5029 " %u}, since the group is too far "
5030 "ahead. Node will now exit.",
5031 get_nodeno(site), SY_MEM(p->synode));
5032 terminate_and_exit();
5033 }
5034 default:
5035 break;
5036 }
5037 if (oom_abort) {
5038 g_critical("Node %u has run out of memory and will now exit.",
5039 get_nodeno(site));
5040 terminate_and_exit();
5041 }
5042 return (p);
5043 }
5044
5045 /* Acceptor-learner task */
5046 #define SERIALIZE_REPLY(msg) \
5047 msg->to = ep->p->from; \
5048 msg->from = ep->p->to; \
5049 msg->delivered_msg = get_delivered_msg(); \
5050 msg->max_synode = get_max_synode(); \
5051 serialize_msg(msg, ep->rfd.x_proto, &ep->buflen, &ep->buf);
5052
5053 #define WRITE_REPLY \
5054 if (ep->buflen) { \
5055 int64_t sent; \
5056 IFDBG(D_TRANSPORT, FN; STRLIT("task_write "); NDBG(ep->rfd.fd, d); \
5057 NDBG(ep->buflen, u)); \
5058 TASK_CALL(task_write(&ep->rfd, ep->buf, ep->buflen, &sent)); \
5059 send_count[ep->p->op]++; \
5060 send_bytes[ep->p->op] += ep->buflen; \
5061 X_FREE(ep->buf); \
5062 } \
5063 ep->buf = NULL;
5064
update_srv(server ** target,server * srv)5065 static inline void update_srv(server **target, server *srv) {
5066 if (srv) srv_ref(srv);
5067 if (*target) srv_unref(*target);
5068 *target = srv;
5069 }
5070
5071 /* A message is harmless if it cannot change the outcome of a consensus round.
5072 * learn_op does change the value, but we trust that the sender has correctly
5073 * derived the value from a majority of the acceptors, so in that sense it is
5074 * harmless. */
harmless(pax_msg const * p)5075 static int harmless(pax_msg const *p) {
5076 if (p->synode.msgno == 0) return 1;
5077 switch (p->op) {
5078 case i_am_alive_op:
5079 case are_you_alive_op:
5080 case need_boot_op:
5081 case gcs_snapshot_op:
5082 case learn_op:
5083 case recover_learn_op:
5084 case tiny_learn_op:
5085 case die_op:
5086 return 1;
5087 default:
5088 return 0;
5089 }
5090 }
5091
wait_for_cache(pax_machine ** pm,synode_no synode,double timeout)5092 static int wait_for_cache(pax_machine **pm, synode_no synode, double timeout) {
5093 DECL_ENV
5094 double now;
5095 END_ENV;
5096
5097 TASK_BEGIN
5098 ep->now = task_now();
5099 while ((*pm = get_cache(synode)) == NULL) {
5100 /* Wait for executor to make progress */
5101 TIMED_TASK_WAIT(&exec_wait, 0.5);
5102 if (task_now() - ep->now > timeout) break; /* Timeout, return NULL. */
5103 }
5104 FINALLY
5105 TASK_END;
5106 }
5107
5108 /*
5109 Verify if we need to poll the cache before calling dispatch_op.
5110 Avoid waiting for a machine if it is not going to be used.
5111 */
should_poll_cache(pax_op op)5112 static bool_t should_poll_cache(pax_op op) {
5113 if (op == die_op || op == gcs_snapshot_op || op == initial_op ||
5114 op == client_msg)
5115 return FALSE;
5116 return TRUE;
5117 }
5118
acceptor_learner_task(task_arg arg)5119 int acceptor_learner_task(task_arg arg) {
5120 DECL_ENV
5121 connection_descriptor rfd;
5122 srv_buf *in_buf;
5123
5124 pax_msg *p;
5125 u_int buflen;
5126 char *buf;
5127 linkage reply_queue;
5128 int errors;
5129 server *srv;
5130 site_def const *site;
5131 int behind;
5132 END_ENV;
5133
5134 TASK_BEGIN
5135
5136 ep->in_buf = (srv_buf *)calloc(1, sizeof(srv_buf));
5137
5138 ep->rfd.fd = get_int_arg(arg);
5139 #ifndef XCOM_WITHOUT_OPENSSL
5140 ep->rfd.ssl_fd = 0;
5141 #endif
5142 ep->p = NULL;
5143 ep->buflen = 0;
5144 ep->buf = NULL;
5145 ep->errors = 0;
5146 ep->srv = 0;
5147 ep->behind = FALSE;
5148
5149 /* We have a connection, make socket non-blocking and wait for request */
5150 unblock_fd(ep->rfd.fd);
5151 set_nodelay(ep->rfd.fd);
5152 wait_io(stack, ep->rfd.fd, 'r');
5153 TASK_YIELD;
5154
5155 #ifndef XCOM_WITHOUT_OPENSSL
5156 if (xcom_use_ssl()) {
5157 ep->rfd.ssl_fd = SSL_new(server_ctx);
5158 SSL_set_fd(ep->rfd.ssl_fd, ep->rfd.fd);
5159
5160 {
5161 int ret_ssl;
5162 int err;
5163 ERR_clear_error();
5164 ret_ssl = SSL_accept(ep->rfd.ssl_fd);
5165 err = SSL_get_error(ep->rfd.ssl_fd, ret_ssl);
5166
5167 while (ret_ssl != SSL_SUCCESS) {
5168 if (err == SSL_ERROR_WANT_READ) {
5169 wait_io(stack, ep->rfd.fd, 'r');
5170 } else if (err == SSL_ERROR_WANT_WRITE) {
5171 wait_io(stack, ep->rfd.fd, 'w');
5172 } else { /* Some other error, give up */
5173 break;
5174 }
5175 TASK_YIELD;
5176 SET_OS_ERR(0);
5177 G_DEBUG("acceptor learner accept retry fd %d", ep->rfd.fd);
5178 ERR_clear_error();
5179 ret_ssl = SSL_accept(ep->rfd.ssl_fd);
5180 err = SSL_get_error(ep->rfd.ssl_fd, ret_ssl);
5181 }
5182
5183 if (ret_ssl != SSL_SUCCESS) {
5184 ssl_free_con(&ep->rfd);
5185 close_connection(&ep->rfd);
5186 TERMINATE;
5187 }
5188 }
5189
5190 } else {
5191 ep->rfd.ssl_fd = 0;
5192 }
5193 #endif
5194 set_connected(&ep->rfd, CON_FD);
5195 link_init(&ep->reply_queue, TYPE_HASH("msg_link"));
5196
5197 again:
5198 while (!xcom_shutdown) {
5199 int64_t n;
5200 ep->site = 0;
5201 unchecked_replace_pax_msg(&ep->p, pax_msg_new_0(null_synode));
5202
5203 if (use_buffered_read) {
5204 TASK_CALL(buffered_read_msg(&ep->rfd, ep->in_buf, ep->p, ep->srv, &n));
5205 } else {
5206 TASK_CALL(read_msg(&ep->rfd, ep->p, ep->srv, &n));
5207 }
5208 ADD_DBG(D_NONE, add_synode_event(ep->p->synode);
5209 add_event(EVENT_DUMP_PAD, string_arg("ep->p->from"));
5210 add_event(EVENT_DUMP_PAD, uint_arg(ep->p->from));
5211 add_event(EVENT_DUMP_PAD, string_arg(pax_op_to_str(ep->p->op))););
5212
5213 if (ep->srv && !ep->srv->invalid && ((int)ep->p->op != (int)client_msg) &&
5214 is_connected(&ep->srv->con))
5215 server_detected(ep->srv);
5216
5217 if (((int)ep->p->op < (int)client_msg || ep->p->op > LAST_OP)) {
5218 /* invalid operation, ignore message */
5219 delete_pax_msg(ep->p);
5220 ep->p = NULL;
5221 TASK_YIELD;
5222 continue;
5223 }
5224 if (n <= 0) {
5225 break;
5226 }
5227 ep->site = find_site_def(ep->p->synode);
5228
5229 /* Handle this connection on a local_server task instead of this
5230 acceptor_learner_task task. */
5231 if (ep->p->op == client_msg && ep->p->a &&
5232 ep->p->a->body.c_t == convert_into_local_server_type) {
5233 if (local_server_is_setup()) {
5234 /* Launch local_server task to handle this connection. */
5235 {
5236 connection_descriptor *con =
5237 (connection_descriptor *)malloc(sizeof(connection_descriptor));
5238 *con = ep->rfd;
5239 task_new(local_server, void_arg(con), "local_server",
5240 XCOM_THREAD_DEBUG);
5241 }
5242 }
5243 /* Reply to client:
5244 - OK if local_server task is setup, or
5245 - FAIL otherwise. */
5246 {
5247 CREATE_REPLY(ep->p);
5248 reply->op = xcom_client_reply;
5249 reply->cli_err = local_server_is_setup() ? REQUEST_OK : REQUEST_FAIL;
5250 SERIALIZE_REPLY(reply);
5251 replace_pax_msg(&reply, NULL);
5252 }
5253 WRITE_REPLY;
5254 delete_pax_msg(ep->p);
5255 ep->p = NULL;
5256 if (local_server_is_setup()) {
5257 /* Relinquish ownership of the connection. It is now onwed by the
5258 launched local_server task. */
5259 reset_connection(&ep->rfd);
5260 }
5261 /* Terminate this task. */
5262 TERMINATE;
5263 }
5264
5265 /*
5266 Getting a pointer to the server needs to be done after we have
5267 received a message, since without having received a message, we
5268 cannot know who it is from. We could peek at the message and de‐
5269 serialize the message number and from field, but since the server
5270 does not change, it should be sufficient to cache the server in
5271 the acceptor_learner task. A cleaner solution would have been to
5272 move the timestamps out of the server object, and have a map in‐
5273 dexed by IP/port or UUID to track the timestamps, since this is
5274 common to both the sender_task, reply_handler_task, and the ac‐
5275 ceptor_learner_task.
5276 */
5277 update_srv(&ep->srv, get_server(ep->site, ep->p->from));
5278 ep->p->refcnt = 1; /* Refcnt from other end is void here */
5279 IFDBG(D_NONE, FN; NDBG(ep->rfd.fd, d); NDBG(task_now(), f);
5280 COPY_AND_FREE_GOUT(dbg_pax_msg(ep->p)););
5281 receive_count[ep->p->op]++;
5282 receive_bytes[ep->p->op] += (uint64_t)n + MSG_HDR_SIZE;
5283 {
5284 if (get_maxnodes(ep->site) > 0) {
5285 ep->behind = ep->p->synode.msgno < delivered_msg.msgno;
5286 }
5287 ADD_DBG(D_BASE, add_event(EVENT_DUMP_PAD, string_arg("before dispatch "));
5288 add_synode_event(ep->p->synode);
5289 add_event(EVENT_DUMP_PAD, string_arg("ep->p->from"));
5290 add_event(EVENT_DUMP_PAD, uint_arg(ep->p->from));
5291 add_event(EVENT_DUMP_PAD, string_arg(pax_op_to_str(ep->p->op)));
5292 add_event(EVENT_DUMP_PAD,
5293 string_arg(pax_msg_type_to_str(ep->p->msg_type)));
5294 add_event(EVENT_DUMP_PAD, string_arg("is_cached(ep->p->synode)"));
5295 add_event(EVENT_DUMP_PAD, int_arg(is_cached(ep->p->synode)));
5296 add_event(EVENT_DUMP_PAD, string_arg("behind"));
5297 add_event(EVENT_DUMP_PAD, int_arg(ep->behind)););
5298 /* Special treatment to see if synode number is valid. Return no-op if
5299 * not. */
5300 if (ep->p->op == read_op || ep->p->op == prepare_op ||
5301 ep->p->op == accept_op) {
5302 if (ep->site) {
5303 ADD_DBG(
5304 D_BASE, add_event(EVENT_DUMP_PAD, string_arg("ep->p->synode"));
5305 add_synode_event(ep->p->synode);
5306 add_event(EVENT_DUMP_PAD, string_arg("ep->site->start"));
5307 add_synode_event(ep->site->start); add_event(
5308 EVENT_DUMP_PAD, string_arg("ep->site->nodes.node_list_len"));
5309 add_event(EVENT_DUMP_PAD,
5310 uint_arg(ep->site->nodes.node_list_len)););
5311 if (ep->p->synode.node >= ep->site->nodes.node_list_len) {
5312 {
5313 CREATE_REPLY(ep->p);
5314 create_noop(reply);
5315 set_learn_type(reply);
5316 SERIALIZE_REPLY(reply);
5317 delete_pax_msg(reply); /* Deallocate BEFORE potentially blocking
5318 call which will lose value of reply */
5319 }
5320 WRITE_REPLY;
5321 goto again;
5322 }
5323 }
5324 }
5325 /* Reject any message that might compromise the integrity of a consensus
5326 * instance. We do this by not processing any message which may change the
5327 * outcome if the consensus instance has been evicted from the cache */
5328 if (harmless(ep->p) || /* Harmless message */
5329 is_cached(ep->p->synode) || /* Already in cache */
5330 (!ep->behind)) { /* Guard against cache pollution from other nodes */
5331
5332 if (should_poll_cache(ep->p->op)) {
5333 pax_machine *pm;
5334 TASK_CALL(wait_for_cache(&pm, ep->p->synode, 10));
5335 if (!pm) continue; /* Could not get a machine, discarding message. */
5336 }
5337
5338 dispatch_op(ep->site, ep->p, &ep->reply_queue);
5339
5340 /* Send replies on same fd */
5341 while (!link_empty(&ep->reply_queue)) {
5342 {
5343 msg_link *reply =
5344 (msg_link *)(link_extract_first(&ep->reply_queue));
5345 IFDBG(D_DISPATCH, FN; PTREXP(reply);
5346 COPY_AND_FREE_GOUT(dbg_linkage(&ep->reply_queue));
5347 COPY_AND_FREE_GOUT(dbg_msg_link(reply));
5348 COPY_AND_FREE_GOUT(dbg_pax_msg(reply->p)););
5349 assert(reply->p);
5350 assert(reply->p->refcnt > 0);
5351 IFDBG(D_DISPATCH, FN; STRLIT("serialize "); PTREXP(reply));
5352 SERIALIZE_REPLY(reply->p);
5353 msg_link_delete(&reply); /* Deallocate BEFORE potentially blocking
5354 call which will lose value of reply */
5355 }
5356 WRITE_REPLY;
5357 }
5358 } else {
5359 IFDBG(D_EXEC, FN; STRLIT("rejecting ");
5360 STRLIT(pax_op_to_str(ep->p->op)); NDBG(ep->p->from, d);
5361 NDBG(ep->p->to, d); SYCEXP(ep->p->synode);
5362 BALCEXP(ep->p->proposal));
5363 if (/* xcom_booted() && */ ep->behind) {
5364 if (/*ep->p->op == prepare_op && */ was_removed_from_cache(
5365 ep->p->synode)) {
5366 IFDBG(D_NONE, FN; STRLIT("send_die ");
5367 STRLIT(pax_op_to_str(ep->p->op)); NDBG(ep->p->from, d);
5368 NDBG(ep->p->to, d); SYCEXP(ep->p->synode);
5369 BALCEXP(ep->p->proposal));
5370 if (get_maxnodes(ep->site) > 0) {
5371 {
5372 pax_msg *np = NULL;
5373 np = pax_msg_new(ep->p->synode, ep->site);
5374 np->op = die_op;
5375 SERIALIZE_REPLY(np);
5376 IFDBG(D_NONE, FN; STRLIT("sending die_op to node ");
5377 NDBG(np->to, d); SYCEXP(executed_msg); SYCEXP(max_synode);
5378 SYCEXP(np->synode));
5379 delete_pax_msg(np); /* Deallocate BEFORE potentially blocking
5380 call which will lose value of np */
5381 }
5382 WRITE_REPLY;
5383 }
5384 }
5385 }
5386 }
5387 }
5388 /* TASK_YIELD; */
5389 }
5390
5391 FINALLY
5392 IFDBG(D_BUG, FN; STRLIT(" shutdown "); NDBG(ep->rfd.fd, d);
5393 NDBG(task_now(), f));
5394 if (ep->reply_queue.suc && !link_empty(&ep->reply_queue))
5395 empty_msg_list(&ep->reply_queue);
5396 unchecked_replace_pax_msg(&ep->p, NULL);
5397 shutdown_connection(&ep->rfd);
5398 IFDBG(D_NONE, FN; NDBG(xcom_shutdown, d));
5399 if (ep->buf) X_FREE(ep->buf);
5400 free(ep->in_buf);
5401
5402 /* Unref srv to avoid leak */
5403 update_srv(&ep->srv, 0);
5404
5405 IFDBG(D_BUG, FN; STRLIT(" shutdown completed"); NDBG(ep->rfd.fd, d);
5406 NDBG(task_now(), f));
5407 TASK_END;
5408 }
5409
5410 /* Reply handler task */
5411
5412 static void server_handle_need_snapshot(server *srv, site_def const *s,
5413 node_no node);
5414
reply_handler_task(task_arg arg)5415 int reply_handler_task(task_arg arg) {
5416 DECL_ENV
5417 server *s;
5418 pax_msg *reply;
5419 double dtime;
5420 END_ENV;
5421
5422 TASK_BEGIN
5423
5424 ep->dtime = INITIAL_CONNECT_WAIT; /* Initial wait is short, to avoid
5425 unnecessary waiting */
5426 ep->s = (server *)get_void_arg(arg);
5427 srv_ref(ep->s);
5428 ep->reply = NULL;
5429
5430 while (!xcom_shutdown) {
5431 while (!is_connected(&ep->s->con)) {
5432 IFDBG(D_NONE, FN; STRLIT("waiting for connection"));
5433 TASK_DELAY(ep->dtime);
5434 if (xcom_shutdown) {
5435 TERMINATE;
5436 }
5437 ep->dtime *= CONNECT_WAIT_INCREASE; /* Increase wait time for next try */
5438 if (ep->dtime > MAX_CONNECT_WAIT) {
5439 ep->dtime = MAX_CONNECT_WAIT;
5440 }
5441 }
5442 ep->dtime = INITIAL_CONNECT_WAIT;
5443 {
5444 int64_t n;
5445 unchecked_replace_pax_msg(&ep->reply, pax_msg_new_0(null_synode));
5446
5447 ADD_DBG(D_NONE, add_event(EVENT_DUMP_PAD, string_arg("ep->s->con.fd"));
5448 add_event(EVENT_DUMP_PAD, int_arg(ep->s->con.fd)););
5449 TASK_CALL(read_msg(&ep->s->con, ep->reply, ep->s, &n));
5450 ADD_DBG(D_NONE, add_event(EVENT_DUMP_PAD, string_arg("ep->s->con.fd"));
5451 add_event(EVENT_DUMP_PAD, int_arg(ep->s->con.fd)););
5452 ep->reply->refcnt = 1; /* Refcnt from other end is void here */
5453 if (n <= 0) {
5454 shutdown_connection(&ep->s->con);
5455 continue;
5456 }
5457 receive_bytes[ep->reply->op] += (uint64_t)n + MSG_HDR_SIZE;
5458 }
5459 IFDBG(D_NONE, FN; NDBG(ep->s->con.fd, d); NDBG(task_now(), f);
5460 COPY_AND_FREE_GOUT(dbg_pax_msg(ep->reply)););
5461 receive_count[ep->reply->op]++;
5462
5463 ADD_DBG(D_NONE, add_synode_event(ep->reply->synode);
5464 add_event(EVENT_DUMP_PAD, string_arg("ep->reply->from"));
5465 add_event(EVENT_DUMP_PAD, uint_arg(ep->reply->from));
5466 add_event(EVENT_DUMP_PAD, string_arg(pax_op_to_str(ep->reply->op)));
5467 add_event(EVENT_DUMP_PAD, string_arg("get_site_def()->boot_key"));
5468 add_synode_event(get_site_def()->boot_key););
5469 /* Special test for need_snapshot, since node and site may not be consistent
5470 */
5471 if (ep->reply->op == need_boot_op &&
5472 !synode_eq(get_site_def()->boot_key, null_synode)) {
5473 pax_msg *p = ep->reply;
5474
5475 ADD_DBG(D_BASE,
5476 add_event(EVENT_DUMP_PAD,
5477 string_arg("calling server_handle_need_snapshot")););
5478 if (should_handle_need_boot(find_site_def(p->synode), p)) {
5479 server_handle_need_snapshot(ep->s, find_site_def(p->synode), p->from);
5480 /* Wake senders waiting to connect, since new node has appeared */
5481 wakeup_sender();
5482 } else {
5483 ep->s->invalid = 1;
5484 }
5485 } else {
5486 /* We only handle messages from this connection if the server is valid. */
5487 if (ep->s->invalid == 0)
5488 dispatch_op(find_site_def(ep->reply->synode), ep->reply, NULL);
5489 }
5490 TASK_YIELD;
5491 }
5492
5493 FINALLY
5494 replace_pax_msg(&ep->reply, NULL);
5495
5496 shutdown_connection(&ep->s->con);
5497 ep->s->reply_handler = NULL;
5498 IFDBG(D_BUG, FN; STRLIT(" shutdown "); NDBG(ep->s->con.fd, d);
5499 NDBG(task_now(), f));
5500 srv_unref(ep->s);
5501
5502 TASK_END;
5503 }
5504
5505 /* purecov: begin deadcode */
xcom_sleep(unsigned int seconds)5506 static inline void xcom_sleep(unsigned int seconds) {
5507 #if defined(_WIN32)
5508 Sleep((DWORD)seconds * 1000); /* windows sleep takes milliseconds */
5509 #else
5510 sleep(seconds);
5511 #endif
5512 }
5513 /* purecov: end */
5514
5515 /*
5516 * Get a unique long as the basis for XCom group id creation.
5517 *
5518 * NOTE:
5519 * As there is no gethostid() on win, we use seconds since epoch instead,
5520 * so it might fail if you try simultaneous create sites at the same second.
5521 */
xcom_unique_long(void)5522 long xcom_unique_long(void) {
5523 #if defined(_WIN32)
5524 __time64_t ltime;
5525
5526 _time64(<ime);
5527 return (long)(ltime ^ GetCurrentProcessId());
5528 #else
5529 return gethostid() ^ getpid();
5530 #endif
5531 }
5532
init_config_with_group(app_data * a,node_list * nl,cargo_type type,uint32_t group_id)5533 app_data_ptr init_config_with_group(app_data *a, node_list *nl, cargo_type type,
5534 uint32_t group_id) {
5535 init_app_data(a);
5536 a->app_key.group_id = a->group_id = group_id;
5537 a->body.c_t = type;
5538 init_node_list(nl->node_list_len, nl->node_list_val, &a->body.app_u_u.nodes);
5539 return a;
5540 }
5541
init_set_event_horizon_msg(app_data * a,uint32_t group_id,xcom_event_horizon event_horizon)5542 app_data_ptr init_set_event_horizon_msg(app_data *a, uint32_t group_id,
5543 xcom_event_horizon event_horizon) {
5544 init_app_data(a);
5545 a->app_key.group_id = a->group_id = group_id;
5546 a->body.c_t = set_event_horizon_type;
5547 a->body.app_u_u.event_horizon = event_horizon;
5548 return a;
5549 }
5550
init_get_event_horizon_msg(app_data * a,uint32_t group_id)5551 app_data_ptr init_get_event_horizon_msg(app_data *a, uint32_t group_id) {
5552 init_app_data(a);
5553 a->app_key.group_id = a->group_id = group_id;
5554 a->body.c_t = get_event_horizon_type;
5555 return a;
5556 }
5557
init_app_msg(app_data * a,char * payload,u_int payload_size)5558 app_data_ptr init_app_msg(app_data *a, char *payload, u_int payload_size) {
5559 init_app_data(a);
5560 a->body.c_t = app_type;
5561 a->body.app_u_u.data.data_val = payload; /* Takes ownership of payload. */
5562 a->body.app_u_u.data.data_len = payload_size;
5563 return a;
5564 }
5565
init_terminate_command(app_data * a)5566 app_data_ptr init_terminate_command(app_data *a) {
5567 init_app_data(a);
5568 a->body.c_t = x_terminate_and_exit;
5569 return a;
5570 }
5571
init_get_synode_app_data_msg(app_data * a,uint32_t group_id,synode_no_array * const synodes)5572 static app_data_ptr init_get_synode_app_data_msg(
5573 app_data *a, uint32_t group_id, synode_no_array *const synodes) {
5574 init_app_data(a);
5575 a->app_key.group_id = a->group_id = group_id;
5576 a->body.c_t = get_synode_app_data_type;
5577 /* Move synodes (as in C++ move semantics) into a->body.app_u_u.synodes. */
5578 synode_array_move(&a->body.app_u_u.synodes, synodes);
5579 return a;
5580 }
5581
init_set_cache_size_msg(app_data * a,uint64_t cache_limit)5582 app_data_ptr init_set_cache_size_msg(app_data *a, uint64_t cache_limit) {
5583 init_app_data(a);
5584 a->body.c_t = set_cache_limit;
5585 a->body.app_u_u.cache_limit = cache_limit;
5586 return a;
5587 }
5588
init_convert_into_local_server_msg(app_data * a)5589 app_data_ptr init_convert_into_local_server_msg(app_data *a) {
5590 init_app_data(a);
5591 a->body.c_t = convert_into_local_server_type;
5592 return a;
5593 }
5594
server_send_snapshot(server * srv,site_def const * s,gcs_snapshot * gcs_snap,node_no node)5595 static void server_send_snapshot(server *srv, site_def const *s,
5596 gcs_snapshot *gcs_snap, node_no node) {
5597 pax_msg *p = pax_msg_new(gcs_snap->log_start, get_site_def());
5598 ref_msg(p);
5599 p->op = gcs_snapshot_op;
5600 p->gcs_snap = gcs_snap;
5601 send_msg(srv, s->nodeno, node, get_group_id(s), p);
5602 unref_msg(&p);
5603 }
5604
server_push_log(server * srv,synode_no push,node_no node)5605 static void server_push_log(server *srv, synode_no push, node_no node) {
5606 site_def const *s = get_site_def();
5607 if (srv && s) {
5608 while (!synode_gt(push, get_max_synode())) {
5609 if (is_cached(push)) {
5610 /* Need to clone message here since pax_machine may be re-used while
5611 * message is sent */
5612 pax_machine *p = get_cache_no_touch(push, FALSE);
5613 if (pm_finished(p)) {
5614 pax_msg *pm = clone_pax_msg(p->learner.msg);
5615 if (pm != NULL) {
5616 ref_msg(pm);
5617 pm->op = recover_learn_op;
5618 IFDBG(D_NONE, FN; PTREXP(srv); PTREXP(s););
5619 send_msg(srv, s->nodeno, node, get_group_id(s), pm);
5620 unref_msg(&pm);
5621 }
5622 }
5623 }
5624 push = incr_synode(push);
5625 }
5626 }
5627 }
5628
5629 /* purecov: begin deadcode */
reply_push_log(synode_no push,linkage * reply_queue)5630 static void reply_push_log(synode_no push, linkage *reply_queue) {
5631 while (!synode_gt(push, get_max_synode())) {
5632 if (is_cached(push)) {
5633 /* Need to clone message here since pax_machine may be re-used while
5634 * message is sent */
5635 pax_machine *p = get_cache_no_touch(push, FALSE);
5636 if (pm_finished(p)) {
5637 pax_msg *reply = clone_pax_msg(p->learner.msg);
5638 ref_msg(reply);
5639 reply->op = recover_learn_op;
5640 {
5641 msg_link *msg_x = msg_link_new(reply, reply->from);
5642 IFDBG(D_NONE, FN; PTREXP(msg_x));
5643 link_into(&(msg_x->l), reply_queue);
5644 }
5645 replace_pax_msg(&reply, NULL);
5646 unref_msg(&reply);
5647 }
5648 }
5649 push = incr_synode(push);
5650 }
5651 }
5652 /* purecov: end */
5653
5654 static app_snap_getter get_app_snap_cb;
5655 static app_snap_handler handle_app_snap_cb;
5656
create_snapshot()5657 static gcs_snapshot *create_snapshot() {
5658 gcs_snapshot *gs = 0;
5659 if (get_app_snap_cb) {
5660 /* purecov: begin deadcode */
5661 blob app_snap = {
5662 {0,
5663 0}}; /* Initialize in case get_app_snap_cb does not assign a value */
5664 synode_no app_lsn = get_app_snap_cb(&app_snap);
5665
5666 /* We have a valid callback, abort if it did not return anything */
5667 if (app_snap.data.data_len == 0) {
5668 ADD_DBG(D_BASE,
5669 add_event(EVENT_DUMP_PAD, string_arg("no data, return")););
5670 return 0;
5671 }
5672 gs = export_config();
5673 if (!gs) return 0;
5674 ADD_DBG(D_BASE, add_event(EVENT_DUMP_PAD, string_arg("export config ok")););
5675 gs->app_snap = app_snap;
5676 IFDBG(D_BUG, FN; SYCEXP(app_lsn); SYCEXP(gs->log_start);
5677 SYCEXP(gs->log_end));
5678
5679 /* Set starting point of log to match the snapshot */
5680 /* If we have a valid synode from application snapshot, see if it should be
5681 * used */
5682 if (!synode_eq(null_synode, app_lsn)) {
5683 /* If log_start is null_synode, always use valid synode from application
5684 * snapshot */
5685 if (synode_eq(null_synode, gs->log_start) ||
5686 !synode_gt(app_lsn, gs->log_start)) {
5687 gs->log_start = app_lsn;
5688 IFDBG(D_BUG, FN; STRLIT("using "); SYCEXP(app_lsn));
5689 }
5690 }
5691 ADD_DBG(D_BASE, add_event(EVENT_DUMP_PAD, string_arg("gs->log_start"));
5692 add_synode_event(gs->log_start);
5693 add_event(EVENT_DUMP_PAD, string_arg("gs->log_end"));
5694 add_synode_event(gs->log_end););
5695 /* purecov: end */
5696 } else {
5697 gs = export_config();
5698 if (!gs) return 0;
5699 ADD_DBG(D_BASE, add_event(EVENT_DUMP_PAD, string_arg("export config ok")););
5700 if (!synode_eq(null_synode, last_config_modification_id)) {
5701 /* No valid valid synode from application snapshot, use
5702 * last_config_modification_id if not null_synode */
5703 gs->log_start = last_config_modification_id;
5704 IFDBG(D_BUG, FN; STRLIT("using "); SYCEXP(last_config_modification_id));
5705 }
5706 IFDBG(D_BUG, FN; SYCEXP(gs->log_start); SYCEXP(gs->log_end));
5707 ADD_DBG(D_BASE, add_event(EVENT_DUMP_PAD, string_arg("gs->log_start"));
5708 add_synode_event(gs->log_start);
5709 add_event(EVENT_DUMP_PAD, string_arg("gs->log_end"));
5710 add_synode_event(gs->log_end););
5711 }
5712 IFDBG(D_BUG, FN; SYCEXP(gs->log_start); SYCEXP(gs->log_end));
5713 return gs;
5714 }
5715
5716 /* purecov: begin deadcode */
handle_need_snapshot(linkage * reply_queue,pax_msg * pm)5717 static void handle_need_snapshot(linkage *reply_queue, pax_msg *pm) {
5718 gcs_snapshot *gs = create_snapshot();
5719 if (gs) {
5720 pax_msg *reply = clone_pax_msg(pm);
5721 ref_msg(reply);
5722 reply->op = gcs_snapshot_op;
5723 reply->gcs_snap = gs;
5724 {
5725 msg_link *msg_x = msg_link_new(reply, reply->from);
5726 IFDBG(D_NONE, FN; PTREXP(msg_x));
5727 link_into(&(msg_x->l), reply_queue);
5728 }
5729 unref_msg(&reply);
5730 IFDBG(D_NONE, FN; STRLIT("sent snapshot"););
5731 reply_push_log(gs->log_start, reply_queue);
5732 send_global_view();
5733 }
5734 }
5735 /* purecov: end */
5736
5737 static task_env *x_timer = NULL;
5738
5739 /* Timer for use with the xcom FSM. Will deliver x_fsm_timeout */
xcom_timer(task_arg arg)5740 static int xcom_timer(task_arg arg) {
5741 DECL_ENV
5742 double t;
5743 END_ENV;
5744
5745 TASK_BEGIN
5746
5747 ep->t = get_double_arg(arg);
5748 TASK_DELAY(ep->t);
5749 XCOM_FSM(x_fsm_timeout, double_arg(ep->t));
5750 FINALLY
5751 if (stack == x_timer) set_task(&x_timer, NULL);
5752 IFDBG(D_BUG, FN; STRLIT(" timeout "));
5753 TASK_END;
5754 }
5755
5756 /* Stop the xcom FSM timer */
stop_x_timer()5757 static void stop_x_timer() {
5758 if (x_timer) {
5759 task_terminate(x_timer);
5760 set_task(&x_timer, NULL);
5761 }
5762 }
5763
5764 /* Start the xcom FSM timer */
start_x_timer(double t)5765 static void start_x_timer(double t) {
5766 stop_x_timer();
5767 set_task(&x_timer, task_new(xcom_timer, double_arg(t), "xcom_timer",
5768 XCOM_THREAD_DEBUG));
5769 }
5770
5771 /* Deliver x_fsm_complete to xcom FSM */
5772 /* purecov: begin deadcode */
x_fsm_completion_task(task_arg arg)5773 static int x_fsm_completion_task(task_arg arg) {
5774 DECL_ENV
5775 int dummy;
5776 END_ENV;
5777
5778 TASK_BEGIN
5779
5780 (void)
5781 arg;
5782 XCOM_FSM(x_fsm_complete, null_arg);
5783 FINALLY
5784 IFDBG(D_BUG, FN; STRLIT(" delivered "));
5785 TASK_END;
5786 }
5787 /* purecov: end */
5788
5789 /* Send x_fsm_complete to xcom FSM in the context of the xcom thread. The
5790 * calling thread and the xcom thread must be in a rendezvous. Using a task to
5791 * deliver a message is an abstraction inversion, but it's the simplest solution
5792 * until we get a proper queue-based communication system going. */
5793 /* purecov: begin deadcode */
send_x_fsm_complete()5794 void send_x_fsm_complete() {
5795 task_new(x_fsm_completion_task, null_arg, "x_fsm_completion_task",
5796 XCOM_THREAD_DEBUG);
5797 }
5798 /* purecov: end */
5799
server_handle_need_snapshot(server * srv,site_def const * s,node_no node)5800 static void server_handle_need_snapshot(server *srv, site_def const *s,
5801 node_no node) {
5802 gcs_snapshot *gs = create_snapshot();
5803
5804 if (gs) {
5805 server_send_snapshot(srv, s, gs, node);
5806 IFDBG(D_NONE, FN; STRLIT("sent snapshot"););
5807 server_push_log(srv, gs->log_start, node);
5808 send_global_view();
5809 }
5810 }
5811
5812 #define X(b) #b
5813 const char *xcom_actions_name[] = {x_actions};
5814 #undef X
5815
5816 static int snapshots[NSERVERS];
5817
5818 /* Note that we have received snapshot from node */
note_snapshot(node_no node)5819 static void note_snapshot(node_no node) {
5820 if (node != VOID_NODE_NO) {
5821 snapshots[node] = 1;
5822 }
5823 }
5824
5825 /* Reset set of received snapshots */
reset_snapshot_mask()5826 static void reset_snapshot_mask() {
5827 int i;
5828 for (i = 0; i < NSERVERS; i++) {
5829 snapshots[i] = 0;
5830 }
5831 }
5832
5833 /* See if we have got a snapshot from every node */
got_all_snapshots()5834 static int got_all_snapshots() {
5835 node_no i;
5836 node_no max = get_maxnodes(get_site_def());
5837 if (0 == max) {
5838 return 0;
5839 }
5840 for (i = 0; i < max; i++) {
5841 if (!snapshots[i]) {
5842 return 0;
5843 }
5844 }
5845 return 1;
5846 }
5847
5848 static synode_no log_start_max; /* Initialized by xcom_fsm */
5849 static synode_no log_end_max; /* Initialized by xcom_fsm */
5850
5851 /* See if this snapshot is better than what we already have */
5852 /* purecov: begin deadcode */
better_snapshot(gcs_snapshot * gcs)5853 static int better_snapshot(gcs_snapshot *gcs) {
5854 synode_no boot_key = config_max_boot_key(gcs);
5855 return synode_gt(boot_key, get_site_def()->boot_key) ||
5856 (synode_eq(boot_key, get_site_def()->boot_key) &&
5857 (synode_gt(gcs->log_start, log_start_max) ||
5858 (synode_eq(gcs->log_start, log_start_max) &&
5859 synode_gt(gcs->log_end, log_end_max))));
5860 }
5861 /* purecov: end */
5862
5863 /* Install snapshot */
handle_x_snapshot(gcs_snapshot * gcs)5864 static void handle_x_snapshot(gcs_snapshot *gcs) {
5865 import_config(gcs);
5866 if (get_nodeno(get_site_def()) == VOID_NODE_NO) {
5867 IFDBG(D_BUG, FN; STRLIT("Not member of site, not executing log"));
5868 gcs->log_end =
5869 gcs->log_start; /* Avoid executing log if not member of site */
5870 }
5871 handle_app_snap_cb(&gcs->app_snap, gcs->log_start, gcs->log_end);
5872 set_max_synode(gcs->log_end);
5873 set_executed_msg(incr_synode(gcs->log_start));
5874 log_start_max = gcs->log_start;
5875 log_end_max = gcs->log_end;
5876
5877 set_last_received_config(get_highest_boot_key(gcs));
5878
5879 IFDBG(D_BUG, FN; SYCEXP(gcs->log_start); SYCEXP(gcs->log_end);
5880 SYCEXP(last_config_modification_id); SYCEXP(executed_msg););
5881 }
5882
5883 /* Note that we have received snapshot, and install if better than old */
5884 /* purecov: begin deadcode */
update_best_snapshot(gcs_snapshot * gcs)5885 static void update_best_snapshot(gcs_snapshot *gcs) {
5886 if (get_site_def() == 0 || better_snapshot(gcs)) {
5887 handle_x_snapshot(gcs);
5888 }
5889 }
5890 /* purecov: end */
5891
5892 /* Send need_boot_op to all nodes in current config */
5893 /* purecov: begin deadcode */
send_need_boot()5894 static void send_need_boot() {
5895 pax_msg *p = pax_msg_new_0(null_synode);
5896 ref_msg(p);
5897 p->synode = get_site_def()->start;
5898 p->op = need_boot_op;
5899 send_to_all_except_self(get_site_def(), p, "need_boot_op");
5900 unref_msg(&p);
5901 }
5902 /* purecov: end */
5903
5904 /* Set log_end of snapshot based on log_end in snapshot and max synode */
set_log_end(gcs_snapshot * gcs)5905 void set_log_end(gcs_snapshot *gcs) {
5906 if (synode_gt(get_max_synode(), gcs->log_end)) {
5907 gcs->log_end = get_max_synode();
5908 }
5909 }
5910
5911 struct xcom_fsm_state;
5912 typedef struct xcom_fsm_state xcom_fsm_state;
5913
5914 /* Function pointer corresponding to a state. Return 1 if execution should
5915 * continue, 0 otherwise */
5916 typedef int (*xcom_fsm_fp)(xcom_actions action, task_arg fsmargs,
5917 xcom_fsm_state *ctxt);
5918
5919 /* Function pointer and name */
5920 struct xcom_fsm_state {
5921 xcom_fsm_fp state_fp;
5922 char const *state_name;
5923 };
5924
5925 #define X_FSM_STATE(s) \
5926 { s, #s }
5927 #define SET_X_FSM_STATE(s) \
5928 do { \
5929 ctxt->state_fp = s; \
5930 ctxt->state_name = #s; \
5931 } while (0)
5932
5933 /* The state functions/thunks */
5934 static int xcom_fsm_init(xcom_actions action, task_arg fsmargs,
5935 xcom_fsm_state *ctxt);
5936 static int xcom_fsm_start_enter(xcom_actions action, task_arg fsmargs,
5937 xcom_fsm_state *ctxt);
5938 static int xcom_fsm_start(xcom_actions action, task_arg fsmargs,
5939 xcom_fsm_state *ctxt);
5940 static int xcom_fsm_snapshot_wait_enter(xcom_actions action, task_arg fsmargs,
5941 xcom_fsm_state *ctxt);
5942 static int xcom_fsm_snapshot_wait(xcom_actions action, task_arg fsmargs,
5943 xcom_fsm_state *ctxt);
5944 static int xcom_fsm_recover_wait_enter(xcom_actions action, task_arg fsmargs,
5945 xcom_fsm_state *ctxt);
5946 static int xcom_fsm_recover_wait(xcom_actions action, task_arg fsmargs,
5947 xcom_fsm_state *ctxt);
5948 static int xcom_fsm_run_enter(xcom_actions action, task_arg fsmargs,
5949 xcom_fsm_state *ctxt);
5950 static int xcom_fsm_run(xcom_actions action, task_arg fsmargs,
5951 xcom_fsm_state *ctxt);
5952
5953 /* You are in a twisting maze of little functions ... */
5954
5955 /* init state */
xcom_fsm_init(xcom_actions action,task_arg fsmargs,xcom_fsm_state * ctxt)5956 static int xcom_fsm_init(xcom_actions action, task_arg fsmargs,
5957 xcom_fsm_state *ctxt) {
5958 (void)action;
5959 (void)fsmargs;
5960 IFDBG(D_NONE, FN;);
5961 /* Initialize basic xcom data */
5962 xcom_thread_init();
5963 SET_X_FSM_STATE(xcom_fsm_start_enter);
5964 return 1;
5965 }
5966
5967 /* start_enter state */
xcom_fsm_start_enter(xcom_actions action,task_arg fsmargs,xcom_fsm_state * ctxt)5968 static int xcom_fsm_start_enter(xcom_actions action, task_arg fsmargs,
5969 xcom_fsm_state *ctxt) {
5970 (void)action;
5971 (void)fsmargs;
5972 /* push_dbg(D_DETECT | D_FSM | D_FILEOP | D_CONS | D_BASE | D_TRANSPORT);
5973 */
5974 push_dbg(D_FSM);
5975 IFDBG(D_NONE, FN; STRLIT("state x_start"););
5976 empty_prop_input_queue();
5977 reset_snapshot_mask();
5978 set_last_received_config(null_synode);
5979
5980 SET_X_FSM_STATE(xcom_fsm_start);
5981 return 1;
5982 }
5983
handle_fsm_net_boot(task_arg fsmargs,xcom_fsm_state * ctxt,int cont)5984 static int handle_fsm_net_boot(task_arg fsmargs, xcom_fsm_state *ctxt,
5985 int cont) {
5986 app_data *a = (app_data *)get_void_arg(fsmargs);
5987 install_node_group(a);
5988 if (is_member(get_site_def())) {
5989 empty_prop_input_queue();
5990 {
5991 synode_no start = get_site_def()->start;
5992 if (start.msgno == 0) { /* May happen during initial boot */
5993 start.msgno = 1;
5994 }
5995 set_executed_msg(start);
5996 }
5997 pop_dbg();
5998 SET_X_FSM_STATE(xcom_fsm_run_enter);
5999 cont = 1;
6000 }
6001 return cont;
6002 }
6003
handle_fsm_snapshot(task_arg fsmargs,xcom_fsm_state * ctxt)6004 static int handle_fsm_snapshot(task_arg fsmargs, xcom_fsm_state *ctxt) {
6005 gcs_snapshot *gcs = (gcs_snapshot *)get_void_arg(fsmargs);
6006 empty_prop_input_queue();
6007 set_log_end(gcs);
6008 handle_x_snapshot(gcs);
6009
6010 /* Get recovery manager going again */
6011 if (recovery_restart_cb) recovery_restart_cb();
6012
6013 /* If we run under control of the recovery manager, we need to call
6014 * recovery_begin_cb to rendezvous with the recovery manager */
6015 if (recovery_begin_cb) recovery_begin_cb();
6016
6017 /* If we run under control of the recovery manager, we need to call
6018 * recovery_end_cb to rendezvous with the recovery manager */
6019 if (recovery_end_cb) recovery_end_cb();
6020
6021 /* If we are here, it means that we are recovering from another node
6022 */
6023 /* Do not bother to wait for more snapshots, just handle it and
6024 enter run state */
6025 pop_dbg();
6026 SET_X_FSM_STATE(xcom_fsm_run_enter);
6027 return 1;
6028 }
6029
6030 /* purecov: begin deadcode */
handle_fsm_snapshot_wait(xcom_fsm_state * ctxt)6031 static int handle_fsm_snapshot_wait(xcom_fsm_state *ctxt) {
6032 empty_prop_input_queue();
6033 start_x_timer(SNAPSHOT_WAIT_TIME);
6034 pop_dbg();
6035 SET_X_FSM_STATE(xcom_fsm_snapshot_wait_enter);
6036 return 1;
6037 }
6038 /* purecov: end */
6039
handle_fsm_exit()6040 static void handle_fsm_exit() {
6041 /* Xcom is finished when we get here */
6042 push_dbg(D_BUG);
6043 bury_site(get_group_id(get_site_def()));
6044 task_terminate_all(); /* Kill, kill, kill, kill, kill, kill. This is
6045 the end. */
6046
6047 /* init_xcom_base(); */ /* Reset shared variables */
6048 init_tasks(); /* Reset task variables */
6049 free_site_defs();
6050 free_forced_config_site_def();
6051 wait_forced_config = 0;
6052 garbage_collect_servers();
6053 IFDBG(D_NONE, FN; STRLIT("shutting down"));
6054 xcom_shutdown = 1;
6055 start_config = null_synode;
6056 G_DEBUG("Exiting xcom thread");
6057 }
6058
6059 /* start state */
xcom_fsm_start(xcom_actions action,task_arg fsmargs,xcom_fsm_state * ctxt)6060 static int xcom_fsm_start(xcom_actions action, task_arg fsmargs,
6061 xcom_fsm_state *ctxt) {
6062 static int need_init_cache = 0;
6063 int cont = 0; /* Set to 1 if we should continue execution */
6064
6065 switch (action) {
6066 case x_fsm_init:
6067 xcom_shutdown = 0;
6068 sent_alive = 0.0;
6069 oom_abort = 0;
6070 if (need_init_cache) init_cache();
6071 break;
6072
6073 case x_fsm_net_boot:
6074 cont = handle_fsm_net_boot(fsmargs, ctxt, cont);
6075 break;
6076
6077 case x_fsm_snapshot:
6078 cont = handle_fsm_snapshot(fsmargs, ctxt);
6079 break;
6080
6081 /* This is the entry point for the initial recovery after the process
6082 * has started when running under an external recovery manager. */
6083 /* If we get x_fsm_snapshot_wait, we are called from the recovery
6084 * manager thread */
6085 /* purecov: begin deadcode */
6086 case x_fsm_snapshot_wait:
6087 cont = handle_fsm_snapshot_wait(ctxt);
6088 break;
6089 /* purecov: end */
6090
6091 case x_fsm_exit:
6092 handle_fsm_exit();
6093 break;
6094
6095 default:
6096 break;
6097 }
6098 need_init_cache = 1;
6099 return cont;
6100 }
6101
6102 /* snapshot_wait_enter state */
6103 /* purecov: begin deadcode */
xcom_fsm_snapshot_wait_enter(xcom_actions action,task_arg fsmargs,xcom_fsm_state * ctxt)6104 static int xcom_fsm_snapshot_wait_enter(xcom_actions action, task_arg fsmargs,
6105 xcom_fsm_state *ctxt) {
6106 (void)action;
6107 (void)fsmargs;
6108 push_dbg(D_DETECT | D_FSM | D_FILEOP | D_CONS | D_BASE | D_TRANSPORT);
6109 IFDBG(D_NONE, FN; STRLIT("state x_snapshot_wait"););
6110 log_start_max = null_synode;
6111 log_end_max = null_synode;
6112 SET_X_FSM_STATE(xcom_fsm_snapshot_wait);
6113 return 0;
6114 }
6115 /* purecov: end */
6116
6117 /* purecov: begin deadcode */
handle_local_snapshot(task_arg fsmargs,xcom_fsm_state * ctxt)6118 static int handle_local_snapshot(task_arg fsmargs, xcom_fsm_state *ctxt) {
6119 update_best_snapshot((gcs_snapshot *)get_void_arg(fsmargs));
6120 /* When recovering locally, fetch node number from site_def after
6121 * processing the snapshot */
6122 note_snapshot(get_site_def()->nodeno);
6123 send_need_boot();
6124 pop_dbg();
6125 SET_X_FSM_STATE(xcom_fsm_recover_wait_enter);
6126 return 1;
6127 }
6128 /* purecov: end */
6129
6130 /* purecov: begin deadcode */
handle_snapshot(task_arg fsmargs,xcom_fsm_state * ctxt)6131 static int handle_snapshot(task_arg fsmargs, xcom_fsm_state *ctxt) {
6132 /* Snapshot from another node */
6133 gcs_snapshot *gcs = (gcs_snapshot *)get_void_arg(fsmargs);
6134 set_log_end(gcs);
6135 update_best_snapshot(gcs);
6136 /* We now have a site, so note that we have processed the local
6137 * snapshot even if we have not seen one, since if we are here, no
6138 * local snapshot will ever arrive. This simplifies the test in
6139 * got_all_snapshots() */
6140 note_snapshot(get_site_def()->nodeno);
6141 send_need_boot();
6142 pop_dbg();
6143 SET_X_FSM_STATE(xcom_fsm_recover_wait_enter);
6144 return 1;
6145 }
6146 /* purecov: end */
6147
6148 /* snapshot_wait state */
6149 /* purecov: begin deadcode */
xcom_fsm_snapshot_wait(xcom_actions action,task_arg fsmargs,xcom_fsm_state * ctxt)6150 static int xcom_fsm_snapshot_wait(xcom_actions action, task_arg fsmargs,
6151 xcom_fsm_state *ctxt) {
6152 switch (action) {
6153 /* If we get x_fsm_local_snapshot, we are called from the recovery
6154 * manager thread */
6155 case x_fsm_local_snapshot:
6156 return handle_local_snapshot(fsmargs, ctxt);
6157
6158 case x_fsm_snapshot:
6159 return handle_snapshot(fsmargs, ctxt);
6160
6161 case x_fsm_timeout:
6162 /* Will time out if no snapshot available */
6163 /* If we run under control of the recovery manager, we need to call
6164 * recovery_end_cb to rendezvous with the recovery manager */
6165 if (recovery_end_cb) recovery_end_cb();
6166 pop_dbg();
6167 SET_X_FSM_STATE(xcom_fsm_start_enter);
6168 return 1;
6169
6170 default:
6171 break;
6172 }
6173 return 0;
6174 }
6175 /* purecov: end */
6176
6177 /* recover_wait_enter state */
6178 /* purecov: begin deadcode */
xcom_fsm_recover_wait_enter(xcom_actions action,task_arg fsmargs,xcom_fsm_state * ctxt)6179 static int xcom_fsm_recover_wait_enter(xcom_actions action, task_arg fsmargs,
6180 xcom_fsm_state *ctxt) {
6181 (void)action;
6182 (void)fsmargs;
6183 push_dbg(D_DETECT | D_FSM | D_FILEOP | D_CONS | D_BASE | D_TRANSPORT);
6184 IFDBG(D_NONE, FN; STRLIT("state x_recover_wait"););
6185 if (got_all_snapshots()) {
6186 /* Need to send message to trigger transition in context of xcom
6187 * thread */
6188 send_x_fsm_complete();
6189 }
6190 SET_X_FSM_STATE(xcom_fsm_recover_wait);
6191 return 0;
6192 }
6193 /* purecov: end */
6194
6195 /* recover_wait state */
6196 /* purecov: begin deadcode */
xcom_fsm_recover_wait(xcom_actions action,task_arg fsmargs,xcom_fsm_state * ctxt)6197 static int xcom_fsm_recover_wait(xcom_actions action, task_arg fsmargs,
6198 xcom_fsm_state *ctxt) {
6199 if (action == x_fsm_snapshot) {
6200 gcs_snapshot *gcs = (gcs_snapshot *)get_void_arg(fsmargs);
6201 set_log_end(gcs);
6202 update_best_snapshot(gcs);
6203 } else if (action == x_fsm_timeout || action == x_fsm_complete) {
6204 /* Wait terminated by timeout or because all nodes have sent a
6205 * snapshot */
6206 /* If we run under control of the recovery manager, we need to call
6207 * recovery_end_cb to rendezvous with the recovery manager */
6208 if (recovery_end_cb) recovery_end_cb();
6209 pop_dbg();
6210 SET_X_FSM_STATE(xcom_fsm_run_enter);
6211 return 1;
6212 }
6213 if (got_all_snapshots()) {
6214 /* Need to send message to trigger transition in context of xcom
6215 * thread */
6216 send_x_fsm_complete();
6217 }
6218 return 0;
6219 }
6220 /* purecov: end */
6221
6222 /* run_enter state */
xcom_fsm_run_enter(xcom_actions action,task_arg fsmargs,xcom_fsm_state * ctxt)6223 static int xcom_fsm_run_enter(xcom_actions action, task_arg fsmargs,
6224 xcom_fsm_state *ctxt) {
6225 (void)action;
6226 (void)fsmargs;
6227 start_config = get_site_def()->boot_key;
6228
6229 /* Final sanity check of executed_msg */
6230 if (find_site_def(executed_msg) == 0) {
6231 /* No site_def matches executed_msg, set it to site->start */
6232 set_executed_msg(get_site_def()->start);
6233 }
6234
6235 IFDBG(D_NONE, FN; STRLIT("state x_run"););
6236 IFDBG(D_BUG, FN; SYCEXP(executed_msg););
6237 IFDBG(D_BUG, FN; SYCEXP(start_config););
6238 stop_x_timer();
6239 if (xcom_run_cb) xcom_run_cb(0);
6240 client_boot_done = 1;
6241 netboot_ok = 1;
6242 set_proposer_startpoint();
6243 create_proposers();
6244 set_task(&executor, task_new(executor_task, null_arg, "executor_task",
6245 XCOM_THREAD_DEBUG));
6246 set_task(&sweeper,
6247 task_new(sweeper_task, null_arg, "sweeper_task", XCOM_THREAD_DEBUG));
6248 set_task(&detector, task_new(detector_task, null_arg, "detector_task",
6249 XCOM_THREAD_DEBUG));
6250 set_task(&alive_t,
6251 task_new(alive_task, null_arg, "alive_task", XCOM_THREAD_DEBUG));
6252 set_task(&cache_task, task_new(cache_manager_task, null_arg,
6253 "cache_manager_task", XCOM_THREAD_DEBUG));
6254
6255 push_dbg(D_FSM /* | D_EXEC | D_BASE | D_TRANSPORT */);
6256 SET_X_FSM_STATE(xcom_fsm_run);
6257 return 1;
6258 }
6259
handle_fsm_terminate(task_arg fsmargs,xcom_fsm_state * ctxt)6260 static int handle_fsm_terminate(task_arg fsmargs, xcom_fsm_state *ctxt) {
6261 dump_debug_exec_state();
6262 client_boot_done = 0;
6263 netboot_ok = 0;
6264 oom_abort = 0;
6265 terminate_proposers();
6266 init_proposers();
6267 task_terminate(executor);
6268 set_task(&executor, NULL);
6269 task_terminate(sweeper);
6270 set_task(&sweeper, NULL);
6271 task_terminate(detector);
6272 set_task(&detector, NULL);
6273 task_terminate(alive_t);
6274 set_task(&alive_t, NULL);
6275 task_terminate(cache_task);
6276 set_task(&cache_task, NULL);
6277
6278 init_xcom_base(); /* Reset shared variables */
6279 free_site_defs();
6280 free_forced_config_site_def();
6281 wait_forced_config = 0;
6282 garbage_collect_servers();
6283 if (xcom_terminate_cb) xcom_terminate_cb(get_int_arg(fsmargs));
6284 pop_dbg();
6285 SET_X_FSM_STATE(xcom_fsm_start_enter);
6286 return 1;
6287 }
6288
handle_fsm_force_config(task_arg fsmargs)6289 static void handle_fsm_force_config(task_arg fsmargs) {
6290 app_data *a = (app_data *)get_void_arg(fsmargs);
6291 site_def *s = create_site_def_with_start(a, executed_msg);
6292
6293 s->boot_key = executed_msg;
6294 invalidate_servers(get_site_def(), s);
6295 start_force_config(s, 1);
6296 wait_forced_config = 1; /* Note that forced config has not yet arrived */
6297 }
6298
6299 /* run state */
xcom_fsm_run(xcom_actions action,task_arg fsmargs,xcom_fsm_state * ctxt)6300 static int xcom_fsm_run(xcom_actions action, task_arg fsmargs,
6301 xcom_fsm_state *ctxt) {
6302 switch (action) {
6303 case x_fsm_terminate:
6304 return handle_fsm_terminate(fsmargs, ctxt);
6305
6306 /* purecov: begin deadcode */
6307 case x_fsm_need_snapshot:
6308 IFDBG(D_NONE, STRLIT("got snapshot request in x_run state"));
6309 break;
6310 /* purecov: end */
6311
6312 case x_fsm_force_config:
6313 handle_fsm_force_config(fsmargs);
6314 break;
6315
6316 default:
6317 break;
6318 }
6319 return 0;
6320 }
6321
6322 /* Trampoline which loops calling thunks pointed to by ctxt.state_fp until 0 is
6323 * returned. Return pointer to ctxt. */
xcom_fsm_impl(xcom_actions action,task_arg fsmargs)6324 xcom_fsm_state *xcom_fsm_impl(xcom_actions action, task_arg fsmargs) {
6325 static xcom_fsm_state ctxt = X_FSM_STATE(xcom_fsm_init);
6326
6327 G_DEBUG("%f pid %d xcom_id %x state %s action %s", seconds(), xpid(),
6328 get_my_xcom_id(), ctxt.state_name, xcom_actions_name[action]);
6329 ADD_DBG(D_FSM, add_event(EVENT_DUMP_PAD, string_arg("state"));
6330 add_event(EVENT_DUMP_PAD, string_arg(ctxt.state_name));
6331 add_event(EVENT_DUMP_PAD, string_arg("action"));
6332 add_event(EVENT_DUMP_PAD, string_arg(xcom_actions_name[action]));
6333 add_event(EVENT_DUMP_PAD, string_arg("executed_msg"));
6334 add_synode_event(executed_msg););
6335 #ifdef TASK_EVENT_TRACE
6336 dump_task_events();
6337 #endif
6338 /* Crank the state machine until it stops */
6339 IFDBG(D_BUG, FN; STREXP(ctxt.state_name); STREXP(xcom_actions_name[action]));
6340 while (ctxt.state_fp(action, fsmargs, &ctxt)) {
6341 IFDBG(D_BUG, FN; STREXP(ctxt.state_name);
6342 STREXP(xcom_actions_name[action]));
6343 }
6344 return &ctxt;
6345 }
6346
6347 /* Call FSM trampoline and return state name of resulting state */
xcom_fsm(xcom_actions action,task_arg fsmargs)6348 char const *xcom_fsm(xcom_actions action, task_arg fsmargs) {
6349 xcom_fsm_state *s = xcom_fsm_impl(action, fsmargs);
6350 return s->state_name;
6351 }
6352
6353 /* See if we can send a snapshot to another node */
6354 /* purecov: begin deadcode */
can_send_snapshot()6355 static int can_send_snapshot() {
6356 xcom_fsm_state *state = xcom_fsm_impl(x_fsm_need_snapshot, null_arg);
6357 return state->state_fp == xcom_fsm_run;
6358 }
6359 /* purecov: end */
6360
set_app_snap_handler(app_snap_handler x)6361 void set_app_snap_handler(app_snap_handler x) { handle_app_snap_cb = x; }
6362
6363 /* purecov: begin deadcode */
set_app_snap_getter(app_snap_getter x)6364 void set_app_snap_getter(app_snap_getter x) { get_app_snap_cb = x; }
6365 /* purecov: end */
6366
checked_create_socket(int domain,int type,int protocol)6367 static result checked_create_socket(int domain, int type, int protocol) {
6368 result retval = {0, 0};
6369 int nr_attempts = 1005;
6370
6371 do {
6372 SET_OS_ERR(0);
6373 retval.val = (int)socket(domain, type, protocol);
6374 retval.funerr = to_errno(GET_OS_ERR);
6375 if (nr_attempts % 10 == 0) xcom_sleep(1);
6376 } while (--nr_attempts && retval.val == -1 &&
6377 (from_errno(retval.funerr) == SOCK_EAGAIN));
6378
6379 if (retval.val == -1) {
6380 task_dump_err(retval.funerr);
6381 #if defined(_WIN32)
6382 G_MESSAGE("Socket creation failed with error %d.", retval.funerr);
6383 #else
6384 G_MESSAGE("Socket creation failed with error %d - %s.", retval.funerr,
6385 strerror(retval.funerr));
6386 #endif
6387 }
6388 return retval;
6389 }
6390
6391 /* Read max n bytes from socket fd into buffer buf */
socket_read(connection_descriptor * rfd,void * buf,int n)6392 static result socket_read(connection_descriptor *rfd, void *buf, int n) {
6393 result ret = {0, 0};
6394
6395 assert(n >= 0);
6396
6397 do {
6398 ret = con_read(rfd, buf, n);
6399 task_dump_err(ret.funerr);
6400 } while (ret.val < 0 && can_retry_read(ret.funerr));
6401 return ret;
6402 }
6403
6404 /* Read exactly n bytes from socket fd into buffer buf */
socket_read_bytes(connection_descriptor * rfd,char * p,uint32_t n)6405 static int64_t socket_read_bytes(connection_descriptor *rfd, char *p,
6406 uint32_t n) {
6407 uint32_t left = n;
6408 char *bytes = p;
6409
6410 result nread = {0, 0};
6411
6412 while (left > 0) {
6413 /*
6414 socket_read just reads no more than INT_MAX bytes. We should not pass
6415 a length more than INT_MAX to it.
6416 */
6417 int r = (int)MIN(left, INT_MAX);
6418
6419 nread = socket_read(rfd, bytes, r);
6420 if (nread.val == 0) {
6421 return 0;
6422 } else if (nread.val < 0) {
6423 return -1;
6424 } else {
6425 bytes += nread.val;
6426 left -= (uint32_t)nread.val;
6427 }
6428 }
6429 assert(left == 0);
6430 return n;
6431 }
6432
6433 /* Write n bytes from buffer buf to socket fd */
socket_write(connection_descriptor * wfd,void * _buf,uint32_t n)6434 static int64_t socket_write(connection_descriptor *wfd, void *_buf,
6435 uint32_t n) {
6436 char *buf = (char *)_buf;
6437 result ret = {0, 0};
6438
6439 uint32_t total; /* Keeps track of number of bytes written so far */
6440
6441 total = 0;
6442 while (total < n) {
6443 int w = (int)MIN(n - total, INT_MAX);
6444
6445 while ((ret = con_write(wfd, buf + total, w)).val < 0 &&
6446 can_retry_write(ret.funerr)) {
6447 task_dump_err(ret.funerr);
6448 IFDBG(D_NONE, FN; STRLIT("retry "); NEXP(total, d); NEXP(n, d));
6449 }
6450 if (ret.val <= 0) { /* Something went wrong */
6451 task_dump_err(ret.funerr);
6452 return -1;
6453 } else {
6454 total += (uint32_t)ret.val; /* Add number of bytes written to total */
6455 }
6456 }
6457 IFDBG(D_TRANSPORT, FN; NEXP(total, u); NEXP(n, u));
6458 assert(total == n);
6459 return (total);
6460 }
6461
xcom_close_socket(int * sock)6462 static inline result xcom_close_socket(int *sock) {
6463 result res = {0, 0};
6464 if (*sock != -1) {
6465 IFDBG(D_FILEOP, FN; STRLIT("closing socket "); NDBG(*sock, d));
6466 do {
6467 SET_OS_ERR(0);
6468 res.val = CLOSESOCKET(*sock);
6469 res.funerr = to_errno(GET_OS_ERR);
6470 } while (res.val == -1 && from_errno(res.funerr) == SOCK_EINTR);
6471 *sock = -1;
6472 }
6473 return res;
6474 }
6475
xcom_shut_close_socket(int * sock)6476 static inline result xcom_shut_close_socket(int *sock) {
6477 result res = {0, 0};
6478 if (*sock >= 0) {
6479 shutdown_socket(sock);
6480 res = xcom_close_socket(sock);
6481 }
6482 return res;
6483 }
6484
6485 #define CONNECT_FAIL \
6486 ret_fd = -1; \
6487 goto end
6488
6489 /*
6490
6491 */
6492
6493 /**
6494 @brief Retreives a node IPv4 address, if it exists.
6495
6496 If a node is v4 reachable, means one of two:
6497 - The raw address is V4
6498 - a name was resolved to a V4/V6 address
6499
6500 If the later is the case, we are going to prefer the first v4
6501 address in the list, since it is the common language between
6502 old and new version. If you want exclusive V6, please configure your
6503 DNS server to serve V6 names
6504
6505 @param retrieved a previously retrieved struct addrinfo
6506 @return struct addrinfo* An addrinfo of the first IPv4 address. Else it will
6507 return the entry parameter.
6508 */
does_node_have_v4_address(struct addrinfo * retrieved)6509 struct addrinfo *does_node_have_v4_address(struct addrinfo *retrieved) {
6510 struct addrinfo *cycle = NULL;
6511
6512 int v4_reachable = is_node_v4_reachable_with_info(retrieved);
6513
6514 if (v4_reachable) {
6515 cycle = retrieved;
6516 while (cycle) {
6517 if (cycle->ai_family == AF_INET) {
6518 return cycle;
6519 }
6520 cycle = cycle->ai_next;
6521 }
6522 }
6523
6524 /* If something goes really wrong... we fallback to avoid crashes */
6525 return retrieved;
6526 }
6527
timed_connect_msec(int fd,struct sockaddr * sock_addr,socklen_t sock_size,int timeout)6528 static int timed_connect_msec(int fd, struct sockaddr *sock_addr,
6529 socklen_t sock_size, int timeout) {
6530 int ret_fd = fd;
6531 int syserr;
6532 int sysret;
6533 struct pollfd fds;
6534
6535 fds.fd = fd;
6536 fds.events = POLLOUT;
6537 fds.revents = 0;
6538
6539 /* Set non-blocking */
6540 if (unblock_fd(fd) < 0) return -1;
6541
6542 /* Trying to connect with timeout */
6543 SET_OS_ERR(0);
6544 sysret = connect(fd, sock_addr, sock_size);
6545
6546 if (is_socket_error(sysret)) {
6547 syserr = GET_OS_ERR;
6548 /* If the error is SOCK_EWOULDBLOCK or SOCK_EINPROGRESS or SOCK_EALREADY,
6549 * wait. */
6550 switch (syserr) {
6551 case SOCK_EWOULDBLOCK:
6552 case SOCK_EINPROGRESS:
6553 case SOCK_EALREADY:
6554 break;
6555 default:
6556 G_DEBUG(
6557 "connect - Error connecting "
6558 "(socket=%d, error=%d).",
6559 fd, GET_OS_ERR);
6560 CONNECT_FAIL;
6561 }
6562
6563 SET_OS_ERR(0);
6564 IFDBG(D_TRANSPORT, FN; STRLIT("poll - Starting. "); NEXP(timeout, d);
6565 NEXP(sysret, d));
6566 while ((sysret = poll(&fds, 1, timeout)) < 0) {
6567 syserr = GET_OS_ERR;
6568 if (syserr != SOCK_EINTR && syserr != SOCK_EINPROGRESS) break;
6569 SET_OS_ERR(0);
6570 }
6571 IFDBG(D_TRANSPORT, FN; STRLIT("poll - Finished. "); NEXP(timeout, d);
6572 NEXP(sysret, d));
6573
6574 if (sysret == 0) {
6575 G_DEBUG(
6576 "Timed out while waiting for connection to be established! "
6577 "Cancelling connection attempt. (socket= %d, error=%d)",
6578 fd, sysret);
6579 /* G_WARNING("poll - Timeout! Cancelling connection..."); */
6580 CONNECT_FAIL;
6581 }
6582
6583 if (is_socket_error(sysret)) {
6584 G_DEBUG(
6585 "poll - Error while connecting! "
6586 "(socket= %d, error=%d)",
6587 fd, GET_OS_ERR);
6588 CONNECT_FAIL;
6589 }
6590
6591 {
6592 int socket_errno = 0;
6593 socklen_t socket_errno_len = sizeof(socket_errno);
6594
6595 if ((fds.revents & POLLOUT) == 0) {
6596 IFDBG(D_NONE, FN; STRLIT("POLLOUT not set - Socket failure!"););
6597 ret_fd = -1;
6598 }
6599
6600 if (fds.revents & (POLLERR | POLLHUP | POLLNVAL)) {
6601 IFDBG(D_NONE, FN;
6602 STRLIT("POLLERR | POLLHUP | POLLNVAL set - Socket failure!"););
6603 ret_fd = -1;
6604 }
6605 if (getsockopt(fd, SOL_SOCKET, SO_ERROR, (xcom_buf *)&socket_errno,
6606 &socket_errno_len) != 0) {
6607 G_DEBUG("getsockopt socket %d failed.", fd);
6608 ret_fd = -1;
6609 } else {
6610 if (socket_errno != 0) {
6611 G_DEBUG("Connection to socket %d failed with error %d.", fd,
6612 socket_errno);
6613 ret_fd = -1;
6614 }
6615 }
6616 }
6617 }
6618
6619 end:
6620 /* Set blocking */
6621 SET_OS_ERR(0);
6622 if (block_fd(fd) < 0) {
6623 G_DEBUG(
6624 "Unable to set socket back to blocking state. "
6625 "(socket=%d, error=%d).",
6626 fd, GET_OS_ERR);
6627 return -1;
6628 }
6629 return ret_fd;
6630 }
6631
timed_connect(int fd,struct sockaddr * sock_addr,socklen_t sock_size)6632 static int timed_connect(int fd, struct sockaddr *sock_addr,
6633 socklen_t sock_size) {
6634 return timed_connect_msec(fd, sock_addr, sock_size, 10000);
6635 }
6636
6637 /* purecov: begin deadcode */
timed_connect_sec(int fd,struct sockaddr * sock_addr,socklen_t sock_size,int timeout)6638 int timed_connect_sec(int fd, struct sockaddr *sock_addr, socklen_t sock_size,
6639 int timeout) {
6640 return timed_connect_msec(fd, sock_addr, sock_size, timeout * 1000);
6641 }
6642 /* purecov: end */
6643
6644 /* Connect to server on given port */
6645 #ifndef XCOM_WITHOUT_OPENSSL
connect_xcom(char const * server,xcom_port port,int use_ssl)6646 static connection_descriptor *connect_xcom(char const *server, xcom_port port,
6647 int use_ssl) {
6648 #else
6649 static connection_descriptor *connect_xcom(char const *server, xcom_port port) {
6650 #endif
6651 result fd = {0, 0};
6652 result ret = {0, 0};
6653 connection_descriptor *cd = NULL;
6654 char buf[SYS_STRERROR_SIZE];
6655
6656 IFDBG(D_NONE, FN; STREXP(server); NEXP(port, d));
6657 G_DEBUG("connecting to %s %d", server, port);
6658
6659 {
6660 struct addrinfo *addr = NULL, *from_ns = NULL;
6661
6662 char buffer[20];
6663 sprintf(buffer, "%d", port);
6664
6665 checked_getaddrinfo(server, buffer, 0, &from_ns);
6666
6667 if (from_ns == NULL) {
6668 /* purecov: begin inspected */
6669 G_ERROR("Error retrieving server information.");
6670 goto end;
6671 /* purecov: end */
6672 }
6673
6674 addr = does_node_have_v4_address(from_ns);
6675
6676 /* Create socket after knowing the family that we are dealing with
6677 getaddrinfo returns a list of possible addresses. We will alays default
6678 to the first one in the list, which is V4 if applicable.
6679 */
6680 if ((fd = checked_create_socket(addr->ai_family, SOCK_STREAM, IPPROTO_TCP))
6681 .val < 0) {
6682 /* purecov: begin inspected */
6683 G_ERROR(
6684 "Error creating socket in local GR->GCS connection to address %s.",
6685 server);
6686 goto end;
6687 /* purecov: end */
6688 }
6689
6690 /* Connect socket to address */
6691
6692 SET_OS_ERR(0);
6693
6694 if (timed_connect(fd.val, addr->ai_addr, (socklen_t)addr->ai_addrlen) ==
6695 -1) {
6696 fd.funerr = to_errno(GET_OS_ERR);
6697 G_DEBUG(
6698 "Connecting socket to address %s in port %d failed with error %d - "
6699 "%s.",
6700 server, port, fd.funerr, strerr_msg(buf, sizeof(buf), fd.funerr));
6701 xcom_close_socket(&fd.val);
6702 goto end;
6703 }
6704 {
6705 int peer = 0;
6706 /* Sanity check before return */
6707 SET_OS_ERR(0);
6708 {
6709 socklen_t ai_addrlen = (socklen_t)addr->ai_addrlen;
6710 ret.val = peer = xcom_getpeername(fd.val, addr->ai_addr, &ai_addrlen);
6711 }
6712 ret.funerr = to_errno(GET_OS_ERR);
6713 if (peer >= 0) {
6714 ret = set_nodelay(fd.val);
6715 if (ret.val < 0) {
6716 /* purecov: begin inspected */
6717 task_dump_err(ret.funerr);
6718 xcom_shut_close_socket(&fd.val);
6719 #if defined(_WIN32)
6720 G_DEBUG(
6721 "Setting node delay failed while connecting to %s with error "
6722 "%d.",
6723 server, ret.funerr);
6724 #else
6725 G_DEBUG(
6726 "Setting node delay failed while connecting to %s with error %d "
6727 "- "
6728 "%s.",
6729 server, ret.funerr, strerror(ret.funerr));
6730 #endif
6731 goto end;
6732 /* purecov: end */
6733 }
6734 G_DEBUG("client connected to %s %d fd %d", server, port, fd.val);
6735 } else {
6736 /* Something is wrong */
6737 /* purecov: begin inspected */
6738 socklen_t errlen = sizeof(ret.funerr);
6739 IFDBG(D_NONE, FN; STRLIT("xcom_getpeername failed"););
6740 if (ret.funerr) {
6741 IFDBG(D_NONE, FN; NEXP(from_errno(ret.funerr), d);
6742 STRLIT(strerror(from_errno(ret.funerr))));
6743 }
6744 getsockopt(fd.val, SOL_SOCKET, SO_ERROR, (xcom_buf *)&ret.funerr,
6745 &errlen);
6746 if (ret.funerr == 0) {
6747 ret.funerr = to_errno(SOCK_ECONNREFUSED);
6748 }
6749 xcom_shut_close_socket(&fd.val);
6750 #if defined(_WIN32)
6751 G_DEBUG(
6752 "Getting the peer name failed while connecting to server %s with "
6753 "error %d.",
6754 server, ret.funerr);
6755 #else
6756 G_DEBUG(
6757 "Getting the peer name failed while connecting to server %s with "
6758 "error %d -%s.",
6759 server, ret.funerr, strerror(ret.funerr));
6760 #endif
6761 goto end;
6762 /* purecov: end */
6763 }
6764
6765 #ifndef XCOM_WITHOUT_OPENSSL
6766 if (use_ssl && xcom_use_ssl()) {
6767 SSL *ssl = SSL_new(client_ctx);
6768 G_DEBUG("Trying to connect using SSL.")
6769 SSL_set_fd(ssl, fd.val);
6770
6771 ERR_clear_error();
6772 ret.val = SSL_connect(ssl);
6773 ret.funerr = to_ssl_err(SSL_get_error(ssl, ret.val));
6774
6775 if (ret.val != SSL_SUCCESS) {
6776 /* purecov: begin inspected */
6777 G_MESSAGE("Error connecting using SSL %d %d.", ret.funerr,
6778 SSL_get_error(ssl, ret.val));
6779 task_dump_err(ret.funerr);
6780 SSL_shutdown(ssl);
6781 SSL_free(ssl);
6782 xcom_shut_close_socket(&fd.val);
6783
6784 goto end;
6785 /* purecov: end */
6786 }
6787 IFDBG(D_NONE, FN; STRLIT("ssl connected to "); STRLIT(server);
6788 NDBG(port, d); NDBG(fd.val, d); PTREXP(ssl));
6789
6790 if (ssl_verify_server_cert(ssl, server)) {
6791 /* purecov: begin inspected */
6792 G_MESSAGE("Error validating certificate and peer.");
6793 task_dump_err(ret.funerr);
6794 SSL_shutdown(ssl);
6795 SSL_free(ssl);
6796 xcom_shut_close_socket(&fd.val);
6797
6798 goto end;
6799 /* purecov: end */
6800 }
6801
6802 cd = new_connection(fd.val, ssl);
6803 set_connected(cd, CON_FD);
6804 G_DEBUG("Success connecting using SSL.")
6805
6806 goto end;
6807 } else {
6808 cd = new_connection(fd.val, 0);
6809 set_connected(cd, CON_FD);
6810
6811 goto end;
6812 }
6813 #else
6814 {
6815 cd = new_connection(fd.val);
6816 set_connected(cd, CON_FD);
6817
6818 goto end;
6819 }
6820 #endif
6821 }
6822
6823 end:
6824 if (from_ns) freeaddrinfo(from_ns);
6825 }
6826 return cd;
6827 }
6828
6829 connection_descriptor *xcom_open_client_connection(char const *server,
6830 xcom_port port) {
6831 #ifndef XCOM_WITHOUT_OPENSSL
6832 return connect_xcom(server, port, TRUE);
6833 #else
6834 return connect_xcom(server, port);
6835 #endif
6836 }
6837
6838 /* Send a protocol negotiation message on connection con */
6839 static int xcom_send_proto(connection_descriptor *con, xcom_proto x_proto,
6840 x_msg_type x_type, unsigned int tag) {
6841 char buf[MSG_HDR_SIZE];
6842 memset(buf, 0, MSG_HDR_SIZE);
6843
6844 if (con->fd >= 0) {
6845 con->snd_tag = tag;
6846 write_protoversion(VERS_PTR((unsigned char *)buf), x_proto);
6847 put_header_1_0((unsigned char *)buf, 0, x_type, tag);
6848 {
6849 int sent;
6850 sent = (int)socket_write(con, buf, MSG_HDR_SIZE);
6851 if (con->fd < 0) {
6852 return -1;
6853 }
6854 return sent;
6855 }
6856 } else {
6857 return -1;
6858 }
6859 }
6860
6861 static int xcom_recv_proto(connection_descriptor *rfd, xcom_proto *x_proto,
6862 x_msg_type *x_type, unsigned int *tag) {
6863 int n;
6864 unsigned char header_buf[MSG_HDR_SIZE];
6865 uint32_t msgsize;
6866
6867 /* Read length field, protocol version, and checksum */
6868 n = (int)socket_read_bytes(rfd, (char *)header_buf, MSG_HDR_SIZE);
6869
6870 if (n != MSG_HDR_SIZE) {
6871 IFDBG(D_NONE, FN; NDBG(n, d));
6872 return -1;
6873 }
6874
6875 *x_proto = read_protoversion(VERS_PTR(header_buf));
6876 get_header_1_0(header_buf, &msgsize, x_type, tag);
6877
6878 return n;
6879 }
6880
6881 enum { TAG_START = 313 };
6882
6883 /**
6884 * @brief Checks if a given app_data is from a given cargo_type.
6885 *
6886 * @param a the app_data
6887 * @param t the cargo type
6888 * @return int TRUE (1) if app_data a is from cargo_type t
6889 */
6890
6891 static inline int is_cargo_type(app_data_ptr a, cargo_type t) {
6892 return a ? (a->body.c_t == t) : 0;
6893 }
6894
6895 /**
6896 * @brief Retrieves the address that was used in the add_node request
6897 *
6898 * @param a app data containing the node to add
6899 * @param member address we used to present ourselves to other nodes
6900 * @return char* a pointer to the address being added.
6901 */
6902 static char *get_add_node_address(app_data_ptr a, unsigned int *member) {
6903 char *retval = NULL;
6904 if (!is_cargo_type(a, add_node_type)) return NULL;
6905
6906 if ((*member) < a->body.app_u_u.nodes.node_list_len) {
6907 retval = a->body.app_u_u.nodes.node_list_val[(*member)].address;
6908 (*member)++;
6909 }
6910
6911 return retval;
6912 }
6913
6914 int is_node_v4_reachable_with_info(struct addrinfo *retrieved_addr_info) {
6915 int v4_reachable = 0;
6916
6917 /* Verify if we are reachable either by V4 and by V6 with the provided
6918 address. */
6919 struct addrinfo *my_own_information_loop = NULL;
6920
6921 my_own_information_loop = retrieved_addr_info;
6922 while (!v4_reachable && my_own_information_loop) {
6923 if (my_own_information_loop->ai_family == AF_INET) {
6924 v4_reachable = 1;
6925 }
6926 my_own_information_loop = my_own_information_loop->ai_next;
6927 }
6928
6929 return v4_reachable;
6930 }
6931
6932 int is_node_v4_reachable(char *node_address) {
6933 int v4_reachable = 0;
6934
6935 /* Verify if we are reachable either by V4 and by V6 with the provided
6936 address. */
6937 struct addrinfo *my_own_information = NULL;
6938
6939 checked_getaddrinfo(node_address, NULL, NULL, &my_own_information);
6940 if (my_own_information == NULL) {
6941 return v4_reachable;
6942 }
6943
6944 v4_reachable = is_node_v4_reachable_with_info(my_own_information);
6945
6946 if (my_own_information) freeaddrinfo(my_own_information);
6947
6948 return v4_reachable;
6949 }
6950
6951 int are_we_allowed_to_upgrade_to_v6(app_data_ptr a) {
6952 /* This should the address we used to present ourselves to other nodes. */
6953 unsigned int list_member = 0;
6954 char *added_node = NULL;
6955
6956 int is_v4_reachable = 0;
6957 while ((added_node = get_add_node_address(a, &list_member)) != NULL) {
6958 xcom_port my_own_port;
6959 char my_own_address[IP_MAX_SIZE];
6960 int ip_and_port_error =
6961 get_ip_and_port(added_node, my_own_address, &my_own_port);
6962
6963 if (ip_and_port_error) {
6964 G_DEBUG("Error retrieving IP and Port information");
6965 return 0;
6966 }
6967
6968 /* Verify if we are reachable either by V4 and by V6 with the provided
6969 address.
6970 This means that the other side won't be able to contact us since we
6971 do not provide a public V4 address */
6972 if (!(is_v4_reachable = is_node_v4_reachable(my_own_address))) {
6973 G_ERROR(
6974 "Unable to add node to a group of older nodes. Please "
6975 "reconfigure "
6976 "you local address to an IPv4 address or configure your DNS to "
6977 "provide "
6978 "an IPv4 address");
6979 return 0;
6980 }
6981 }
6982
6983 return is_v4_reachable;
6984 }
6985
6986 int64_t xcom_send_client_app_data(connection_descriptor *fd, app_data_ptr a,
6987 int force) {
6988 pax_msg *msg = pax_msg_new(null_synode, 0);
6989 uint32_t buflen = 0;
6990 char *buf = 0;
6991 int64_t retval = 0;
6992 int serialized = 0;
6993
6994 if (!proto_done(fd)) {
6995 xcom_proto x_proto;
6996 x_msg_type x_type;
6997 unsigned int tag;
6998 retval = xcom_send_proto(fd, my_xcom_version, x_version_req, TAG_START);
6999 G_DEBUG("client sent negotiation request for protocol %d", my_xcom_version);
7000 if (retval < 0) goto end;
7001 retval = xcom_recv_proto(fd, &x_proto, &x_type, &tag);
7002 if (retval < 0) goto end;
7003 if (tag != TAG_START) {
7004 retval = -1;
7005 goto end;
7006 }
7007 if (x_type != x_version_reply) {
7008 retval = -1;
7009 goto end;
7010 }
7011
7012 if (x_proto == x_unknown_proto) {
7013 G_DEBUG("no common protocol, returning error");
7014 retval = -1;
7015 goto end;
7016 }
7017
7018 /* This code will check if, in case of an upgrade if:
7019 - We are a node able to speak IPv6.
7020 - If we are connecting to a group that does not speak IPv6.
7021 - If our address is IPv4-compatible in order for the old group to be able
7022 to contact us back. */
7023 if (is_cargo_type(a, add_node_type) && x_proto < minimum_ipv6_version() &&
7024 !are_we_allowed_to_upgrade_to_v6(a)) {
7025 retval = -1;
7026 goto end;
7027 }
7028
7029 G_DEBUG("client connection will use protocol version %d", x_proto);
7030 IFDBG(D_NONE, STRLIT("client connection will use protocol version ");
7031 NDBG(x_proto, u); STRLIT(xcom_proto_to_str(x_proto)));
7032 fd->x_proto = x_proto;
7033 set_connected(fd, CON_PROTO);
7034 }
7035 msg->a = a;
7036 msg->to = VOID_NODE_NO;
7037 msg->op = client_msg;
7038 msg->force_delivery = force;
7039
7040 serialized = serialize_msg(msg, fd->x_proto, &buflen, &buf);
7041 if (serialized) {
7042 retval = socket_write(fd, buf, buflen);
7043 if (buflen != retval) {
7044 IFDBG(D_NONE, FN; STRLIT("write failed "); NDBG(fd->fd, d);
7045 NDBG(buflen, d); NDBG64(retval));
7046 }
7047 } else {
7048 /* Failed to serialize, set retval accordingly. */
7049 retval = -1;
7050 }
7051 X_FREE(buf);
7052 end:
7053 msg->a = 0; /* Do not deallocate a */
7054 XCOM_XDR_FREE(xdr_pax_msg, msg);
7055 return retval;
7056 }
7057
7058 /* purecov: begin tested */
7059 /*
7060 * Tested by TEST_F(XComMultinodeSmokeTest,
7061 * 3_nodes_member_crashes_with_dieop_and_joins_again_immediately) GCS smoke test
7062 */
7063 int64_t xcom_client_send_die(connection_descriptor *fd) {
7064 uint32_t buflen = 0;
7065 char *buf = 0;
7066 int64_t retval = 0;
7067 app_data a;
7068 pax_msg *msg = pax_msg_new(null_synode, 0);
7069
7070 if (!proto_done(fd)) {
7071 xcom_proto x_proto;
7072 x_msg_type x_type;
7073 unsigned int tag;
7074 retval = xcom_send_proto(fd, my_xcom_version, x_version_req, TAG_START);
7075 G_DEBUG("client sent negotiation request for protocol %d", my_xcom_version);
7076 if (retval < 0) goto end;
7077 retval = xcom_recv_proto(fd, &x_proto, &x_type, &tag);
7078 if (retval < 0) goto end;
7079 if (tag != TAG_START) {
7080 retval = -1;
7081 goto end;
7082 }
7083 if (x_type != x_version_reply) {
7084 retval = -1;
7085 goto end;
7086 }
7087
7088 if (x_proto == x_unknown_proto) {
7089 G_DEBUG("no common protocol, returning error");
7090 retval = -1;
7091 goto end;
7092 }
7093 G_DEBUG("client connection will use protocol version %d", x_proto);
7094 IFDBG(D_NONE, STRLIT("client connection will use protocol version ");
7095 NDBG(x_proto, u); STRLIT(xcom_proto_to_str(x_proto)));
7096 fd->x_proto = x_proto;
7097 set_connected(fd, CON_PROTO);
7098 }
7099 init_app_data(&a);
7100 a.body.c_t = app_type;
7101 msg->a = &a;
7102 msg->op = die_op;
7103 /*
7104 Set the msgno to a value that ensures the die_op will be processed by
7105 XCom when it is received (it needs to be higher than the msgno of the
7106 executed_msg, otherwise XCom will simply ignore it).
7107 */
7108 msg->synode.msgno = UINT64_MAX;
7109
7110 serialize_msg(msg, fd->x_proto, &buflen, &buf);
7111 if (buflen) {
7112 retval = socket_write(fd, buf, buflen);
7113 if (buflen != retval) {
7114 IFDBG(D_NONE, FN; STRLIT("write failed "); NDBG(fd->fd, d);
7115 NDBG(buflen, d); NDBG64(retval));
7116 }
7117 X_FREE(buf);
7118 }
7119 xdr_free((xdrproc_t)xdr_app_data, (char *)&a);
7120 end:
7121 msg->a = 0;
7122 XCOM_XDR_FREE(xdr_pax_msg, msg);
7123 return retval > 0 && retval == buflen ? 1 : 0;
7124 }
7125 /* purecov: end */
7126
7127 /* purecov: begin deadcode */
7128 int64_t xcom_client_send_data(uint32_t size, char *data,
7129 connection_descriptor *fd) {
7130 app_data a;
7131 int64_t retval = 0;
7132 init_app_data(&a);
7133 a.body.c_t = app_type;
7134 a.body.app_u_u.data.data_len = size;
7135 a.body.app_u_u.data.data_val = data;
7136 retval = xcom_send_client_app_data(fd, &a, 0);
7137 xdr_free((xdrproc_t)xdr_app_data, (char *)&a);
7138 return retval;
7139 }
7140 /* purecov: end */
7141
7142 #ifndef _WIN32
7143 #include <arpa/inet.h>
7144 #include <netinet/in.h>
7145 #include <sys/socket.h>
7146 #endif
7147
7148 /* Output warning in log periodically if we receive messages
7149 with a protocol version that does not match our own */
7150 /* purecov: begin inspected */
7151 void warn_protoversion_mismatch(connection_descriptor *rfd) {
7152 struct sockaddr_storage sock_addr;
7153 socklen_t sock_size = sizeof(sock_addr);
7154
7155 if (task_now() - protoversion_warning_time > PROTOVERSION_WARNING_TIMEOUT) {
7156 if (0 ==
7157 xcom_getpeername(rfd->fd, (struct sockaddr *)&sock_addr, &sock_size)) {
7158 char buf[INET6_ADDRSTRLEN + 1];
7159 struct sockaddr_in *s4 = (struct sockaddr_in *)&sock_addr;
7160 struct sockaddr_in6 *s6 = (struct sockaddr_in6 *)&sock_addr;
7161 char const *ok;
7162
7163 memset((void *)buf, 0, sizeof(buf));
7164 if (sock_addr.ss_family == AF_INET) {
7165 ok = inet_ntop(sock_addr.ss_family, (void *)&s4->sin_addr, buf,
7166 sizeof(buf));
7167 } else {
7168 ok = inet_ntop(sock_addr.ss_family, (void *)&s6->sin6_addr, buf,
7169 sizeof(buf));
7170 }
7171 if (ok) {
7172 G_WARNING(
7173 "Detected incorrect xcom protocol version in connection from %s "
7174 "indicates "
7175 "missing cleanup of, or incorrect, xcom group definition on remote "
7176 "host. Please upgrade the process running on %s to a compatible "
7177 "version or stop it.",
7178 buf, buf);
7179 protoversion_warning_time = task_now();
7180 }
7181 }
7182 }
7183 }
7184 /* purecov: end */
7185
7186 static pax_msg *socket_read_msg(connection_descriptor *rfd, pax_msg *p)
7187 /* Should buffer reads as well */
7188 {
7189 int64_t n;
7190 char *bytes;
7191 unsigned char header_buf[MSG_HDR_SIZE];
7192 xcom_proto x_version;
7193 uint32_t msgsize;
7194 x_msg_type x_type;
7195 unsigned int tag;
7196 int deserialize_ok = 0;
7197
7198 bytes = NULL;
7199
7200 /* Read version, length, type, and tag */
7201 n = socket_read_bytes(rfd, (char *)header_buf, MSG_HDR_SIZE);
7202
7203 if (n <= 0) {
7204 IFDBG(D_NONE, FN; NDBG64(n));
7205 return 0;
7206 }
7207 assert(n == MSG_HDR_SIZE);
7208 x_version = (xcom_proto)get_32(VERS_PTR(header_buf));
7209 /* Check the protocol version before doing anything else */
7210 #ifdef XCOM_PARANOID
7211 assert(check_protoversion(x_version, rfd->x_proto));
7212 #endif
7213 if (!check_protoversion(x_version, rfd->x_proto)) {
7214 /* purecov: begin inspected */
7215 warn_protoversion_mismatch(rfd);
7216 return 0;
7217 /* purecov: end */
7218 }
7219
7220 /* OK, we can grok this version */
7221
7222 get_header_1_0(header_buf, &msgsize, &x_type, &tag);
7223
7224 /* Allocate buffer space for message */
7225 bytes = (char *)calloc(1, msgsize);
7226
7227 /* Read message */
7228 n = socket_read_bytes(rfd, bytes, msgsize);
7229
7230 if (n > 0) {
7231 /* Deserialize message */
7232 deserialize_ok = deserialize_msg(p, rfd->x_proto, bytes, msgsize);
7233 IFDBG(D_NONE, FN; STRLIT(" deserialized message"));
7234 }
7235 /* Deallocate buffer */
7236 X_FREE(bytes);
7237 if (n <= 0 || deserialize_ok == 0) {
7238 IFDBG(D_NONE, FN; NDBG64(n));
7239 return 0;
7240 }
7241 return (p);
7242 }
7243
7244 int xcom_close_client_connection(connection_descriptor *connection) {
7245 int retval = 0;
7246
7247 #ifndef XCOM_WITHOUT_OPENSSL
7248 if (connection->ssl_fd) {
7249 SSL_shutdown(connection->ssl_fd);
7250 ssl_free_con(connection);
7251 }
7252 #endif
7253 retval = xcom_shut_close_socket(&connection->fd).val;
7254 free(connection);
7255 return retval;
7256 }
7257
7258 /* purecov: begin deadcode */
7259 int xcom_client_boot(connection_descriptor *fd, node_list *nl,
7260 uint32_t group_id) {
7261 app_data a;
7262 int retval = 0;
7263 retval = (int)xcom_send_client_app_data(
7264 fd, init_config_with_group(&a, nl, unified_boot_type, group_id), 0);
7265 xdr_free((xdrproc_t)xdr_app_data, (char *)&a);
7266 return retval;
7267 }
7268 /* purecov: end */
7269
7270 enum xcom_send_app_wait_result {
7271 SEND_REQUEST_FAILED = 0,
7272 RECEIVE_REQUEST_FAILED,
7273 REQUEST_BOTCHED,
7274 RETRIES_EXCEEDED,
7275 REQUEST_OK_RECEIVED,
7276 REQUEST_FAIL_RECEIVED
7277 };
7278 typedef enum xcom_send_app_wait_result xcom_send_app_wait_result;
7279
7280 /**
7281 * Send a message and wait for response.
7282 *
7283 * The caller is reponsible for freeing p after calling this function,
7284 * i.e. xdr_free((xdrproc_t)xdr_pax_msg, (char *)p)
7285 */
7286 static xcom_send_app_wait_result xcom_send_app_wait_and_get(
7287 connection_descriptor *fd, app_data *a, int force, pax_msg *p) {
7288 int retval = 0;
7289 int retry_count = 10; /* Same as 'connection_attempts' */
7290 pax_msg *rp = 0;
7291
7292 do {
7293 retval = (int)xcom_send_client_app_data(fd, a, force);
7294 memset(p, 0, sizeof(*p)); /* before return so caller can free p */
7295 if (retval < 0) return SEND_REQUEST_FAILED;
7296 rp = socket_read_msg(fd, p);
7297 if (rp) {
7298 client_reply_code cli_err = rp->cli_err;
7299 switch (cli_err) {
7300 case REQUEST_OK:
7301 return REQUEST_OK_RECEIVED;
7302 case REQUEST_FAIL:
7303
7304 G_DEBUG("cli_err %d", cli_err);
7305 return REQUEST_FAIL_RECEIVED;
7306 case REQUEST_RETRY:
7307 G_DEBUG("cli_err %d", cli_err);
7308 if (retry_count > 1) xdr_free((xdrproc_t)xdr_pax_msg, (char *)p);
7309 xcom_sleep(1);
7310 break;
7311 default:
7312 G_WARNING("client protocol botched");
7313 return REQUEST_BOTCHED;
7314 }
7315 } else {
7316 G_WARNING("read failed");
7317 return RECEIVE_REQUEST_FAILED;
7318 }
7319 } while (--retry_count);
7320 /* Timeout after REQUEST_RETRY has been received 'retry_count' times */
7321 G_MESSAGE(
7322 "Request failed: maximum number of retries (10) has been exhausted.");
7323 return RETRIES_EXCEEDED;
7324 }
7325
7326 int xcom_send_app_wait(connection_descriptor *fd, app_data *a, int force) {
7327 pax_msg p;
7328 int result = 0;
7329 xcom_send_app_wait_result res = xcom_send_app_wait_and_get(fd, a, force, &p);
7330 switch (res) {
7331 case SEND_REQUEST_FAILED:
7332 case RECEIVE_REQUEST_FAILED:
7333 case REQUEST_BOTCHED:
7334 case RETRIES_EXCEEDED:
7335 case REQUEST_FAIL_RECEIVED:
7336 result = 0;
7337 break;
7338 case REQUEST_OK_RECEIVED:
7339 result = 1;
7340 break;
7341 }
7342 xdr_free((xdrproc_t)xdr_pax_msg, (char *)&p);
7343 return result;
7344 }
7345
7346 int xcom_send_cfg_wait(connection_descriptor *fd, node_list *nl,
7347 uint32_t group_id, cargo_type ct, int force) {
7348 app_data a;
7349 int retval = 0;
7350 IFDBG(D_NONE, FN; COPY_AND_FREE_GOUT(dbg_list(nl)););
7351 retval = xcom_send_app_wait(fd, init_config_with_group(&a, nl, ct, group_id),
7352 force);
7353 xdr_free((xdrproc_t)xdr_app_data, (char *)&a);
7354 return retval;
7355 }
7356
7357 int xcom_client_add_node(connection_descriptor *fd, node_list *nl,
7358 uint32_t group_id) {
7359 u_int i;
7360 for (i = 0; i < nl->node_list_len; i++) {
7361 assert(nl->node_list_val[i].proto.max_proto > x_unknown_proto);
7362 }
7363 return xcom_send_cfg_wait(fd, nl, group_id, add_node_type, 0);
7364 }
7365
7366 int xcom_client_remove_node(connection_descriptor *fd, node_list *nl,
7367 uint32_t group_id) {
7368 return xcom_send_cfg_wait(fd, nl, group_id, remove_node_type, 0);
7369 }
7370
7371 /* purecov: begin deadcode */
7372 int xcom_client_get_event_horizon(connection_descriptor *fd, uint32_t group_id,
7373 xcom_event_horizon *event_horizon) {
7374 pax_msg p;
7375 app_data a;
7376 int result = 0;
7377
7378 xcom_send_app_wait_result res = xcom_send_app_wait_and_get(
7379 fd, init_get_event_horizon_msg(&a, group_id), 0, &p);
7380
7381 switch (res) {
7382 case RECEIVE_REQUEST_FAILED:
7383 case REQUEST_BOTCHED:
7384 case RETRIES_EXCEEDED:
7385 case SEND_REQUEST_FAILED:
7386 case REQUEST_FAIL_RECEIVED:
7387 result = 0;
7388 break;
7389 case REQUEST_OK_RECEIVED:
7390 *event_horizon = p.event_horizon;
7391 result = 1;
7392 break;
7393 }
7394
7395 xdr_free((xdrproc_t)xdr_pax_msg, (char *)&p);
7396 xdr_free((xdrproc_t)xdr_app_data, (char *)&a);
7397
7398 return result;
7399 }
7400 /* purecov: end */
7401
7402 /* purecov: begin deadcode */
7403 int xcom_client_set_event_horizon(connection_descriptor *fd, uint32_t group_id,
7404 xcom_event_horizon event_horizon) {
7405 app_data a;
7406 int retval = 0;
7407 retval = xcom_send_app_wait(
7408 fd, init_set_event_horizon_msg(&a, group_id, event_horizon), 0);
7409 xdr_free((xdrproc_t)xdr_app_data, (char *)&a);
7410 return retval;
7411 }
7412 /* purecov: end */
7413
7414 int xcom_client_get_synode_app_data(connection_descriptor *const fd,
7415 uint32_t group_id,
7416 synode_no_array *const synodes,
7417 synode_app_data_array *const reply) {
7418 bool_t const success = TRUE;
7419 bool_t const failure = FALSE;
7420 bool_t result = failure;
7421 pax_msg p;
7422 app_data a;
7423 u_int const nr_synodes_requested = synodes->synode_no_array_len;
7424
7425 /* This call moves, as in C++ move semantics, synodes into app_data a. */
7426 init_get_synode_app_data_msg(&a, group_id, synodes);
7427
7428 {
7429 xcom_send_app_wait_result res = xcom_send_app_wait_and_get(fd, &a, 0, &p);
7430 switch (res) {
7431 case RECEIVE_REQUEST_FAILED:
7432 case REQUEST_BOTCHED:
7433 case RETRIES_EXCEEDED:
7434 case SEND_REQUEST_FAILED:
7435 case REQUEST_FAIL_RECEIVED: {
7436 G_TRACE(
7437 "xcom_client_get_synode_app_data: XCom did not have the required "
7438 "%u "
7439 "synodes.",
7440 nr_synodes_requested);
7441 break;
7442 }
7443 case REQUEST_OK_RECEIVED: {
7444 u_int const nr_synodes_received =
7445 p.requested_synode_app_data.synode_app_data_array_len;
7446 G_TRACE(
7447 "xcom_client_get_synode_app_data: Got %u synode payloads, we asked "
7448 "for %u.",
7449 nr_synodes_received, nr_synodes_requested);
7450
7451 /* This should always be TRUE.
7452 * But rather than asserting it, let's treat an unexpected number of
7453 * synode payloads in the reply as a failure. */
7454 if (nr_synodes_received == nr_synodes_requested) {
7455 /* Move (as in C++ move semantics) into reply */
7456 synode_app_data_array_move(reply, &p.requested_synode_app_data);
7457 result = success;
7458 }
7459 break;
7460 }
7461 }
7462 }
7463
7464 xdr_free((xdrproc_t)xdr_pax_msg, (char *)&p);
7465 xdr_free((xdrproc_t)xdr_app_data, (char *)&a);
7466
7467 return result;
7468 }
7469
7470 #ifdef NOTDEF
7471 /* Not completely implemented, need to be handled properly
7472 when received as a client message in dispatch_op.
7473 Should have separate opcode from normal add/remove,
7474 like force config_type */
7475 int xcom_client_force_add_node(connection_descriptor *, node_list *nl,
7476 uint32_t group_id) {
7477 return xcom_send_cfg_wait(fd, nl, group_id, add_node_type, 1);
7478 }
7479
7480 int xcom_client_force_remove_node(connection_descriptor *, node_list *nl,
7481 uint32_t group_id) {
7482 return xcom_send_cfg_wait(fd, nl, group_id, remove_node_type, 1);
7483 }
7484 #endif
7485
7486 int xcom_client_force_config(connection_descriptor *fd, node_list *nl,
7487 uint32_t group_id) {
7488 return xcom_send_cfg_wait(fd, nl, group_id, force_config_type, 1);
7489 }
7490
7491 /* purecov: begin deadcode */
7492 int xcom_client_enable_arbitrator(connection_descriptor *fd) {
7493 app_data a;
7494 int retval = 0;
7495 init_app_data(&a);
7496 a.body.c_t = enable_arbitrator;
7497 retval = xcom_send_app_wait(fd, &a, 0);
7498 xdr_free((xdrproc_t)xdr_app_data, (char *)&a);
7499 return retval;
7500 }
7501 /* purecov: end */
7502
7503 /* purecov: begin deadcode */
7504 int xcom_client_disable_arbitrator(connection_descriptor *fd) {
7505 app_data a;
7506 int retval = 0;
7507 init_app_data(&a);
7508 a.body.c_t = disable_arbitrator;
7509 retval = xcom_send_app_wait(fd, &a, 0);
7510 xdr_free((xdrproc_t)xdr_app_data, (char *)&a);
7511 return retval;
7512 }
7513 /* purecov: end */
7514
7515 /* purecov: begin deadcode */
7516 int xcom_client_terminate_and_exit(connection_descriptor *fd) {
7517 app_data a;
7518 int retval = 0;
7519 init_app_data(&a);
7520 a.body.c_t = x_terminate_and_exit;
7521 retval = xcom_send_app_wait(fd, &a, 0);
7522 xdr_free((xdrproc_t)xdr_app_data, (char *)&a);
7523 return retval;
7524 }
7525 /* purecov: end */
7526
7527 /* purecov: begin deadcode */
7528 int xcom_client_set_cache_limit(connection_descriptor *fd,
7529 uint64_t cache_limit) {
7530 app_data a;
7531 int retval = 0;
7532 init_app_data(&a);
7533 a.body.c_t = set_cache_limit;
7534 a.body.app_u_u.cache_limit = cache_limit;
7535 retval = xcom_send_app_wait(fd, &a, 0);
7536 xdr_free((xdrproc_t)xdr_app_data, (char *)&a);
7537 return retval;
7538 }
7539 /* purecov: end */
7540
7541 int xcom_client_convert_into_local_server(connection_descriptor *const fd) {
7542 app_data a;
7543 int retval = 0;
7544 retval = xcom_send_app_wait(fd, init_convert_into_local_server_msg(&a), 0);
7545 xdr_free((xdrproc_t)xdr_app_data, (char *)&a);
7546 return retval;
7547 }
7548