1 /* Copyright (c) 2012, 2020, Oracle and/or its affiliates. All rights reserved.
2 
3    This program is free software; you can redistribute it and/or modify
4    it under the terms of the GNU General Public License, version 2.0,
5    as published by the Free Software Foundation.
6 
7    This program is also distributed with certain software (including
8    but not limited to OpenSSL) that is licensed under separate terms,
9    as designated in a particular file or component or in included license
10    documentation.  The authors of MySQL hereby grant you an additional
11    permission to link the program and your derivative works with the
12    separately licensed software that they have included with MySQL.
13 
14    This program is distributed in the hope that it will be useful,
15    but WITHOUT ANY WARRANTY; without even the implied warranty of
16    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17    GNU General Public License, version 2.0, for more details.
18 
19    You should have received a copy of the GNU General Public License
20    along with this program; if not, write to the Free Software
21    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301  USA */
22 
23 #include <assert.h>
24 #include <errno.h>
25 #ifndef __STDC_FORMAT_MACROS
26 #define __STDC_FORMAT_MACROS
27 #endif
28 #ifndef _WIN32
29 #include <inttypes.h>
30 #endif
31 #include <limits.h>
32 #include <signal.h>
33 #include <stdio.h>
34 #include <stdlib.h>
35 #include <string.h>
36 #include <sys/time.h>
37 #include <sys/types.h>
38 
39 #ifndef _WIN32
40 #include <poll.h>
41 #endif
42 
43 #ifdef _WIN32
44 #define xcom_buf char
45 #else
46 #define xcom_buf void
47 #endif
48 
49 /**
50   @file
51   xcom/xcom_base.c
52     The new version of xcom is a major rewrite to allow
53     transmission of multiple messages from several sources
54     simultaneously without collision. The interface to xcom is largely
55     intact, one notable change is that xcom will consider the message
56     delivered as soon as it has got a majority. Consequently, the VP
57     set will not necessarily show all nodes which will actually
58     receive the message.
59 
60     OHKFIX Add wait for complete last known node set to mimic the old
61     semantics.
62 
63 
64     IMPORTANT: What xcom does and what it does not do:
65 
66     xcom messages are received in the same order on all nodes.
67 
68     xcom guarantees that if a message is delivered to one node, it will
69     eventually be seen on all other nodes as well.
70 
71     xcom messages are available to a crashed node when it comes up
72     again if at least one node which knows the value of the message
73     has not crashed. The size of the message cache is configurable.
74 
75     OHKFIX Add logging to disk to make messages durable across system
76     crash and to increase the number of messages which may be cached.
77 
78     There is no guarantee whatsoever about the order of messages from
79     different nodes, not even the order of multiple messages from the
80     same node. It is up to the client to impose such an order by
81     waiting on a message before it sends the next.
82 
83     xcom can notify the client that a message has timed out, and in
84     that case will try to cancel the message, but it cannot guarantee
85     that a message which has timed out will not be delivered.
86 
87     xcom attaches a node set to each message as it is delivered to the
88     client. This node set reflects the current node set that xcom
89     believes is active, it does not mean that the message has been
90     delivered yet to all nodes in the set. Neither does it mean that
91     the message has not been delivered to the nodes not in the set.
92 
93     A cache of Paxos state machines is central to the new design. The
94     purpose of the cache is both to store a window of messages, and to
95     decouple the different parts of xcom, like message proposal,
96     message delivery and execution, and recovery.  The old cache was
97     limited to caching messages, and a single state machine ran the
98     combined VP and Paxos algorithm. This constrained xcom to deliver
99     only a single message at a time.
100 
101     Each instance of the Paxos state machine implements the basic
102     Paxos protocol.  Unlike the cache in the old system, it is not
103     cleared when a site is deleted.  This removes some problems
104     related to message delivery during site deletion.  The cache is a
105     classic fixed size LRU with a hash index.
106 
107     Some extensions to the basic Paxos algorithm has been implemented:
108 
109     A node has ownership to all synodes with its own node number. Only
110     a node with node number N can propose a value for synode {X N},
111     where X is the sequence number, and N is the node number. Other
112     nodes can only propose the special value no_op for synode {X N}.
113     The reason for this is to retain the leaderless Paxos algorithm,
114     but to avoid collisions between nodes which are competing for the
115     same synode number. With this scheme, each node has its own unique
116     number series during normal operation. The scheme has the
117     following implications:
118 
119     1. If a node N has not already proposed a value for the synode {X N},
120     it may at any time send a LEARN message to the other nodes with
121     the reserved value no_op, without going through phase 1 and 2 of
122     Paxos. This is because the other nodes are constrained to propose
123     no_op for this synode, so the final outcome will always be no_op.
124     To avoid unnecessary message transmission, a node will try to
125     broadcast the no_op LEARN messages by piggybacking the information
126     on the messages of the basic Paxos protocol.
127 
128     2. Other nodes which want to find the value of synode {X N} may do
129     so by trying to get the value no_op accepted by following the
130     basic Paxos algorithm. The result will be the actual value
131     proposed by node N if it has done so, otherwise no_op. This will
132     typically only be necessary when a node is down, and the other
133     nodes need to find the values from the missing node in order to be
134     able to continue execution.
135 
136     Messages are delivered in order to the client, and the order is
137     determined by the sequence number and the node number, with the
138     sequence number as the most significant part.
139 
140     The xcom network interface has been redesigned and is now
141     implemented directly on top of TCP, and has so far been completely
142     trouble free. We use poll() or select() to implement non-blocking
143     send and receive, but libev could equally well have been used.
144 
145     Multicast is implemented on top of unicast as before, but the
146     implementation is prepared to use real multicast with relatively
147     minor changes.
148 
149     The roles of proposer, acceptor/learner, and executor are now
150     directly mapped to unique task types which interact with the Paxos
151     state machines, whereas the previous implementation folded all the
152     roles into a single event driven state machine.
153 
154     The following terminology will be used:
155 
156     A node is an instance of the xcom thread. There is only one instance
157     of the xcom thread in the agent.
158     A client is the application which is using xcom to send messages.
159     A thread is a real OS thread.
160     A task is a logical process. It is implemented by coroutines and
161     an explicit stack.
162 
163     The implementation of tasks and non-blocking socket operations is
164     isolated in task.h and task.c.
165 
166     A node will open a tcp connection to each of the other nodes. This
167     connection is used for all communication initiated by the node,
168     and replies to messages will arrive on the connection on which it
169     was sent.
170 
171     static int tcp_server(task_arg);
172 
173     The tcp_server listens on the xcom port and starts an
174     acceptor_learner_task whenever a new connection is detected.
175 
176     static int tcp_reaper_task(task_arg);
177 
178     Closes tcp connection which have been unused for too long.
179 
180     static int sender_task(task_arg);
181 
182     The sender_task waits for tcp messages on its input queue and
183     sends it on the tcp socket. If the socket is closed for any
184     reason, the sender_task will reconnect the socket. There is one
185     sender_task for each socket. The sender task exists mainly to
186     simplify the logic in the other tasks, but it could have been
187     replaced with a coroutine which handles the connection logic after
188     having reserved the socket for its client task.
189 
190     static int generator_task(task_arg);
191 
192     The generator_task reads messages from the client queue and moves
193     them into the input queue of the proposer_task.
194 
195     OHKFIX Use a tcp socket instead of the client queue. We can then
196     remove the generator_task and let the acceptor_learner_task do the
197     dispatching.
198 
199     static int proposer_task(task_arg);
200 
201     Assign a message number to an outgoing message and try to get it
202     accepted. There may be several proposer tasks on each node
203     working in parallel. If there are multiple proposer tasks, xcom can
204     not guarantee that the messages will be sent in the same order as
205     received from the client.
206 
207     static int acceptor_learner_task(task_arg);
208 
209     This is the server part of the xcom thread. There is one
210     acceptor_learner_task for each node in the system. The acceptor
211     learner_task reads messages from the socket, finds the correct
212     Paxos state machine, and dispatches to the correct message handler
213     with the state machine and message as arguments.
214 
215     static int reply_handler_task(task_arg);
216 
217     The reply_handler_task does the same job as the
218     acceptor_learner_task, but listens on the socket which the node
219     uses to send messages, so it will handle only replies on that
220     socket.
221 
222     static int executor_task(task_arg);
223 
224     The ececutor_task waits for a Paxos message to be accpeted. When
225     the message is accepted, it is delivered to the client,
226     unless it is a no-op. In either case, the executor_task steps to
227     the next message and repeats the wait. If it times out waiting for
228     a message, it will try to get a no-op accepted.
229 
230     static int alive_task(task_arg);
231 
232     Sends i-am-alive to other nodes if there has been no normal traffic
233     for a while. It also pings nodes which seem to be inactive.
234 
235     static int detector_task(task_arg);
236 
237     The detector_task periodically scans the set of connections from
238     other nodes and sees if there has been any activity. If there has
239     been no activity for some time, it will assume that the node is
240     dead, and send a view message to the client.
241 
242 
243     Reconfiguration:
244 
245     The xcom reconfiguration process is essentially the one described in
246     "Reconfiguring a State Machine" by Lamport et al. as the R-alpha
247     algorithm.
248     We execute the reconfiguration command immediately, but the config is
249     only valid after a delay of alpha messages.
250     The parameter alpha is the same as
251     EVENT_HORIZON in this implementation. :/static.*too_far
252     All tcp messages from beyond the event horizon will be ignored.
253 
254 */
255 #include "xcom/xcom_profile.h"
256 
257 #ifndef XCOM_STANDALONE
258 #include "my_compiler.h"
259 #endif
260 #include "xcom/x_platform.h"
261 
262 #ifndef _WIN32
263 #include <arpa/inet.h>
264 #include <net/if.h>
265 #include <sys/ioctl.h>
266 #include <sys/socket.h>
267 #ifndef __linux__
268 #include <sys/sockio.h>
269 #endif
270 #endif
271 
272 #if defined(_WIN32)
273 #include <windows.h>
274 #endif
275 
276 #include "xcom/app_data.h"
277 #include "xcom/get_synode_app_data.h"
278 #include "xcom/node_no.h"
279 #include "xcom/server_struct.h"
280 #include "xcom/simset.h"
281 #include "xcom/site_struct.h"
282 #include "xcom/task.h"
283 #include "xcom/task_net.h"
284 #include "xcom/task_os.h"
285 #include "xcom/xcom_base.h"
286 #include "xcom/xcom_common.h"
287 #include "xcom/xcom_detector.h"
288 #include "xcom/xcom_transport.h"
289 #include "xcom/xdr_utils.h"
290 #include "xdr_gen/xcom_vp.h"
291 
292 #ifndef XCOM_WITHOUT_OPENSSL
293 #include "xcom/xcom_ssl_transport.h"
294 #endif
295 
296 #include "xcom/bitset.h"
297 #include "xcom/node_list.h"
298 #include "xcom/node_set.h"
299 #include "xcom/pax_msg.h"
300 #include "xcom/site_def.h"
301 #include "xcom/sock_probe.h"
302 #include "xcom/synode_no.h"
303 #include "xcom/task_debug.h"
304 #include "xcom/task_net.h"
305 #include "xcom/xcom_cache.h"
306 #include "xcom/xcom_cfg.h"
307 #include "xcom/xcom_interface.h"
308 #include "xcom/xcom_memory.h"
309 #include "xcom/xcom_msg_queue.h"
310 #include "xcom/xcom_recover.h"
311 #include "xcom/xcom_statistics.h"
312 #include "xcom/xcom_vp_str.h"
313 
314 #ifndef XCOM_WITHOUT_OPENSSL
315 #ifdef _WIN32
316 /* In OpenSSL before 1.1.0, we need this first. */
317 #include <winsock2.h>
318 #endif /* _WIN32 */
319 
320 #include <openssl/ssl.h>
321 
322 #endif
323 
324 /* Defines and constants */
325 
326 #define SYS_STRERROR_SIZE 512
327 
328 /* Avoid printing the warning of protocol version mismatch too often */
329 #define PROTOVERSION_WARNING_TIMEOUT 600.0 /** Every 10 minutes */
330 static double protoversion_warning_time =
331     0.0; /** Timestamp of previous protoversion warning */
332 
333 /* Skip prepare for first ballot */
334 #ifdef ALWAYS_THREEPHASE
335 int const threephase = 1;
336 #else
337 int const threephase = 0;
338 #endif
339 
340 #include "xcom/retry.h"
341 
342 #ifdef NODE_0_IS_ARBITRATOR
343 int ARBITRATOR_HACK = 1;
344 #else
345 int ARBITRATOR_HACK = 0;
346 #endif
347 
348 static int const no_duplicate_payload = 1;
349 
350 /* Use buffered read when reading messages from the network */
351 static int use_buffered_read = 1;
352 
353 /* Used to handle OOM errors */
354 static unsigned short oom_abort = 0;
355 
356 /* Forward declarations */
357 long xcom_unique_long(void);
358 
359 static double wakeup_delay(double old);
360 static void note_snapshot(node_no node);
361 
362 /* Task types */
363 static int proposer_task(task_arg arg);
364 static int executor_task(task_arg arg);
365 static int sweeper_task(task_arg arg);
366 extern int alive_task(task_arg arg);
367 extern int cache_manager_task(task_arg arg);
368 extern int detector_task(task_arg arg);
369 
370 static int finished(pax_machine *p);
371 static int accepted(pax_machine *p);
372 static int started(pax_machine *p);
373 static synode_no first_free_synode(synode_no msgno);
374 static void free_forced_config_site_def();
375 static void activate_sweeper();
376 static void force_pax_machine(pax_machine *p, int enforcer);
377 static void handle_need_snapshot(linkage *reply_queue, pax_msg *pm);
378 static void handle_skip(site_def const *site, pax_machine *p, pax_msg *m);
379 
380 extern void bit_set_or(bit_set *x, bit_set const *y);
381 
382 /* Global variables */
383 
384 int xcom_shutdown = 0;  /* Xcom_Shutdown flag */
385 synode_no executed_msg; /* The message we are waiting to execute */
386 synode_no max_synode;   /* Max message number seen so far */
387 task_env *boot = NULL;
388 task_env *detector = NULL;
389 task_env *killer = NULL;
390 task_env *net_boot = NULL;
391 task_env *net_recover = NULL;
392 void *xcom_thread_input = 0;
393 
394 long xcom_debug_mask =
395     /* D_DETECT | */ D_FSM /* | D_FILEOP | D_CONS | D_BASE */ | D_TRANSPORT;
396 long xcom_dbg_stack[DBG_STACK_SIZE];
397 int xcom_dbg_stack_top = 0;
398 
399 static void init_proposers();
400 void initialize_lsn(uint64_t n);
401 
init_base_vars()402 void init_base_vars() {
403   xcom_shutdown = 0;          /* Xcom_Shutdown flag */
404   executed_msg = null_synode; /* The message we are waiting to execute */
405   max_synode = null_synode;   /* Max message number seen so far */
406   boot = NULL;
407   detector = NULL;
408   killer = NULL;
409   net_boot = NULL;
410   net_recover = NULL;
411   xcom_thread_input = 0;
412 }
413 
414 static task_env *executor = NULL;
415 static task_env *sweeper = NULL;
416 static task_env *retry = NULL;
417 static task_env *proposer[PROPOSERS];
418 static task_env *alive_t = NULL;
419 static task_env *cache_task = NULL;
420 
421 static uint32_t my_id = 0; /* Unique id of this instance */
get_my_xcom_id()422 uint32_t get_my_xcom_id() { return my_id; }
423 static synode_no current_message; /* Current message number */
424 static synode_no
425     last_config_modification_id; /*Last configuration change proposal*/
426 static uint64_t lsn = 0;         /* Current log sequence number */
427 
get_current_message()428 synode_no get_current_message() { return current_message; }
429 
430 static channel prop_input_queue; /* Proposer task input queue */
431 
432 extern int client_boot_done;
433 extern int netboot_ok;
434 
435 static linkage exec_wait = {
436     0, &exec_wait, &exec_wait}; /* Executor will wake up tasks sleeping here */
437 
438 linkage detector_wait = {0, &detector_wait,
439                          &detector_wait}; /* Detector sleeps here */
440 
441 static struct {
442   int n;
443   unsigned long id[MAX_DEAD];
444 } dead_sites;
445 
get_max_synode()446 synode_no get_max_synode() { return max_synode; }
447 
is_latest_config(site_def const * const config)448 static bool_t is_latest_config(site_def const *const config) {
449   site_def const *const latest_config = get_site_def();
450   assert(latest_config != NULL);
451   return config == latest_config;
452 }
453 
454 /**
455  * Get the first pending configuration that reconfigures the event horizon.
456  *
457  * Retrieve the first pending site_def, i.e. with the smallest start synod that
458  * is greater than executed_msg, that reconfigures the event horizon.
459  */
first_event_horizon_reconfig()460 static site_def const *first_event_horizon_reconfig() {
461   site_def const *active_config = find_site_def(executed_msg);
462   xcom_event_horizon active_event_horizon = active_config->event_horizon;
463   site_def const *first_event_horizon_reconfig = NULL;
464   site_def const *next_config = NULL;
465   for (next_config = find_next_site_def(active_config->start);
466        next_config != NULL && first_event_horizon_reconfig == NULL;
467        next_config = find_next_site_def(next_config->start)) {
468     if (active_event_horizon != next_config->event_horizon) {
469       first_event_horizon_reconfig = next_config;
470     }
471   }
472   return first_event_horizon_reconfig;
473 }
474 
475 /**
476  * Get the latest pending configuration that reconfigures the event horizon.
477  *
478  * Retrieve the last pending site_def, i.e. with the greatest start synod that
479  * is greater than executed_msg, that reconfigures the event horizon.
480  */
latest_event_horizon_reconfig()481 static site_def const *latest_event_horizon_reconfig() {
482   site_def const *active_config = find_site_def(executed_msg);
483   xcom_event_horizon previous_event_horizon = active_config->event_horizon;
484   site_def const *last_event_horizon_reconfig = NULL;
485   site_def const *next_config = NULL;
486   for (next_config = find_next_site_def(active_config->start);
487        next_config != NULL;
488        next_config = find_next_site_def(next_config->start)) {
489     if (previous_event_horizon != next_config->event_horizon) {
490       previous_event_horizon = next_config->event_horizon;
491       last_event_horizon_reconfig = next_config;
492     }
493   }
494   return last_event_horizon_reconfig;
495 }
496 
497 /**
498  * Add the event horizon to the given base synod s.
499  *
500  * We are assuming right now that this function is used solely in the context of
501  * "we have received a reconfiguration command at synod s, when should it be
502  * scheduled to take effect?"
503  * The result of this function is *when* it should take effect.
504  *
505  * Common case: there are no configurations pending, or if there are, none of
506  * them reconfigure the event horizon. The common case result is:
507  *
508  *   s + event_horizon(active_config) + 1
509  *
510  *
511  * If an event horizon reconfiguration R is pending, it means that the command C
512  * proposed for synod s is concurrent with R, i.e., s falls in the interval
513  * ]proposed(R), start(R)[.
514  *
515  * In this situation we apply the command C proposed for synod s *after* taking
516  * into account R's event horizon.
517  *
518  * This means that the result is:
519  *
520  *   start(R) + event_horizon(R) + 1
521  */
522 /* purecov: begin deadcode */
add_default_event_horizon(synode_no s)523 static synode_no add_default_event_horizon(synode_no s) {
524   s.msgno += EVENT_HORIZON_MIN + 1;
525   return s;
526 }
527 /* purecov: end */
528 
add_event_horizon(synode_no s)529 static synode_no add_event_horizon(synode_no s) {
530   site_def const *active_config = find_site_def(executed_msg);
531   if (active_config) {
532     site_def const *pending_config = latest_event_horizon_reconfig();
533     bool_t const no_event_horizon_reconfig_pending = (pending_config == NULL);
534     if (is_latest_config(active_config) || no_event_horizon_reconfig_pending) {
535       s.msgno = s.msgno + active_config->event_horizon + 1;
536     } else {
537       s.msgno = pending_config->start.msgno + pending_config->event_horizon + 1;
538     }
539     return s;
540   } else { /* This is initial boot or recovery, we have no config */
541 #ifdef PERMISSIVE_EH_ACTIVE_CONFIG
542     return add_default_event_horizon(s);
543 #else
544     /* We should always have an active config */
545     /* purecov: begin deadcode */
546     assert(active_config != NULL);
547     return null_synode;
548     /* purecov: end */
549 #endif
550   }
551 }
552 
553 /**
554    Set node group
555 */
set_group(uint32_t id)556 void set_group(uint32_t id) {
557   IFDBG(D_NONE, FN; STRLIT("changing group id of global variables ");
558         NDBG((unsigned long)id, lu););
559   /*	set_group_id(id); */
560   current_message.group_id = id;
561   executed_msg.group_id = id;
562   max_synode.group_id = id;
563 }
564 
bury_site(uint32_t id)565 static void bury_site(uint32_t id) {
566   if (id != 0) {
567     dead_sites.id[dead_sites.n % MAX_DEAD] = id;
568     dead_sites.n = (dead_sites.n + 1) % MAX_DEAD;
569   }
570 }
571 
is_dead_site(uint32_t id)572 static bool_t is_dead_site(uint32_t id) {
573   int i = 0;
574   for (i = 0; i < MAX_DEAD; i++) {
575     if (dead_sites.id[i] == id)
576       return TRUE;
577     else if (dead_sites.id[i] == 0)
578       return FALSE;
579   }
580   return FALSE;
581 }
582 
583 extern node_set *init_node_set(node_set *set, u_int n);
584 extern node_set *alloc_node_set(node_set *set, u_int n);
585 
586 #if 0
587 /* Find our previous message number. */
588 static synode_no decr_msgno(synode_no msgno)
589 {
590 	synode_no ret = msgno;
591 	ret.msgno--;
592 	ret.node = get_nodeno(find_site_def(ret)); /* In case site and node number has changed */
593 	return ret;
594 }
595 #endif
596 
597 /* Find our next message number. */
incr_msgno(synode_no msgno)598 static synode_no incr_msgno(synode_no msgno) {
599   synode_no ret = msgno;
600   ret.msgno++;
601   ret.node = get_nodeno(
602       find_site_def(ret)); /* In case site and node number has changed */
603   return ret;
604 }
605 
incr_synode(synode_no synode)606 synode_no incr_synode(synode_no synode) {
607   synode_no ret = synode;
608   ret.node++;
609   if (ret.node >= get_maxnodes(find_site_def(synode))) {
610     ret.node = 0;
611     ret.msgno++;
612   }
613   /* 	IFDBG(D_NONE, FN; SYCEXP(synode); SYCEXP(ret)); */
614   return ret; /* Change this if we change message number type */
615 }
616 
decr_synode(synode_no synode)617 synode_no decr_synode(synode_no synode) {
618   synode_no ret = synode;
619   if (ret.node == 0) {
620     ret.msgno--;
621     ret.node = get_maxnodes(find_site_def(ret));
622   }
623   ret.node--;
624   return ret; /* Change this if we change message number type */
625 }
626 
skip_value(pax_msg * p)627 static void skip_value(pax_msg *p) {
628   IFDBG(D_NONE, FN; SYCEXP(p->synode));
629   p->op = learn_op;
630   p->msg_type = no_op;
631 }
632 
633 /* Utilities and debug */
634 
635 #ifndef _WIN32
636 /* Ignore this signal */
ignoresig(int signum)637 static int ignoresig(int signum) {
638   struct sigaction act;
639   struct sigaction oldact;
640 
641   memset(&act, 0, sizeof(act));
642   act.sa_handler = SIG_IGN;
643   memset(&oldact, 0, sizeof(oldact));
644 
645   return sigaction(signum, &act, &oldact);
646 }
647 #else
648 #define SIGPIPE 0
ignoresig(int signum)649 static int ignoresig(int signum) { return 0; }
650 #endif
651 
recently_active(pax_machine * p)652 static int recently_active(pax_machine *p) {
653   IFDBG(D_NONE, FN; SYCEXP(p->synode); STRLIT(" op "); PTREXP(p);
654         STRLIT(p->learner.msg ? pax_op_to_str(p->learner.msg->op) : "NULL");
655         NDBG(p->last_modified, f); NDBG(task_now(), f));
656   return p->last_modified != 0.0 &&
657          (p->last_modified + BUILD_TIMEOUT + median_time()) > task_now();
658 }
659 
finished(pax_machine * p)660 static inline int finished(pax_machine *p) {
661   IFDBG(D_NONE, FN; SYCEXP(p->synode); STRLIT(" op "); PTREXP(p);
662         STRLIT(p->learner.msg ? pax_op_to_str(p->learner.msg->op) : "NULL"););
663   return p->learner.msg && (p->learner.msg->op == learn_op ||
664                             p->learner.msg->op == tiny_learn_op);
665 }
666 
pm_finished(pax_machine * p)667 int pm_finished(pax_machine *p) { return finished(p); }
668 
accepted(pax_machine * p)669 static inline int accepted(pax_machine *p) {
670   IFDBG(D_NONE, FN; SYCEXP(p->synode); STRLIT(" op "); PTREXP(p);
671         STRLIT(p->acceptor.msg ? pax_op_to_str(p->acceptor.msg->op) : "NULL"););
672   return p->acceptor.msg && p->acceptor.msg->op != initial_op;
673 }
674 
accepted_noop(pax_machine * p)675 static inline int accepted_noop(pax_machine *p) {
676   IFDBG(D_NONE, FN; SYCEXP(p->synode); STRLIT(" op "); PTREXP(p);
677         STRLIT(p->acceptor.msg ? pax_op_to_str(p->acceptor.msg->op) : "NULL"););
678   return accepted(p) && p->acceptor.msg->msg_type == no_op;
679 }
680 
noop_match(pax_machine * p,pax_msg * pm)681 static inline int noop_match(pax_machine *p, pax_msg *pm) {
682   return pm->msg_type == no_op && accepted_noop(p);
683 }
684 
started(pax_machine * p)685 static inline int started(pax_machine *p) {
686   return p->op != initial_op || (p->acceptor.promise.cnt > 0) ||
687          (p->proposer.msg && (p->proposer.msg->op != initial_op)) ||
688          accepted(p) || finished(p);
689 }
690 
set_last_received_config(synode_no received_config_change)691 void set_last_received_config(synode_no received_config_change) {
692   last_config_modification_id = received_config_change;
693 }
694 
695 /* Definition of majority */
max_check(site_def const * site)696 static inline node_no max_check(site_def const *site) {
697 #ifdef MAXACCEPT
698   return MIN(get_maxnodes(site), MAXACCEPT);
699 #else
700   return get_maxnodes(site);
701 #endif
702 }
703 
704 static site_def *forced_config = 0;
is_forcing_node(pax_machine const * p)705 static int is_forcing_node(pax_machine const *p) { return p->enforcer; }
706 static int wait_forced_config = 0;
707 
708 /* Definition of majority */
majority(bit_set const * nodeset,site_def const * s,int all,int delay MY_ATTRIBUTE ((unused)),int force)709 static inline int majority(bit_set const *nodeset, site_def const *s, int all,
710                            int delay MY_ATTRIBUTE((unused)), int force) {
711   node_no ok = 0;
712   node_no i = 0;
713   int retval = 0;
714 #ifdef WAIT_FOR_ALL_FIRST
715   double sec = task_now();
716 #endif
717   node_no max = max_check(s);
718 
719   /* IFDBG(D_NONE, FN; NDBG(max,lu); NDBG(all,d); NDBG(delay,d); NDBG(force,d));
720    */
721 
722   /* Count nodes that has answered */
723   for (i = 0; i < max; i++) {
724     if (BIT_ISSET(i, nodeset)) {
725       ok++;
726     }
727 #ifdef WAIT_FOR_ALL_FIRST
728     else {
729       if (all) return 0; /* Delay until all nodes have answered */
730       if (delay && !may_be_dead(s->detected, i, sec)) {
731         return 0; /* Delay until all live nodes have answered */
732       }
733     }
734 #endif
735   }
736 
737   /* If we are forcing messages, attempt to ensure consistency by
738      requiring all remaining nodes to agree. Forced_config points to
739      the config that should be used as acceptors in this
740      case. Another possibility is to use the original config and
741      count the number of live nodes, but since the force flag is
742      being used only to force a new config, it seems safer to use
743      the new config and no time-dependent info. Note that we are
744      counting the answers based on the normal config, but use the
745      number of nodes from forced_config. This is safe, since we can
746      assume that the nodes that are not in forced_config will never
747      answer. */
748 
749   if (force) {
750     IFDBG(D_NONE, FN; STRLIT("force majority"); NDBG(ok, u); NDBG(max, u);
751           NDBG(get_maxnodes(forced_config), u));
752     return ok == get_maxnodes(forced_config);
753   } else {
754 /* Have now seen answer from all live nodes */
755 #ifdef NODE_0_IS_ARBITRATOR
756     retval = all ? ok == max
757                  : ok > max / 2 ||
758                        (ARBITRATOR_HACK && (get_nodeno(s) == 0) && (2 == max));
759 #else
760     retval = all ? ok == max : ok > max / 2 || (ARBITRATOR_HACK && (2 == max));
761 #endif
762     /* 	IFDBG(D_NONE, FN; NDBG(max,lu); NDBG(all,d); NDBG(delay,d);
763      * NDBG(retval,d)); */
764     return retval;
765   }
766 }
767 
768 #define IS_CONS_ALL(p) \
769   ((p)->proposer.msg->a ? (p)->proposer.msg->a->consensus == cons_all : 0)
770 
771 /* See if a majority of acceptors have answered our prepare */
prep_majority(site_def const * site,pax_machine * p)772 static int prep_majority(site_def const *site, pax_machine *p) {
773   int ok = 0;
774 
775   assert(p);
776   assert(p->proposer.prep_nodeset);
777   assert(p->proposer.msg);
778   /* IFDBG(D_NONE, FN; BALCEXP(p->proposer.bal)); */
779   ok = majority(p->proposer.prep_nodeset, site, IS_CONS_ALL(p),
780                 p->proposer.bal.cnt == 1,
781                 p->proposer.msg->force_delivery || p->force_delivery);
782   return ok;
783 }
784 
785 /* See if a majority of acceptors have answered our propose */
prop_majority(site_def const * site,pax_machine * p)786 static int prop_majority(site_def const *site, pax_machine *p) {
787   int ok = 0;
788 
789   assert(p);
790   assert(p->proposer.prop_nodeset);
791   assert(p->proposer.msg);
792   /* IFDBG(D_NONE, FN; BALCEXP(p->proposer.bal)); */
793   ok = majority(p->proposer.prop_nodeset, site, IS_CONS_ALL(p),
794                 p->proposer.bal.cnt == 1,
795                 p->proposer.msg->force_delivery || p->force_delivery);
796   return ok;
797 }
798 
799 /* Xcom thread */
800 
801 static site_def *executor_site = 0;
802 
get_executor_site()803 site_def const *get_executor_site() { return executor_site; }
get_executor_site_rw()804 site_def *get_executor_site_rw() { return executor_site; }
805 
806 static site_def *proposer_site = 0;
807 
get_proposer_site()808 site_def const *get_proposer_site() { return proposer_site; }
809 
810 /* delivered_msg may point to a no_op message, which will not actually be
811  * delivered */
812 static synode_no delivered_msg = NULL_SYNODE;
813 
get_delivered_msg()814 synode_no get_delivered_msg() { return delivered_msg; }
815 
816 /* last_delivered_msg is the last synode we actually delivered */
817 static synode_no last_delivered_msg = NULL_SYNODE;
get_last_delivered_msg()818 synode_no get_last_delivered_msg() { return last_delivered_msg; }
819 
init_xcom_base()820 void init_xcom_base() {
821   IFDBG(D_NONE, FN);
822   xcom_shutdown = 0;
823   current_message = null_synode;
824   executed_msg = null_synode;
825   delivered_msg = null_synode;
826   last_delivered_msg = null_synode;
827   max_synode = null_synode;
828   client_boot_done = 0;
829   netboot_ok = 0;
830 
831   xcom_recover_init();
832   my_id = new_id();
833   push_site_def(NULL);
834   /*	update_servers(NULL); */
835   xcom_cache_var_init();
836   median_filter_init();
837   link_init(&exec_wait, TYPE_HASH("task_env"));
838   link_init(&detector_wait, TYPE_HASH("task_env"));
839   link_init(&connect_wait, TYPE_HASH("task_env"));
840   executor_site = 0;
841   proposer_site = 0;
842 
843   /** Reset lsn */
844   initialize_lsn(0);
845   IFDBG(D_NONE, FN);
846 }
847 
init_tasks()848 static void init_tasks() {
849   IFDBG(D_NONE, FN);
850   set_task(&boot, NULL);
851   set_task(&net_boot, NULL);
852   set_task(&net_recover, NULL);
853   set_task(&killer, NULL);
854   set_task(&executor, NULL);
855   set_task(&retry, NULL);
856   set_task(&detector, NULL);
857   init_proposers();
858   set_task(&alive_t, NULL);
859   set_task(&sweeper, NULL);
860   set_task(&cache_task, NULL);
861   IFDBG(D_NONE, FN);
862 }
863 
864 /* Initialize the xcom thread */
xcom_thread_init()865 void xcom_thread_init() {
866 #ifndef NO_SIGPIPE
867   signal(SIGPIPE, SIG_IGN);
868 #endif
869   init_base_vars();
870   init_site_vars();
871   init_crc32c();
872   xcom_srand48((long int)task_now());
873 
874   init_xcom_base();
875   init_tasks();
876 
877   /* Initialize input queue */
878   channel_init(&prop_input_queue, TYPE_HASH("msg_link"));
879   init_link_list();
880   task_sys_init();
881 
882   init_cache();
883 }
884 
885 /* Empty the proposer input queue */
empty_prop_input_queue()886 static void empty_prop_input_queue() {
887   empty_msg_channel(&prop_input_queue);
888   IFDBG(D_NONE, FN; STRLIT("prop_input_queue empty"));
889 }
890 
891 /* De-initialize the xcom thread */
xcom_thread_deinit()892 void xcom_thread_deinit() {
893   IFDBG(D_BUG, FN; STRLIT("Empty proposer input queue"));
894   empty_prop_input_queue();
895   IFDBG(D_BUG, FN; STRLIT("Empty link free list"));
896   empty_link_free_list();
897   IFDBG(D_BUG, FN; STRLIT("De-initialize cache"));
898   deinit_cache();
899   garbage_collect_servers();
900   IFDBG(D_BUG, FN; STRLIT("De-initialize network cache"));
901   deinit_network_cache();
902   IFDBG(D_BUG, FN; STRLIT("De-initialize xcom_interface"));
903   deinit_xcom_interface();
904 }
905 
906 #define PROP_ITER \
907   int i;          \
908   for (i = 0; i < PROPOSERS; i++)
909 
init_proposers()910 static void init_proposers() {
911   PROP_ITER { set_task(&proposer[i], NULL); }
912 }
913 
create_proposers()914 static void create_proposers() {
915   PROP_ITER {
916     set_task(&proposer[i], task_new(proposer_task, int_arg(i), "proposer_task",
917                                     XCOM_THREAD_DEBUG));
918   }
919 }
920 
terminate_proposers()921 static void terminate_proposers() {
922   PROP_ITER { task_terminate(proposer[i]); }
923 }
924 
free_forced_config_site_def()925 static void free_forced_config_site_def() {
926   free_site_def(forced_config);
927   forced_config = NULL;
928 }
929 
930 #if TASK_DBUG_ON
931 static void dbg_proposers() MY_ATTRIBUTE((unused));
dbg_proposers()932 static void dbg_proposers() {
933   GET_GOUT;
934   if (!IS_XCOM_DEBUG_WITH(XCOM_DEBUG_TRACE)) return;
935   NDBG(PROPOSERS, d);
936   {
937     PROP_ITER { PPUT(proposer[i]); }
938   }
939   PRINT_GOUT;
940   FREE_GOUT;
941 }
942 #endif
943 
set_proposer_startpoint()944 static void set_proposer_startpoint() {
945   IFDBG(D_NONE, FN; STRLIT("changing current message"));
946   if (synode_gt(max_synode, get_current_message())) {
947     if (max_synode.msgno <= 1)
948       set_current_message(first_free_synode(max_synode));
949     else
950       set_current_message(incr_msgno(first_free_synode(max_synode)));
951   }
952   if (synode_gt(executed_msg, get_current_message())) {
953     set_current_message(first_free_synode(executed_msg));
954   }
955 }
956 
957 /* Task functions */
958 
959 static xcom_state_change_cb xcom_run_cb = 0;
960 static xcom_state_change_cb xcom_terminate_cb = 0;
961 static xcom_state_change_cb xcom_comms_cb = 0;
962 static xcom_state_change_cb xcom_exit_cb = 0;
963 static xcom_state_change_cb xcom_expel_cb = 0;
964 static xcom_input_try_pop_cb xcom_try_pop_from_input_cb = NULL;
965 
set_xcom_run_cb(xcom_state_change_cb x)966 void set_xcom_run_cb(xcom_state_change_cb x) { xcom_run_cb = x; }
967 
set_xcom_comms_cb(xcom_state_change_cb x)968 void set_xcom_comms_cb(xcom_state_change_cb x) { xcom_comms_cb = x; }
969 /* purecov: begin deadcode */
set_xcom_terminate_cb(xcom_state_change_cb x)970 void set_xcom_terminate_cb(xcom_state_change_cb x) { xcom_terminate_cb = x; }
971 /* purecov: end */
set_xcom_exit_cb(xcom_state_change_cb x)972 void set_xcom_exit_cb(xcom_state_change_cb x) { xcom_exit_cb = x; }
973 
974 static xcom_recovery_cb recovery_begin_cb = NULL;
975 /* purecov: begin deadcode */
set_xcom_recovery_begin_cb(xcom_recovery_cb x)976 void set_xcom_recovery_begin_cb(xcom_recovery_cb x) { recovery_begin_cb = x; }
977 /* purecov: end */
978 
979 static xcom_recovery_cb recovery_restart_cb = NULL;
980 /* purecov: begin deadcode */
set_xcom_recovery_restart_cb(xcom_recovery_cb x)981 void set_xcom_recovery_restart_cb(xcom_recovery_cb x) {
982   recovery_restart_cb = x;
983 }
984 /* purecov: end */
985 
986 static xcom_recovery_cb recovery_init_cb = NULL;
987 /* purecov: begin deadcode */
set_xcom_recovery_init_cb(xcom_recovery_cb x)988 void set_xcom_recovery_init_cb(xcom_recovery_cb x) { recovery_init_cb = x; }
989 /* purecov: end */
990 
991 static xcom_recovery_cb recovery_end_cb = NULL;
992 /* purecov: begin deadcode */
set_xcom_recovery_end_cb(xcom_recovery_cb x)993 void set_xcom_recovery_end_cb(xcom_recovery_cb x) { recovery_end_cb = x; }
994 /* purecov: end */
995 
set_xcom_expel_cb(xcom_state_change_cb x)996 void set_xcom_expel_cb(xcom_state_change_cb x) { xcom_expel_cb = x; }
997 
set_xcom_input_try_pop_cb(xcom_input_try_pop_cb pop)998 void set_xcom_input_try_pop_cb(xcom_input_try_pop_cb pop) {
999   xcom_try_pop_from_input_cb = pop;
1000 }
1001 
1002 static connection_descriptor *input_signal_connection = NULL;
1003 
1004 #ifndef XCOM_WITHOUT_OPENSSL
xcom_input_signal_connection_shutdown_ssl_wait_for_peer()1005 static bool_t xcom_input_signal_connection_shutdown_ssl_wait_for_peer() {
1006   int ssl_error_code = 0;
1007   do {
1008     char buf[1024];
1009     ssl_error_code = SSL_read(input_signal_connection->ssl_fd, buf, 1024);
1010   } while (ssl_error_code > 0);
1011 
1012   bool_t const successful =
1013       (SSL_get_error(input_signal_connection->ssl_fd, ssl_error_code) ==
1014        SSL_ERROR_ZERO_RETURN);
1015   return successful;
1016 }
xcom_input_signal_connection_shutdown_ssl()1017 static bool_t xcom_input_signal_connection_shutdown_ssl() {
1018   bool_t successful = FALSE;
1019 
1020   int ssl_error_code = SSL_shutdown(input_signal_connection->ssl_fd);
1021 
1022   bool_t const need_to_wait_for_peer_shutdown = (ssl_error_code == 0);
1023   bool_t const something_went_wrong = (ssl_error_code < 0);
1024   if (need_to_wait_for_peer_shutdown) {
1025     successful = xcom_input_signal_connection_shutdown_ssl_wait_for_peer();
1026     if (!successful) goto end;
1027   } else if (something_went_wrong) {
1028     goto end;
1029   }
1030 
1031   ssl_free_con(input_signal_connection);
1032   successful = TRUE;
1033 
1034 end:
1035   return successful;
1036 }
1037 #endif
1038 
xcom_input_new_signal_connection(char const * address,xcom_port port)1039 bool_t xcom_input_new_signal_connection(char const *address, xcom_port port) {
1040   bool_t const SUCCESSFUL = TRUE;
1041   bool_t const UNSUCCESSFUL = FALSE;
1042   assert(input_signal_connection == NULL);
1043 
1044   /* Try to connect. */
1045   input_signal_connection = xcom_open_client_connection(address, port);
1046   if (input_signal_connection == NULL) return UNSUCCESSFUL;
1047 
1048   /* Have the server handle the rest of this connection using a local_server
1049      task. */
1050   if (xcom_client_convert_into_local_server(input_signal_connection) == 1) {
1051     G_TRACE(
1052         "Converted the signalling connection handler into a local_server "
1053         "task on the client side.");
1054 #ifndef XCOM_WITHOUT_OPENSSL
1055     /* No more SSL in this connection. */
1056     {
1057       bool_t const using_ssl = (input_signal_connection->ssl_fd != NULL);
1058       if (using_ssl) {
1059         bool_t successful = xcom_input_signal_connection_shutdown_ssl();
1060         if (!successful) {
1061           G_ERROR(
1062               "Error shutting down SSL on XCom's signalling connection on the "
1063               "client side.");
1064           xcom_input_free_signal_connection();
1065           return UNSUCCESSFUL;
1066         }
1067       }
1068     }
1069 #endif
1070     return SUCCESSFUL;
1071   } else {
1072     G_DEBUG(
1073         "Error converting the signalling connection handler into a "
1074         "local_server task on the client side.");
1075     xcom_input_free_signal_connection();
1076     return UNSUCCESSFUL;
1077   }
1078 }
1079 static int64_t socket_write(connection_descriptor *wfd, void *_buf, uint32_t n);
xcom_input_signal()1080 bool_t xcom_input_signal() {
1081   bool_t successful = FALSE;
1082   if (input_signal_connection != NULL) {
1083     unsigned char tiny_buf[1] = {0};
1084     int64_t error_code = socket_write(input_signal_connection, tiny_buf, 1);
1085     successful = (error_code == 1);
1086   }
1087   return successful;
1088 }
xcom_input_free_signal_connection()1089 void xcom_input_free_signal_connection() {
1090   if (input_signal_connection != NULL) {
1091     xcom_close_client_connection(input_signal_connection);
1092     input_signal_connection = NULL;
1093   }
1094 }
1095 
1096 #ifndef XCOM_WITHOUT_OPENSSL
local_server_shutdown_ssl(connection_descriptor * con,void * buf,int n,int * ret)1097 static int local_server_shutdown_ssl(connection_descriptor *con, void *buf,
1098                                      int n, int *ret) {
1099   DECL_ENV
1100   int ssl_error_code;
1101   bool_t need_to_wait_for_peer_shutdown;
1102   bool_t something_went_wrong;
1103   int64_t nr_read;
1104   END_ENV;
1105   *ret = 0;
1106   TASK_BEGIN
1107   ep->ssl_error_code = SSL_shutdown(con->ssl_fd);
1108   ep->need_to_wait_for_peer_shutdown = (ep->ssl_error_code == 0);
1109   ep->something_went_wrong = (ep->ssl_error_code < 0);
1110   if (ep->need_to_wait_for_peer_shutdown) {
1111     do {
1112       TASK_CALL(task_read(con, buf, n, &ep->nr_read));
1113     } while (ep->nr_read > 0);
1114     ep->ssl_error_code = SSL_get_error(con->ssl_fd, ep->nr_read);
1115     ep->something_went_wrong = (ep->ssl_error_code != SSL_ERROR_ZERO_RETURN);
1116   }
1117   if (ep->something_went_wrong) TERMINATE;
1118   ssl_free_con(con);
1119   *ret = 1;
1120   FINALLY
1121   TASK_END;
1122 }
1123 #endif
1124 
local_server(task_arg arg)1125 int local_server(task_arg arg) {
1126   DECL_ENV
1127   connection_descriptor rfd;
1128   int ssl_shutdown_ret;
1129   unsigned char buf[1024]; /* arbitrary size */
1130   int64_t nr_read;
1131   xcom_input_request_ptr request;
1132   xcom_input_request_ptr next_request;
1133   pax_msg *request_pax_msg;
1134   pax_msg *reply_payload;
1135   linkage internal_reply_queue;
1136   msg_link *internal_reply;
1137   END_ENV;
1138   TASK_BEGIN
1139   assert(xcom_try_pop_from_input_cb != NULL);
1140   {
1141     connection_descriptor *arg_rfd = (connection_descriptor *)get_void_arg(arg);
1142     ep->rfd = *arg_rfd;
1143     free(arg_rfd);
1144   }
1145   ep->ssl_shutdown_ret = 0;
1146   memset(ep->buf, 0, 1024);
1147   ep->nr_read = 0;
1148   ep->request = NULL;
1149   ep->next_request = NULL;
1150   ep->request_pax_msg = NULL;
1151   ep->reply_payload = NULL;
1152   link_init(&ep->internal_reply_queue, TYPE_HASH("msg_link"));
1153   ep->internal_reply = NULL;
1154 
1155 #ifndef XCOM_WITHOUT_OPENSSL
1156   /* No more SSL in this connection. */
1157   if (ep->rfd.ssl_fd) {
1158     TASK_CALL(local_server_shutdown_ssl(&ep->rfd, ep->buf, 1024,
1159                                         &ep->ssl_shutdown_ret));
1160     if (ep->ssl_shutdown_ret != 1) {
1161       G_ERROR(
1162           "Error shutting down SSL on XCom's signalling connection on the "
1163           "server side.");
1164       TERMINATE;
1165     }
1166   }
1167 #endif
1168 
1169   while (!xcom_shutdown) {
1170     /* Wait for signal that there is work to consume from the queue. */
1171     TASK_CALL(task_read(&ep->rfd, ep->buf, 1024, &ep->nr_read));
1172     if (ep->nr_read == 0) {
1173       /* purecov: begin inspected */
1174       G_WARNING("local_server: client closed the signalling connection?");
1175       break;
1176       /* purecov: end */
1177     } else if (ep->nr_read < 0) {
1178       /* purecov: begin inspected */
1179       IFDBG(D_NONE, FN; NDBG64(ep->nr_read));
1180       G_WARNING("local_server: error reading from the signalling connection?");
1181       break;
1182       /* purecov: end */
1183     }
1184     /* Pop, dispatch, and reply. */
1185     ep->request = xcom_try_pop_from_input_cb();
1186     while (ep->request != NULL) {
1187       /* Take ownership of the tail of the list, otherwise we lose it when we
1188          free ep->request. */
1189       ep->next_request = xcom_input_request_extract_next(ep->request);
1190       unchecked_replace_pax_msg(&ep->request_pax_msg,
1191                                 pax_msg_new_0(null_synode));
1192       assert(ep->request_pax_msg->refcnt == 1);
1193       ep->request_pax_msg->op = client_msg;
1194       /* Take ownership of the request's app_data, otherwise the app_data is
1195          freed with ep->request. */
1196       ep->request_pax_msg->a = xcom_input_request_extract_app_data(ep->request);
1197       ep->request_pax_msg->to = VOID_NODE_NO;
1198       ep->request_pax_msg->force_delivery =
1199           (ep->request_pax_msg->a->body.c_t == force_config_type);
1200       dispatch_op(NULL, ep->request_pax_msg, &ep->internal_reply_queue);
1201       if (!link_empty(&ep->internal_reply_queue)) {
1202         ep->internal_reply =
1203             (msg_link *)(link_extract_first(&ep->internal_reply_queue));
1204         assert(ep->internal_reply->p);
1205         assert(ep->internal_reply->p->refcnt == 1);
1206         /* We are going to take ownership of the pax_msg which has the reply
1207            payload, so we bump its reference count so that it is not freed by
1208            msg_link_delete. */
1209         ep->reply_payload = ep->internal_reply->p;
1210         ep->reply_payload->refcnt++;
1211         msg_link_delete(&ep->internal_reply);
1212         /* There should only have been one reply. */
1213         assert(link_empty(&ep->internal_reply_queue));
1214       } else {
1215         ep->reply_payload = NULL;
1216       }
1217       /* Reply to the request. */
1218       xcom_input_request_reply(ep->request, ep->reply_payload);
1219       xcom_input_request_free(ep->request);
1220       ep->request = ep->next_request;
1221     }
1222   }
1223   FINALLY
1224   IFDBG(D_BUG, FN; STRLIT(" shutdown "); NDBG(ep->rfd.fd, d);
1225         NDBG(task_now(), f));
1226   /* Close the signalling connection. */
1227   shutdown_connection(&ep->rfd);
1228   unchecked_replace_pax_msg(&ep->request_pax_msg, NULL);
1229   IFDBG(D_NONE, FN; NDBG(xcom_shutdown, d));
1230   TASK_END;
1231 }
1232 
local_server_is_setup()1233 static bool_t local_server_is_setup() {
1234   return xcom_try_pop_from_input_cb != NULL;
1235 }
1236 
xcom_taskmain2(xcom_port listen_port)1237 int xcom_taskmain2(xcom_port listen_port) {
1238   init_xcom_transport(listen_port);
1239 
1240   IFDBG(D_BUG, FN; STRLIT("enter taskmain"));
1241   ignoresig(SIGPIPE);
1242 
1243   {
1244     /* Setup tcp_server socket */
1245     result tcp_fd = {0, 0};
1246 
1247     if ((tcp_fd = announce_tcp(listen_port)).val < 0) {
1248       /* purecov: begin inspected */
1249       IFDBG(D_BUG, FN; STRLIT("cannot annonunce tcp "); NDBG(listen_port, d));
1250       task_dump_err(tcp_fd.funerr);
1251       g_critical("Unable to announce tcp port %d. Port already in use?",
1252                  listen_port);
1253       if (xcom_comms_cb) {
1254         xcom_comms_cb(XCOM_COMMS_ERROR);
1255       }
1256       if (xcom_terminate_cb) {
1257         xcom_terminate_cb(0);
1258       }
1259       goto cleanup;
1260       /* purecov: end */
1261     }
1262 
1263     if (xcom_comms_cb) {
1264       xcom_comms_cb(XCOM_COMMS_OK);
1265     }
1266 
1267     IFDBG(D_NONE, FN; STRLIT("Creating tasks"));
1268     /* task_new(generator_task, null_arg, "generator_task", XCOM_THREAD_DEBUG);
1269      */
1270     task_new(tcp_server, int_arg(tcp_fd.val), "tcp_server", XCOM_THREAD_DEBUG);
1271     task_new(tcp_reaper_task, null_arg, "tcp_reaper_task", XCOM_THREAD_DEBUG);
1272     IFDBG(D_BUG, FN; STRLIT("XCOM is listening on "); NPUT(listen_port, d));
1273   }
1274 
1275   if (recovery_init_cb) recovery_init_cb();
1276 
1277   if (recovery_begin_cb) recovery_begin_cb();
1278 
1279   task_loop();
1280 
1281 cleanup:
1282 
1283 #ifdef TASK_EVENT_TRACE
1284   dump_task_events();
1285 #endif
1286 #ifndef XCOM_WITHOUT_OPENSSL
1287   xcom_cleanup_ssl();
1288 #endif
1289 
1290   xcom_thread_deinit();
1291   if (xcom_exit_cb) {
1292     xcom_exit_cb(0);
1293   }
1294   IFDBG(D_BUG, FN; STRLIT(" exit "); NDBG(xcom_dbg_stack_top, d);
1295         NDBG((unsigned)xcom_debug_mask, x));
1296   xcom_debug_mask = 0;
1297   xcom_dbg_stack_top = 0;
1298   return 1;
1299 }
1300 
1301 /* Paxos message construction and sending */
1302 
1303 /* Initialize a message for sending */
prepare(pax_msg * p,pax_op op)1304 static void prepare(pax_msg *p, pax_op op) {
1305   p->op = op;
1306   p->reply_to = p->proposal;
1307 }
1308 
1309 /* Initialize a prepare_msg */
init_prepare_msg(pax_msg * p)1310 void init_prepare_msg(pax_msg *p) { prepare(p, prepare_op); }
1311 
prepare_msg(pax_msg * p)1312 static int prepare_msg(pax_msg *p) {
1313   init_prepare_msg(p);
1314   /* p->msg_type = normal; */
1315   return send_to_acceptors(p, "prepare_msg");
1316 }
1317 
1318 /* Initialize a noop_msg */
create_noop(pax_msg * p)1319 pax_msg *create_noop(pax_msg *p) {
1320   init_prepare_msg(p);
1321   p->msg_type = no_op;
1322   return p;
1323 }
1324 
1325 /* Initialize a read_msg */
create_read(site_def const * site,pax_msg * p)1326 static pax_msg *create_read(site_def const *site, pax_msg *p) {
1327   p->msg_type = normal;
1328   p->proposal.node = get_nodeno(site);
1329   prepare(p, read_op);
1330   return p;
1331 }
1332 
skip_msg(pax_msg * p)1333 static int skip_msg(pax_msg *p) {
1334   prepare(p, skip_op);
1335   IFDBG(D_NONE, FN; STRLIT("skipping message "); SYCEXP(p->synode));
1336   p->msg_type = no_op;
1337   return send_to_all(p, "skip_msg");
1338 }
1339 
brand_app_data(pax_msg * p)1340 static void brand_app_data(pax_msg *p) {
1341   if (p->a) {
1342     p->a->app_key.msgno = p->synode.msgno;
1343     p->a->app_key.node = p->synode.node;
1344     p->a->app_key.group_id = p->a->group_id = p->synode.group_id;
1345   }
1346 }
1347 
my_unique_id(synode_no synode)1348 static synode_no my_unique_id(synode_no synode) {
1349   assert(my_id != 0);
1350   /* Random number derived from node number and timestamp which uniquely defines
1351    * this instance */
1352   synode.group_id = my_id;
1353   return synode;
1354 }
1355 
set_unique_id(pax_msg * msg,synode_no synode)1356 static void set_unique_id(pax_msg *msg, synode_no synode) {
1357   app_data_ptr a = msg->a;
1358   while (a) {
1359     a->unique_id = synode;
1360     a = a->next;
1361   }
1362 }
1363 
init_propose_msg(pax_msg * p)1364 void init_propose_msg(pax_msg *p) {
1365   p->op = accept_op;
1366   p->reply_to = p->proposal;
1367   brand_app_data(p);
1368   /* set_unique_id(p, my_unique_id(synode)); */
1369 }
1370 
send_propose_msg(pax_msg * p)1371 static int send_propose_msg(pax_msg *p) {
1372   return send_to_acceptors(p, "propose_msg");
1373 }
1374 
propose_msg(pax_msg * p)1375 static int propose_msg(pax_msg *p) {
1376   init_propose_msg(p);
1377   return send_propose_msg(p);
1378 }
1379 
set_learn_type(pax_msg * p)1380 static void set_learn_type(pax_msg *p) {
1381   p->op = learn_op;
1382   p->msg_type = p->a ? normal : no_op;
1383 }
1384 
1385 /* purecov: begin deadcode */
init_learn_msg(pax_msg * p)1386 static void init_learn_msg(pax_msg *p) {
1387   set_learn_type(p);
1388   p->reply_to = p->proposal;
1389   brand_app_data(p);
1390 }
1391 
send_learn_msg(site_def const * site,pax_msg * p)1392 static int send_learn_msg(site_def const *site, pax_msg *p) {
1393   IFDBG(D_NONE, FN; dbg_bitset(p->receivers, get_maxnodes(site)););
1394   return send_to_all_site(site, p, "learn_msg");
1395 }
1396 /* purecov: end */
1397 
create_tiny_learn_msg(pax_machine * pm,pax_msg * p)1398 static pax_msg *create_tiny_learn_msg(pax_machine *pm, pax_msg *p) {
1399   pax_msg *tiny_learn_msg = clone_pax_msg_no_app(p);
1400 
1401   ref_msg(tiny_learn_msg);
1402   tiny_learn_msg->msg_type = p->a ? normal : no_op;
1403   tiny_learn_msg->op = tiny_learn_op;
1404   tiny_learn_msg->reply_to = pm->proposer.bal;
1405   brand_app_data(tiny_learn_msg);
1406 
1407   return tiny_learn_msg;
1408 }
1409 
send_tiny_learn_msg(site_def const * site,pax_msg * p)1410 static int send_tiny_learn_msg(site_def const *site, pax_msg *p) {
1411   int retval = send_to_all_site(site, p, "tiny_learn_msg");
1412   unref_msg(&p);
1413   return retval;
1414 }
1415 
1416 /* Proposer task */
1417 
prepare_push_3p(site_def const * site,pax_machine * p,pax_msg * msg,synode_no msgno,pax_msg_type msg_type)1418 void prepare_push_3p(site_def const *site, pax_machine *p, pax_msg *msg,
1419                      synode_no msgno, pax_msg_type msg_type) {
1420   IFDBG(D_NONE, FN; SYCEXP(msgno); NDBG(p->proposer.bal.cnt, d);
1421         NDBG(p->acceptor.promise.cnt, d));
1422   BIT_ZERO(p->proposer.prep_nodeset);
1423   p->proposer.bal.node = get_nodeno(site);
1424   {
1425     int maxcnt = MAX(p->proposer.bal.cnt, p->acceptor.promise.cnt);
1426     p->proposer.bal.cnt = ++maxcnt;
1427   }
1428   msg->synode = msgno;
1429   msg->proposal = p->proposer.bal;
1430   msg->msg_type = msg_type;
1431   msg->force_delivery = p->force_delivery;
1432 }
1433 
prepare_push_2p(site_def const * site,pax_machine * p)1434 void prepare_push_2p(site_def const *site, pax_machine *p) {
1435   assert(p->proposer.msg);
1436 
1437   BIT_ZERO(p->proposer.prop_nodeset);
1438   IFDBG(D_NONE, FN; SYCEXP(p->synode));
1439   p->proposer.bal.cnt = 0;
1440   p->proposer.bal.node = get_nodeno(site);
1441   p->proposer.msg->proposal = p->proposer.bal;
1442   p->proposer.msg->synode = p->synode;
1443   p->proposer.msg->force_delivery = p->force_delivery;
1444 }
1445 
push_msg_2p(site_def const * site,pax_machine * p)1446 static void push_msg_2p(site_def const *site, pax_machine *p) {
1447   prepare_push_2p(site, p);
1448   propose_msg(p->proposer.msg);
1449 }
1450 
push_msg_3p(site_def const * site,pax_machine * p,pax_msg * msg,synode_no msgno,pax_msg_type msg_type)1451 static void push_msg_3p(site_def const *site, pax_machine *p, pax_msg *msg,
1452                         synode_no msgno, pax_msg_type msg_type) {
1453   if (wait_forced_config) {
1454     force_pax_machine(p, 1);
1455   }
1456 
1457   assert(msgno.msgno != 0);
1458   prepare_push_3p(site, p, msg, msgno, msg_type);
1459   assert(p->proposer.msg);
1460   prepare_msg(msg);
1461   IFDBG(D_NONE, FN; BALCEXP(msg->proposal); SYCEXP(msgno); STRLIT(" op ");
1462         STRLIT(pax_op_to_str(msg->op)));
1463 }
1464 
1465 /* Brand client message with unique ID */
brand_client_msg(pax_msg * msg,synode_no msgno)1466 static void brand_client_msg(pax_msg *msg, synode_no msgno) {
1467   assert(!synode_eq(msgno, null_synode));
1468   set_unique_id(msg, my_unique_id(msgno));
1469 }
1470 
xcom_send(app_data_ptr a,pax_msg * msg)1471 void xcom_send(app_data_ptr a, pax_msg *msg) {
1472   IFDBG(D_NONE, FN; PTREXP(a); SYCEXP(a->app_key); SYCEXP(msg->synode));
1473   msg->a = a;
1474   msg->op = client_msg;
1475   {
1476     msg_link *link = msg_link_new(msg, VOID_NODE_NO);
1477     IFDBG(D_NONE, FN; COPY_AND_FREE_GOUT(dbg_pax_msg(msg)));
1478     channel_put(&prop_input_queue, &link->l);
1479   }
1480 }
1481 
1482 #define FNVSTART 0x811c9dc5
1483 
1484 /* Fowler-Noll-Vo type multiplicative hash */
fnv_hash(unsigned char * buf,size_t length,uint32_t sum)1485 static uint32_t fnv_hash(unsigned char *buf, size_t length, uint32_t sum) {
1486   size_t i = 0;
1487   for (i = 0; i < length; i++) {
1488     sum = sum * (uint32_t)0x01000193 ^ (uint32_t)buf[i];
1489   }
1490   return sum;
1491 }
1492 
1493 /**
1494    Create a new (hopefully unique) ID. The basic idea is to create a hash from
1495    the host ID and a timestamp.
1496 */
new_id()1497 uint32_t new_id() {
1498   long id = xcom_unique_long();
1499   double timestamp = task_now();
1500   uint32_t retval = 0;
1501   while (retval == 0 ||
1502          is_dead_site(retval)) { /* Avoid returning 0 or already used site id */
1503     retval = fnv_hash((unsigned char *)&id, sizeof(id), 0);
1504     retval = fnv_hash((unsigned char *)&timestamp, sizeof(timestamp), retval);
1505   }
1506   return retval;
1507 }
1508 
getstart(app_data_ptr a)1509 static synode_no getstart(app_data_ptr a) {
1510   synode_no retval = null_synode;
1511   /* If a->group_id is null_id, we set the group id  from app_key.group_id,
1512    * which is hopefully not null_id. If it is, we're out of luck. */
1513   if (a && a->group_id == null_id) {
1514     /* purecov: begin deadcode */
1515     a->group_id = a->app_key.group_id; /* app_key may have valid group */
1516     /* purecov: end */
1517   }
1518   G_DEBUG("pid %d getstart group_id %x", xpid(), a->group_id);
1519   if (!a || a->group_id == null_id) {
1520     retval.group_id = new_id();
1521   } else {
1522     a->app_key.group_id = a->group_id;
1523     retval = a->app_key;
1524     if (get_site_def() &&
1525         retval.msgno > 1) { /* Special case for initial boot of site */
1526       /* Not valid until after event horizon has been passed */
1527       retval = add_event_horizon(retval);
1528     }
1529   }
1530   return retval;
1531 }
1532 
1533 /* purecov: begin deadcode */
get_default_start(app_data_ptr a)1534 synode_no get_default_start(app_data_ptr a) {
1535   synode_no retval = null_synode;
1536   /* If a->group_id is null_id, we set the group id  from app_key.group_id,
1537    * which is hopefully not null_id. If it is, we're out of luck. */
1538   if (a && a->group_id == null_id) {
1539     a->group_id = a->app_key.group_id; /* app_key may have valid group */
1540   }
1541   G_DEBUG("pid %d getstart group_id %x", xpid(), a->group_id);
1542   if (!a || a->group_id == null_id) {
1543     retval.group_id = new_id();
1544   } else {
1545     a->app_key.group_id = a->group_id;
1546     retval = a->app_key;
1547     if (retval.msgno > 1) { /* Special case for initial boot of site */
1548       /* Not valid until after event horizon has been passed */
1549       retval = add_default_event_horizon(retval);
1550     }
1551   }
1552   return retval;
1553 }
1554 /* purecov: end */
1555 
1556 /* purecov: begin deadcode */
dump_xcom_node_names(site_def const * site)1557 static void dump_xcom_node_names(site_def const *site) {
1558   u_int i;
1559   char buf[NSERVERS * 256]; /* Big enough */
1560   char *p = buf;
1561   if (!site) {
1562     G_INFO("pid %d no site", xpid());
1563     return;
1564   }
1565   *p = 0;
1566   for (i = 0; i < site->nodes.node_list_len; i++) {
1567     p = strcat(p, site->nodes.node_list_val[i].address);
1568     p = strcat(p, " ");
1569   }
1570   G_INFO("pid %d node names %s", xpid(), buf);
1571 }
1572 /* purecov: end */
1573 
site_install_action(site_def * site,cargo_type operation)1574 void site_install_action(site_def *site, cargo_type operation) {
1575   IFDBG(D_NONE, FN; NDBG(get_nodeno(get_site_def()), u));
1576   assert(site->event_horizon);
1577   if (group_mismatch(site->start, max_synode) ||
1578       synode_gt(site->start, max_synode))
1579     set_max_synode(site->start);
1580   site->nodeno = xcom_find_node_index(&site->nodes);
1581   push_site_def(site);
1582   IFDBG(D_NONE, dump_xcom_node_names(site));
1583   IFDBG(D_BUG, FN; SYCEXP(site->start); SYCEXP(site->boot_key));
1584   IFDBG(D_BUG, FN; COPY_AND_FREE_GOUT(dbg_site_def(site)));
1585   set_group(get_group_id(site));
1586   if (get_maxnodes(get_site_def())) {
1587     update_servers(site, operation);
1588   }
1589   site->install_time = task_now();
1590   G_INFO("pid %d Installed site start=" SY_FMT " boot_key=" SY_FMT
1591          " event_horizon=%" PRIu32
1592          " node %u chksum_node_list(&site->nodes) %" PRIu32,
1593          xpid(), SY_MEM(site->start), SY_MEM(site->boot_key),
1594          site->event_horizon, get_nodeno(site), chksum_node_list(&site->nodes));
1595   IFDBG(D_NONE, FN; NDBG(get_nodeno(site), u));
1596   IFDBG(D_NONE, FN; SYCEXP(site->start); SYCEXP(site->boot_key);
1597         NDBG(site->install_time, f));
1598   IFDBG(D_NONE, FN; NDBG(get_nodeno(site), u));
1599   ADD_DBG(
1600       D_BASE, add_event(EVENT_DUMP_PAD, string_arg("nodeno"));
1601       add_event(EVENT_DUMP_PAD, uint_arg(get_nodeno(site)));
1602       add_event(EVENT_DUMP_PAD, string_arg("site->boot_key"));
1603       add_synode_event(site->boot_key);
1604       /* add_event(EVENT_DUMP_PAD, uint_arg(chksum_node_list(&site->nodes))); */
1605   );
1606 }
1607 
create_site_def_with_start(app_data_ptr a,synode_no start)1608 static site_def *create_site_def_with_start(app_data_ptr a, synode_no start) {
1609   site_def *site = new_site_def();
1610   IFDBG(D_NONE, FN; COPY_AND_FREE_GOUT(dbg_list(&a->body.app_u_u.nodes)););
1611   init_site_def(a->body.app_u_u.nodes.node_list_len,
1612                 a->body.app_u_u.nodes.node_list_val, site);
1613   site->start = start;
1614   site->boot_key = a->app_key;
1615   return site;
1616 }
1617 
install_ng_with_start(app_data_ptr a,synode_no start)1618 static site_def *install_ng_with_start(app_data_ptr a, synode_no start) {
1619   if (a) {
1620     site_def *site = create_site_def_with_start(a, start);
1621     site_install_action(site, a->body.c_t);
1622     return site;
1623   }
1624   return 0;
1625 }
1626 
install_node_group(app_data_ptr a)1627 site_def *install_node_group(app_data_ptr a) {
1628   ADD_DBG(D_BASE, add_event(EVENT_DUMP_PAD, string_arg("a->app_key"));
1629           add_synode_event(a->app_key););
1630   if (a)
1631     return install_ng_with_start(a, getstart(a));
1632   else
1633     return 0;
1634 }
1635 
set_max_synode(synode_no synode)1636 void set_max_synode(synode_no synode) {
1637   max_synode = synode; /* Track max synode number */
1638   IFDBG(D_NONE, FN; STRLIT("new "); SYCEXP(max_synode));
1639   activate_sweeper();
1640 }
1641 
is_busy(synode_no s)1642 static int is_busy(synode_no s) {
1643   pax_machine *p = hash_get(s);
1644   if (!p) {
1645     return 0;
1646   } else {
1647     return started(p);
1648   }
1649 }
1650 
match_my_msg(pax_msg * learned,pax_msg * mine)1651 bool_t match_my_msg(pax_msg *learned, pax_msg *mine) {
1652   IFDBG(D_NONE, FN; PTREXP(learned->a);
1653         if (learned->a) SYCEXP(learned->a->unique_id); PTREXP(mine->a);
1654         if (mine->a) SYCEXP(mine->a->unique_id););
1655   if (learned->a && mine->a) { /* Both have app data, see if data is mine */
1656     return synode_eq(learned->a->unique_id, mine->a->unique_id);
1657   } else if (!(learned->a || mine->a)) { /* None have app data, anything goes */
1658     return TRUE;
1659   } else { /* Definitely mismatch */
1660     return FALSE;
1661   }
1662 }
1663 
1664 /*
1665  * Initialize the log sequence number (lsn).
1666  */
initialize_lsn(uint64_t n)1667 void initialize_lsn(uint64_t n) { lsn = n; }
1668 
1669 /**
1670  * Assign the next log sequence number (lsn) for a message.
1671  *
1672  * Initial propose sets lsn to msgno of the max message number as safe starting
1673  * point, otherwise lsn shall be ever increasing. lsn ensures sender order known
1674  * on receiver side, as messages may arrive "out of order" due to
1675  * retransmission. We use max_synode instead of current_message to avoid any
1676  * conflict with lsn allocated by a previous instance of the node.
1677  */
assign_lsn()1678 static uint64_t assign_lsn() {
1679   if (lsn == 0) {
1680     initialize_lsn(max_synode.msgno);
1681   }
1682   lsn++;
1683   IFDBG(D_EXEC, NDBG64(lsn));
1684   return lsn;
1685 }
1686 
1687 /* purecov: begin deadcode */
check_lsn(app_data_ptr a)1688 static int check_lsn(app_data_ptr a) {
1689   while (a) {
1690     if (!a->lsn) return 0;
1691     a = a->next;
1692   }
1693   return 1;
1694 }
1695 /* purecov: end */
1696 
1697 static void propose_noop(synode_no find, pax_machine *p);
1698 
1699 /**
1700  * Checks if the given synod s is outside the event horizon.
1701  *
1702  * Common case: there are no configurations pending, or if there are, none of
1703  * them reconfigure the event horizon. The common case threshold is:
1704  *
1705  *   last_executed_synod + event_horizon(active_config)
1706  *
1707  *
1708  * If an event horizon reconfiguration R is pending, it is possible that it
1709  * reduces the event horizon. In that case, it is possible that the threshold
1710  * above falls outside the new event horizon.
1711  *
1712  * For example, consider last_executed_synod = 42 and
1713  * event_horizon(active_config) = 10.
1714  * At this point this member participates in synods up to 52.
1715  * Now consider an event horizon reconfiguration that takes effect at synod 45,
1716  * which modifies the event horizon to 2. This means that when
1717  * last_executed_synod = 45, event_horizon(active_config) = 2. At this point
1718  * this member should only be able to participate in synods up to 47. The member
1719  * may have previously started processing messages directed to synods between 47
1720  * and 52, but will now ignore messages directed to those same synods.
1721  *
1722  * We do not want to start processing messages that will eventually fall out
1723  * of the event horizon. More importantly, the threshold above may not be safe
1724  * due to the exit logic of executor_task.
1725  *
1726  * When a node removes itself from the group on configuration C starting at
1727  * synod start(C), the exit logic relies on knowing *when* a majority has
1728  * executed synod start(C) - 1, i.e. the last message of the last configuration
1729  * to contain the leaving node.
1730  *
1731  * With a constant event horizon, we know that when synod
1732  * start(C) + event_horizon is learnt, it is because a majority already executed
1733  * or is ready to execute (and thus learned) synod start(C). This implies that a
1734  * majority already executed start(C) - 1.
1735  *
1736  * With a dynamic event horizon, we cannot be sure that when synod
1737  * start(C) + event_horizon(C) is learnt, a majority already executed or is
1738  * ready to execute synod start(C).
1739  * This is because it is possible for a new, smaller, event horizon to take
1740  * effect between start(C) and start(C) + event_horizon(C).
1741  * If that happens, the threshold above allows nodes to participate in synods
1742  * which are possibly beyond start(C) + event_horizon(C), which can lead to the
1743  * value of synod start(C) + event_horizon(C) being learnt without a majority
1744  * already having executed or being ready to execute synod start(C).
1745  *
1746  * In order to maintain the assumption made by the executor_task's exit logic,
1747  * when an event horizon reconfiguration R is pending we set the threshold to
1748  * the minimum between:
1749  *
1750  *   last_executed_synod + event_horizon(active_config)
1751  *
1752  * and:
1753  *
1754  *   start(R) - 1 + event_horizon(R)
1755  */
too_far_threshold(xcom_event_horizon active_event_horizon)1756 static uint64_t too_far_threshold(xcom_event_horizon active_event_horizon) {
1757   return executed_msg.msgno + active_event_horizon;
1758 }
1759 
too_far_threshold_new_event_horizon_pending(site_def const * new_config)1760 static uint64_t too_far_threshold_new_event_horizon_pending(
1761     site_def const *new_config) {
1762   uint64_t last_executed = executed_msg.msgno;
1763   /* compute normal threshold */
1764   uint64_t possibly_unsafe_threshold;
1765   site_def const *active_config = find_site_def(executed_msg);
1766   xcom_event_horizon active_event_horizon = active_config->event_horizon;
1767   possibly_unsafe_threshold = last_executed + active_event_horizon;
1768   /* compute threshold taking into account new event horizon */ {
1769     uint64_t maximum_safe_threshold;
1770     xcom_event_horizon new_event_horizon;
1771     uint64_t start_new_event_horizon = new_config->start.msgno;
1772     new_event_horizon = new_config->event_horizon;
1773     maximum_safe_threshold = start_new_event_horizon - 1 + new_event_horizon;
1774     /* use the minimum of both for safety */
1775     return MIN(possibly_unsafe_threshold, maximum_safe_threshold);
1776   }
1777 }
1778 
too_far(synode_no s)1779 static inline int too_far(synode_no s) {
1780   uint64_t threshold = 0;
1781   site_def const *active_config = find_site_def(executed_msg);
1782   if (active_config != NULL) {
1783     site_def const *pending_config = first_event_horizon_reconfig();
1784     bool_t const no_event_horizon_reconfig_pending = (pending_config == NULL);
1785     if (is_latest_config(active_config) || no_event_horizon_reconfig_pending) {
1786       threshold = too_far_threshold(active_config->event_horizon);
1787     } else {
1788       threshold = too_far_threshold_new_event_horizon_pending(pending_config);
1789     }
1790   } else {
1791     /* we have no configs, resort to default */
1792     threshold = too_far_threshold(EVENT_HORIZON_MIN);
1793   }
1794   return s.msgno >= threshold;
1795 }
1796 
1797 #define GOTO(x)                                 \
1798   {                                             \
1799     IFDBG(D_NONE, STRLIT("goto "); STRLIT(#x)); \
1800     goto x;                                     \
1801   }
1802 
is_view(cargo_type x)1803 static inline int is_view(cargo_type x) { return x == view_msg; }
1804 
is_config(cargo_type x)1805 static inline int is_config(cargo_type x) {
1806   return x == unified_boot_type || x == add_node_type ||
1807          x == remove_node_type || x == set_event_horizon_type ||
1808          x == force_config_type;
1809 }
1810 
1811 static int wait_for_cache(pax_machine **pm, synode_no synode, double timeout);
1812 static int prop_started = 0;
1813 static int prop_finished = 0;
1814 
1815 /* Send messages by fetching from the input queue and trying to get it accepted
1816    by a Paxos instance */
proposer_task(task_arg arg)1817 static int proposer_task(task_arg arg) {
1818   DECL_ENV
1819   int self;             /* ID of this proposer task */
1820   pax_machine *p;       /* Pointer to Paxos instance */
1821   msg_link *client_msg; /* The client message we are trying to push */
1822   synode_no msgno;
1823   pax_msg *prepare_msg;
1824   double start_propose;
1825   double start_push;
1826   double delay;
1827   site_def const *site;
1828   size_t size;
1829   size_t nr_batched_app_data;
1830   END_ENV;
1831 
1832   TASK_BEGIN
1833 
1834   ep->self = get_int_arg(arg);
1835   ep->p = NULL;
1836   ep->client_msg = NULL;
1837   ep->prepare_msg = NULL;
1838   ep->start_propose = 0.0;
1839   ep->start_push = 0.0;
1840   ep->delay = 0.0;
1841   ep->msgno = current_message;
1842   ep->site = 0;
1843   ep->size = 0;
1844   ep->nr_batched_app_data = 0;
1845 
1846   IFDBG(D_NONE, FN; NDBG(ep->self, d); NDBG(task_now(), f));
1847 
1848   while (!xcom_shutdown) { /* Loop until no more work to do */
1849     /* Wait for client message */
1850     assert(!ep->client_msg);
1851     CHANNEL_GET(&prop_input_queue, &ep->client_msg, msg_link);
1852     prop_started++;
1853     IFDBG(D_NONE, FN; PTREXP(ep->client_msg->p->a); STRLIT("extracted ");
1854           SYCEXP(ep->client_msg->p->a->app_key));
1855 
1856     /* Grab rest of messages in queue as well, but never batch config messages,
1857      * which need a unique number */
1858 
1859     /* The batch is limited either by size or number of batched app_datas.
1860      * We limit the number of elements because the XDR deserialization
1861      * implementation is recursive, and batching too many app_datas will cause a
1862      * call stack overflow. */
1863     if (!is_config(ep->client_msg->p->a->body.c_t) &&
1864         !is_view(ep->client_msg->p->a->body.c_t)) {
1865       ep->size = app_data_size(ep->client_msg->p->a);
1866       ep->nr_batched_app_data = 1;
1867       while (AUTOBATCH && ep->size <= MAX_BATCH_SIZE &&
1868              ep->nr_batched_app_data <= MAX_BATCH_APP_DATA &&
1869              !link_empty(&prop_input_queue
1870                               .data)) { /* Batch payloads into single message */
1871         msg_link *tmp;
1872         app_data_ptr atmp;
1873 
1874         CHANNEL_GET(&prop_input_queue, &tmp, msg_link);
1875         atmp = tmp->p->a;
1876         ep->size += app_data_size(atmp);
1877         ep->nr_batched_app_data++;
1878         /* Abort batching if config or too big batch */
1879         if (is_config(atmp->body.c_t) || is_view(atmp->body.c_t) ||
1880             ep->nr_batched_app_data > MAX_BATCH_APP_DATA ||
1881             ep->size > MAX_BATCH_SIZE) {
1882           channel_put_front(&prop_input_queue, &tmp->l);
1883           break;
1884         }
1885         ADD_T_EV(seconds(), __FILE__, __LINE__, "batching");
1886 
1887         tmp->p->a = 0;                     /* Steal this payload */
1888         msg_link_delete(&tmp);             /* Get rid of the empty message */
1889         atmp->next = ep->client_msg->p->a; /* Add to list of app_data */
1890                                            /* G_TRACE("Batching %s %s",
1891                                             * cargo_type_to_str(ep->client_msg->p->a->body.c_t), */
1892         /* 	cargo_type_to_str(atmp->body.c_t)); */
1893         ep->client_msg->p->a = atmp;
1894         IFDBG(D_NONE, FN; PTREXP(ep->client_msg->p->a); STRLIT("extracted ");
1895               SYCEXP(ep->client_msg->p->a->app_key));
1896       }
1897     }
1898 
1899     ep->start_propose = task_now();
1900     ep->delay = 0.0;
1901 
1902     assert(!ep->client_msg->p->a->chosen);
1903 
1904     /* It is a new message */
1905 
1906     assert(!synode_eq(current_message, null_synode));
1907 
1908     /* Assign a log sequence number only on initial propose */
1909     {
1910       uint64_t prop_lsn = assign_lsn();
1911       app_data_ptr ap = ep->client_msg->p->a;
1912       /* Assign to all app_data structs */
1913       while (ap) {
1914         ap->lsn = prop_lsn;
1915         ap = ap->next;
1916       }
1917     }
1918     DBGOUT_ASSERT(check_lsn(ep->client_msg->p->a), STRLIT("NULL lsn"));
1919   retry_new:
1920     /* Find a free slot */
1921 
1922     assert(!synode_eq(current_message, null_synode));
1923     ep->msgno = current_message;
1924     proposer_site = find_site_def_rw(ep->msgno);
1925     ep->site = proposer_site;
1926 
1927     while (is_busy(ep->msgno)) {
1928       while (/* ! ep->client_msg->p->force_delivery &&  */ too_far(
1929           incr_msgno(ep->msgno))) { /* Too far ahead of executor */
1930         TIMED_TASK_WAIT(&exec_wait, 1.0);
1931         IFDBG(D_NONE, FN; SYCEXP(ep->msgno); TIMECEXP(ep->start_propose);
1932               TIMECEXP(ep->client_msg->p->a->expiry_time); TIMECEXP(task_now());
1933 
1934               NDBG(enough_live_nodes(ep->site), d));
1935 #ifdef DELIVERY_TIMEOUT
1936         if ((ep->start_propose + ep->client_msg->p->a->expiry_time) <
1937                 task_now() &&
1938             !enough_live_nodes(ep->site)) {
1939           /* Give up */
1940           DBGOUT_ASSERT(check_lsn(ep->client_msg->p->a), STRLIT("NULL lsn"));
1941           IFDBG(D_NONE, FN; STRLIT("timeout -> delivery_failure"));
1942           deliver_to_app(NULL, ep->client_msg->p->a, delivery_failure);
1943           GOTO(next);
1944         }
1945 #endif
1946       }
1947       ep->msgno = incr_msgno(ep->msgno);
1948       /* Refresh site to next msgno */
1949       proposer_site = find_site_def_rw(ep->msgno);
1950       ep->site = proposer_site;
1951     }
1952     assert(!synode_eq(ep->msgno, null_synode));
1953 
1954     /* See if we can do anything with this message */
1955     if (!ep->site || get_nodeno(ep->site) == VOID_NODE_NO) {
1956       /* Give up */
1957       DBGOUT_ASSERT(check_lsn(ep->client_msg->p->a), STRLIT("NULL lsn"));
1958       IFDBG(D_NONE, FN; STRLIT("delivery_failure "); SYCEXP(ep->msgno);
1959             PTREXP(ep->site); NDBG(get_nodeno(ep->site), u));
1960       deliver_to_app(NULL, ep->client_msg->p->a, delivery_failure);
1961       GOTO(next);
1962     }
1963     IFDBG(D_NONE, FN; STRLIT("changing current message to ");
1964           SYCEXP(ep->msgno));
1965     set_current_message(ep->msgno);
1966 
1967     brand_client_msg(ep->client_msg->p, ep->msgno);
1968 
1969     for (;;) { /* Loop until the client message has been learned */
1970       /* Get a Paxos instance to send the client message */
1971 
1972       TASK_CALL(wait_for_cache(&ep->p, ep->msgno, 60));
1973       if (!ep->p) {
1974         G_MESSAGE("Could not get a pax_machine for msgno %lu. Retrying",
1975                   (unsigned long)ep->msgno.msgno);
1976         goto retry_new;
1977       }
1978 
1979       assert(ep->p);
1980       if (ep->client_msg->p->force_delivery)
1981         ep->p->force_delivery = ep->client_msg->p->force_delivery;
1982       {
1983         int MY_ATTRIBUTE((unused)) lock = lock_pax_machine(ep->p);
1984         assert(!lock);
1985       }
1986 
1987       /* Set the client message as current proposal */
1988       assert(ep->client_msg->p);
1989       replace_pax_msg(&ep->p->proposer.msg, clone_pax_msg(ep->client_msg->p));
1990       if (ep->p->proposer.msg == NULL) {
1991         g_critical(
1992             "Node %u has run out of memory while sending a message and "
1993             "will now exit.",
1994             get_nodeno(proposer_site));
1995         terminate_and_exit(); /* Tell xcom to stop */
1996         TERMINATE;
1997       }
1998       assert(ep->p->proposer.msg);
1999       PAX_MSG_SANITY_CHECK(ep->p->proposer.msg);
2000 
2001       /* Create the prepare message */
2002       unchecked_replace_pax_msg(&ep->prepare_msg,
2003                                 pax_msg_new(ep->msgno, ep->site));
2004       IFDBG(D_NONE, FN; PTREXP(ep->client_msg->p->a); STRLIT("pushing ");
2005             SYCEXP(ep->msgno));
2006       IFDBG(D_NONE, FN; COPY_AND_FREE_GOUT(dbg_app_data(ep->prepare_msg->a)));
2007 
2008       /* Use 3 phase algorithm if threephase is set or we are forcing or we have
2009          already accepted something, which may happen if another node has timed
2010          out waiting for this node and proposed a no_op, which we have accepted.
2011        */
2012       if (threephase || ep->p->force_delivery || ep->p->acceptor.promise.cnt) {
2013         push_msg_3p(ep->site, ep->p, ep->prepare_msg, ep->msgno, normal);
2014       } else {
2015         push_msg_2p(ep->site, ep->p);
2016       }
2017 
2018       ep->start_push = task_now();
2019 
2020       while (!finished(ep->p)) { /* Try to get a value accepted */
2021         /* We will wake up periodically, and whenever a message arrives */
2022         TIMED_TASK_WAIT(&ep->p->rv, ep->delay = wakeup_delay(ep->delay));
2023         if (!synode_eq(ep->msgno, ep->p->synode) ||
2024             ep->p->proposer.msg == NULL) {
2025           IFDBG(D_NONE, FN; STRLIT("detected stolen state machine, retry"););
2026           /* unlock_pax_machine(ep->p); */
2027           GOTO(retry_new); /* Need to break out of both loops,
2028                                                   and we have no "exit named
2029                               loop" construction */
2030         }
2031         assert(synode_eq(ep->msgno, ep->p->synode) && ep->p->proposer.msg);
2032         if (finished(ep->p)) break;
2033         {
2034           double now = task_now();
2035 #ifdef DELIVERY_TIMEOUT
2036           if ((ep->start_propose + ep->client_msg->p->a->expiry_time) < now) {
2037             IFDBG(D_NONE, FN; STRLIT("timeout when pushing ");
2038                   SYCEXP(ep->msgno); SYCEXP(executed_msg));
2039             /* Proposing a no-op here is a last ditch effort to cancel the
2040             failed message. If any of the currently reachable nodes have
2041             participated in the failed consensus round, it is equivalent to
2042             retrying a final time, otherwise we could get a no-op
2043             accepted. Proposing a no-op is always harmless.
2044             Having a timeout on delivery and telling the client is really
2045             contrary to the spirit of
2046             Paxos, since we cannot guarantee that the message has not been
2047             delivered, but at the moment, MCM depends on it.
2048             Proposing a no-op here increases the probability that the outcome
2049             matches what we tell MCM about the outcome. */
2050             propose_noop(ep->msgno, ep->p);
2051             DBGOUT_ASSERT(check_lsn(ep->client_msg->p->a), STRLIT("NULL lsn"));
2052             IFDBG(D_NONE, FN; STRLIT("timeout -> delivery_failure"));
2053             deliver_to_app(ep->p, ep->client_msg->p->a, delivery_failure);
2054             unlock_pax_machine(ep->p);
2055             GOTO(next);
2056           }
2057 #endif
2058           if ((ep->start_push + ep->delay) <= now) {
2059             PAX_MSG_SANITY_CHECK(ep->p->proposer.msg);
2060             IFDBG(D_NONE, FN; STRLIT("retry pushing "); SYCEXP(ep->msgno));
2061             IFDBG(D_NONE, FN;
2062                   COPY_AND_FREE_GOUT(dbg_app_data(ep->prepare_msg->a)););
2063             IFDBG(D_NONE, BALCEXP(ep->p->proposer.bal);
2064                   BALCEXP(ep->p->acceptor.promise));
2065             push_msg_3p(ep->site, ep->p, ep->prepare_msg, ep->msgno, normal);
2066             ep->start_push = now;
2067           }
2068         }
2069       }
2070       /* When we get here, we know the value for this message number,
2071          but it may not be the value we tried to push,
2072          so loop until we have a successful push. */
2073       unlock_pax_machine(ep->p);
2074       IFDBG(D_NONE, FN; STRLIT(" found finished message "); SYCEXP(ep->msgno);
2075             STRLIT("seconds since last push ");
2076             NPUT(task_now() - ep->start_push, f); STRLIT("ep->client_msg ");
2077             COPY_AND_FREE_GOUT(dbg_pax_msg(ep->client_msg->p)););
2078       IFDBG(D_NONE, FN; STRLIT("ep->p->learner.msg ");
2079             COPY_AND_FREE_GOUT(dbg_pax_msg(ep->p->learner.msg)););
2080       if (match_my_msg(ep->p->learner.msg, ep->client_msg->p)) {
2081         break;
2082       } else
2083         GOTO(retry_new);
2084     }
2085   next : {
2086     double now = task_now();
2087     double used = now - ep->start_propose;
2088     add_to_filter(used);
2089     prop_finished++;
2090     IFDBG(D_NONE, FN; STRLIT("completed ep->msgno "); SYCEXP(ep->msgno);
2091           NDBG(used, f); NDBG(median_time(), f);
2092           STRLIT("seconds since last push "); NDBG(now - ep->start_push, f););
2093     IFDBG(D_NONE, FN; STRLIT("ep->client_msg ");
2094           COPY_AND_FREE_GOUT(dbg_pax_msg(ep->client_msg->p)););
2095     if (ep->p) {
2096       IFDBG(D_NONE, FN; STRLIT("ep->p->learner.msg ");
2097             COPY_AND_FREE_GOUT(dbg_pax_msg(ep->p->learner.msg)););
2098     }
2099     msg_link_delete(&ep->client_msg);
2100   }
2101   }
2102   FINALLY
2103   IFDBG(D_BUG, FN; STRLIT("exit "); NDBG(ep->self, d); NDBG(task_now(), f));
2104   if (ep->p) {
2105     unlock_pax_machine(ep->p);
2106   }
2107   replace_pax_msg(&ep->prepare_msg, NULL);
2108   if (ep->client_msg) { /* If we get here with a client message, we have
2109                            failed to deliver */
2110     DBGOUT_ASSERT(check_lsn(ep->client_msg->p->a), STRLIT("NULL lsn"));
2111     IFDBG(D_NONE, FN;
2112           STRLIT("undelivered message at task end -> delivery_failure"));
2113     deliver_to_app(ep->p, ep->client_msg->p->a, delivery_failure);
2114     msg_link_delete(&ep->client_msg);
2115   }
2116   TASK_END;
2117 }
2118 
2119 static xcom_proto constexpr first_protocol_that_ignores_intermediate_forced_configs_or_views =
2120     x_1_8;
2121 
should_ignore_forced_config_or_view(xcom_proto protocol_version)2122 static bool constexpr should_ignore_forced_config_or_view(
2123     xcom_proto protocol_version) {
2124   return protocol_version >=
2125          first_protocol_that_ignores_intermediate_forced_configs_or_views;
2126 }
2127 
leader(site_def const * s)2128 static node_no leader(site_def const *s) {
2129   node_no leader = 0;
2130   for (leader = 0; leader < get_maxnodes(s); leader++) {
2131     if (!may_be_dead(s->detected, leader, task_now())) return leader;
2132   }
2133   return 0;
2134 }
2135 
iamthegreatest(site_def const * s)2136 int iamthegreatest(site_def const *s) { return leader(s) == s->nodeno; }
2137 
execute_msg(site_def * site,pax_machine * pma,pax_msg * p)2138 void execute_msg(site_def *site, pax_machine *pma, pax_msg *p) {
2139   app_data_ptr a = p->a;
2140   IFDBG(D_EXEC, FN; COPY_AND_FREE_GOUT(dbg_pax_msg(p)););
2141   if (a) {
2142     switch (a->body.c_t) {
2143       case unified_boot_type:
2144       case force_config_type:
2145         deliver_config(a);
2146       case add_node_type:
2147       case remove_node_type:
2148         break;
2149       case app_type:
2150         IFDBG(D_NONE, FN; STRLIT(" learner.msg ");
2151               COPY_AND_FREE_GOUT(dbg_pax_msg(pma->learner.msg)););
2152         /* DBGOUT_ASSERT(check_lsn(a), STRLIT("NULL lsn")); */
2153         deliver_to_app(pma, a, delivery_ok);
2154         break;
2155       case view_msg:
2156         IFDBG(D_EXEC, FN; STRLIT(" global view ");
2157               COPY_AND_FREE_GOUT(dbg_pax_msg(pma->learner.msg)););
2158         if (site && site->global_node_set.node_set_len ==
2159                         a->body.app_u_u.present.node_set_len) {
2160           if ((p->force_delivery != 0) &&
2161               should_ignore_forced_config_or_view(site->x_proto)) {
2162             G_DEBUG(
2163                 "execute_msg: Ignoring a forced intermediate, pending "
2164                 "view_msg");
2165           } else {
2166             assert(site->global_node_set.node_set_len ==
2167                    a->body.app_u_u.present.node_set_len);
2168             copy_node_set(&a->body.app_u_u.present, &site->global_node_set);
2169             deliver_global_view_msg(site, p->synode);
2170             ADD_DBG(D_BASE,
2171                     add_event(EVENT_DUMP_PAD,
2172                               string_arg("deliver_global_view_msg p->synode"));
2173                     add_synode_event(p->synode););
2174           }
2175         }
2176         break;
2177       default:
2178         break;
2179     }
2180   }
2181   IFDBG(D_NONE, FN; SYCEXP(p->synode));
2182 }
2183 
2184 static void read_missing_values(int n);
2185 static void propose_missing_values(int n);
2186 
2187 #ifdef EXECUTOR_TASK_AGGRESSIVE_NO_OP
2188 /* With many nodes sending read_ops on instances that are not decided yet, it
2189  * may take a very long time until someone finally decides to start a new
2190  * consensus round. As the cost of a new proposal is not that great, it's
2191  * acceptable to go directly to proposing a no-op instead of first trying to get
2192  * the value with a read_op. An added benefit of this is that if more than one
2193  * node needs the result, they will get it all when the consensus round
2194  * finishes. */
find_value(site_def const * site,unsigned int * wait,int n)2195 static void find_value(site_def const *site, unsigned int *wait, int n) {
2196   IFDBG(D_NONE, FN; NDBG(*wait, d));
2197 
2198   if (get_nodeno(site) == VOID_NODE_NO) {
2199     read_missing_values(n);
2200     return;
2201   }
2202 
2203   if ((*wait) > 1 || /* Only leader will propose initially */
2204       ((*wait) > 0 && iamthegreatest(site)))
2205     propose_missing_values(n);
2206 
2207 #ifdef TASK_EVENT_TRACE
2208   if ((*wait) > 1) dump_task_events();
2209 #endif
2210   (*wait)++;
2211 }
2212 #else
find_value(site_def const * site,unsigned int * wait,int n)2213 static void find_value(site_def const *site, unsigned int *wait, int n) {
2214   IFDBG(D_NONE, FN; NDBG(*wait, d));
2215 
2216   if (get_nodeno(site) == VOID_NODE_NO) {
2217     read_missing_values(n);
2218     return;
2219   }
2220 
2221   switch (*wait) {
2222     case 0:
2223     case 1:
2224       read_missing_values(n);
2225       (*wait)++;
2226       break;
2227     case 2:
2228       if (iamthegreatest(site))
2229         propose_missing_values(n);
2230       else
2231         read_missing_values(n);
2232       (*wait)++;
2233       break;
2234     case 3:
2235       propose_missing_values(n);
2236       break;
2237     default:
2238       break;
2239   }
2240 }
2241 #endif /* EXECUTOR_TASK_AGGRESSIVE_NO_OP */
2242 
2243 static void dump_debug_exec_state();
2244 
2245 #ifdef PROPOSE_IF_LEADER
get_xcom_message(pax_machine ** p,synode_no msgno,int n)2246 int get_xcom_message(pax_machine **p, synode_no msgno, int n) {
2247   DECL_ENV
2248   unsigned int wait;
2249   double delay;
2250   site_def const *site;
2251   END_ENV;
2252 
2253   TASK_BEGIN
2254 
2255   ep->wait = 0;
2256   ep->delay = 0.0;
2257   *p = force_get_cache(msgno);
2258   ep->site = NULL;
2259 
2260   dump_debug_exec_state();
2261   while (!finished(*p)) {
2262     ep->site = find_site_def(msgno);
2263     /* The end of the world ?, fake message by skipping */
2264     if (get_maxnodes(ep->site) == 0) {
2265       pax_msg *msg = pax_msg_new(msgno, ep->site);
2266       handle_skip(ep->site, *p, msg);
2267       break;
2268     }
2269     IFDBG(D_NONE, FN; STRLIT(" not finished "); SYCEXP(msgno); PTREXP(*p);
2270           NDBG(ep->wait, u); SYCEXP(msgno));
2271     if (get_maxnodes(ep->site) > 1 && iamthegreatest(ep->site) &&
2272         ep->site->global_node_set.node_set_val &&
2273         !ep->site->global_node_set.node_set_val[msgno.node] &&
2274         may_be_dead(ep->site->detected, msgno.node, task_now())) {
2275       propose_missing_values(n);
2276     } else {
2277       find_value(ep->site, &ep->wait, n);
2278     }
2279     TIMED_TASK_WAIT(&(*p)->rv, ep->delay = wakeup_delay(ep->delay));
2280     *p = get_cache(msgno);
2281     dump_debug_exec_state();
2282   }
2283 
2284   FINALLY
2285   IFDBG(D_NONE, FN; SYCEXP(msgno); PTREXP(*p); NDBG(ep->wait, u);
2286         SYCEXP(msgno));
2287   TASK_END;
2288 }
2289 #else
get_xcom_message(pax_machine ** p,synode_no msgno,int n)2290 int get_xcom_message(pax_machine **p, synode_no msgno, int n) {
2291   DECL_ENV
2292   unsigned int wait;
2293   double delay;
2294   site_def const *site;
2295   END_ENV;
2296 
2297   TASK_BEGIN
2298 
2299   ep->wait = 0;
2300   ep->delay = 0.0;
2301   *p = force_get_cache(msgno);
2302   ep->site = NULL;
2303 
2304   dump_debug_exec_state();
2305   while (!finished(*p)) {
2306     ep->site = find_site_def(msgno);
2307     /* The end of the world ?, fake message by skipping */
2308     if (get_maxnodes(ep->site) == 0) {
2309       pax_msg *msg = pax_msg_new(msgno, ep->site);
2310       handle_skip(ep->site, *p, msg);
2311       break;
2312     }
2313     IFDBG(D_NONE, FN; STRLIT("before find_value"); SYCEXP(msgno); PTREXP(*p);
2314           NDBG(ep->wait, u); SYCEXP(msgno));
2315     find_value(ep->site, &ep->wait, n);
2316     IFDBG(D_NONE, FN; STRLIT("after find_value"); SYCEXP(msgno); PTREXP(*p);
2317           NDBG(ep->wait, u); SYCEXP(msgno));
2318     ep->delay = wakeup_delay(ep->delay);
2319     IFDBG(D_NONE, FN; NDBG(ep->delay, f));
2320     TIMED_TASK_WAIT(&(*p)->rv, ep->delay);
2321     *p = get_cache(msgno);
2322     dump_debug_exec_state();
2323   }
2324 
2325   FINALLY
2326   TASK_END;
2327 }
2328 #endif
2329 
set_executed_msg(synode_no msgno)2330 synode_no set_executed_msg(synode_no msgno) {
2331   IFDBG(D_EXEC, FN; STRLIT("changing executed_msg from "); SYCEXP(executed_msg);
2332         STRLIT(" to "); SYCEXP(msgno));
2333   if (group_mismatch(msgno, current_message) ||
2334       synode_gt(msgno, current_message)) {
2335     IFDBG(D_EXEC, FN; STRLIT("changing current message"));
2336     set_current_message(first_free_synode(msgno));
2337   }
2338 
2339   if (msgno.msgno > executed_msg.msgno) task_wakeup(&exec_wait);
2340 
2341   executed_msg = msgno;
2342   executor_site = find_site_def_rw(executed_msg);
2343   return executed_msg;
2344 }
2345 
first_free_synode(synode_no msgno)2346 static synode_no first_free_synode(synode_no msgno) {
2347   site_def const *site = find_site_def(msgno);
2348   synode_no retval = msgno;
2349   if (!site) {
2350     /* purecov: begin deadcode */
2351     site = get_site_def();
2352     IFDBG(D_NONE, FN; PTREXP(site); SYCEXP(msgno));
2353     assert(get_group_id(site) != 0);
2354     return site->start;
2355     /* purecov: end */
2356   }
2357   if (get_group_id(site) == 0) {
2358     IFDBG(D_NONE, FN; PTREXP(site); SYCEXP(msgno));
2359     if (site) {
2360       IFDBG(D_NONE, FN; SYCEXP(site->boot_key); SYCEXP(site->start);
2361             COPY_AND_FREE_GOUT(dbg_site_def(site)));
2362     }
2363   }
2364   assert(get_group_id(site) != 0);
2365   assert(!synode_eq(msgno, null_synode));
2366   if (retval.msgno == 0) retval.msgno = 1;
2367   retval.node = get_nodeno(site);
2368   if (synode_lt(retval, msgno))
2369     return incr_msgno(retval);
2370   else
2371     return retval;
2372 }
2373 
set_current_message(synode_no msgno)2374 synode_no set_current_message(synode_no msgno) {
2375   IFDBG(D_PROPOSE, FN; STRLIT("changing current_message from ");
2376         SYCEXP(current_message); STRLIT(" to "); SYCEXP(msgno));
2377   return current_message = msgno;
2378 }
2379 
2380 static void update_max_synode(pax_msg *p);
2381 
2382 #if TASK_DBUG_ON
2383 static void perf_dbg(int *_n, int *_old_n, double *_old_t)
2384     MY_ATTRIBUTE((unused));
perf_dbg(int * _n,int * _old_n,double * _old_t)2385 static void perf_dbg(int *_n, int *_old_n, double *_old_t) {
2386   int n = *_n;
2387   int old_n = *_old_n;
2388   double old_t = *_old_t;
2389 
2390   if (!IS_XCOM_DEBUG_WITH(XCOM_DEBUG_TRACE)) return;
2391 
2392   IFDBG(D_NONE, FN; SYCEXP(executed_msg));
2393   if (!(n % 5000)) {
2394     GET_GOUT;
2395     NDBG(get_nodeno(get_site_def()), u);
2396     NDBG(task_now(), f);
2397     NDBG(n, d);
2398     NDBG(median_time(), f);
2399     SYCEXP(executed_msg);
2400     PRINT_GOUT;
2401     FREE_GOUT;
2402   }
2403   (*_n)++;
2404   if (task_now() - old_t > 1.0) {
2405     GET_GOUT;
2406     NDBG(get_nodeno(get_site_def()), u);
2407     NDBG(task_now(), f);
2408     NDBG(n, d);
2409     NDBG((n - old_n) / (task_now() - old_t), f);
2410     PRINT_GOUT;
2411     FREE_GOUT;
2412     *_old_t = task_now();
2413     *_old_n = n;
2414   }
2415 }
2416 #endif
2417 
2418 #ifdef IGNORE_LOSERS
2419 
LOSER(synode_no x,site_def const * site)2420 static inline int LOSER(synode_no x, site_def const *site) {
2421   IFDBG(D_NONE, NEXP(x.node, u);
2422         NEXP(site->global_node_set.node_set_val[(x).node], d));
2423   return (!(site)->global_node_set.node_set_val[(x).node]);
2424 }
2425 
2426 #else
2427 #define LOSER(x, site) 0
2428 #endif
2429 
2430 static void debug_loser(synode_no x) MY_ATTRIBUTE((unused));
2431 #if defined(TASK_DBUG_ON) && TASK_DBUG_ON
debug_loser(synode_no x)2432 static void debug_loser(synode_no x) {
2433   if (!IS_XCOM_DEBUG_WITH(XCOM_DEBUG_TRACE)) return;
2434   if (1 || x.msgno < 10) {
2435     GET_GOUT;
2436     NDBG(get_nodeno(find_site_def(x)), u);
2437     STRLIT(" ignoring loser ");
2438     SYCEXP(x);
2439     SYCEXP(max_synode);
2440     PRINT_GOUT;
2441     FREE_GOUT;
2442   }
2443 }
2444 #else
2445 /* purecov: begin deadcode */
debug_loser(synode_no x MY_ATTRIBUTE ((unused)))2446 static void debug_loser(synode_no x MY_ATTRIBUTE((unused))) {}
2447 /* purecov: end */
2448 #endif
2449 
send_value(site_def const * site,node_no to,synode_no synode)2450 static void send_value(site_def const *site, node_no to, synode_no synode) {
2451   pax_machine *pm = get_cache(synode);
2452   if (pm && pm->learner.msg) {
2453     pax_msg *msg = clone_pax_msg(pm->learner.msg);
2454     if (msg == NULL) return;
2455     ref_msg(msg);
2456     send_server_msg(site, to, msg);
2457     unref_msg(&msg);
2458   }
2459 }
2460 
2461 /**
2462  * Returns the message number where it is safe for nodes in previous
2463  * configuration to exit.
2464  *
2465  * @param start start synod of the next configuration
2466  * @param event_horizon event horizon of the next configuration
2467  */
compute_delay(synode_no start,xcom_event_horizon event_horizon)2468 static synode_no compute_delay(synode_no start,
2469                                xcom_event_horizon event_horizon) {
2470   start.msgno += event_horizon;
2471   return start;
2472 }
2473 
2474 /* Push messages to all nodes which were in the previous site, but not in this
2475  */
inform_removed(int index,int all)2476 static void inform_removed(int index, int all) {
2477   site_def **sites = 0;
2478   uint32_t site_count = 0;
2479   IFDBG(D_NONE, FN; NEXP(index, d));
2480   get_all_site_defs(&sites, &site_count);
2481   while (site_count > 1 && index >= 0 && (uint32_t)(index + 1) < site_count) {
2482     site_def *s = sites[index];
2483     site_def *ps = sites[index + 1];
2484 
2485     /* Compute diff and push messages */
2486     IFDBG(D_NONE, FN; NDBG(index, d); PTREXP(s); if (s) SYCEXP(s->boot_key);
2487           PTREXP(ps); if (ps) SYCEXP(ps->boot_key));
2488 
2489     if (s && ps) {
2490       node_no i = 0;
2491       IFDBG(D_NONE, FN; SYCEXP(s->boot_key); SYCEXP(s->start);
2492             SYCEXP(ps->boot_key); SYCEXP(ps->start));
2493       for (i = 0; i < ps->nodes.node_list_len; i++) { /* Loop over prev site */
2494         if (ps->nodeno != i &&
2495             !node_exists(&ps->nodes.node_list_val[i], &s->nodes)) {
2496           synode_no synode = s->start;
2497           synode_no end = max_synode;
2498           while (!synode_gt(synode, end)) { /* Loop over relevant messages */
2499             send_value(ps, i, synode);
2500             synode = incr_synode(synode);
2501           }
2502         }
2503       }
2504     }
2505     if (!all) /* Early exit if not all configs should be examined */
2506       break;
2507     index--;
2508   }
2509 }
2510 
backwards_compatible(xcom_event_horizon event_horizon)2511 static bool_t backwards_compatible(xcom_event_horizon event_horizon) {
2512   return event_horizon == EVENT_HORIZON_MIN;
2513 }
2514 
2515 static xcom_proto const first_event_horizon_aware_protocol = x_1_4;
2516 
reconfigurable_event_horizon(xcom_proto protocol_version)2517 static bool_t reconfigurable_event_horizon(xcom_proto protocol_version) {
2518   return protocol_version >= first_event_horizon_aware_protocol;
2519 }
2520 
add_node_unsafe_against_ipv4_old_nodes(app_data_ptr a)2521 static bool_t add_node_unsafe_against_ipv4_old_nodes(app_data_ptr a) {
2522   assert(a->body.c_t == add_node_type);
2523 
2524   {
2525     site_def const *latest_config = get_site_def();
2526     if (latest_config && latest_config->x_proto >= minimum_ipv6_version())
2527       return FALSE;
2528 
2529     {
2530       u_int const nr_nodes_to_add = a->body.app_u_u.nodes.node_list_len;
2531       node_address *nodes_to_add = a->body.app_u_u.nodes.node_list_val;
2532 
2533       u_int i;
2534       xcom_port node_port = 0;
2535       char node_addr[IP_MAX_SIZE];
2536 
2537       for (i = 0; i < nr_nodes_to_add; i++) {
2538         if (get_ip_and_port(nodes_to_add[i].address, node_addr, &node_port)) {
2539           G_ERROR(
2540               "Error parsing address from a joining node. Join operation "
2541               "will be "
2542               "rejected");
2543           return TRUE;
2544         }
2545 
2546         if (!is_node_v4_reachable(node_addr)) return TRUE;
2547       }
2548     }
2549 
2550     return FALSE;
2551   }
2552 }
2553 
2554 /**
2555  * Check if a node is compatible with the group's event horizon.
2556  *
2557  * A node is compatible with the group's configuration if:
2558  *
2559  *    a) The node supports event horizon reconfigurations, or
2560  *    b) The group's event horizon is, or is scheduled to be, the default event
2561  *       horizon.
2562  */
unsafe_against_event_horizon(node_address const * node)2563 static bool_t unsafe_against_event_horizon(node_address const *node) {
2564   site_def const *latest_config = get_site_def();
2565   xcom_proto node_max_protocol_version = node->proto.max_proto;
2566   bool_t const compatible =
2567       reconfigurable_event_horizon(node_max_protocol_version) ||
2568       backwards_compatible(latest_config->event_horizon);
2569 
2570   if (!compatible) {
2571     /*
2572      * The node that wants to join does not support event horizon
2573      * reconfigurations and the group's event horizon is, or is scheduled to
2574      * be, different from the default.
2575      * The node can not safely join the group so we deny its attempt to join.
2576      */
2577     G_INFO(
2578         "%s's request to join the group was rejected because the group's event "
2579         "horizon is, or will be %" PRIu32 " and %s only supports %" PRIu32,
2580         node->address, get_site_def()->event_horizon, node->address,
2581         EVENT_HORIZON_MIN);
2582     return TRUE;
2583   }
2584   return FALSE;
2585 }
2586 
add_node_unsafe_against_event_horizon(app_data_ptr a)2587 static bool_t add_node_unsafe_against_event_horizon(app_data_ptr a) {
2588   assert(a->body.c_t == add_node_type);
2589   {
2590     u_int nodes_len = a->body.app_u_u.nodes.node_list_len;
2591     node_address *nodes_to_add = a->body.app_u_u.nodes.node_list_val;
2592     u_int i;
2593     for (i = 0; i < nodes_len; i++) {
2594       if (unsafe_against_event_horizon(&nodes_to_add[i])) return TRUE;
2595     }
2596   }
2597   return FALSE;
2598 }
2599 
2600 /**
2601  * Reconfigure the group membership: add new member(s).
2602  *
2603  * It is possible that concurrent reconfigurations take effect between the time
2604  * this reconfiguration was proposed and now.
2605  *
2606  * Particularly, it is possible that any of the concurrent reconfigurations
2607  * modified the event horizon and that the new member(s) do not support event
2608  * horizon reconfigurations.
2609  *
2610  * We account for these situations by validating if adding the new members is
2611  * still possible under the current state.
2612  *
2613  * If it is not, this reconfiguration does not produce any effect, i.e. no new
2614  * configuration is installed.
2615  */
handle_add_node(app_data_ptr a)2616 site_def *handle_add_node(app_data_ptr a) {
2617   if (add_node_unsafe_against_event_horizon(a)) {
2618     /*
2619      * Note that the result of this function is only applicable to
2620      * unused and not-fully-implemented code paths where add_node_type is used
2621      * forcibly.
2622      * Should this fact change, this obviously does not work.
2623      */
2624     return NULL;
2625   }
2626   {
2627     site_def *site = clone_site_def(get_site_def());
2628     IFDBG(D_NONE, FN; COPY_AND_FREE_GOUT(dbg_list(&a->body.app_u_u.nodes)););
2629     IFDBG(D_NONE, FN; COPY_AND_FREE_GOUT(dbg_list(&a->body.app_u_u.nodes)););
2630     ADD_DBG(D_BASE, add_event(EVENT_DUMP_PAD, string_arg("a->app_key"));
2631             add_synode_event(a->app_key););
2632     assert(get_site_def());
2633     assert(site);
2634     add_site_def(a->body.app_u_u.nodes.node_list_len,
2635                  a->body.app_u_u.nodes.node_list_val, site);
2636     site->start = getstart(a);
2637     site->boot_key = a->app_key;
2638     site_install_action(site, a->body.c_t);
2639     return site;
2640   }
2641 }
2642 
2643 /**
2644  * Check if we can reconfigure the event horizon.
2645  *
2646  * We can reconfigure the event horizon if all group members support
2647  * reconfiguring the event horizon, and the new event horizon in the domain
2648  * [EVENT_HORIZON_MIN, EVENT_HORIZON_MAX].
2649  *
2650  * We use the group's latest common XCom protocol as a proxy to decide if all
2651  * members support reconfiguring the event horizon.
2652  *
2653  * If the common protocol is at least version 5 (x_1_4) then all members run
2654  * compatible server instances.
2655  *
2656  * Otherwise there are older instances, and it follows that the event horizon
2657  * must be the default and cannot be reconfigured.
2658  */
2659 enum allow_event_horizon_result {
2660   EVENT_HORIZON_ALLOWED,
2661   EVENT_HORIZON_INVALID,
2662   EVENT_HORIZON_UNCHANGEABLE
2663 };
2664 typedef enum allow_event_horizon_result allow_event_horizon_result;
2665 
log_event_horizon_reconfiguration_failure(allow_event_horizon_result error_code,xcom_event_horizon attempted_event_horizon)2666 static void log_event_horizon_reconfiguration_failure(
2667     allow_event_horizon_result error_code,
2668     xcom_event_horizon attempted_event_horizon) {
2669   switch (error_code) {
2670     case EVENT_HORIZON_INVALID:
2671       G_WARNING("The event horizon was not reconfigured to %" PRIu32
2672                 "because its domain is [%" PRIu32 ", %" PRIu32 "]",
2673                 attempted_event_horizon, xcom_get_minimum_event_horizon(),
2674                 xcom_get_maximum_event_horizon());
2675       break;
2676     case EVENT_HORIZON_UNCHANGEABLE:
2677       G_WARNING("The event horizon was not reconfigured to %" PRIu32
2678                 " because some of the group's members do not support "
2679                 "reconfiguring the event horizon",
2680                 attempted_event_horizon);
2681       break;
2682     case EVENT_HORIZON_ALLOWED:
2683       break;
2684   }
2685 }
2686 
allow_event_horizon(xcom_event_horizon event_horizon)2687 static allow_event_horizon_result allow_event_horizon(
2688     xcom_event_horizon event_horizon) {
2689   if (event_horizon < EVENT_HORIZON_MIN || event_horizon > EVENT_HORIZON_MAX)
2690     return EVENT_HORIZON_INVALID;
2691 
2692   {
2693     const site_def *latest_config = get_site_def();
2694     if (!reconfigurable_event_horizon(latest_config->x_proto)) {
2695       assert(backwards_compatible(latest_config->event_horizon));
2696       return EVENT_HORIZON_UNCHANGEABLE;
2697     }
2698   }
2699   return EVENT_HORIZON_ALLOWED;
2700 }
2701 
unsafe_event_horizon_reconfiguration(app_data_ptr a)2702 static bool_t unsafe_event_horizon_reconfiguration(app_data_ptr a) {
2703   assert(a->body.c_t == set_event_horizon_type);
2704   {
2705     xcom_event_horizon new_event_horizon = a->body.app_u_u.event_horizon;
2706     bool_t result = FALSE;
2707     allow_event_horizon_result error_code;
2708     error_code = allow_event_horizon(new_event_horizon);
2709     switch (error_code) {
2710       case EVENT_HORIZON_INVALID:
2711       case EVENT_HORIZON_UNCHANGEABLE:
2712         log_event_horizon_reconfiguration_failure(error_code,
2713                                                   new_event_horizon);
2714         result = TRUE;
2715         break;
2716       case EVENT_HORIZON_ALLOWED:
2717         break;
2718     }
2719     return result;
2720   }
2721 }
2722 
are_there_dead_nodes_in_new_config(app_data_ptr a)2723 static bool_t are_there_dead_nodes_in_new_config(app_data_ptr a) {
2724   assert(a->body.c_t == force_config_type);
2725 
2726   {
2727     u_int nr_nodes_to_add = a->body.app_u_u.nodes.node_list_len;
2728     node_address *nodes_to_change = a->body.app_u_u.nodes.node_list_val;
2729     uint32_t i;
2730     G_DEBUG("Checking for dead nodes in Forced Configuration")
2731     for (i = 0; i < nr_nodes_to_add; i++) {
2732       node_no node = find_nodeno(get_site_def(), nodes_to_change[i].address);
2733 
2734       if (node == get_nodeno(get_site_def()))
2735         continue; /* No need to validate myself */
2736 
2737       if (node == VOID_NODE_NO) {
2738         G_ERROR(
2739             "%s is not in the current configuration."
2740             "Only members in the current configuration can be present"
2741             " in a forced configuration list",
2742             nodes_to_change[i].address)
2743         return TRUE;
2744       }
2745 
2746       if (may_be_dead(get_site_def()->detected, node, task_now())) {
2747         G_ERROR(
2748             "%s is suspected to be failed."
2749             "Only alive members in the current configuration should be present"
2750             " in a forced configuration list",
2751             nodes_to_change[i].address)
2752         return TRUE;
2753       }
2754     }
2755   }
2756 
2757   return FALSE;
2758 }
2759 
2760 /**
2761  * Reconfigure the event horizon.
2762  *
2763  * It is possible that concurrent reconfigurations take effect between the
2764  * time this reconfiguration was proposed and now.
2765  *
2766  * Particularly, it is possible that any of the concurrent reconfigurations
2767  * added a new member which does not support reconfiguring the event
2768  * horizon.
2769  *
2770  * We account for these situations by validating if the event horizon
2771  * reconfiguration is still possible under the current state.
2772  *
2773  * If it is not, this reconfiguration does not produce any effect, i.e. no
2774  * new configuration is installed.
2775  */
handle_event_horizon(app_data_ptr a)2776 bool_t handle_event_horizon(app_data_ptr a) {
2777   if (unsafe_event_horizon_reconfiguration(a)) return FALSE;
2778 
2779   {
2780     xcom_event_horizon new_event_horizon = a->body.app_u_u.event_horizon;
2781     const site_def *latest_config = get_site_def();
2782     site_def *new_config = clone_site_def(latest_config);
2783     IFDBG(D_NONE, FN; NDBG(new_event_horizon, u));
2784     IFDBG(D_NONE, FN; NDBG(new_event_horizon, u));
2785     ADD_DBG(D_BASE, add_event(EVENT_DUMP_PAD, string_arg("a->app_key"));
2786             add_synode_event(a->app_key););
2787     assert(get_site_def());
2788     assert(new_config);
2789     new_config->event_horizon = new_event_horizon;
2790     new_config->start = getstart(a);
2791     new_config->boot_key = a->app_key;
2792     site_install_action(new_config, a->body.c_t);
2793     G_INFO("The event horizon was reconfigured to %" PRIu32, new_event_horizon);
2794   }
2795   return TRUE;
2796 }
2797 
terminate_and_exit()2798 void terminate_and_exit() {
2799   IFDBG(D_NONE, FN;);
2800   ADD_DBG(D_FSM, add_event(EVENT_DUMP_PAD, string_arg("terminating"));)
2801   XCOM_FSM(x_fsm_terminate, int_arg(0)); /* Tell xcom to stop */
2802   XCOM_FSM(x_fsm_exit, int_arg(0));      /* Tell xcom to exit */
2803   if (xcom_expel_cb) xcom_expel_cb(0);
2804 }
2805 
is_empty_site(site_def const * s)2806 static inline int is_empty_site(site_def const *s) {
2807   return s->nodes.node_list_len == 0;
2808 }
2809 
handle_remove_node(app_data_ptr a)2810 site_def *handle_remove_node(app_data_ptr a) {
2811   site_def *site = clone_site_def(get_site_def());
2812   IFDBG(D_NONE, FN; COPY_AND_FREE_GOUT(dbg_list(&a->body.app_u_u.nodes)));
2813   ADD_DBG(D_BASE, add_event(EVENT_DUMP_PAD, string_arg("a->app_key"));
2814           add_synode_event(a->app_key);
2815           add_event(EVENT_DUMP_PAD, string_arg("nodeno"));
2816           add_event(EVENT_DUMP_PAD, uint_arg(get_nodeno(site))););
2817 
2818   remove_site_def(a->body.app_u_u.nodes.node_list_len,
2819                   a->body.app_u_u.nodes.node_list_val, site);
2820   site->start = getstart(a);
2821   site->boot_key = a->app_key;
2822   site_install_action(site, a->body.c_t);
2823   return site;
2824 }
2825 
log_ignored_forced_config(app_data_ptr a,char const * const caller_name)2826 static void log_ignored_forced_config(app_data_ptr a,
2827                                       char const *const caller_name) {
2828   switch (a->body.c_t) {
2829     case unified_boot_type:
2830       G_DEBUG("%s: Ignoring a forced intermediate, pending unified_boot",
2831               caller_name);
2832       break;
2833     case add_node_type:
2834       G_DEBUG("%s: Ignoring a forced intermediate, pending add_node for %s",
2835               caller_name, a->body.app_u_u.nodes.node_list_val[0].address);
2836       break;
2837     case remove_node_type:
2838       G_DEBUG("%s: Ignoring a forced intermediate, pending remove_node for %s",
2839               caller_name, a->body.app_u_u.nodes.node_list_val[0].address);
2840       break;
2841     case set_event_horizon_type:
2842       G_DEBUG(
2843           "%s: Ignoring a forced intermediate, pending set_event_horizon for "
2844           "%" PRIu32,
2845           caller_name, a->body.app_u_u.event_horizon);
2846       break;
2847     case force_config_type:
2848       G_DEBUG("%s: Ignoring a forced intermediate, pending force_config",
2849               caller_name);
2850       break;
2851     case abort_trans:
2852     case app_type:
2853     case begin_trans:
2854     case convert_into_local_server_type:
2855     case disable_arbitrator:
2856     case enable_arbitrator:
2857     case exit_type:
2858     case get_event_horizon_type:
2859     case get_synode_app_data_type:
2860     case prepared_trans:
2861     case remove_reset_type:
2862     case reset_type:
2863     case set_cache_limit:
2864     case view_msg:
2865     case x_terminate_and_exit:
2866     case xcom_boot_type:
2867     case xcom_set_group:
2868       // Meaningless for any other `cargo_type`s. Ignore.
2869       break;
2870   }
2871 }
2872 
handle_config(app_data_ptr a,bool const forced)2873 bool_t handle_config(app_data_ptr a, bool const forced) {
2874   assert(a->body.c_t == unified_boot_type ||
2875          a->next == NULL); /* Reconfiguration commands are not batched. */
2876   {
2877     bool_t success = FALSE;
2878     if (forced &&
2879         should_ignore_forced_config_or_view(get_executor_site()->x_proto)) {
2880       log_ignored_forced_config(a, "handle_config");
2881       goto end;
2882     }
2883     switch (a->body.c_t) {
2884       case unified_boot_type:
2885         success = (install_node_group(a) != NULL);
2886         assert(success);
2887         break;
2888       case add_node_type:
2889         /*
2890          * May fail if meanwhile the event horizon was reconfigured and the
2891          * node is incompatible.
2892          */
2893         success = (handle_add_node(a) != NULL);
2894         break;
2895       case remove_node_type:
2896         ADD_DBG(D_BASE,
2897                 add_event(EVENT_DUMP_PAD, string_arg("got remove_node_type"));)
2898         success = (handle_remove_node(a) != NULL);
2899         assert(success);
2900         break;
2901       case set_event_horizon_type:
2902         /* May fail if meanwhile an incompatible node joined. */
2903         success = handle_event_horizon(a);
2904         break;
2905       case force_config_type:
2906         success = (install_node_group(a) != NULL);
2907         assert(success);
2908         break;
2909       default:
2910         assert(FALSE); /* Boy oh boy, something is really wrong... */
2911         break;
2912     }
2913   end:
2914     return success;
2915   }
2916 }
2917 
is_member(site_def const * site)2918 static inline int is_member(site_def const *site) {
2919   return site->nodeno != VOID_NODE_NO;
2920 }
2921 
2922 /*
2923 Execute xcom message stream.
2924 
2925 Beware  of  the exit logic in this task, which is both simple and
2926 not so simple.  Consider three configs C1  and  C2.  C1  has  two
2927 nodes,  A and B. C2 has only node B.  C3 is empty.  A config with
2928 message number N will be activated after a delay  of  (at  least)
2929 alpha  messages,  where alpha is the size of the pipeline (or the
2930 event horizon).
2931 
2932 So, C1.start = C1+alpha, and C2.start = C2+alpha. A, which is re‐
2933 moved  from  C1, cannot exit until a majority of nodes in the new
2934 config C2 (in this case B) has learned all the messages from con‐
2935 fig  C1,  which  means all messages less than C2.start. How can A
2936 know that a majority of C2 has learned those messages?
2937 
2938 If we denote the first message that is not yet decided (and  exe‐
2939 cuted)  by E, the proposers will not try to propose messages with
2940 number >= E+alpha, and all incoming  tcp  messages  with  message
2941 number  >=  E+alpha will be ignored.  E is incremented by the ex‐
2942 ecutor task, so all messages < E are known.  This means that when
2943 the value of E+alpha is known, all messages up to and including E
2944 are also known, although not all messages  E+1..E+alpha‐1  neces‐
2945 sarily are known.
2946 
2947 This  leads  to  the requirement that a node which is removed (A)
2948 needs to wait until it knows the value of  C2.start+alpha,  since
2949 by  then it knows that a majority of the nodes in C2 are ready to
2950 execute C2.start, which in turn implies that a majority of  nodes
2951 in  C2  knows  all  the values from config C1. Note that the last
2952 message that should be delivered to the  application  by  a  node
2953 that  is  leaving  C1 is C2.start‐1, which is the last message of
2954 C1.
2955 
2956 How does a node that is removed get to know values from the  next
2957 config?   There  are  two  ways, and we use both. First, the node
2958 that tries to exit can simply ask for the message.  get_xcom_mes‐
2959 sage()  will  do  this for all messages <= max_synode, but it may
2960 take some time.  Second, the nodes of C2 can  send  the  messages
2961 C2.start..C2.start+alpha  to  the  nodes  that are removed (nodes
2962 that are in C1 but not in C2).  inform_removed()  does  this.  We
2963 take  care to handle the case where configs are close enough that
2964 C0 < C1 <= C0+alpha by tracking the oldest config  that  contains
2965 nodes that are leaving.
2966 
2967 This  takes care of nodes leaving C1. What about nodes that leave
2968 C2? C3 is empty, so B, which is leaving C2, cannot wait for  mes‐
2969 sages  from  C3. But since C3 is empty, there is no need to wait.
2970 It can exit immediately after  having  executed  C3.start‐1,  the
2971 last message of C2. What if C3.start‐1 < C2.start+alpha? This can
2972 happen if C2 and C3 are close. In that case, B will exit before A
2973 gets the chance to learn C2.start+alpha, which will leave A hang‐
2974 ing forever. Clearly, we need to impose an additional constraint,
2975 that  C3.start must be greater than C2.start+alpha. This is taken
2976 care of by the special test for an empty config.
2977 
2978 Complicated and confusing? Not really, but there is a  clean  and
2979 simple  solution which has not been implemented yet, since it re‐
2980 quires more changes to the consensus logic.  If we  require  that
2981 for  the messages C2..C2.start‐1 we have a majority from both the
2982 nodes in C1 and the nodes in C2, the nodes not  in  C2  can  exit
2983 when  they  have  executed message C2.start‐1, since we then know
2984 that a majority of the nodes of C2 has agreed on  those  messages
2985 as  well,  so they do not depend on the nodes not in C2 any more.
2986 This holds even if C2 is empty.  Note that requiring  a  majority
2987 from  both  C1 and C2 is different from requiring a majority from
2988 C1+C2, which means that the proposer logic needs to consider  an‐
2989 swers  from  two  different sets of acceptors for those messages.
2990 Since acceptors are identified by their node number, and the node
2991 numbers  need  not be the same for both configs, we need to main‐
2992 tain a mapping between the nodes numbers of any  two  consecutive
2993 configs.  Alternatively,  we  could remove the node numbers alto‐
2994 gether, and always use a unique, unchanging ID for a  node,  like
2995 IP address + port.
2996 
2997 TODO:
2998 
2999 Move the delayed delivery logic into MCM-specific code, since it is
3000 only needed by MCM.  Is it still needed?
3001 
3002 Rewrite exit logic as FSM with more states. (RUN, EMPTY_EXIT,
3003 NOT_MEMBER_EXIT) to avoid unnecessary tests.
3004 
3005 */
3006 
3007 /* FIFO which tracks the message numbers where we should deliver queued messages
3008 or
3009 inform the removed nodes */
3010 #define FIFO_SIZE 1000
3011 static struct {
3012   int n;
3013   int front;
3014   int rear;
3015   synode_no q[FIFO_SIZE];
3016 } delay_fifo;
3017 
addone(int i)3018 static inline int addone(int i) { return ((i + 1) % FIFO_SIZE); }
3019 
3020 /* Is queue empty?  */
fifo_empty()3021 static inline int fifo_empty() { return delay_fifo.n <= 0; }
3022 
3023 /* Is queue full?  */
fifo_full()3024 static inline int fifo_full() { return delay_fifo.n >= FIFO_SIZE; }
3025 
3026 /* Insert in queue  */
fifo_insert(synode_no s)3027 static inline void fifo_insert(synode_no s) {
3028   if (!fifo_full()) {
3029     delay_fifo.n++;
3030     delay_fifo.q[delay_fifo.rear] = s;
3031     delay_fifo.rear = addone(delay_fifo.rear);
3032   }
3033 }
3034 
3035 /* Extract first from queue  */
fifo_extract()3036 static inline synode_no fifo_extract() {
3037   if (!fifo_empty()) {
3038     synode_no ret = delay_fifo.q[delay_fifo.front];
3039     delay_fifo.front = addone(delay_fifo.front);
3040     delay_fifo.n--;
3041     return ret;
3042   } else {
3043     return null_synode;
3044   }
3045 }
3046 
3047 /* Return first in queue, but do not dequeue  */
fifo_front()3048 static inline synode_no fifo_front() {
3049   if (!fifo_empty()) {
3050     return delay_fifo.q[delay_fifo.front];
3051   } else {
3052     return null_synode;
3053   }
3054 }
3055 
3056 struct execute_context;
3057 typedef struct execute_context execute_context;
3058 
3059 typedef void (*exec_fp)(execute_context *xc);
3060 
3061 struct execute_context {
3062   pax_machine *p;
3063   int n;
3064   int old_n;
3065   double old_t;
3066   synode_no exit_synode;
3067   synode_no delivery_limit;
3068   exec_fp state;
3069   int exit_flag; /* To avoid state explosion */
3070   int inform_index;
3071 };
3072 
3073 static void dump_exec_state(execute_context *xc, long dbg);
3074 static int x_check_exit(execute_context *xc);
3075 static int x_check_execute_inform(execute_context *xc);
3076 static void x_fetch(execute_context *xc);
3077 static void x_execute(execute_context *xc);
3078 static void x_check_increment_fetch(execute_context *xc);
3079 static void x_check_increment_execute(execute_context *xc);
3080 static void x_terminate(execute_context *xc);
3081 
3082 struct fp_name {
3083   exec_fp fp;
3084   char const *name;
3085 };
3086 
3087 #define NAME(f) \
3088   { f, #f }
3089 
3090 /* List of fp, name pairs */
3091 static struct fp_name oblist[] = {
3092     NAME(x_fetch), NAME(x_execute), NAME(x_terminate), {0, 0}};
3093 #undef NAME
3094 
3095 /* purecov: begin deadcode */
get_fp_name(exec_fp fp)3096 char const *get_fp_name(exec_fp fp) {
3097   struct fp_name *list = oblist;
3098   while (list->fp) {
3099     if (list->fp == fp) return list->name;
3100     list++;
3101   }
3102   return "no such fp";
3103 }
3104 /* purecov: end */
3105 
setup_exit_handling(execute_context * xc,site_def * site)3106 static void setup_exit_handling(execute_context *xc, site_def *site) {
3107   synode_no delay_until;
3108   if (is_member(site)) {
3109     delay_until = compute_delay(site->start, site->event_horizon);
3110   } else { /* Not in this site */
3111     /* See if site will be empty when we leave. If the new site
3112      * is empty, we should exit after having delivered the last
3113      * message from the old site. */
3114 
3115     /* Note limit of delivery. We should never deliver anything after the start
3116      * of the next site. */
3117     xc->delivery_limit = site->start;
3118 
3119     /* If we are not a member of the new site, we should exit
3120       after having seen enough messages beyond the end of the current site.
3121       This ensures that a majority of the next site will have agreed upon all
3122       messages that belong to the current site.
3123      */
3124     xc->exit_synode = compute_delay(site->start, site->event_horizon);
3125     if (is_empty_site(site)) {
3126       /* If site is empty, increase start to allow nodes to terminate before
3127        * start. This works as if there was a non-empty group after the
3128        * exit_synode, effectively allowing the majority of the current group to
3129        * agree on all messages up to exit_synode.
3130        */
3131       site->start = compute_delay(
3132           compute_delay(site->start, site->event_horizon), site->event_horizon);
3133     }
3134     if (!synode_lt(xc->exit_synode, max_synode)) {
3135       /* We need messages from the next site, so set max_synode accordingly. */
3136       set_max_synode(incr_synode(xc->exit_synode));
3137     }
3138     /* Note where we switch to execute and inform removed nodes */
3139     delay_until = xc->exit_synode;
3140 
3141     IFDBG(D_EXEC, FN; SYCEXP(delay_until); SYCEXP(executed_msg);
3142           SYCEXP(max_synode));
3143     IFDBG(D_EXEC, FN; SYCEXP(xc->exit_synode); SYCEXP(executed_msg);
3144           SYCEXP(max_synode));
3145 
3146     /* Note that we will exit */
3147     xc->exit_flag = 1;
3148   }
3149 
3150   /* Ensure that max_synode is greater than trigger for delivery
3151    */
3152   if (synode_gt(delay_until, max_synode))
3153     set_max_synode(incr_msgno(delay_until));
3154   fifo_insert(delay_until);
3155   (xc->inform_index)++;
3156 
3157   /* If I am the leader, will propose no-ops until current max_synode
3158    */
3159 }
3160 
3161 /* Called immediately after we have got a new message.
3162    Terminate if we have no site.
3163    Otherwise, handle config messages immediately.
3164    Afterwards, switch to check_exit_fetch. */
x_fetch(execute_context * xc)3165 static void x_fetch(execute_context *xc) {
3166   /* Execute unified_boot immediately, but do not deliver site message
3167    * until we are ready to execute messages from the new site
3168    * definition. At that point we can be certain that a majority have
3169    * learned everything from the old site. */
3170 
3171   app_data *app = xc->p->learner.msg->a;
3172   if (app && is_config(app->body.c_t) &&
3173       synode_gt(executed_msg, get_site_def()->boot_key)) /* Redo test */
3174   {
3175     site_def *site = 0;
3176     bool_t reconfiguration_successful =
3177         handle_config(app, (xc->p->learner.msg->force_delivery != 0));
3178     if (reconfiguration_successful) {
3179       /* If the reconfiguration failed then it does not have any
3180        * effect. What follows only makes sense if the reconfiguration
3181        * took effect. */
3182       set_last_received_config(executed_msg);
3183       garbage_collect_site_defs(delivered_msg);
3184       site = get_site_def_rw();
3185       if (site == 0) {
3186         xc->state = x_terminate;
3187         return;
3188       }
3189       IFDBG(D_EXEC, FN; STRLIT("new config "); SYCEXP(site->boot_key););
3190 
3191       if (xc->exit_flag == 0) {
3192         /* We have not yet set the exit trigger */
3193         setup_exit_handling(xc, site);
3194       }
3195     }
3196   } else {
3197     IFDBG(D_EXEC, FN; SYCEXP(executed_msg); SYCEXP(get_site_def()->boot_key));
3198   }
3199   /* Check for exit and increment executed_msg */
3200   x_check_increment_fetch(xc);
3201 }
3202 
3203 /* Push messages to nodes that have been removed.
3204    Signal switch to execute when nothing left to push by returning 1 */
x_check_execute_inform(execute_context * xc)3205 static int x_check_execute_inform(execute_context *xc) {
3206   IFDBG(D_EXEC, FN; SYCEXP(fifo_front()); SYCEXP(executed_msg);
3207         SYCEXP(xc->exit_synode); NDBG(xc->exit_flag, d));
3208   if (fifo_empty()) {
3209     return 1;
3210   } else if (!synode_lt(executed_msg, fifo_front())) {
3211     while (
3212         !fifo_empty() &&
3213         !synode_lt(executed_msg, fifo_front())) { /* More than one may match */
3214       inform_removed(xc->inform_index, 0);
3215       fifo_extract();
3216       (xc->inform_index)--;
3217     }
3218     garbage_collect_servers();
3219     return 1;
3220   }
3221   dump_exec_state(xc, D_EXEC);
3222   return 0;
3223 }
3224 
3225 /* Check for exit and return 1 if we should exit. */
x_check_exit(execute_context * xc)3226 static int x_check_exit(execute_context *xc) {
3227   /* See if we should exit when having seen this message */
3228   return (xc->exit_flag && !synode_lt(executed_msg, xc->exit_synode) &&
3229           !synode_lt(delivered_msg, xc->delivery_limit));
3230 }
3231 
3232 /* Terminate if we should exit, else increment executed_msg and see if we should
3233  * switch to execute */
x_check_increment_fetch(execute_context * xc)3234 static void x_check_increment_fetch(execute_context *xc) {
3235   if (x_check_exit(xc)) {
3236     xc->state = x_terminate;
3237   } else {
3238     SET_EXECUTED_MSG(incr_synode(executed_msg));
3239     if (x_check_execute_inform(xc)) {
3240       xc->state = x_execute;
3241     }
3242   }
3243 }
3244 
3245 /* Terminate if we should exit, else increment delivered_msg and see if we
3246  * should switch to fetch */
x_check_increment_execute(execute_context * xc)3247 static void x_check_increment_execute(execute_context *xc) {
3248   if (x_check_exit(xc)) {
3249     xc->state = x_terminate;
3250   } else {
3251     /* Increment delivered_msg and switch to fetch if delivered_msg equals
3252      * executed_msg; */
3253     delivered_msg = incr_synode(delivered_msg);
3254     if (synode_eq(delivered_msg, executed_msg)) {
3255       xc->state = x_fetch;
3256     }
3257   }
3258 }
3259 
3260 /* Deliver one message if it should be delivered. Switch state to see if
3261    we should exit */
x_execute(execute_context * xc)3262 static void x_execute(execute_context *xc) {
3263   site_def const *x_site = find_site_def(delivered_msg);
3264 
3265   IFDBG(D_EXEC, FN; SYCEXP(delivered_msg); SYCEXP(delivered_msg);
3266         SYCEXP(executed_msg); SYCEXP(xc->exit_synode); NDBG(xc->exit_flag, d));
3267   if (!is_cached(delivered_msg)) {
3268     /* purecov: begin deadcode */
3269 #ifdef TASK_EVENT_TRACE
3270     dump_task_events();
3271 #endif
3272     /* purecov: end */
3273   }
3274   assert(is_cached(delivered_msg) && "delivered_msg should have been cached");
3275   xc->p = get_cache(delivered_msg);
3276   if (LOSER(delivered_msg, x_site)) {
3277 #ifdef IGNORE_LOSERS
3278     IFDBG(D_EXEC, FN; debug_loser(delivered_msg); PTREXP(x_site);
3279           dbg_node_set(x_site->global_node_set));
3280 #endif
3281   } else if (xc->p->learner.msg->msg_type != no_op) {
3282     /* Avoid delivery after start if we should exit */
3283     if (xc->exit_flag == 0 || synode_lt(delivered_msg, xc->delivery_limit)) {
3284       /* IFDBG(D_EXEC, FN; NDBG(ep->state, d); STRLIT("executing ");
3285          SYCEXP(delivered_msg); SYCEXP(executed_msg);
3286               SYCEXP(xc->delivery_limit); NDBG(xc->exit_flag, d)); */
3287       last_delivered_msg = delivered_msg;
3288       execute_msg(find_site_def_rw(delivered_msg), xc->p, xc->p->learner.msg);
3289     }
3290   }
3291   /* Garbage collect old servers */
3292   if (synode_eq(delivered_msg, x_site->start)) {
3293     garbage_collect_servers();
3294   }
3295 #if defined(TASK_DBUG_ON) && TASK_DBUG_ON
3296   IFDBG(D_EXEC, perf_dbg(&xc->n, &xc->old_n, &xc->old_t));
3297 #endif
3298   /* Check for exit and increment delivered_msg */
3299   x_check_increment_execute(xc);
3300 }
3301 
3302 static execute_context *debug_xc;
3303 
dump_exec_state(execute_context * xc MY_ATTRIBUTE ((unused)),long dbg MY_ATTRIBUTE ((unused)))3304 static void dump_exec_state(execute_context *xc MY_ATTRIBUTE((unused)),
3305                             long dbg MY_ATTRIBUTE((unused))) {
3306   IFDBG(dbg, FN; SYCEXP(executed_msg); SYCEXP(delivered_msg);
3307         SYCEXP(max_synode); SYCEXP(last_delivered_msg); NDBG(delay_fifo.n, d);
3308         NDBG(delay_fifo.front, d); NDBG(delay_fifo.rear, d);
3309         SYCEXP(fifo_front()); SYCEXP(xc->exit_synode);
3310         SYCEXP(xc->delivery_limit); NDBG(xc->exit_flag, d);
3311         NDBG(xc->inform_index, d); NDBG(prop_started, d);
3312         NDBG(prop_finished, d););
3313 }
3314 
dump_debug_exec_state()3315 static void dump_debug_exec_state() {
3316   if (debug_xc) dump_exec_state(debug_xc, D_EXEC);
3317 }
3318 
3319 /* Terminate the excutor_task. */
x_terminate(execute_context * xc)3320 static void x_terminate(execute_context *xc) {
3321   dump_exec_state(xc, D_BUG);
3322   xc->state = 0;
3323 }
3324 
executor_task(task_arg arg MY_ATTRIBUTE ((unused)))3325 static int executor_task(task_arg arg MY_ATTRIBUTE((unused))) {
3326   DECL_ENV
3327   execute_context xc;
3328   END_ENV;
3329   /* xcom_debug_mask = D_BUG; */
3330   IFDBG(D_EXEC, FN; NDBG(stack->sp->state, d); SYCEXP(executed_msg););
3331   TASK_BEGIN
3332   ep->xc.p = NULL;
3333   ep->xc.n = 0;
3334   ep->xc.old_n = 0;
3335   ep->xc.old_t = task_now();
3336   ep->xc.exit_synode = null_synode;
3337   ep->xc.delivery_limit = null_synode;
3338   ep->xc.exit_flag = 0;
3339   ep->xc.inform_index = -1;
3340   delay_fifo.n = 0;
3341   delay_fifo.front = 0;
3342   delay_fifo.rear = 0;
3343   debug_xc = &ep->xc;
3344 
3345   if (executed_msg.msgno == 0) executed_msg.msgno = 1;
3346   delivered_msg = executed_msg;
3347   ep->xc.state = x_fetch;
3348   executor_site = find_site_def_rw(executed_msg);
3349 
3350   /* The following loop implements a state machine based on function pointers,
3351      effectively acting as non-local gotos.
3352      The functions all operate on data in the execution context xc, and
3353      switch state by setting xc->state to the function corresponding to the new
3354      state.
3355   */
3356   while (!xcom_shutdown && ep->xc.state != 0) {
3357     IFDBG(D_EXEC, FN; STRLIT(get_fp_name(ep->xc.state)););
3358     if (ep->xc.state == x_fetch) { /* Special case because of task macros */
3359       if (LOSER(executed_msg, executor_site)) {
3360         x_check_increment_fetch(&ep->xc); /* Just increment past losers */
3361       } else {
3362         TASK_CALL(get_xcom_message(&ep->xc.p, executed_msg, FIND_MAX));
3363         IFDBG(D_EXEC, FN; STRLIT("got message "); SYCEXP(ep->xc.p->synode);
3364               COPY_AND_FREE_GOUT(dbg_app_data(ep->xc.p->learner.msg->a)));
3365         x_fetch(&ep->xc);
3366       }
3367     } else {
3368       ep->xc.state(&ep->xc);
3369     }
3370   }
3371 
3372   /* Inform all removed nodes before we exit */
3373   ADD_DBG(D_FSM, add_event(EVENT_DUMP_PAD, string_arg("terminating"));)
3374   inform_removed(ep->xc.inform_index, 1);
3375   dump_exec_state(&ep->xc, D_BUG);
3376 
3377 #ifndef NO_DELAYED_TERMINATION
3378   IFDBG(D_EXEC, FN; STRLIT("delayed terminate and exit"));
3379 
3380   /* Wait to allow messages to propagate */
3381   TASK_DELAY(TERMINATE_DELAY);
3382 
3383   /* Start termination of xcom */
3384   terminate_and_exit();
3385 #endif
3386 
3387   FINALLY
3388   dump_exec_state(&ep->xc, D_BUG);
3389   IFDBG(D_BUG, FN; STRLIT(" shutdown "); SYCEXP(executed_msg);
3390         NDBG(task_now(), f));
3391   TASK_END;
3392 }
3393 
get_sweep_start()3394 static synode_no get_sweep_start() {
3395   synode_no find = executed_msg;
3396   find.node = get_nodeno(find_site_def(find));
3397   if (find.node < executed_msg.node) {
3398     find = incr_msgno(find);
3399   }
3400   return find;
3401 }
3402 
sweeper_task(task_arg arg MY_ATTRIBUTE ((unused)))3403 static int sweeper_task(task_arg arg MY_ATTRIBUTE((unused))) {
3404   DECL_ENV
3405   synode_no find;
3406   END_ENV;
3407 
3408   TASK_BEGIN
3409 
3410   ep->find = get_sweep_start();
3411 
3412   while (!xcom_shutdown) {
3413     ep->find.group_id =
3414         executed_msg.group_id; /* In case group id has changed */
3415 #ifndef AGGRESSIVE_SWEEP
3416     while (!is_only_task()) {
3417       TASK_YIELD;
3418     }
3419 #endif
3420     ADD_DBG(D_NONE, add_event(EVENT_DUMP_PAD, string_arg("sweeper ready"));
3421             add_synode_event(executed_msg););
3422     /*		IFDBG(D_NONE, FN; STRLIT("ready to run ");   */
3423     /*			SYCEXP(executed_msg); SYCEXP(max_synode);
3424      * SYCEXP(ep->find));
3425      */
3426     {
3427       while (synode_lt(ep->find, max_synode) && !too_far(ep->find)) {
3428         /* pax_machine * pm = hash_get(ep->find); */
3429         pax_machine *pm = 0;
3430         ADD_DBG(D_NONE,
3431                 add_event(EVENT_DUMP_PAD, string_arg("sweeper examining"));
3432                 add_synode_event(ep->find););
3433         if (ep->find.node == VOID_NODE_NO) {
3434           if (synode_gt(executed_msg, ep->find)) {
3435             ep->find = get_sweep_start();
3436           }
3437           if (ep->find.node == VOID_NODE_NO) goto deactivate;
3438         }
3439         pm = get_cache(ep->find);
3440         ADD_DBG(D_CONS,
3441                 add_event(EVENT_DUMP_PAD, string_arg("sweeper checking"));
3442                 add_synode_event(ep->find);
3443                 add_event(EVENT_DUMP_PAD, string_arg(pax_op_to_str(pm->op)));
3444                 add_event(EVENT_DUMP_PAD, string_arg("pm"));
3445                 add_event(EVENT_DUMP_PAD, void_arg(pm)););
3446         if (pm && !pm->force_delivery) { /* We want full 3 phase Paxos for
3447                                             forced messages */
3448           ADD_DBG(
3449               D_CONS, add_event(EVENT_DUMP_PAD, string_arg("sweeper checking"));
3450               add_synode_event(ep->find);
3451               add_event(EVENT_DUMP_PAD, string_arg(pax_op_to_str(pm->op)));
3452               add_event(EVENT_DUMP_PAD, string_arg("is_busy_machine"));
3453               add_event(EVENT_DUMP_PAD, int_arg(is_busy_machine(pm)));
3454               add_event(EVENT_DUMP_PAD, string_arg("pm->acceptor.promise.cnt"));
3455               add_event(EVENT_DUMP_PAD, int_arg(pm->acceptor.promise.cnt));
3456               add_event(EVENT_DUMP_PAD, string_arg("finished(pm)"));
3457               add_event(EVENT_DUMP_PAD, int_arg(finished(pm)));
3458               add_event(EVENT_DUMP_PAD, string_arg("pm->acceptor.msg"));
3459               add_event(EVENT_DUMP_PAD, void_arg(pm->acceptor.msg)););
3460           /* IFDBG(D_NONE, FN; dbg_pax_machine(pm)); */
3461           if (!is_busy_machine(pm) && pm->acceptor.promise.cnt == 0 &&
3462               !pm->acceptor.msg && !finished(pm)) {
3463             pm->op = skip_op;
3464             ADD_DBG(D_CONS,
3465                     add_event(EVENT_DUMP_PAD, string_arg("sweeper skipping"));
3466                     add_synode_event(ep->find); add_event(
3467                         EVENT_DUMP_PAD, string_arg(pax_op_to_str(pm->op))););
3468             skip_msg(pax_msg_new(ep->find, find_site_def(ep->find)));
3469             IFDBG(D_NONE, FN; STRLIT("skipping "); SYCEXP(ep->find));
3470             /* 						IFDBG(D_NONE, FN;
3471              * dbg_pax_machine(pm));
3472              */
3473           }
3474         }
3475         ep->find = incr_msgno(ep->find);
3476       }
3477     }
3478   deactivate:
3479     TASK_DEACTIVATE;
3480   }
3481   FINALLY
3482   IFDBG(D_BUG, FN; STRLIT(" shutdown sweeper "); SYCEXP(executed_msg);
3483         NDBG(task_now(), f));
3484   TASK_END;
3485 }
3486 
wakeup_delay(double old)3487 static double wakeup_delay(double old) {
3488   double retval = 0.0;
3489   if (0.0 == old) {
3490     double m = median_time();
3491     if (m == 0.0 || m > 0.3) m = 0.1;
3492     retval = 0.1 + 5.0 * m + m * xcom_drand48();
3493   } else {
3494     retval = old * 1.4142136; /* Exponential backoff */
3495   }
3496   {
3497 #ifdef EXECUTOR_TASK_AGGRESSIVE_NO_OP
3498     double const maximum_threshold = 1.0;
3499 #else
3500     double const maximum_threshold = 3.0;
3501 #endif /* EXECUTOR_TASK_AGGRESSIVE_NO_OP */
3502     while (retval > maximum_threshold) retval /= 1.31415926;
3503   }
3504   /* IFDBG(D_NONE, FN; NDBG(retval,d)); */
3505   return retval;
3506 }
3507 
propose_noop(synode_no find,pax_machine * p)3508 static void propose_noop(synode_no find, pax_machine *p) {
3509   /* Prepare to send a noop */
3510   site_def const *site = find_site_def(find);
3511   IFDBG(D_NONE, FN; SYCEXP(find); SYCEXP(executed_msg));
3512   assert(!too_far(find));
3513   replace_pax_msg(&p->proposer.msg, pax_msg_new(find, site));
3514   assert(p->proposer.msg);
3515   create_noop(p->proposer.msg);
3516   {
3517     pax_msg *clone = clone_pax_msg(p->proposer.msg);
3518     if (clone != NULL) {
3519       push_msg_3p(site, p, clone, find, no_op);
3520     } else {
3521       /* purecov: begin inspected */
3522       G_DEBUG("Unable to propose NoOp due to an OOM error.");
3523       /* purecov: end */
3524     }
3525   }
3526 }
3527 
send_read(synode_no find)3528 static void send_read(synode_no find) {
3529   /* Prepare to send a read_op */
3530   site_def const *site = find_site_def(find);
3531 
3532   IFDBG(D_NONE, FN; NDBG(get_maxnodes(site), u); NDBG(get_nodeno(site), u););
3533   ADD_DBG(D_CONS, add_event(EVENT_DUMP_PAD, string_arg("find"));
3534           add_synode_event(find); add_event(EVENT_DUMP_PAD, string_arg("site"));
3535           add_event(EVENT_DUMP_PAD, void_arg((void *)find_site_def_rw(find)));
3536           add_event(EVENT_DUMP_PAD, string_arg("get_nodeno(site)"));
3537           add_event(EVENT_DUMP_PAD, uint_arg(get_nodeno(site))););
3538 
3539   /* See if node number matches ours */
3540   if (site) {
3541     if (find.node != get_nodeno(site)) {
3542       pax_msg *pm = pax_msg_new(find, site);
3543       ref_msg(pm);
3544       create_read(site, pm);
3545       IFDBG(D_NONE, FN; SYCEXP(find););
3546 
3547       IFDBG(D_NONE, FN; NDBG(get_maxnodes(site), u); NDBG(get_nodeno(site), u);
3548             PTREXP(pm));
3549 /* send_server_msg(site, find.node, pm); */
3550 #if 0
3551 			send_to_others(site, pm, "send_read");
3552 #else
3553       /* If we have no node number,  ask all the others */
3554       if (get_nodeno(site) == VOID_NODE_NO)
3555         send_to_others(site, pm, "send_read");
3556       else
3557         /* Ask a random node */
3558         send_to_someone(site, pm, "send_read");
3559 #endif
3560       unref_msg(&pm);
3561     } else { /* If node number matches our own number, ask all the others */
3562       pax_msg *pm = pax_msg_new(find, site);
3563       ref_msg(pm);
3564       create_read(site, pm);
3565       send_to_others(site, pm, "send_read");
3566       unref_msg(&pm);
3567     }
3568   }
3569 }
3570 
3571 /* Find missing values */
3572 
ok_to_propose(pax_machine * p)3573 static int ok_to_propose(pax_machine *p) {
3574   int retval = (is_forcing_node(p) || !recently_active(p)) && !finished(p) &&
3575                !is_busy_machine(p);
3576   IFDBG(D_NONE, FN; NDBG(p->synode.node, u); NDBG(recently_active(p), d);
3577         NDBG(finished(p), d); NDBG(is_busy_machine(p), d); NDBG(retval, d));
3578   return retval;
3579 }
3580 
read_missing_values(int n)3581 static void read_missing_values(int n) {
3582   synode_no find = executed_msg;
3583   synode_no end = max_synode;
3584   int i = 0;
3585 
3586   IFDBG(D_NONE, FN; SYCEXP(find); SYCEXP(end));
3587   if (synode_gt(executed_msg, max_synode) ||
3588       synode_eq(executed_msg, null_synode))
3589     return;
3590 
3591   while (!synode_gt(find, end) && i < n && !too_far(find)) {
3592     pax_machine *p = force_get_cache(find);
3593     ADD_DBG(D_NONE, add_synode_event(find); add_synode_event(end);
3594             add_event(EVENT_DUMP_PAD, string_arg("active "));
3595             add_event(EVENT_DUMP_PAD, int_arg(recently_active(p)));
3596             add_event(EVENT_DUMP_PAD, string_arg("finished  "));
3597             add_event(EVENT_DUMP_PAD, int_arg(finished(p)));
3598             add_event(EVENT_DUMP_PAD, string_arg("busy "));
3599             add_event(EVENT_DUMP_PAD, int_arg(is_busy_machine(p))););
3600     IFDBG(D_NONE, FN; SYCEXP(find); SYCEXP(end); NDBG(recently_active(p), d);
3601           NDBG(finished(p), d); NDBG(is_busy_machine(p), d));
3602     if (!recently_active(p) && !finished(p) && !is_busy_machine(p)) {
3603       send_read(find);
3604     }
3605     find = incr_synode(find);
3606     i++;
3607   }
3608 }
3609 
propose_missing_values(int n)3610 static void propose_missing_values(int n) {
3611   synode_no find = executed_msg;
3612   synode_no end = max_synode;
3613   int i = 0;
3614 
3615   IFDBG(D_NONE, FN; NDBG(get_maxnodes(get_site_def()), u); SYCEXP(find);
3616         SYCEXP(end));
3617   if (synode_gt(executed_msg, max_synode) ||
3618       synode_eq(executed_msg, null_synode))
3619     return;
3620 
3621   IFDBG(D_NONE, FN; SYCEXP(find); SYCEXP(end));
3622   i = 0;
3623   while (!synode_gt(find, end) && i < n && !too_far(find)) {
3624     pax_machine *p = force_get_cache(find);
3625     if (wait_forced_config) {
3626       force_pax_machine(p, 1);
3627     }
3628     IFDBG(D_NONE, FN; NDBG(ok_to_propose(p), d); TIMECEXP(task_now());
3629           TIMECEXP(p->last_modified); SYCEXP(find));
3630     if (get_nodeno(find_site_def(find)) == VOID_NODE_NO) break;
3631     if (ok_to_propose(p)) {
3632       propose_noop(find, p);
3633     }
3634     find = incr_synode(find);
3635     i++;
3636   }
3637 }
3638 
3639 /* Propose a noop for the range find..end */
request_values(synode_no find,synode_no end)3640 void request_values(synode_no find, synode_no end) {
3641   IFDBG(D_NONE, FN; SYCEXP(find); SYCEXP(end););
3642   while (!synode_gt(find, end) && !too_far(find)) {
3643     pax_machine *p = get_cache(find);
3644     site_def const *site = find_site_def(find);
3645     if (get_nodeno(site) == VOID_NODE_NO) break;
3646     if (!finished(p) && !is_busy_machine(p)) {
3647       /* Prepare to send a noop */
3648       replace_pax_msg(&p->proposer.msg, pax_msg_new(find, site));
3649       assert(p->proposer.msg);
3650       create_noop(p->proposer.msg);
3651 
3652       IFDBG(D_NONE, FN; STRLIT("propose "); SYCEXP(find););
3653       push_msg_3p(site, p, pax_msg_new(find, site), find, no_op);
3654     }
3655     find = incr_synode(find);
3656   }
3657 }
3658 
3659 /* Message handlers */
3660 
3661 /*
3662 Reply to the sender of a message.
3663 Avoid using the outbound TCP connection to the node that sent the message, since
3664 it is simpler and safer to always use the same TCP connection as the one the
3665 message arrived on. We then know that the answever will always go to the same
3666 client (and the same instance of that client) that sent the request.
3667 */
3668 #define reply_msg(m)                                              \
3669   {                                                               \
3670     if (is_local_node((m)->from, site)) {                         \
3671       dispatch_op(site, m, NULL);                                 \
3672     } else {                                                      \
3673       link_into(&(msg_link_new((m), (m)->from)->l), reply_queue); \
3674     }                                                             \
3675   }
3676 
3677 #define CREATE_REPLY(x)  \
3678   pax_msg *reply = NULL; \
3679   CLONE_PAX_MSG(reply, x)
3680 
3681 #define SEND_REPLY  \
3682   reply_msg(reply); \
3683   replace_pax_msg(&reply, NULL)
3684 
safe_app_data_copy(pax_msg ** target,app_data_ptr source)3685 bool_t safe_app_data_copy(pax_msg **target, app_data_ptr source) {
3686   copy_app_data(&(*target)->a, source);
3687   if ((*target)->a == NULL && source != NULL) {
3688     oom_abort = 1;
3689     replace_pax_msg(target, NULL);
3690     return FALSE;
3691   }
3692   return TRUE;
3693 }
3694 
create_learn_msg_for_ignorant_node(pax_machine * p,pax_msg * pm,synode_no synode)3695 static pax_msg *create_learn_msg_for_ignorant_node(pax_machine *p, pax_msg *pm,
3696                                                    synode_no synode) {
3697   CREATE_REPLY(pm);
3698   IFDBG(D_NONE, FN; SYCEXP(synode));
3699   reply->synode = synode;
3700   reply->proposal = p->learner.msg->proposal;
3701   reply->msg_type = p->learner.msg->msg_type;
3702   safe_app_data_copy(&reply, p->learner.msg->a);
3703   if (reply != NULL) set_learn_type(reply);
3704   /* set_unique_id(reply, p->learner.msg->unique_id); */
3705   return reply;
3706 }
3707 
teach_ignorant_node(site_def const * site,pax_machine * p,pax_msg * pm,synode_no synode,linkage * reply_queue)3708 static void teach_ignorant_node(site_def const *site, pax_machine *p,
3709                                 pax_msg *pm, synode_no synode,
3710                                 linkage *reply_queue) {
3711   pax_msg *reply = create_learn_msg_for_ignorant_node(p, pm, synode);
3712   if (reply != NULL) SEND_REPLY;
3713 }
3714 
3715 /* Handle incoming read */
handle_read(site_def const * site,pax_machine * p,linkage * reply_queue,pax_msg * pm)3716 static void handle_read(site_def const *site, pax_machine *p,
3717                         linkage *reply_queue, pax_msg *pm) {
3718   IFDBG(D_NONE, FN; BALCEXP(pm->proposal); BALCEXP(p->acceptor.promise);
3719         if (p->acceptor.msg) BALCEXP(p->acceptor.msg->proposal);
3720         STRLIT("type "); STRLIT(pax_msg_type_to_str(pm->msg_type)));
3721 
3722   if (finished(p)) { /* We have learned a value */
3723     teach_ignorant_node(site, p, pm, pm->synode, reply_queue);
3724   }
3725 }
3726 
create_ack_prepare_msg(pax_machine * p,pax_msg * pm,synode_no synode)3727 static pax_msg *create_ack_prepare_msg(pax_machine *p, pax_msg *pm,
3728                                        synode_no synode) {
3729   CREATE_REPLY(pm);
3730   reply->synode = synode;
3731   if (accepted(p)) { /* We have accepted a value */
3732     reply->proposal = p->acceptor.msg->proposal;
3733     reply->msg_type = p->acceptor.msg->msg_type;
3734     IFDBG(D_NONE, FN; STRLIT(" already accepted value "); SYCEXP(synode));
3735     reply->op = ack_prepare_op;
3736     safe_app_data_copy(&reply, p->acceptor.msg->a);
3737   } else {
3738     IFDBG(D_NONE, FN; STRLIT(" no value synode "); SYCEXP(synode));
3739     reply->op = ack_prepare_empty_op;
3740   }
3741   return reply;
3742 }
3743 
handle_simple_prepare(pax_machine * p,pax_msg * pm,synode_no synode)3744 pax_msg *handle_simple_prepare(pax_machine *p, pax_msg *pm, synode_no synode) {
3745   pax_msg *reply = NULL;
3746   if (finished(p)) { /* We have learned a value */
3747     IFDBG(D_NONE, FN; SYCEXP(synode); BALCEXP(pm->proposal);
3748           NDBG(finished(p), d));
3749     reply = create_learn_msg_for_ignorant_node(p, pm, synode);
3750   } else {
3751     int greater =
3752         gt_ballot(pm->proposal,
3753                   p->acceptor.promise); /* Paxos acceptor phase 1 decision */
3754     IFDBG(D_NONE, FN; SYCEXP(synode); BALCEXP(pm->proposal); NDBG(greater, d));
3755     if (greater || noop_match(p, pm)) {
3756       p->last_modified = task_now();
3757       if (greater) {
3758         p->acceptor.promise = pm->proposal; /* promise to not accept any less */
3759       }
3760       reply = create_ack_prepare_msg(p, pm, synode);
3761     }
3762   }
3763   return reply;
3764 }
3765 
3766 /* Handle incoming prepare */
handle_prepare(site_def const * site,pax_machine * p,linkage * reply_queue,pax_msg * pm)3767 static void handle_prepare(site_def const *site, pax_machine *p,
3768                            linkage *reply_queue, pax_msg *pm) {
3769   ADD_DBG(D_CONS, add_synode_event(p->synode);
3770           add_event(EVENT_DUMP_PAD, string_arg("pm->from"));
3771           add_event(EVENT_DUMP_PAD, uint_arg(pm->from));
3772           add_event(EVENT_DUMP_PAD, string_arg(pax_op_to_str(pm->op)));
3773           add_event(EVENT_DUMP_PAD, string_arg("proposal"));
3774           add_ballot_event(pm->proposal);
3775           add_event(EVENT_DUMP_PAD, string_arg("promise"));
3776           add_ballot_event(p->acceptor.promise););
3777   IFDBG(D_NONE, FN; BALCEXP(pm->proposal); BALCEXP(p->acceptor.promise);
3778         if (p->acceptor.msg) BALCEXP(p->acceptor.msg->proposal);
3779         STRLIT("type "); STRLIT(pax_msg_type_to_str(pm->msg_type)));
3780 
3781   {
3782     pax_msg *reply = handle_simple_prepare(p, pm, pm->synode);
3783     if (reply != NULL) SEND_REPLY;
3784   }
3785 }
3786 
check_propose(site_def const * site,pax_machine * p)3787 bool_t check_propose(site_def const *site, pax_machine *p) {
3788   IFDBG(D_NONE, FN; SYCEXP(p->synode);
3789         COPY_AND_FREE_GOUT(dbg_machine_nodeset(p, get_maxnodes(site))););
3790   PAX_MSG_SANITY_CHECK(p->proposer.msg);
3791   {
3792     bool_t can_propose = FALSE;
3793     if (prep_majority(site, p)) {
3794       p->proposer.msg->proposal = p->proposer.bal;
3795       BIT_ZERO(p->proposer.prop_nodeset);
3796       p->proposer.msg->synode = p->synode;
3797       init_propose_msg(p->proposer.msg);
3798       p->proposer.sent_prop = p->proposer.bal;
3799       can_propose = TRUE;
3800     }
3801     return can_propose;
3802   }
3803 }
3804 
check_learn(site_def const * site,pax_machine * p)3805 static pax_msg *check_learn(site_def const *site, pax_machine *p) {
3806   IFDBG(D_NONE, FN; SYCEXP(p->synode);
3807         COPY_AND_FREE_GOUT(dbg_machine_nodeset(p, get_maxnodes(site))););
3808   PAX_MSG_SANITY_CHECK(p->proposer.msg);
3809   {
3810     pax_msg *learn_msg = NULL;
3811     if (get_nodeno(site) != VOID_NODE_NO && prop_majority(site, p)) {
3812       p->proposer.msg->synode = p->synode;
3813       if (p->proposer.msg->receivers) free_bit_set(p->proposer.msg->receivers);
3814       p->proposer.msg->receivers = clone_bit_set(p->proposer.prep_nodeset);
3815       BIT_SET(get_nodeno(site), p->proposer.msg->receivers);
3816       if (no_duplicate_payload) {
3817         learn_msg = create_tiny_learn_msg(p, p->proposer.msg);
3818       } else {
3819         /* purecov: begin deadcode */
3820         init_learn_msg(p->proposer.msg);
3821         learn_msg = p->proposer.msg;
3822         /* purecov: end */
3823       }
3824       p->proposer.sent_learn = p->proposer.bal;
3825     }
3826     return learn_msg;
3827   }
3828 }
3829 
do_learn(site_def const * site MY_ATTRIBUTE ((unused)),pax_machine * p,pax_msg * m)3830 static void do_learn(site_def const *site MY_ATTRIBUTE((unused)),
3831                      pax_machine *p, pax_msg *m) {
3832   ADD_DBG(D_CONS, add_synode_event(p->synode);
3833           add_event(EVENT_DUMP_PAD, string_arg("m->from"));
3834           add_event(EVENT_DUMP_PAD, uint_arg(m->from));
3835           add_event(EVENT_DUMP_PAD, string_arg(pax_op_to_str(m->op)));
3836           add_event(EVENT_DUMP_PAD, string_arg("proposal"));
3837           add_ballot_event(m->proposal);
3838           add_event(EVENT_DUMP_PAD, string_arg("promise"));
3839           add_ballot_event(p->acceptor.promise););
3840   /* FN; SYCEXP(p->synode); SYCEXP(m->synode); STRLIT(NEWLINE); */
3841   IFDBG(D_NONE, FN; SYCEXP(p->synode); SYCEXP(m->synode);
3842         dbg_bitset(m->receivers, get_maxnodes(site)););
3843   if (m->a) m->a->chosen = TRUE;
3844   replace_pax_msg(&p->acceptor.msg, m);
3845   replace_pax_msg(&p->learner.msg, m);
3846   /*
3847      Track memory used by client data in the cache.
3848      If we do not care about instances that are being decided,
3849      it is only necessary to compute the added memory when we
3850      record the outcome of a consensus round.
3851   */
3852   add_cache_size(p);
3853   /* Shrink the cache size if necessary */
3854   shrink_cache();
3855 }
3856 
handle_simple_ack_prepare(site_def const * site,pax_machine * p,pax_msg * m)3857 bool_t handle_simple_ack_prepare(site_def const *site, pax_machine *p,
3858                                  pax_msg *m) {
3859   if (get_nodeno(site) != VOID_NODE_NO)
3860     BIT_SET(m->from, p->proposer.prep_nodeset);
3861 
3862   {
3863     bool_t can_propose = FALSE;
3864     if (m->op == ack_prepare_op &&
3865         gt_ballot(m->proposal, p->proposer.msg->proposal)) { /* greater */
3866       replace_pax_msg(&p->proposer.msg, m);
3867       assert(p->proposer.msg);
3868     }
3869     if (gt_ballot(m->reply_to, p->proposer.sent_prop)) {
3870       can_propose = check_propose(site, p);
3871     }
3872     return can_propose;
3873   }
3874 }
3875 
3876 /* Other node has already accepted a value */
handle_ack_prepare(site_def const * site,pax_machine * p,pax_msg * m)3877 static void handle_ack_prepare(site_def const *site, pax_machine *p,
3878                                pax_msg *m) {
3879   ADD_DBG(D_CONS, add_synode_event(p->synode);
3880           add_event(EVENT_DUMP_PAD, string_arg("m->from"));
3881           add_event(EVENT_DUMP_PAD, uint_arg(m->from));
3882           add_event(EVENT_DUMP_PAD, string_arg(pax_op_to_str(m->op))););
3883   assert(m);
3884   IFDBG(D_NONE, FN; if (p->proposer.msg) BALCEXP(p->proposer.msg->proposal);
3885         BALCEXP(p->proposer.bal); BALCEXP(m->reply_to);
3886         BALCEXP(p->proposer.sent_prop); SYCEXP(m->synode));
3887   /*
3888     If the node is preparing a Noop for another node's slot, it is possible
3889     that the leader of the slot has since proposed a value. Hence, there is
3890     no need to move forward if we know that the value has been accepted. This
3891     also prevents changing the size of a learned pax_machine, which would
3892     cause inconsistent reporting of memory usage in P_S.
3893   */
3894   if (finished(p)) return;
3895 
3896   if (m->from != VOID_NODE_NO &&
3897       eq_ballot(p->proposer.bal, m->reply_to)) { /* answer to my prepare */
3898     bool_t can_propose = handle_simple_ack_prepare(site, p, m);
3899     if (can_propose) send_propose_msg(p->proposer.msg);
3900   }
3901 }
3902 
3903 /* #define AUTO_MSG(p,synode) {if(!(p)){replace_pax_msg(&(p),
3904  * pax_msg_new(synode, site));} */
3905 
create_ack_accept_msg(pax_msg * m,synode_no synode)3906 static pax_msg *create_ack_accept_msg(pax_msg *m, synode_no synode) {
3907   CREATE_REPLY(m);
3908   reply->op = ack_accept_op;
3909   reply->synode = synode;
3910   return reply;
3911 }
3912 
handle_simple_accept(pax_machine * p,pax_msg * m,synode_no synode)3913 pax_msg *handle_simple_accept(pax_machine *p, pax_msg *m, synode_no synode) {
3914   pax_msg *reply = NULL;
3915   if (finished(p)) { /* We have learned a value */
3916     reply = create_learn_msg_for_ignorant_node(p, m, synode);
3917   } else if (!gt_ballot(p->acceptor.promise,
3918                         m->proposal) || /* Paxos acceptor phase 2 decision */
3919              noop_match(p, m)) {
3920     IFDBG(D_NONE, FN; SYCEXP(m->synode); STRLIT("accept ");
3921           BALCEXP(m->proposal));
3922     p->last_modified = task_now();
3923     replace_pax_msg(&p->acceptor.msg, m);
3924     reply = create_ack_accept_msg(m, synode);
3925   }
3926   return reply;
3927 }
3928 
3929 /* Accecpt value if promise is not greater */
handle_accept(site_def const * site,pax_machine * p,linkage * reply_queue,pax_msg * m)3930 static void handle_accept(site_def const *site, pax_machine *p,
3931                           linkage *reply_queue, pax_msg *m) {
3932   IFDBG(D_NONE, FN; BALCEXP(p->acceptor.promise); BALCEXP(m->proposal);
3933         STREXP(pax_msg_type_to_str(m->msg_type)));
3934   PAX_MSG_SANITY_CHECK(m);
3935   ADD_DBG(D_CONS, add_synode_event(p->synode);
3936           add_event(EVENT_DUMP_PAD, string_arg("m->from"));
3937           add_event(EVENT_DUMP_PAD, uint_arg(m->from));
3938           add_event(EVENT_DUMP_PAD, string_arg(pax_op_to_str(m->op)));
3939           add_event(EVENT_DUMP_PAD, string_arg("proposal"));
3940           add_ballot_event(m->proposal);
3941           add_event(EVENT_DUMP_PAD, string_arg("promise"));
3942           add_ballot_event(p->acceptor.promise););
3943 
3944   {
3945     pax_msg *reply = handle_simple_accept(p, m, m->synode);
3946     if (reply != NULL) SEND_REPLY;
3947   }
3948 }
3949 
3950 /* Handle answer to accept */
handle_simple_ack_accept(site_def const * site,pax_machine * p,pax_msg * m)3951 pax_msg *handle_simple_ack_accept(site_def const *site, pax_machine *p,
3952                                   pax_msg *m) {
3953   pax_msg *learn_msg = NULL;
3954   if (get_nodeno(site) != VOID_NODE_NO && m->from != VOID_NODE_NO &&
3955       eq_ballot(p->proposer.bal, m->reply_to)) { /* answer to my accept */
3956     BIT_SET(m->from, p->proposer.prop_nodeset);
3957     if (gt_ballot(m->proposal, p->proposer.sent_learn)) {
3958       learn_msg = check_learn(site, p);
3959     }
3960   }
3961   return learn_msg;
3962 }
handle_ack_accept(site_def const * site,pax_machine * p,pax_msg * m)3963 static void handle_ack_accept(site_def const *site, pax_machine *p,
3964                               pax_msg *m) {
3965   ADD_DBG(D_CONS, add_synode_event(p->synode);
3966           add_event(EVENT_DUMP_PAD, string_arg("m->from"));
3967           add_event(EVENT_DUMP_PAD, uint_arg(m->from));
3968           add_event(EVENT_DUMP_PAD, string_arg(pax_op_to_str(m->op))););
3969   IFDBG(D_NONE, FN; SYCEXP(m->synode); BALCEXP(p->proposer.bal);
3970         BALCEXP(p->proposer.sent_learn); BALCEXP(m->proposal);
3971         BALCEXP(m->reply_to););
3972   IFDBG(D_NONE, FN; SYCEXP(p->synode);
3973         if (p->acceptor.msg) BALCEXP(p->acceptor.msg->proposal);
3974         BALCEXP(p->proposer.bal); BALCEXP(m->reply_to););
3975 
3976   {
3977     pax_msg *learn_msg = handle_simple_ack_accept(site, p, m);
3978     if (learn_msg != NULL) {
3979       if (learn_msg->op == tiny_learn_op) {
3980         send_tiny_learn_msg(site, learn_msg);
3981       } else {
3982         /* purecov: begin deadcode */
3983         assert(learn_msg->op == learn_op);
3984         send_learn_msg(site, learn_msg);
3985         /* purecov: end */
3986       }
3987     }
3988   }
3989 }
3990 
3991 /* Handle incoming learn. */
3992 static void activate_sweeper();
handle_tiny_learn(site_def const * site,pax_machine * pm,pax_msg * p)3993 void handle_tiny_learn(site_def const *site, pax_machine *pm, pax_msg *p) {
3994   assert(p->msg_type != no_op);
3995   if (pm->acceptor.msg) {
3996     /* 			BALCEXP(pm->acceptor.msg->proposal); */
3997     if (eq_ballot(pm->acceptor.msg->proposal, p->proposal)) {
3998       pm->acceptor.msg->op = learn_op;
3999       pm->last_modified = task_now();
4000       update_max_synode(p);
4001       handle_learn(site, pm, pm->acceptor.msg);
4002     } else {
4003       send_read(p->synode);
4004       IFDBG(D_NONE, FN; STRLIT("tiny_learn"); SYCEXP(p->synode);
4005             BALCEXP(pm->acceptor.msg->proposal); BALCEXP(p->proposal));
4006     }
4007   } else {
4008     send_read(p->synode);
4009     IFDBG(D_NONE, FN; STRLIT("tiny_learn"); SYCEXP(p->synode);
4010           BALCEXP(p->proposal));
4011   }
4012 }
4013 
force_pax_machine(pax_machine * p,int enforcer)4014 static void force_pax_machine(pax_machine *p, int enforcer) {
4015   if (!p->enforcer) { /* Not if already marked as forcing node */
4016     if (enforcer) {   /* Only if forcing node */
4017       /* Increase ballot count with a large increment without overflowing */
4018       /* p->proposer.bal.cnt may be -1. */
4019       int32_t delta = (INT32_MAX - MAX(p->proposer.bal.cnt, 0)) / 3;
4020       p->proposer.bal.cnt += delta;
4021     }
4022   }
4023   p->force_delivery = 1;
4024   p->enforcer = enforcer;
4025 }
4026 
4027 /* Configure all messages in interval start, end to be forced */
force_interval(synode_no start,synode_no end,int enforcer)4028 static void force_interval(synode_no start, synode_no end, int enforcer) {
4029   while (!synode_gt(start, end)) {
4030     pax_machine *p = get_cache(start);
4031     if (get_nodeno(find_site_def(start)) == VOID_NODE_NO) break;
4032 
4033     /* The forcing node will call force_interval twice, first when
4034     the new config is originally installed, and again when it
4035     receives it as an xcom message. start may be the same, but
4036     end will be greater the second time, since it is calculated
4037     based on the message number of the incoming config. Since the forcing
4038     node is the one responsible for delivering all messages until the
4039     start of the new site, it is important that all instances belonging to
4040     the old site are correctly marked. */
4041 
4042     if (p->enforcer) enforcer = 1; /* Extend to new instances */
4043     force_pax_machine(p, enforcer);
4044 
4045     /* Old nodesets are null and void */
4046     BIT_ZERO(p->proposer.prep_nodeset);
4047     BIT_ZERO(p->proposer.prop_nodeset);
4048     start = incr_synode(start);
4049   }
4050 }
4051 
start_force_config(site_def * s,int enforcer)4052 static void start_force_config(site_def *s, int enforcer) {
4053   synode_no end = add_event_horizon(s->boot_key);
4054 
4055   IFDBG(D_NONE, FN; SYCEXP(executed_msg); SYCEXP(end));
4056   if (synode_gt(end, max_synode)) set_max_synode(end);
4057 
4058   free_forced_config_site_def();
4059   wait_forced_config = 0;
4060   forced_config = s;
4061   force_interval(executed_msg, max_synode,
4062                  enforcer); /* Force everything in the pipeline */
4063 }
4064 
4065 /* Learn this value */
handle_learn(site_def const * site,pax_machine * p,pax_msg * m)4066 void handle_learn(site_def const *site, pax_machine *p, pax_msg *m) {
4067   IFDBG(D_NONE, FN; STRLIT("proposer nodeset ");
4068         dbg_bitset(p->proposer.prop_nodeset, get_maxnodes(site)););
4069   IFDBG(D_NONE, FN; STRLIT("receivers ");
4070         dbg_bitset(m->receivers, get_maxnodes(site)););
4071   IFDBG(D_NONE, FN; NDBG(task_now(), f); SYCEXP(p->synode);
4072         COPY_AND_FREE_GOUT(dbg_app_data(m->a)););
4073 
4074   PAX_MSG_SANITY_CHECK(m);
4075   p->last_modified = task_now();
4076   if (!finished(p)) { /* Avoid re-learn */
4077     activate_sweeper();
4078     do_learn(site, p, m);
4079     /* Check for special messages */
4080     if (m->a && m->a->body.c_t == unified_boot_type) {
4081       IFDBG(D_NONE, FN; STRLIT("Got unified_boot "); SYCEXP(p->synode);
4082             SYCEXP(m->synode););
4083       XCOM_FSM(x_fsm_net_boot, void_arg(m->a));
4084     }
4085     /* See if someone is forcing a new config */
4086     if (m->force_delivery && m->a) {
4087       IFDBG(D_NONE, FN; STRLIT("Got forced config "); SYCEXP(p->synode);
4088             SYCEXP(m->synode););
4089       /* Configure all messages from executed_msg until start of new config
4090          as forced messages so they will eventually be finished */
4091       /* Immediately install this new config */
4092       switch (m->a->body.c_t) {
4093         case add_node_type:
4094           /* purecov: begin deadcode */
4095           if (should_ignore_forced_config_or_view(
4096                   find_site_def(p->synode)->x_proto)) {
4097             log_ignored_forced_config(m->a, "handle_learn");
4098           } else {
4099             start_force_config(clone_site_def(handle_add_node(m->a)), 0);
4100           }
4101           break;
4102         /* purecov: end */
4103         case remove_node_type:
4104           /* purecov: begin deadcode */
4105           if (should_ignore_forced_config_or_view(
4106                   find_site_def(p->synode)->x_proto)) {
4107             log_ignored_forced_config(m->a, "handle_learn");
4108           } else {
4109             start_force_config(clone_site_def(handle_remove_node(m->a)), 0);
4110           }
4111           break;
4112         /* purecov: end */
4113         case force_config_type:
4114           start_force_config(clone_site_def(install_node_group(m->a)), 0);
4115           break;
4116         default:
4117           break;
4118       }
4119     }
4120   }
4121 
4122   task_wakeup(&p->rv);
4123 }
4124 
4125 /* Skip this value */
handle_skip(site_def const * site,pax_machine * p,pax_msg * m)4126 static void handle_skip(site_def const *site, pax_machine *p, pax_msg *m) {
4127   /* IFDBG(D_NONE, FN;); */
4128   /* IFDBG(D_NONE, FN; NDBG(task_now(),f); SYCEXP(p->msg->synode)); */
4129   if (!finished(p)) {
4130     p->last_modified = task_now();
4131     skip_value(m);
4132     do_learn(site, p, m);
4133   }
4134   /* IFDBG(D_NONE, FN; STRLIT("taskwakeup "); SYCEXP(p->msg->synode)); */
4135   task_wakeup(&p->rv);
4136 }
4137 
handle_client_msg(pax_msg * p)4138 static void handle_client_msg(pax_msg *p) {
4139   if (!p || p->a == NULL) /* discard invalid message */
4140     return;
4141   {
4142     msg_link *ml = msg_link_new(p, VOID_NODE_NO);
4143 
4144     /* Put it in the proposer queue */
4145     ADD_T_EV(task_now(), __FILE__, __LINE__, "handle_client_msg");
4146     channel_put(&prop_input_queue, &ml->l);
4147   }
4148 }
4149 
4150 #ifdef ACCEPT_SITE_TEST
4151 /* See if we should process an incoming ping from a node.
4152    The purpose is to avoid doing recovery from a node with an obsolete site
4153    definition */
accept_site(site_def const * site)4154 static int accept_site(site_def const *site) {
4155   site_def *mysite = (site_def *)get_site_def();
4156 
4157   if (site) {
4158     if (!mysite) {
4159       site_def *prev = (site_def *)find_prev_site_def(site->boot_key);
4160       IFDBG(
4161           D_NONE, FN; PTREXP(site); PTREXP(mysite); PTREXP(prev);
4162           SYCEXP(site->boot_key); if (prev) { SYCEXP(prev->boot_key); });
4163       if (!prev) {
4164         /** alive when no site, no known previous definition, and present in
4165          * new is accepted */
4166         return (site->boot_key.group_id == 0
4167                     ? 1
4168                     : (xcom_find_node_index((node_list *)&site->nodes) !=
4169                        VOID_NODE_NO));
4170       } else {
4171         /** alive when no site, a previous definition of groupid is known, but
4172          * is older than site def, is accepted */
4173         return synode_gt(site->boot_key, prev->boot_key);
4174       }
4175     } else {
4176       IFDBG(D_NONE, FN; PTREXP(site); PTREXP(mysite); SYCEXP(site->boot_key);
4177             SYCEXP(mysite->boot_key));
4178       if (get_group_id(site) != get_group_id(mysite)) {
4179         /** alive from different site should never be accepted */
4180         return 0;
4181       } else {
4182         /** alive from same site should be accepted if boot_key is larger than
4183          * mine */
4184         node_no my_nodeno = xcom_find_node_index((node_list *)&mysite->nodes);
4185         node_no site_nodeno = xcom_find_node_index((node_list *)&site->nodes);
4186         return (synode_gt(site->boot_key, mysite->boot_key) &&
4187                 ((my_nodeno != VOID_NODE_NO) || (site_nodeno != VOID_NODE_NO)));
4188       }
4189     }
4190   }
4191   /** Always accept a NULL site */
4192   IFDBG(D_NONE, FN; PTREXP(site));
4193   return 1;
4194 }
4195 #endif
4196 
4197 /* Handle incoming "need boot" message. */
4198 /* purecov: begin deadcode */
handle_boot(site_def const * site,linkage * reply_queue,pax_msg * p)4199 static inline void handle_boot(site_def const *site, linkage *reply_queue,
4200                                pax_msg *p) {
4201   /* This should never be TRUE, but validate it instead of asserting. */
4202   if (site == NULL || site->nodes.node_list_len < 1) {
4203     G_DEBUG(
4204         "handle_boot: Received an unexpected need_boot_op when site == NULL or "
4205         "site->nodes.node_list_len < 1");
4206     return;
4207   }
4208 
4209   if (ALWAYS_HANDLE_NEED_BOOT || should_handle_need_boot(site, p)) {
4210     handle_need_snapshot(reply_queue, p);
4211   } else {
4212     G_DEBUG(
4213         "Ignoring a need_boot_op message from an XCom incarnation that does "
4214         "not belong to the group.");
4215   }
4216 }
4217 /* purecov: end */
4218 
should_handle_need_boot(site_def const * site,pax_msg * p)4219 bool_t should_handle_need_boot(site_def const *site, pax_msg *p) {
4220   bool_t should_handle = FALSE;
4221   bool_t const sender_advertises_identity =
4222       (p->a != NULL && p->a->body.c_t == xcom_boot_type);
4223 
4224   /*
4225    If the message advertises the sender's identity, check if it matches the
4226    membership information.
4227 
4228    The sender's identity may not match if, e.g.:
4229 
4230      a. The member was already removed, or
4231      b. It is a new incarnation of a crashed member that is yet to be removed.
4232 
4233    ...or some other reason.
4234 
4235    If it is due to reason (b), we do not want to boot the sender because XCom
4236    only implements a simple fail-stop model. Allowing the sender to rejoin the
4237    group without going through the full remove+add node path could violate
4238    safety because the sender does not remember any previous Paxos acceptances it
4239    acknowledged before crashing.
4240    Since the pre-crash incarnation may have accepted a value for a given synod
4241    but the post-crash incarnation has forgotten that fact, the post-crash
4242    incarnation will fail to propagate the previously accepted value to a higher
4243    ballot. Since majorities can overlap on a single node, if the overlap node
4244    is the post-crash incarnation which has forgotten about the previously
4245    accepted value, a higher ballot proposer may get a different value accepted,
4246    leading to conflicting values to be accepted for different proposers, which
4247    is a violation of the safety properties of the Paxos protocol.
4248 
4249    If the sender does not advertise its identity, we boot it unconditionally.
4250    This is for backwards compatibility.
4251   */
4252   if (sender_advertises_identity) {
4253     bool_t const sender_advertises_one_identity =
4254         (p->a->body.app_u_u.nodes.node_list_len == 1);
4255 
4256     /* Defensively accept only messages with a single identity. */
4257     if (sender_advertises_one_identity) {
4258       node_address *sender_identity = p->a->body.app_u_u.nodes.node_list_val;
4259 
4260       should_handle = node_exists_with_uid(sender_identity, &site->nodes);
4261     }
4262   } else {
4263     should_handle = TRUE;
4264   }
4265 
4266   return should_handle;
4267 }
4268 
init_need_boot_op(pax_msg * p,node_address * identity)4269 void init_need_boot_op(pax_msg *p, node_address *identity) {
4270   p->op = need_boot_op;
4271   if (identity != NULL) {
4272     p->a = new_app_data();
4273     p->a->body.c_t = xcom_boot_type;
4274     init_node_list(1, identity, &p->a->body.app_u_u.nodes);
4275   }
4276 }
4277 
4278 #define PING_GATHERING_TIME_WINDOW 5.0
4279 #define PINGS_GATHERED_BEFORE_CONNECTION_SHUTDOWN 3
4280 
pre_process_incoming_ping(site_def const * site,pax_msg const * pm,int has_client_already_booted,double current_time)4281 int pre_process_incoming_ping(site_def const *site, pax_msg const *pm,
4282                               int has_client_already_booted,
4283                               double current_time) {
4284   // Yes... it is a ping for me, boot is done and it is a are_you_alive_op
4285   // This means that something wrong is not right...
4286   int did_shutdown = 0;
4287 
4288   if ((pm->from != get_nodeno(site)) && has_client_already_booted &&
4289       (pm->op == are_you_alive_op)) {
4290     G_DEBUG(
4291         "Received a ping to myself. This means that something must be wrong in "
4292         "a bi-directional connection")
4293     // Going to kill the connection for that node...
4294     if (site && (pm->from < site->nodes.node_list_len)) {
4295       // This is not the first ping received in the last 5 seconds...
4296       if (site->servers[pm->from]->last_ping_received >
4297           (current_time - PING_GATHERING_TIME_WINDOW)) {
4298         site->servers[pm->from]->number_of_pings_received++;
4299       } else {  // First ping since at least more than 5 seconds...
4300         site->servers[pm->from]->number_of_pings_received = 1;
4301       }
4302 
4303       site->servers[pm->from]->last_ping_received = current_time;
4304 
4305       // If we keep on receiving periodical pings... lets kill the connection
4306       if (is_connected(&site->servers[pm->from]->con) &&
4307           site->servers[pm->from]->number_of_pings_received ==
4308               PINGS_GATHERED_BEFORE_CONNECTION_SHUTDOWN) {
4309         shutdown_connection(&site->servers[pm->from]->con);
4310         G_WARNING(
4311             "Shutting down an outgoing connection. This happens because "
4312             "something might be wrong on a bi-directional connection to node "
4313             "%s:%d. Please check the connection status to this member",
4314             site->servers[pm->from]->srv, site->servers[pm->from]->port);
4315         did_shutdown = 1;
4316       }
4317     }
4318   }
4319 
4320   return did_shutdown;
4321 }
4322 
4323 /* Handle incoming alive message */
4324 static double sent_alive = 0.0;
handle_alive(site_def const * site,linkage * reply_queue,pax_msg * pm)4325 static inline void handle_alive(site_def const *site, linkage *reply_queue,
4326                                 pax_msg *pm) {
4327   pre_process_incoming_ping(site, pm, client_boot_done, task_now());
4328 
4329   if (client_boot_done || !(task_now() - sent_alive > 1.0)) /* Already done? */
4330     return;
4331 
4332 #ifdef ACCEPT_SITE_TEST
4333   if (!accept_site(site)) return;
4334 #endif
4335 
4336   /* Avoid responding to own ping */
4337   if (pm->from == get_nodeno(site) || pm->from == pm->to) return;
4338 
4339   /*
4340    This code will check if the ping is intended to us.
4341    If the encoded node does not exist in the current configuration,
4342    we avoid sending need_boot_op, since it must be from a different
4343    reincarnation of this node.
4344    */
4345   if (site && pm->a && pm->a->body.c_t == xcom_boot_type) {
4346     IFDBG(D_NONE, FN;
4347           COPY_AND_FREE_GOUT(dbg_list(&pm->a->body.app_u_u.nodes)););
4348 
4349     if (!node_exists_with_uid(&pm->a->body.app_u_u.nodes.node_list_val[0],
4350                               &get_site_def()->nodes))
4351       return;
4352   }
4353 
4354   if (is_dead_site(pm->group_id)) return; /* Avoid dealing with zombies */
4355 
4356   {
4357     CREATE_REPLY(pm);
4358     init_need_boot_op(reply, cfg_app_xcom_get_identity());
4359     sent_alive = task_now();
4360     SEND_REPLY;
4361   }
4362   IFDBG(D_NONE, FN; STRLIT("sent need_boot_op"););
4363 }
4364 
update_max_synode(pax_msg * p)4365 static void update_max_synode(pax_msg *p) {
4366   if (is_dead_site(p->group_id)) return;
4367   if (get_group_id(get_site_def()) == 0 || max_synode.group_id == 0) {
4368     set_max_synode(p->synode);
4369   } else if (max_synode.group_id == p->synode.group_id) {
4370     if (synode_gt(p->synode, max_synode)) {
4371       set_max_synode(p->synode);
4372     }
4373     if (synode_gt(p->max_synode, max_synode)) {
4374       set_max_synode(p->max_synode);
4375     }
4376   }
4377 }
4378 
4379 /* Message dispatch */
4380 #define BAL_FMT "ballot {cnt %d node %d}"
4381 #define BAL_MEM(x) (x).cnt, (x).node
4382 
4383 static int clicnt = 0;
4384 
xcom_get_minimum_event_horizon()4385 xcom_event_horizon xcom_get_minimum_event_horizon() {
4386   return EVENT_HORIZON_MIN;
4387 }
4388 
xcom_get_maximum_event_horizon()4389 xcom_event_horizon xcom_get_maximum_event_horizon() {
4390   return EVENT_HORIZON_MAX;
4391 }
4392 
4393 /**
4394  * Retrieves the latest event horizon.
4395  *
4396  * There is no specific reason for this method to return the latest event
4397  * horizon instead of the current one. Both would be acceptable results of
4398  * this function, but we had to make a decision of one over the other.
4399  *
4400  * @param[out] event_horizon the latest event horizon
4401  * @retval REQUEST_FAIL XCom is not initialized yet
4402  * @retval REQUEST_OK function was successful and event_horizon contains the
4403  *                    latest event horizon
4404  */
xcom_get_event_horizon(xcom_event_horizon * event_horizon)4405 static client_reply_code xcom_get_event_horizon(
4406     xcom_event_horizon *event_horizon) {
4407   site_def const *latest_config = get_site_def();
4408   if (latest_config == NULL) return REQUEST_FAIL;
4409   *event_horizon = latest_config->event_horizon;
4410   return REQUEST_OK;
4411 }
4412 
allow_add_node(app_data_ptr a)4413 static u_int allow_add_node(app_data_ptr a) {
4414   /* Get information on the current site definition */
4415   const site_def *new_site_def = get_site_def();
4416   const site_def *valid_site_def = find_site_def(executed_msg);
4417 
4418   /* Get information on the nodes to be added */
4419   u_int nr_nodes_to_add = a->body.app_u_u.nodes.node_list_len;
4420   node_address *nodes_to_change = a->body.app_u_u.nodes.node_list_val;
4421 
4422   if (add_node_unsafe_against_event_horizon(a)) return 0;
4423 
4424   if (add_node_unsafe_against_ipv4_old_nodes(a)) {
4425     G_MESSAGE(
4426         "This server is unable to join the group as the NIC used is configured "
4427         "with IPv6 only and there are members in the group that are unable to "
4428         "communicate using IPv6, only IPv4.Please configure this server to "
4429         "join the group using an IPv4 address instead.");
4430     return 0;
4431   }
4432 
4433   {
4434     u_int i;
4435     for (i = 0; i < nr_nodes_to_add; i++) {
4436       if (node_exists(&nodes_to_change[i], &new_site_def->nodes) ||
4437           node_exists(&nodes_to_change[i], &valid_site_def->nodes)) {
4438         /*
4439         We are simply ignoring the attempt to add a node to the
4440         group when there is an old incarnation of it, meaning
4441         that the node has crashed and restarted so fastly that
4442         nobody has noticed that it has gone.
4443 
4444         In XCOM, the group is not automatically reconfigured
4445         and it is possible to start reusing a node that has
4446         crashed and restarted without reconfiguring the group
4447         by adding the node back to it.
4448 
4449         However, this operation may be unsafe because XCOM
4450         does not implement a crash-recovery model and nodes
4451         suffer from amnesia after restarting the service. In
4452         other words this may lead to inconsistency issues in
4453         the paxos protocol.
4454 
4455         Unfortunately, preventing that a node is added back
4456         to the system where there is an old incarnation will
4457         not fix this problem since other changes are required.
4458         */
4459         G_MESSAGE(
4460             "Old incarnation found while trying to "
4461             "add node %s %.*s.",
4462             nodes_to_change[i].address, nodes_to_change[i].uuid.data.data_len,
4463             nodes_to_change[i].uuid.data.data_val);
4464         return 0;
4465       }
4466     }
4467   }
4468 
4469   return 1;
4470 }
4471 
allow_remove_node(app_data_ptr a)4472 static u_int allow_remove_node(app_data_ptr a) {
4473   /* Get information on the current site definition */
4474   const site_def *new_site_def = get_site_def();
4475 
4476   /* Get information on the nodes to be added */
4477   u_int nodes_len = a->body.app_u_u.nodes.node_list_len;
4478   node_address *nodes_to_change = a->body.app_u_u.nodes.node_list_val;
4479 
4480   u_int i;
4481   for (i = 0; i < nodes_len; i++) {
4482     if (!node_exists_with_uid(&nodes_to_change[i], &new_site_def->nodes)) {
4483       /*
4484       If the UID does not exist, then 1) the node has already been
4485       removed or 2) it has reincarnated.
4486       */
4487       /* purecov: begin inspected */
4488       if (node_exists(&nodes_to_change[i], &new_site_def->nodes)) {
4489         /*
4490         We also cannot allow an upper-layer to remove a new incarnation
4491         of a node when it tries to remove an old one.
4492         */
4493         G_MESSAGE(
4494             "New incarnation found while trying to "
4495             "remove node %s %.*s.",
4496             nodes_to_change[i].address, nodes_to_change[i].uuid.data.data_len,
4497             nodes_to_change[i].uuid.data.data_val);
4498       } else {
4499         /* The node has already been removed, so we block the request */
4500         G_MESSAGE(
4501             "Node has already been removed: "
4502             "%s %.*s.",
4503             nodes_to_change[i].address, nodes_to_change[i].uuid.data.data_len,
4504             nodes_to_change[i].uuid.data.data_val);
4505       }
4506       return 0;
4507       /* purecov: end */
4508     }
4509   }
4510 
4511   return 1;
4512 }
4513 
4514 /**
4515  * Logs the fact that an add/remove node request is aimed at another group.
4516  *
4517  * @param a a pointer to the app_data of the configuration command
4518  * @param message_fmt a formatted message to log, containing a single %s that
4519  * will be replaced by the node's address
4520  */
log_cfgchange_wrong_group(app_data_ptr a,const char * const message_fmt)4521 static void log_cfgchange_wrong_group(app_data_ptr a,
4522                                       const char *const message_fmt) {
4523   u_int const nr_nodes = a->body.app_u_u.nodes.node_list_len;
4524   u_int i;
4525   for (i = 0; i < nr_nodes; i++) {
4526     char const *const address = a->body.app_u_u.nodes.node_list_val[i].address;
4527     G_WARNING(message_fmt, address);
4528   }
4529 }
4530 
4531 /**
4532  * Validates if a configuration command can be executed.
4533  * Checks whether the configuration command is aimed at the correct group.
4534  * Checks whether the configuration command pertains to a node reincarnation.
4535  *
4536  * @param p a pointer to the pax_msg of the configuration command
4537  * @retval REQUEST_OK if the reconfiguration command can be executed
4538  * @retval REQUEST_RETRY if XCom is still booting
4539  * @retval REQUEST_FAIL if the configuration command cannot be executed
4540  */
can_execute_cfgchange(pax_msg * p)4541 static client_reply_code can_execute_cfgchange(pax_msg *p) {
4542   app_data_ptr a = p->a;
4543 
4544   if (executed_msg.msgno <= 2) return REQUEST_RETRY;
4545 
4546   if (a && a->group_id != 0 && a->group_id != executed_msg.group_id) {
4547     switch (a->body.c_t) {
4548       case add_node_type:
4549         log_cfgchange_wrong_group(
4550             a,
4551             "The request to add %s to the group has been rejected because it "
4552             "is aimed at another group");
4553         break;
4554       case remove_node_type:
4555         log_cfgchange_wrong_group(
4556             a,
4557             "The request to remove %s from the group has been rejected because "
4558             "it is aimed at another group");
4559         break;
4560       case force_config_type:
4561         G_WARNING(
4562             "The request to force the group membership has been rejected "
4563             "because it is aimed at another group");
4564         break;
4565       default:
4566         assert(0 &&
4567                "A cargo_type different from {add_node_type, remove_node_type, "
4568                "force_config_type} should not have hit this code path");
4569     }
4570     return REQUEST_FAIL;
4571   }
4572 
4573   if (a && a->body.c_t == add_node_type && !allow_add_node(a))
4574     return REQUEST_FAIL;
4575 
4576   if (a && a->body.c_t == remove_node_type && !allow_remove_node(a))
4577     return REQUEST_FAIL;
4578 
4579   if (a && a->body.c_t == set_event_horizon_type &&
4580       unsafe_event_horizon_reconfiguration(a))
4581     return REQUEST_FAIL;
4582 
4583   if (a && a->body.c_t == force_config_type &&
4584       are_there_dead_nodes_in_new_config(a))
4585     return REQUEST_FAIL;
4586 
4587   return REQUEST_OK;
4588 }
4589 
activate_sweeper()4590 static void activate_sweeper() {
4591   if (sweeper) {
4592     ADD_DBG(D_CONS, add_event(EVENT_DUMP_PAD,
4593                               string_arg("sweeper activated max_synode"));
4594             add_synode_event(max_synode););
4595     task_activate(sweeper);
4596   }
4597 }
4598 
4599 static synode_no start_config = NULL_SYNODE;
4600 
dispatch_get_event_horizon(site_def const * site,pax_msg * p,linkage * reply_queue)4601 void dispatch_get_event_horizon(site_def const *site, pax_msg *p,
4602                                 linkage *reply_queue) {
4603   CREATE_REPLY(p);
4604   IFDBG(D_NONE, FN; STRLIT("Got get_event_horizon from client");
4605         SYCEXP(p->synode););
4606   reply->op = xcom_client_reply;
4607   reply->cli_err = xcom_get_event_horizon(&reply->event_horizon);
4608   SEND_REPLY;
4609 }
4610 
4611 /*
4612  * Log the result of the get_synode_app_data command.
4613  */
log_get_synode_app_data_failure(xcom_get_synode_app_data_result error_code)4614 static void log_get_synode_app_data_failure(
4615     xcom_get_synode_app_data_result error_code) {
4616   switch (error_code) {
4617     case XCOM_GET_SYNODE_APP_DATA_OK:
4618       break;
4619     case XCOM_GET_SYNODE_APP_DATA_ERROR:
4620       G_DEBUG("Could not reply successfully to request for synode data.");
4621       break;
4622     case XCOM_GET_SYNODE_APP_DATA_NOT_CACHED:
4623       G_DEBUG(
4624           "Could not reply successfully to request for synode data because "
4625           "some of the requested synodes are no longer cached.");
4626       break;
4627     case XCOM_GET_SYNODE_APP_DATA_NOT_DECIDED:
4628       G_DEBUG(
4629           "Could not reply successfully to request for synode data because "
4630           "some of the requested synodes are still undecided.");
4631       break;
4632     case XCOM_GET_SYNODE_APP_DATA_NO_MEMORY:
4633       G_DEBUG(
4634           "Could not reply successfully to request for synode data because "
4635           "memory could not be allocated.");
4636       break;
4637   }
4638 }
4639 
dispatch_get_synode_app_data(site_def const * site,pax_msg * p,linkage * reply_queue)4640 void dispatch_get_synode_app_data(site_def const *site, pax_msg *p,
4641                                   linkage *reply_queue) {
4642   IFDBG(D_NONE, FN; STRLIT("Got get_synode_app_data from client");
4643         SYCEXP(p->synode););
4644 
4645   {
4646     CREATE_REPLY(p);
4647     reply->op = xcom_client_reply;
4648 
4649     {
4650       xcom_get_synode_app_data_result error_code;
4651       error_code = xcom_get_synode_app_data(&p->a->body.app_u_u.synodes,
4652                                             &reply->requested_synode_app_data);
4653       switch (error_code) {
4654         case XCOM_GET_SYNODE_APP_DATA_OK:
4655           reply->cli_err = REQUEST_OK;
4656           break;
4657         case XCOM_GET_SYNODE_APP_DATA_NOT_CACHED:
4658         case XCOM_GET_SYNODE_APP_DATA_NOT_DECIDED:
4659         case XCOM_GET_SYNODE_APP_DATA_NO_MEMORY:
4660         case XCOM_GET_SYNODE_APP_DATA_ERROR:
4661           reply->cli_err = REQUEST_FAIL;
4662           log_get_synode_app_data_failure(error_code);
4663           break;
4664       }
4665 
4666       SEND_REPLY;
4667     }
4668   }
4669 }
4670 
4671 static int can_send_snapshot();
4672 
dispatch_op(site_def const * site,pax_msg * p,linkage * reply_queue)4673 pax_msg *dispatch_op(site_def const *site, pax_msg *p, linkage *reply_queue) {
4674   pax_machine *pm = NULL;
4675   site_def *dsite = find_site_def_rw(p->synode);
4676   int in_front = too_far(p->synode);
4677 
4678   if (p->force_delivery) {
4679     /* Ensure that forced message can be processed */
4680     in_front = 0;
4681   }
4682 
4683   if (dsite && p->op != client_msg && is_server_connected(dsite, p->from)) {
4684     /* Wake up the detector task if this node was previously marked as
4685      * potentially failed. */
4686     if (!note_detected(dsite, p->from)) task_wakeup(&detector_wait);
4687     update_delivered(dsite, p->from, p->delivered_msg);
4688   }
4689 
4690   IFDBG(D_NONE, FN; STRLIT("incoming message ");
4691         COPY_AND_FREE_GOUT(dbg_pax_msg(p)););
4692   ADD_DBG(D_NONE, add_synode_event(p->synode);
4693           add_event(EVENT_DUMP_PAD, string_arg("p->from"));
4694           add_event(EVENT_DUMP_PAD, uint_arg(p->from));
4695           add_event(EVENT_DUMP_PAD, string_arg("in_front"));
4696           add_event(EVENT_DUMP_PAD, int_arg(in_front));
4697           add_event(EVENT_DUMP_PAD, string_arg(pax_op_to_str(p->op))););
4698 
4699   switch (p->op) {
4700     case client_msg:
4701       clicnt++;
4702       if (p->a && (p->a->body.c_t == exit_type)) {
4703         /* purecov: begin deadcode */
4704         IFDBG(D_NONE, FN; STRLIT("Got exit from client"); SYCEXP(p->synode););
4705         bury_site(get_group_id(get_site_def()));
4706         ADD_DBG(D_FSM, add_event(EVENT_DUMP_PAD, string_arg("terminating"));)
4707         terminate_and_exit();
4708         break;
4709         /* purecov: end */
4710       }
4711       if (p->a && (p->a->body.c_t == reset_type)) {
4712         /* purecov: begin deadcode */
4713         IFDBG(D_NONE, FN; STRLIT("Got reset from client"); SYCEXP(p->synode););
4714         bury_site(get_group_id(get_site_def()));
4715         ADD_DBG(D_FSM, add_event(EVENT_DUMP_PAD, string_arg("terminating"));)
4716         XCOM_FSM(x_fsm_terminate, int_arg(0));
4717         break;
4718         /* purecov: end */
4719       }
4720       if (p->a && (p->a->body.c_t == remove_reset_type)) {
4721         /* purecov: begin deadcode */
4722         IFDBG(D_NONE, FN; STRLIT("Got remove_reset from client");
4723               SYCEXP(p->synode););
4724         ADD_DBG(D_FSM, add_event(EVENT_DUMP_PAD, string_arg("terminating"));)
4725         XCOM_FSM(x_fsm_terminate, int_arg(0));
4726         break;
4727         /* purecov: end */
4728       }
4729       if (p->a && (p->a->body.c_t == enable_arbitrator)) {
4730         CREATE_REPLY(p);
4731         IFDBG(D_NONE, FN; STRLIT("Got enable_arbitrator from client");
4732               SYCEXP(p->synode););
4733         ARBITRATOR_HACK = 1;
4734         reply->op = xcom_client_reply;
4735         reply->cli_err = REQUEST_OK;
4736         SEND_REPLY;
4737         break;
4738       }
4739       if (p->a && (p->a->body.c_t == disable_arbitrator)) {
4740         CREATE_REPLY(p);
4741         IFDBG(D_NONE, FN; STRLIT("Got disable_arbitrator from client");
4742               SYCEXP(p->synode););
4743         ARBITRATOR_HACK = 0;
4744         reply->op = xcom_client_reply;
4745         reply->cli_err = REQUEST_OK;
4746         SEND_REPLY;
4747         break;
4748       }
4749       if (p->a && (p->a->body.c_t == set_cache_limit)) {
4750         CREATE_REPLY(p);
4751         IFDBG(D_NONE, FN; STRLIT("Got set_cache_limit from client");
4752               SYCEXP(p->synode););
4753         if (the_app_xcom_cfg) {
4754           set_max_cache_size(p->a->body.app_u_u.cache_limit);
4755           reply->cli_err = REQUEST_OK;
4756         } else {
4757           reply->cli_err = REQUEST_FAIL;
4758         }
4759         reply->op = xcom_client_reply;
4760         SEND_REPLY;
4761         break;
4762       }
4763       if (p->a && (p->a->body.c_t == x_terminate_and_exit)) {
4764         /* purecov: begin deadcode */
4765         CREATE_REPLY(p);
4766         IFDBG(D_NONE, FN; STRLIT("Got terminate_and_exit from client");
4767               SYCEXP(p->synode););
4768         reply->op = xcom_client_reply;
4769         reply->cli_err = REQUEST_OK;
4770         SEND_REPLY;
4771         /*
4772           The function frees sites which is used by SEND_REPLY,
4773           so it should be called after SEND_REPLY.
4774         */
4775         IFDBG(D_NONE, FN; STRLIT("terminate_and_exit"));
4776         ADD_DBG(D_FSM, add_event(EVENT_DUMP_PAD, string_arg("terminating"));)
4777         terminate_and_exit();
4778         break;
4779         /* purecov: end */
4780       }
4781       if (p->a && (p->a->body.c_t == get_event_horizon_type)) {
4782         dispatch_get_event_horizon(site, p, reply_queue);
4783         break;
4784       }
4785       if (p->a && (p->a->body.c_t == get_synode_app_data_type)) {
4786         dispatch_get_synode_app_data(site, p, reply_queue);
4787         break;
4788       }
4789       if (p->a && (p->a->body.c_t == add_node_type ||
4790                    p->a->body.c_t == remove_node_type ||
4791                    p->a->body.c_t == force_config_type ||
4792                    p->a->body.c_t == set_event_horizon_type)) {
4793         client_reply_code cli_err;
4794         CREATE_REPLY(p);
4795         reply->op = xcom_client_reply;
4796         reply->cli_err = cli_err = can_execute_cfgchange(p);
4797         SEND_REPLY;
4798         if (cli_err != REQUEST_OK) {
4799           break;
4800         }
4801       }
4802       if (p->a && p->a->body.c_t == unified_boot_type) {
4803         IFDBG(D_NONE, FN; STRLIT("Got unified_boot from client");
4804               SYCEXP(p->synode););
4805         IFDBG(D_NONE, FN;
4806               COPY_AND_FREE_GOUT(dbg_list(&p->a->body.app_u_u.nodes)););
4807         IFDBG(D_NONE, STRLIT("handle_client_msg "); NDBG(p->a->group_id, x));
4808         XCOM_FSM(x_fsm_net_boot, void_arg(p->a));
4809       }
4810       if (p->a && p->a->body.c_t == add_node_type) {
4811         IFDBG(D_NONE, FN; STRLIT("Got add_node from client");
4812               SYCEXP(p->synode););
4813         IFDBG(D_NONE, FN;
4814               COPY_AND_FREE_GOUT(dbg_list(&p->a->body.app_u_u.nodes)););
4815         IFDBG(D_NONE, STRLIT("handle_client_msg "); NDBG(p->a->group_id, x));
4816         assert(get_site_def());
4817       }
4818       if (p->a && p->a->body.c_t == remove_node_type) {
4819         IFDBG(D_NONE, FN; STRLIT("Got remove_node from client");
4820               SYCEXP(p->synode););
4821         IFDBG(D_NONE, FN;
4822               COPY_AND_FREE_GOUT(dbg_list(&p->a->body.app_u_u.nodes)););
4823         IFDBG(D_NONE, STRLIT("handle_client_msg "); NDBG(p->a->group_id, x));
4824         assert(get_site_def());
4825       }
4826       if (p->a && p->a->body.c_t == set_event_horizon_type) {
4827         IFDBG(D_NONE, FN; STRLIT("Got set_event_horizon from client");
4828               SYCEXP(p->synode););
4829         IFDBG(D_NONE, FN; NDBG(p->a->body.app_u_u.event_horizon, u));
4830         IFDBG(D_NONE, STRLIT("handle_client_msg "); NDBG(p->a->group_id, x));
4831         assert(get_site_def());
4832       }
4833       if (p->a && p->a->body.c_t == force_config_type) {
4834         IFDBG(D_NONE, FN; STRLIT("Got new force config from client");
4835               SYCEXP(p->synode););
4836         IFDBG(D_NONE, FN;
4837               COPY_AND_FREE_GOUT(dbg_list(&p->a->body.app_u_u.nodes)););
4838         IFDBG(D_NONE, STRLIT("handle_client_msg "); NDBG(p->a->group_id, x));
4839         assert(get_site_def());
4840         XCOM_FSM(x_fsm_force_config, void_arg(p->a));
4841       }
4842       handle_client_msg(p);
4843       break;
4844     case initial_op:
4845       break;
4846     case read_op:
4847       pm = get_cache(p->synode);
4848       assert(pm);
4849 
4850       handle_read(site, pm, reply_queue, p);
4851       break;
4852     case prepare_op:
4853       pm = get_cache(p->synode);
4854       assert(pm);
4855       if (p->force_delivery) pm->force_delivery = 1;
4856       IFDBG(D_NONE, FN; dbg_pax_msg(p));
4857 
4858       /*
4859        We can only be a productive Paxos Acceptor if we have been booted, i.e.
4860        added to the group and received an up-to-date snapshot from some member.
4861 
4862        We do not allow non-booted members to participate in Paxos because they
4863        might be a reincarnation of a member that crashed and was then brought up
4864        without having gone through the remove+add node path.
4865        Since the pre-crash incarnation may have accepted a value for a given
4866        synod but the post-crash incarnation has forgotten that fact, the
4867        post-crash incarnation will fail to propagate the previously accepted
4868        value to a higher ballot. Since majorities can overlap on a single node,
4869        if the overlap node is the post-crash incarnation which has forgotten
4870        about the previously accepted value, the higher ballot proposer may get
4871        a different value accepted, leading to conflicting values to be accepted
4872        for different proposers, which is a violation of the safety requirements
4873        of the Paxos protocol.
4874       */
4875       if (ALWAYS_HANDLE_CONSENSUS || client_boot_done) {
4876         handle_prepare(site, pm, reply_queue, p);
4877       }
4878       break;
4879     case ack_prepare_op:
4880     case ack_prepare_empty_op:
4881       if (in_front || !is_cached(p->synode)) break;
4882       pm = get_cache(p->synode);
4883       if (p->force_delivery) pm->force_delivery = 1;
4884       if (!pm->proposer.msg) break;
4885       assert(pm && pm->proposer.msg);
4886       handle_ack_prepare(site, pm, p);
4887       break;
4888     case accept_op:
4889       pm = get_cache(p->synode);
4890       assert(pm);
4891       if (p->force_delivery) pm->force_delivery = 1;
4892       IFDBG(D_NONE, FN; dbg_pax_msg(p));
4893 
4894       /*
4895        We can only be a productive Paxos Acceptor if we have been booted, i.e.
4896        added to the group and received an up-to-date snapshot from some member.
4897 
4898        We do not allow non-booted members to participate in Paxos because they
4899        might be a reincarnation of a member that crashed and was then brought up
4900        without having gone through the remove+add node path.
4901        Since the pre-crash incarnation may have accepted a value for a given
4902        synod but the post-crash incarnation has forgotten that fact, the
4903        post-crash incarnation will fail to propagate the previously accepted
4904        value to a higher ballot. Since majorities can overlap on a single node,
4905        if the overlap node is the post-crash incarnation which has forgotten
4906        about the previously accepted value, the higher ballot proposer may get
4907        a different value accepted, leading to conflicting values to be accepted
4908        for different proposers, which is a violation of the safety requirements
4909        of the Paxos protocol.
4910       */
4911       if (ALWAYS_HANDLE_CONSENSUS || client_boot_done) {
4912         handle_alive(site, reply_queue, p);
4913 
4914         handle_accept(site, pm, reply_queue, p);
4915       }
4916       break;
4917     case ack_accept_op:
4918       if (in_front || !is_cached(p->synode)) break;
4919       pm = get_cache(p->synode);
4920       if (p->force_delivery) pm->force_delivery = 1;
4921       if (!pm->proposer.msg) break;
4922       assert(pm && pm->proposer.msg);
4923       handle_ack_accept(site, pm, p);
4924       break;
4925     case recover_learn_op:
4926       IFDBG(D_NONE, FN; STRLIT("recover_learn_op receive "); SYCEXP(p->synode));
4927       pm = get_cache(p->synode);
4928       assert(pm);
4929       if (p->force_delivery) pm->force_delivery = 1;
4930       update_max_synode(p);
4931       {
4932         IFDBG(D_NONE, FN; STRLIT("recover_learn_op learn "); SYCEXP(p->synode));
4933         p->op = learn_op;
4934         handle_learn(site, pm, p);
4935       }
4936       break;
4937     case learn_op:
4938     learnop:
4939       pm = get_cache(p->synode);
4940       assert(pm);
4941       if (p->force_delivery) pm->force_delivery = 1;
4942       update_max_synode(p);
4943       handle_learn(site, pm, p);
4944       break;
4945     case tiny_learn_op:
4946       if (p->msg_type == no_op) goto learnop;
4947       pm = get_cache(p->synode);
4948       assert(pm);
4949       if (p->force_delivery) pm->force_delivery = 1;
4950       handle_tiny_learn(site, pm, p);
4951       break;
4952     case skip_op:
4953       pm = get_cache(p->synode);
4954       assert(pm);
4955       if (p->force_delivery) pm->force_delivery = 1;
4956       handle_skip(site, pm, p);
4957       break;
4958     case i_am_alive_op:
4959       /* Update max_synode, but use only p->max_synode, ignore p->synode */
4960       if (!is_dead_site(p->group_id)) {
4961         if (max_synode.group_id == p->synode.group_id &&
4962             synode_gt(p->max_synode, max_synode)) {
4963           set_max_synode(p->max_synode);
4964         }
4965       }
4966       handle_alive(site, reply_queue, p);
4967       break;
4968     case are_you_alive_op:
4969       handle_alive(site, reply_queue, p);
4970       break;
4971     case need_boot_op:
4972       /* purecov: begin deadcode */
4973       /* Only in run state. Test state and do it here because we need to use
4974        * reply queue */
4975       if (can_send_snapshot() &&
4976           !synode_eq(get_site_def()->boot_key, null_synode)) {
4977         handle_boot(site, reply_queue, p);
4978       }
4979       /* Wake senders waiting to connect, since new node has appeared */
4980       wakeup_sender();
4981       break;
4982     /* purecov: end */
4983     case gcs_snapshot_op:
4984       /* Avoid duplicate snapshots and snapshots from zombies */
4985       IFDBG(D_BUG, FN; SYCEXP(executed_msg););
4986       IFDBG(D_BUG, FN; SYCEXP(start_config););
4987       if (!synode_eq(start_config, get_highest_boot_key(p->gcs_snap)) &&
4988           !is_dead_site(p->group_id)) {
4989         update_max_synode(p);
4990         /* For incoming messages, note delivery of snapshot from sender node */
4991         note_snapshot(p->from);
4992         XCOM_FSM(x_fsm_snapshot, void_arg(p->gcs_snap));
4993       }
4994       break;
4995     case die_op:
4996       /* assert("die horribly" == "need coredump"); */
4997       {
4998         GET_GOUT;
4999         FN;
5000         STRLIT("die_op ");
5001         SYCEXP(executed_msg);
5002         SYCEXP(delivered_msg);
5003         SYCEXP(p->synode);
5004         SYCEXP(p->delivered_msg);
5005         SYCEXP(p->max_synode);
5006         PRINT_GOUT;
5007         FREE_GOUT;
5008       }
5009       /*
5010       If the message with the number in  the  incoming  die_op  message
5011       already  has  been  executed  (delivered),  then it means that we
5012       actually got consensus on it, since otherwise we would  not  have
5013       delivered it.Such a situation could arise if one of the nodes has
5014       expelled the message from its cache, but others have not. So when
5015       sending  out  a  request, we might get two different answers, one
5016       indicating that we are too far behind  and  should  restart,  and
5017       another  with  the  actual  consensus value. If the value arrives
5018       first, we will deliver it, and then the die_op may arrive  later.
5019       But  it this case it does not matter, since we got what we needed
5020       anyway. It is only a partial guard against exiting without really
5021       needing  it  of course, since the die_op may arrive first, and we
5022       do not wait for a die_op from all the other nodes.  We  could  do
5023       that  with  some extra housekeeping in the pax_machine (a new bit
5024       vector), but I am not convinced that it is worth the effort.
5025       */
5026       if (!synode_lt(p->synode, executed_msg)) {
5027         ADD_DBG(D_FSM, add_event(EVENT_DUMP_PAD, string_arg("terminating"));)
5028         g_critical("Node %u is unable to get message {%x %" PRIu64
5029                    " %u}, since the group is too far "
5030                    "ahead. Node will now exit.",
5031                    get_nodeno(site), SY_MEM(p->synode));
5032         terminate_and_exit();
5033       }
5034     default:
5035       break;
5036   }
5037   if (oom_abort) {
5038     g_critical("Node %u has run out of memory and will now exit.",
5039                get_nodeno(site));
5040     terminate_and_exit();
5041   }
5042   return (p);
5043 }
5044 
5045 /* Acceptor-learner task */
5046 #define SERIALIZE_REPLY(msg)                \
5047   msg->to = ep->p->from;                    \
5048   msg->from = ep->p->to;                    \
5049   msg->delivered_msg = get_delivered_msg(); \
5050   msg->max_synode = get_max_synode();       \
5051   serialize_msg(msg, ep->rfd.x_proto, &ep->buflen, &ep->buf);
5052 
5053 #define WRITE_REPLY                                                    \
5054   if (ep->buflen) {                                                    \
5055     int64_t sent;                                                      \
5056     IFDBG(D_TRANSPORT, FN; STRLIT("task_write "); NDBG(ep->rfd.fd, d); \
5057           NDBG(ep->buflen, u));                                        \
5058     TASK_CALL(task_write(&ep->rfd, ep->buf, ep->buflen, &sent));       \
5059     send_count[ep->p->op]++;                                           \
5060     send_bytes[ep->p->op] += ep->buflen;                               \
5061     X_FREE(ep->buf);                                                   \
5062   }                                                                    \
5063   ep->buf = NULL;
5064 
update_srv(server ** target,server * srv)5065 static inline void update_srv(server **target, server *srv) {
5066   if (srv) srv_ref(srv);
5067   if (*target) srv_unref(*target);
5068   *target = srv;
5069 }
5070 
5071 /* A message is harmless if it cannot change the outcome of a consensus round.
5072  * learn_op does change the value, but we trust that the sender has correctly
5073  * derived the value from a majority of the acceptors, so in that sense it is
5074  * harmless. */
harmless(pax_msg const * p)5075 static int harmless(pax_msg const *p) {
5076   if (p->synode.msgno == 0) return 1;
5077   switch (p->op) {
5078     case i_am_alive_op:
5079     case are_you_alive_op:
5080     case need_boot_op:
5081     case gcs_snapshot_op:
5082     case learn_op:
5083     case recover_learn_op:
5084     case tiny_learn_op:
5085     case die_op:
5086       return 1;
5087     default:
5088       return 0;
5089   }
5090 }
5091 
wait_for_cache(pax_machine ** pm,synode_no synode,double timeout)5092 static int wait_for_cache(pax_machine **pm, synode_no synode, double timeout) {
5093   DECL_ENV
5094   double now;
5095   END_ENV;
5096 
5097   TASK_BEGIN
5098   ep->now = task_now();
5099   while ((*pm = get_cache(synode)) == NULL) {
5100     /* Wait for executor to make progress */
5101     TIMED_TASK_WAIT(&exec_wait, 0.5);
5102     if (task_now() - ep->now > timeout) break; /* Timeout, return NULL. */
5103   }
5104   FINALLY
5105   TASK_END;
5106 }
5107 
5108 /*
5109   Verify if we need to poll the cache before calling dispatch_op.
5110   Avoid waiting for a machine if it is not going to be used.
5111  */
should_poll_cache(pax_op op)5112 static bool_t should_poll_cache(pax_op op) {
5113   if (op == die_op || op == gcs_snapshot_op || op == initial_op ||
5114       op == client_msg)
5115     return FALSE;
5116   return TRUE;
5117 }
5118 
acceptor_learner_task(task_arg arg)5119 int acceptor_learner_task(task_arg arg) {
5120   DECL_ENV
5121   connection_descriptor rfd;
5122   srv_buf *in_buf;
5123 
5124   pax_msg *p;
5125   u_int buflen;
5126   char *buf;
5127   linkage reply_queue;
5128   int errors;
5129   server *srv;
5130   site_def const *site;
5131   int behind;
5132   END_ENV;
5133 
5134   TASK_BEGIN
5135 
5136   ep->in_buf = (srv_buf *)calloc(1, sizeof(srv_buf));
5137 
5138   ep->rfd.fd = get_int_arg(arg);
5139 #ifndef XCOM_WITHOUT_OPENSSL
5140   ep->rfd.ssl_fd = 0;
5141 #endif
5142   ep->p = NULL;
5143   ep->buflen = 0;
5144   ep->buf = NULL;
5145   ep->errors = 0;
5146   ep->srv = 0;
5147   ep->behind = FALSE;
5148 
5149   /* We have a connection, make socket non-blocking and wait for request */
5150   unblock_fd(ep->rfd.fd);
5151   set_nodelay(ep->rfd.fd);
5152   wait_io(stack, ep->rfd.fd, 'r');
5153   TASK_YIELD;
5154 
5155 #ifndef XCOM_WITHOUT_OPENSSL
5156   if (xcom_use_ssl()) {
5157     ep->rfd.ssl_fd = SSL_new(server_ctx);
5158     SSL_set_fd(ep->rfd.ssl_fd, ep->rfd.fd);
5159 
5160     {
5161       int ret_ssl;
5162       int err;
5163       ERR_clear_error();
5164       ret_ssl = SSL_accept(ep->rfd.ssl_fd);
5165       err = SSL_get_error(ep->rfd.ssl_fd, ret_ssl);
5166 
5167       while (ret_ssl != SSL_SUCCESS) {
5168         if (err == SSL_ERROR_WANT_READ) {
5169           wait_io(stack, ep->rfd.fd, 'r');
5170         } else if (err == SSL_ERROR_WANT_WRITE) {
5171           wait_io(stack, ep->rfd.fd, 'w');
5172         } else { /* Some other error, give up */
5173           break;
5174         }
5175         TASK_YIELD;
5176         SET_OS_ERR(0);
5177         G_DEBUG("acceptor learner accept retry fd %d", ep->rfd.fd);
5178         ERR_clear_error();
5179         ret_ssl = SSL_accept(ep->rfd.ssl_fd);
5180         err = SSL_get_error(ep->rfd.ssl_fd, ret_ssl);
5181       }
5182 
5183       if (ret_ssl != SSL_SUCCESS) {
5184         ssl_free_con(&ep->rfd);
5185         close_connection(&ep->rfd);
5186         TERMINATE;
5187       }
5188     }
5189 
5190   } else {
5191     ep->rfd.ssl_fd = 0;
5192   }
5193 #endif
5194   set_connected(&ep->rfd, CON_FD);
5195   link_init(&ep->reply_queue, TYPE_HASH("msg_link"));
5196 
5197 again:
5198   while (!xcom_shutdown) {
5199     int64_t n;
5200     ep->site = 0;
5201     unchecked_replace_pax_msg(&ep->p, pax_msg_new_0(null_synode));
5202 
5203     if (use_buffered_read) {
5204       TASK_CALL(buffered_read_msg(&ep->rfd, ep->in_buf, ep->p, ep->srv, &n));
5205     } else {
5206       TASK_CALL(read_msg(&ep->rfd, ep->p, ep->srv, &n));
5207     }
5208     ADD_DBG(D_NONE, add_synode_event(ep->p->synode);
5209             add_event(EVENT_DUMP_PAD, string_arg("ep->p->from"));
5210             add_event(EVENT_DUMP_PAD, uint_arg(ep->p->from));
5211             add_event(EVENT_DUMP_PAD, string_arg(pax_op_to_str(ep->p->op))););
5212 
5213     if (ep->srv && !ep->srv->invalid && ((int)ep->p->op != (int)client_msg) &&
5214         is_connected(&ep->srv->con))
5215       server_detected(ep->srv);
5216 
5217     if (((int)ep->p->op < (int)client_msg || ep->p->op > LAST_OP)) {
5218       /* invalid operation, ignore message */
5219       delete_pax_msg(ep->p);
5220       ep->p = NULL;
5221       TASK_YIELD;
5222       continue;
5223     }
5224     if (n <= 0) {
5225       break;
5226     }
5227     ep->site = find_site_def(ep->p->synode);
5228 
5229     /* Handle this connection on a local_server task instead of this
5230        acceptor_learner_task task. */
5231     if (ep->p->op == client_msg && ep->p->a &&
5232         ep->p->a->body.c_t == convert_into_local_server_type) {
5233       if (local_server_is_setup()) {
5234         /* Launch local_server task to handle this connection. */
5235         {
5236           connection_descriptor *con =
5237               (connection_descriptor *)malloc(sizeof(connection_descriptor));
5238           *con = ep->rfd;
5239           task_new(local_server, void_arg(con), "local_server",
5240                    XCOM_THREAD_DEBUG);
5241         }
5242       }
5243       /* Reply to client:
5244          - OK if local_server task is setup, or
5245          - FAIL otherwise. */
5246       {
5247         CREATE_REPLY(ep->p);
5248         reply->op = xcom_client_reply;
5249         reply->cli_err = local_server_is_setup() ? REQUEST_OK : REQUEST_FAIL;
5250         SERIALIZE_REPLY(reply);
5251         replace_pax_msg(&reply, NULL);
5252       }
5253       WRITE_REPLY;
5254       delete_pax_msg(ep->p);
5255       ep->p = NULL;
5256       if (local_server_is_setup()) {
5257         /* Relinquish ownership of the connection. It is now onwed by the
5258            launched local_server task. */
5259         reset_connection(&ep->rfd);
5260       }
5261       /* Terminate this task. */
5262       TERMINATE;
5263     }
5264 
5265     /*
5266       Getting a pointer to the server needs to be done after we have
5267       received a message, since without having received a message, we
5268       cannot know who it is from. We could peek at the message and de‐
5269       serialize the message number and from field, but since the server
5270       does not change, it should be sufficient to cache the server in
5271       the acceptor_learner task. A cleaner solution would have been to
5272       move the timestamps out of the server object, and have a map in‐
5273       dexed by IP/port or UUID to track the timestamps, since this is
5274       common to both the sender_task, reply_handler_task,  and the ac‐
5275       ceptor_learner_task.
5276     */
5277     update_srv(&ep->srv, get_server(ep->site, ep->p->from));
5278     ep->p->refcnt = 1; /* Refcnt from other end is void here */
5279     IFDBG(D_NONE, FN; NDBG(ep->rfd.fd, d); NDBG(task_now(), f);
5280           COPY_AND_FREE_GOUT(dbg_pax_msg(ep->p)););
5281     receive_count[ep->p->op]++;
5282     receive_bytes[ep->p->op] += (uint64_t)n + MSG_HDR_SIZE;
5283     {
5284       if (get_maxnodes(ep->site) > 0) {
5285         ep->behind = ep->p->synode.msgno < delivered_msg.msgno;
5286       }
5287       ADD_DBG(D_BASE, add_event(EVENT_DUMP_PAD, string_arg("before dispatch "));
5288               add_synode_event(ep->p->synode);
5289               add_event(EVENT_DUMP_PAD, string_arg("ep->p->from"));
5290               add_event(EVENT_DUMP_PAD, uint_arg(ep->p->from));
5291               add_event(EVENT_DUMP_PAD, string_arg(pax_op_to_str(ep->p->op)));
5292               add_event(EVENT_DUMP_PAD,
5293                         string_arg(pax_msg_type_to_str(ep->p->msg_type)));
5294               add_event(EVENT_DUMP_PAD, string_arg("is_cached(ep->p->synode)"));
5295               add_event(EVENT_DUMP_PAD, int_arg(is_cached(ep->p->synode)));
5296               add_event(EVENT_DUMP_PAD, string_arg("behind"));
5297               add_event(EVENT_DUMP_PAD, int_arg(ep->behind)););
5298       /* Special treatment to see if synode number is valid. Return no-op if
5299        * not. */
5300       if (ep->p->op == read_op || ep->p->op == prepare_op ||
5301           ep->p->op == accept_op) {
5302         if (ep->site) {
5303           ADD_DBG(
5304               D_BASE, add_event(EVENT_DUMP_PAD, string_arg("ep->p->synode"));
5305               add_synode_event(ep->p->synode);
5306               add_event(EVENT_DUMP_PAD, string_arg("ep->site->start"));
5307               add_synode_event(ep->site->start); add_event(
5308                   EVENT_DUMP_PAD, string_arg("ep->site->nodes.node_list_len"));
5309               add_event(EVENT_DUMP_PAD,
5310                         uint_arg(ep->site->nodes.node_list_len)););
5311           if (ep->p->synode.node >= ep->site->nodes.node_list_len) {
5312             {
5313               CREATE_REPLY(ep->p);
5314               create_noop(reply);
5315               set_learn_type(reply);
5316               SERIALIZE_REPLY(reply);
5317               delete_pax_msg(reply); /* Deallocate BEFORE potentially blocking
5318                                         call which will lose value of reply */
5319             }
5320             WRITE_REPLY;
5321             goto again;
5322           }
5323         }
5324       }
5325       /* Reject any message that might compromise the integrity of a consensus
5326        * instance. We do this by not processing any message which may change the
5327        * outcome if the consensus instance has been evicted from the cache */
5328       if (harmless(ep->p) ||          /* Harmless message */
5329           is_cached(ep->p->synode) || /* Already in cache */
5330           (!ep->behind)) { /* Guard against cache pollution from other nodes */
5331 
5332         if (should_poll_cache(ep->p->op)) {
5333           pax_machine *pm;
5334           TASK_CALL(wait_for_cache(&pm, ep->p->synode, 10));
5335           if (!pm) continue; /* Could not get a machine, discarding message. */
5336         }
5337 
5338         dispatch_op(ep->site, ep->p, &ep->reply_queue);
5339 
5340         /* Send replies on same fd */
5341         while (!link_empty(&ep->reply_queue)) {
5342           {
5343             msg_link *reply =
5344                 (msg_link *)(link_extract_first(&ep->reply_queue));
5345             IFDBG(D_DISPATCH, FN; PTREXP(reply);
5346                   COPY_AND_FREE_GOUT(dbg_linkage(&ep->reply_queue));
5347                   COPY_AND_FREE_GOUT(dbg_msg_link(reply));
5348                   COPY_AND_FREE_GOUT(dbg_pax_msg(reply->p)););
5349             assert(reply->p);
5350             assert(reply->p->refcnt > 0);
5351             IFDBG(D_DISPATCH, FN; STRLIT("serialize "); PTREXP(reply));
5352             SERIALIZE_REPLY(reply->p);
5353             msg_link_delete(&reply); /* Deallocate BEFORE potentially blocking
5354                                         call which will lose value of reply */
5355           }
5356           WRITE_REPLY;
5357         }
5358       } else {
5359         IFDBG(D_EXEC, FN; STRLIT("rejecting ");
5360               STRLIT(pax_op_to_str(ep->p->op)); NDBG(ep->p->from, d);
5361               NDBG(ep->p->to, d); SYCEXP(ep->p->synode);
5362               BALCEXP(ep->p->proposal));
5363         if (/* xcom_booted() && */ ep->behind) {
5364           if (/*ep->p->op == prepare_op && */ was_removed_from_cache(
5365               ep->p->synode)) {
5366             IFDBG(D_NONE, FN; STRLIT("send_die ");
5367                   STRLIT(pax_op_to_str(ep->p->op)); NDBG(ep->p->from, d);
5368                   NDBG(ep->p->to, d); SYCEXP(ep->p->synode);
5369                   BALCEXP(ep->p->proposal));
5370             if (get_maxnodes(ep->site) > 0) {
5371               {
5372                 pax_msg *np = NULL;
5373                 np = pax_msg_new(ep->p->synode, ep->site);
5374                 np->op = die_op;
5375                 SERIALIZE_REPLY(np);
5376                 IFDBG(D_NONE, FN; STRLIT("sending die_op to node ");
5377                       NDBG(np->to, d); SYCEXP(executed_msg); SYCEXP(max_synode);
5378                       SYCEXP(np->synode));
5379                 delete_pax_msg(np); /* Deallocate BEFORE potentially blocking
5380                                    call which will lose value of np */
5381               }
5382               WRITE_REPLY;
5383             }
5384           }
5385         }
5386       }
5387     }
5388     /* TASK_YIELD; */
5389   }
5390 
5391   FINALLY
5392   IFDBG(D_BUG, FN; STRLIT(" shutdown "); NDBG(ep->rfd.fd, d);
5393         NDBG(task_now(), f));
5394   if (ep->reply_queue.suc && !link_empty(&ep->reply_queue))
5395     empty_msg_list(&ep->reply_queue);
5396   unchecked_replace_pax_msg(&ep->p, NULL);
5397   shutdown_connection(&ep->rfd);
5398   IFDBG(D_NONE, FN; NDBG(xcom_shutdown, d));
5399   if (ep->buf) X_FREE(ep->buf);
5400   free(ep->in_buf);
5401 
5402   /* Unref srv to avoid leak */
5403   update_srv(&ep->srv, 0);
5404 
5405   IFDBG(D_BUG, FN; STRLIT(" shutdown completed"); NDBG(ep->rfd.fd, d);
5406         NDBG(task_now(), f));
5407   TASK_END;
5408 }
5409 
5410 /* Reply handler task */
5411 
5412 static void server_handle_need_snapshot(server *srv, site_def const *s,
5413                                         node_no node);
5414 
reply_handler_task(task_arg arg)5415 int reply_handler_task(task_arg arg) {
5416   DECL_ENV
5417   server *s;
5418   pax_msg *reply;
5419   double dtime;
5420   END_ENV;
5421 
5422   TASK_BEGIN
5423 
5424   ep->dtime = INITIAL_CONNECT_WAIT; /* Initial wait is short, to avoid
5425                                        unnecessary waiting */
5426   ep->s = (server *)get_void_arg(arg);
5427   srv_ref(ep->s);
5428   ep->reply = NULL;
5429 
5430   while (!xcom_shutdown) {
5431     while (!is_connected(&ep->s->con)) {
5432       IFDBG(D_NONE, FN; STRLIT("waiting for connection"));
5433       TASK_DELAY(ep->dtime);
5434       if (xcom_shutdown) {
5435         TERMINATE;
5436       }
5437       ep->dtime *= CONNECT_WAIT_INCREASE; /* Increase wait time for next try */
5438       if (ep->dtime > MAX_CONNECT_WAIT) {
5439         ep->dtime = MAX_CONNECT_WAIT;
5440       }
5441     }
5442     ep->dtime = INITIAL_CONNECT_WAIT;
5443     {
5444       int64_t n;
5445       unchecked_replace_pax_msg(&ep->reply, pax_msg_new_0(null_synode));
5446 
5447       ADD_DBG(D_NONE, add_event(EVENT_DUMP_PAD, string_arg("ep->s->con.fd"));
5448               add_event(EVENT_DUMP_PAD, int_arg(ep->s->con.fd)););
5449       TASK_CALL(read_msg(&ep->s->con, ep->reply, ep->s, &n));
5450       ADD_DBG(D_NONE, add_event(EVENT_DUMP_PAD, string_arg("ep->s->con.fd"));
5451               add_event(EVENT_DUMP_PAD, int_arg(ep->s->con.fd)););
5452       ep->reply->refcnt = 1; /* Refcnt from other end is void here */
5453       if (n <= 0) {
5454         shutdown_connection(&ep->s->con);
5455         continue;
5456       }
5457       receive_bytes[ep->reply->op] += (uint64_t)n + MSG_HDR_SIZE;
5458     }
5459     IFDBG(D_NONE, FN; NDBG(ep->s->con.fd, d); NDBG(task_now(), f);
5460           COPY_AND_FREE_GOUT(dbg_pax_msg(ep->reply)););
5461     receive_count[ep->reply->op]++;
5462 
5463     ADD_DBG(D_NONE, add_synode_event(ep->reply->synode);
5464             add_event(EVENT_DUMP_PAD, string_arg("ep->reply->from"));
5465             add_event(EVENT_DUMP_PAD, uint_arg(ep->reply->from));
5466             add_event(EVENT_DUMP_PAD, string_arg(pax_op_to_str(ep->reply->op)));
5467             add_event(EVENT_DUMP_PAD, string_arg("get_site_def()->boot_key"));
5468             add_synode_event(get_site_def()->boot_key););
5469     /* Special test for need_snapshot, since node and site may not be consistent
5470      */
5471     if (ep->reply->op == need_boot_op &&
5472         !synode_eq(get_site_def()->boot_key, null_synode)) {
5473       pax_msg *p = ep->reply;
5474 
5475       ADD_DBG(D_BASE,
5476               add_event(EVENT_DUMP_PAD,
5477                         string_arg("calling server_handle_need_snapshot")););
5478       if (should_handle_need_boot(find_site_def(p->synode), p)) {
5479         server_handle_need_snapshot(ep->s, find_site_def(p->synode), p->from);
5480         /* Wake senders waiting to connect, since new node has appeared */
5481         wakeup_sender();
5482       } else {
5483         ep->s->invalid = 1;
5484       }
5485     } else {
5486       /* We only handle messages from this connection if the server is valid. */
5487       if (ep->s->invalid == 0)
5488         dispatch_op(find_site_def(ep->reply->synode), ep->reply, NULL);
5489     }
5490     TASK_YIELD;
5491   }
5492 
5493   FINALLY
5494   replace_pax_msg(&ep->reply, NULL);
5495 
5496   shutdown_connection(&ep->s->con);
5497   ep->s->reply_handler = NULL;
5498   IFDBG(D_BUG, FN; STRLIT(" shutdown "); NDBG(ep->s->con.fd, d);
5499         NDBG(task_now(), f));
5500   srv_unref(ep->s);
5501 
5502   TASK_END;
5503 }
5504 
5505 /* purecov: begin deadcode */
xcom_sleep(unsigned int seconds)5506 static inline void xcom_sleep(unsigned int seconds) {
5507 #if defined(_WIN32)
5508   Sleep((DWORD)seconds * 1000); /* windows sleep takes milliseconds */
5509 #else
5510   sleep(seconds);
5511 #endif
5512 }
5513 /* purecov: end */
5514 
5515 /*
5516  * Get a unique long as the basis for XCom group id creation.
5517  *
5518  * NOTE:
5519  * As there is no gethostid() on win, we use seconds since epoch instead,
5520  * so it might fail if you try simultaneous create sites at the same second.
5521  */
xcom_unique_long(void)5522 long xcom_unique_long(void) {
5523 #if defined(_WIN32)
5524   __time64_t ltime;
5525 
5526   _time64(&ltime);
5527   return (long)(ltime ^ GetCurrentProcessId());
5528 #else
5529   return gethostid() ^ getpid();
5530 #endif
5531 }
5532 
init_config_with_group(app_data * a,node_list * nl,cargo_type type,uint32_t group_id)5533 app_data_ptr init_config_with_group(app_data *a, node_list *nl, cargo_type type,
5534                                     uint32_t group_id) {
5535   init_app_data(a);
5536   a->app_key.group_id = a->group_id = group_id;
5537   a->body.c_t = type;
5538   init_node_list(nl->node_list_len, nl->node_list_val, &a->body.app_u_u.nodes);
5539   return a;
5540 }
5541 
init_set_event_horizon_msg(app_data * a,uint32_t group_id,xcom_event_horizon event_horizon)5542 app_data_ptr init_set_event_horizon_msg(app_data *a, uint32_t group_id,
5543                                         xcom_event_horizon event_horizon) {
5544   init_app_data(a);
5545   a->app_key.group_id = a->group_id = group_id;
5546   a->body.c_t = set_event_horizon_type;
5547   a->body.app_u_u.event_horizon = event_horizon;
5548   return a;
5549 }
5550 
init_get_event_horizon_msg(app_data * a,uint32_t group_id)5551 app_data_ptr init_get_event_horizon_msg(app_data *a, uint32_t group_id) {
5552   init_app_data(a);
5553   a->app_key.group_id = a->group_id = group_id;
5554   a->body.c_t = get_event_horizon_type;
5555   return a;
5556 }
5557 
init_app_msg(app_data * a,char * payload,u_int payload_size)5558 app_data_ptr init_app_msg(app_data *a, char *payload, u_int payload_size) {
5559   init_app_data(a);
5560   a->body.c_t = app_type;
5561   a->body.app_u_u.data.data_val = payload; /* Takes ownership of payload. */
5562   a->body.app_u_u.data.data_len = payload_size;
5563   return a;
5564 }
5565 
init_terminate_command(app_data * a)5566 app_data_ptr init_terminate_command(app_data *a) {
5567   init_app_data(a);
5568   a->body.c_t = x_terminate_and_exit;
5569   return a;
5570 }
5571 
init_get_synode_app_data_msg(app_data * a,uint32_t group_id,synode_no_array * const synodes)5572 static app_data_ptr init_get_synode_app_data_msg(
5573     app_data *a, uint32_t group_id, synode_no_array *const synodes) {
5574   init_app_data(a);
5575   a->app_key.group_id = a->group_id = group_id;
5576   a->body.c_t = get_synode_app_data_type;
5577   /* Move synodes (as in C++ move semantics) into a->body.app_u_u.synodes. */
5578   synode_array_move(&a->body.app_u_u.synodes, synodes);
5579   return a;
5580 }
5581 
init_set_cache_size_msg(app_data * a,uint64_t cache_limit)5582 app_data_ptr init_set_cache_size_msg(app_data *a, uint64_t cache_limit) {
5583   init_app_data(a);
5584   a->body.c_t = set_cache_limit;
5585   a->body.app_u_u.cache_limit = cache_limit;
5586   return a;
5587 }
5588 
init_convert_into_local_server_msg(app_data * a)5589 app_data_ptr init_convert_into_local_server_msg(app_data *a) {
5590   init_app_data(a);
5591   a->body.c_t = convert_into_local_server_type;
5592   return a;
5593 }
5594 
server_send_snapshot(server * srv,site_def const * s,gcs_snapshot * gcs_snap,node_no node)5595 static void server_send_snapshot(server *srv, site_def const *s,
5596                                  gcs_snapshot *gcs_snap, node_no node) {
5597   pax_msg *p = pax_msg_new(gcs_snap->log_start, get_site_def());
5598   ref_msg(p);
5599   p->op = gcs_snapshot_op;
5600   p->gcs_snap = gcs_snap;
5601   send_msg(srv, s->nodeno, node, get_group_id(s), p);
5602   unref_msg(&p);
5603 }
5604 
server_push_log(server * srv,synode_no push,node_no node)5605 static void server_push_log(server *srv, synode_no push, node_no node) {
5606   site_def const *s = get_site_def();
5607   if (srv && s) {
5608     while (!synode_gt(push, get_max_synode())) {
5609       if (is_cached(push)) {
5610         /* Need to clone message here since pax_machine may be re-used while
5611          * message is sent */
5612         pax_machine *p = get_cache_no_touch(push, FALSE);
5613         if (pm_finished(p)) {
5614           pax_msg *pm = clone_pax_msg(p->learner.msg);
5615           if (pm != NULL) {
5616             ref_msg(pm);
5617             pm->op = recover_learn_op;
5618             IFDBG(D_NONE, FN; PTREXP(srv); PTREXP(s););
5619             send_msg(srv, s->nodeno, node, get_group_id(s), pm);
5620             unref_msg(&pm);
5621           }
5622         }
5623       }
5624       push = incr_synode(push);
5625     }
5626   }
5627 }
5628 
5629 /* purecov: begin deadcode */
reply_push_log(synode_no push,linkage * reply_queue)5630 static void reply_push_log(synode_no push, linkage *reply_queue) {
5631   while (!synode_gt(push, get_max_synode())) {
5632     if (is_cached(push)) {
5633       /* Need to clone message here since pax_machine may be re-used while
5634        * message is sent */
5635       pax_machine *p = get_cache_no_touch(push, FALSE);
5636       if (pm_finished(p)) {
5637         pax_msg *reply = clone_pax_msg(p->learner.msg);
5638         ref_msg(reply);
5639         reply->op = recover_learn_op;
5640         {
5641           msg_link *msg_x = msg_link_new(reply, reply->from);
5642           IFDBG(D_NONE, FN; PTREXP(msg_x));
5643           link_into(&(msg_x->l), reply_queue);
5644         }
5645         replace_pax_msg(&reply, NULL);
5646         unref_msg(&reply);
5647       }
5648     }
5649     push = incr_synode(push);
5650   }
5651 }
5652 /* purecov: end */
5653 
5654 static app_snap_getter get_app_snap_cb;
5655 static app_snap_handler handle_app_snap_cb;
5656 
create_snapshot()5657 static gcs_snapshot *create_snapshot() {
5658   gcs_snapshot *gs = 0;
5659   if (get_app_snap_cb) {
5660     /* purecov: begin deadcode */
5661     blob app_snap = {
5662         {0,
5663          0}}; /* Initialize in case get_app_snap_cb does not assign a value */
5664     synode_no app_lsn = get_app_snap_cb(&app_snap);
5665 
5666     /* We have a valid callback, abort if it did not return anything */
5667     if (app_snap.data.data_len == 0) {
5668       ADD_DBG(D_BASE,
5669               add_event(EVENT_DUMP_PAD, string_arg("no data, return")););
5670       return 0;
5671     }
5672     gs = export_config();
5673     if (!gs) return 0;
5674     ADD_DBG(D_BASE, add_event(EVENT_DUMP_PAD, string_arg("export config ok")););
5675     gs->app_snap = app_snap;
5676     IFDBG(D_BUG, FN; SYCEXP(app_lsn); SYCEXP(gs->log_start);
5677           SYCEXP(gs->log_end));
5678 
5679     /* Set starting point of log to match the snapshot */
5680     /* If we have a valid synode from application snapshot, see if it should be
5681      * used */
5682     if (!synode_eq(null_synode, app_lsn)) {
5683       /* If log_start is null_synode, always use valid synode from application
5684        * snapshot */
5685       if (synode_eq(null_synode, gs->log_start) ||
5686           !synode_gt(app_lsn, gs->log_start)) {
5687         gs->log_start = app_lsn;
5688         IFDBG(D_BUG, FN; STRLIT("using "); SYCEXP(app_lsn));
5689       }
5690     }
5691     ADD_DBG(D_BASE, add_event(EVENT_DUMP_PAD, string_arg("gs->log_start"));
5692             add_synode_event(gs->log_start);
5693             add_event(EVENT_DUMP_PAD, string_arg("gs->log_end"));
5694             add_synode_event(gs->log_end););
5695     /* purecov: end */
5696   } else {
5697     gs = export_config();
5698     if (!gs) return 0;
5699     ADD_DBG(D_BASE, add_event(EVENT_DUMP_PAD, string_arg("export config ok")););
5700     if (!synode_eq(null_synode, last_config_modification_id)) {
5701       /* No valid valid synode from application snapshot, use
5702        * last_config_modification_id if not null_synode */
5703       gs->log_start = last_config_modification_id;
5704       IFDBG(D_BUG, FN; STRLIT("using "); SYCEXP(last_config_modification_id));
5705     }
5706     IFDBG(D_BUG, FN; SYCEXP(gs->log_start); SYCEXP(gs->log_end));
5707     ADD_DBG(D_BASE, add_event(EVENT_DUMP_PAD, string_arg("gs->log_start"));
5708             add_synode_event(gs->log_start);
5709             add_event(EVENT_DUMP_PAD, string_arg("gs->log_end"));
5710             add_synode_event(gs->log_end););
5711   }
5712   IFDBG(D_BUG, FN; SYCEXP(gs->log_start); SYCEXP(gs->log_end));
5713   return gs;
5714 }
5715 
5716 /* purecov: begin deadcode */
handle_need_snapshot(linkage * reply_queue,pax_msg * pm)5717 static void handle_need_snapshot(linkage *reply_queue, pax_msg *pm) {
5718   gcs_snapshot *gs = create_snapshot();
5719   if (gs) {
5720     pax_msg *reply = clone_pax_msg(pm);
5721     ref_msg(reply);
5722     reply->op = gcs_snapshot_op;
5723     reply->gcs_snap = gs;
5724     {
5725       msg_link *msg_x = msg_link_new(reply, reply->from);
5726       IFDBG(D_NONE, FN; PTREXP(msg_x));
5727       link_into(&(msg_x->l), reply_queue);
5728     }
5729     unref_msg(&reply);
5730     IFDBG(D_NONE, FN; STRLIT("sent snapshot"););
5731     reply_push_log(gs->log_start, reply_queue);
5732     send_global_view();
5733   }
5734 }
5735 /* purecov: end */
5736 
5737 static task_env *x_timer = NULL;
5738 
5739 /* Timer for use with the xcom FSM. Will deliver x_fsm_timeout */
xcom_timer(task_arg arg)5740 static int xcom_timer(task_arg arg) {
5741   DECL_ENV
5742   double t;
5743   END_ENV;
5744 
5745   TASK_BEGIN
5746 
5747   ep->t = get_double_arg(arg);
5748   TASK_DELAY(ep->t);
5749   XCOM_FSM(x_fsm_timeout, double_arg(ep->t));
5750   FINALLY
5751   if (stack == x_timer) set_task(&x_timer, NULL);
5752   IFDBG(D_BUG, FN; STRLIT(" timeout "));
5753   TASK_END;
5754 }
5755 
5756 /* Stop the xcom FSM timer */
stop_x_timer()5757 static void stop_x_timer() {
5758   if (x_timer) {
5759     task_terminate(x_timer);
5760     set_task(&x_timer, NULL);
5761   }
5762 }
5763 
5764 /* Start the xcom FSM timer */
start_x_timer(double t)5765 static void start_x_timer(double t) {
5766   stop_x_timer();
5767   set_task(&x_timer, task_new(xcom_timer, double_arg(t), "xcom_timer",
5768                               XCOM_THREAD_DEBUG));
5769 }
5770 
5771 /* Deliver x_fsm_complete to xcom FSM */
5772 /* purecov: begin deadcode */
x_fsm_completion_task(task_arg arg)5773 static int x_fsm_completion_task(task_arg arg) {
5774   DECL_ENV
5775   int dummy;
5776   END_ENV;
5777 
5778   TASK_BEGIN
5779 
5780       (void)
5781   arg;
5782   XCOM_FSM(x_fsm_complete, null_arg);
5783   FINALLY
5784   IFDBG(D_BUG, FN; STRLIT(" delivered "));
5785   TASK_END;
5786 }
5787 /* purecov: end */
5788 
5789 /* Send x_fsm_complete to xcom FSM in the context of the xcom thread. The
5790  * calling thread and the xcom thread must be in a rendezvous. Using a task to
5791  * deliver a message is an abstraction inversion, but it's the simplest solution
5792  * until we get a proper queue-based communication system going. */
5793 /* purecov: begin deadcode */
send_x_fsm_complete()5794 void send_x_fsm_complete() {
5795   task_new(x_fsm_completion_task, null_arg, "x_fsm_completion_task",
5796            XCOM_THREAD_DEBUG);
5797 }
5798 /* purecov: end */
5799 
server_handle_need_snapshot(server * srv,site_def const * s,node_no node)5800 static void server_handle_need_snapshot(server *srv, site_def const *s,
5801                                         node_no node) {
5802   gcs_snapshot *gs = create_snapshot();
5803 
5804   if (gs) {
5805     server_send_snapshot(srv, s, gs, node);
5806     IFDBG(D_NONE, FN; STRLIT("sent snapshot"););
5807     server_push_log(srv, gs->log_start, node);
5808     send_global_view();
5809   }
5810 }
5811 
5812 #define X(b) #b
5813 const char *xcom_actions_name[] = {x_actions};
5814 #undef X
5815 
5816 static int snapshots[NSERVERS];
5817 
5818 /* Note that we have received snapshot from node */
note_snapshot(node_no node)5819 static void note_snapshot(node_no node) {
5820   if (node != VOID_NODE_NO) {
5821     snapshots[node] = 1;
5822   }
5823 }
5824 
5825 /* Reset set of received snapshots */
reset_snapshot_mask()5826 static void reset_snapshot_mask() {
5827   int i;
5828   for (i = 0; i < NSERVERS; i++) {
5829     snapshots[i] = 0;
5830   }
5831 }
5832 
5833 /* See if we have got a snapshot from every node */
got_all_snapshots()5834 static int got_all_snapshots() {
5835   node_no i;
5836   node_no max = get_maxnodes(get_site_def());
5837   if (0 == max) {
5838     return 0;
5839   }
5840   for (i = 0; i < max; i++) {
5841     if (!snapshots[i]) {
5842       return 0;
5843     }
5844   }
5845   return 1;
5846 }
5847 
5848 static synode_no log_start_max; /* Initialized by xcom_fsm */
5849 static synode_no log_end_max;   /* Initialized by xcom_fsm */
5850 
5851 /* See if this snapshot is better than what we already have */
5852 /* purecov: begin deadcode */
better_snapshot(gcs_snapshot * gcs)5853 static int better_snapshot(gcs_snapshot *gcs) {
5854   synode_no boot_key = config_max_boot_key(gcs);
5855   return synode_gt(boot_key, get_site_def()->boot_key) ||
5856          (synode_eq(boot_key, get_site_def()->boot_key) &&
5857           (synode_gt(gcs->log_start, log_start_max) ||
5858            (synode_eq(gcs->log_start, log_start_max) &&
5859             synode_gt(gcs->log_end, log_end_max))));
5860 }
5861 /* purecov: end */
5862 
5863 /* Install snapshot */
handle_x_snapshot(gcs_snapshot * gcs)5864 static void handle_x_snapshot(gcs_snapshot *gcs) {
5865   import_config(gcs);
5866   if (get_nodeno(get_site_def()) == VOID_NODE_NO) {
5867     IFDBG(D_BUG, FN; STRLIT("Not member of site, not executing log"));
5868     gcs->log_end =
5869         gcs->log_start; /* Avoid executing log if not member of site */
5870   }
5871   handle_app_snap_cb(&gcs->app_snap, gcs->log_start, gcs->log_end);
5872   set_max_synode(gcs->log_end);
5873   set_executed_msg(incr_synode(gcs->log_start));
5874   log_start_max = gcs->log_start;
5875   log_end_max = gcs->log_end;
5876 
5877   set_last_received_config(get_highest_boot_key(gcs));
5878 
5879   IFDBG(D_BUG, FN; SYCEXP(gcs->log_start); SYCEXP(gcs->log_end);
5880         SYCEXP(last_config_modification_id); SYCEXP(executed_msg););
5881 }
5882 
5883 /* Note that we have received snapshot, and install if better than old */
5884 /* purecov: begin deadcode */
update_best_snapshot(gcs_snapshot * gcs)5885 static void update_best_snapshot(gcs_snapshot *gcs) {
5886   if (get_site_def() == 0 || better_snapshot(gcs)) {
5887     handle_x_snapshot(gcs);
5888   }
5889 }
5890 /* purecov: end */
5891 
5892 /* Send need_boot_op to all nodes in current config */
5893 /* purecov: begin deadcode */
send_need_boot()5894 static void send_need_boot() {
5895   pax_msg *p = pax_msg_new_0(null_synode);
5896   ref_msg(p);
5897   p->synode = get_site_def()->start;
5898   p->op = need_boot_op;
5899   send_to_all_except_self(get_site_def(), p, "need_boot_op");
5900   unref_msg(&p);
5901 }
5902 /* purecov: end */
5903 
5904 /* Set log_end of snapshot based on log_end in snapshot and max synode */
set_log_end(gcs_snapshot * gcs)5905 void set_log_end(gcs_snapshot *gcs) {
5906   if (synode_gt(get_max_synode(), gcs->log_end)) {
5907     gcs->log_end = get_max_synode();
5908   }
5909 }
5910 
5911 struct xcom_fsm_state;
5912 typedef struct xcom_fsm_state xcom_fsm_state;
5913 
5914 /* Function pointer corresponding to a state. Return 1 if execution should
5915  * continue, 0 otherwise */
5916 typedef int (*xcom_fsm_fp)(xcom_actions action, task_arg fsmargs,
5917                            xcom_fsm_state *ctxt);
5918 
5919 /* Function pointer and name */
5920 struct xcom_fsm_state {
5921   xcom_fsm_fp state_fp;
5922   char const *state_name;
5923 };
5924 
5925 #define X_FSM_STATE(s) \
5926   { s, #s }
5927 #define SET_X_FSM_STATE(s) \
5928   do {                     \
5929     ctxt->state_fp = s;    \
5930     ctxt->state_name = #s; \
5931   } while (0)
5932 
5933 /* The state functions/thunks */
5934 static int xcom_fsm_init(xcom_actions action, task_arg fsmargs,
5935                          xcom_fsm_state *ctxt);
5936 static int xcom_fsm_start_enter(xcom_actions action, task_arg fsmargs,
5937                                 xcom_fsm_state *ctxt);
5938 static int xcom_fsm_start(xcom_actions action, task_arg fsmargs,
5939                           xcom_fsm_state *ctxt);
5940 static int xcom_fsm_snapshot_wait_enter(xcom_actions action, task_arg fsmargs,
5941                                         xcom_fsm_state *ctxt);
5942 static int xcom_fsm_snapshot_wait(xcom_actions action, task_arg fsmargs,
5943                                   xcom_fsm_state *ctxt);
5944 static int xcom_fsm_recover_wait_enter(xcom_actions action, task_arg fsmargs,
5945                                        xcom_fsm_state *ctxt);
5946 static int xcom_fsm_recover_wait(xcom_actions action, task_arg fsmargs,
5947                                  xcom_fsm_state *ctxt);
5948 static int xcom_fsm_run_enter(xcom_actions action, task_arg fsmargs,
5949                               xcom_fsm_state *ctxt);
5950 static int xcom_fsm_run(xcom_actions action, task_arg fsmargs,
5951                         xcom_fsm_state *ctxt);
5952 
5953 /* You are in a twisting maze of little functions ... */
5954 
5955 /* init state */
xcom_fsm_init(xcom_actions action,task_arg fsmargs,xcom_fsm_state * ctxt)5956 static int xcom_fsm_init(xcom_actions action, task_arg fsmargs,
5957                          xcom_fsm_state *ctxt) {
5958   (void)action;
5959   (void)fsmargs;
5960   IFDBG(D_NONE, FN;);
5961   /* Initialize basic xcom data */
5962   xcom_thread_init();
5963   SET_X_FSM_STATE(xcom_fsm_start_enter);
5964   return 1;
5965 }
5966 
5967 /* start_enter state */
xcom_fsm_start_enter(xcom_actions action,task_arg fsmargs,xcom_fsm_state * ctxt)5968 static int xcom_fsm_start_enter(xcom_actions action, task_arg fsmargs,
5969                                 xcom_fsm_state *ctxt) {
5970   (void)action;
5971   (void)fsmargs;
5972   /* push_dbg(D_DETECT | D_FSM | D_FILEOP | D_CONS | D_BASE | D_TRANSPORT);
5973    */
5974   push_dbg(D_FSM);
5975   IFDBG(D_NONE, FN; STRLIT("state x_start"););
5976   empty_prop_input_queue();
5977   reset_snapshot_mask();
5978   set_last_received_config(null_synode);
5979 
5980   SET_X_FSM_STATE(xcom_fsm_start);
5981   return 1;
5982 }
5983 
handle_fsm_net_boot(task_arg fsmargs,xcom_fsm_state * ctxt,int cont)5984 static int handle_fsm_net_boot(task_arg fsmargs, xcom_fsm_state *ctxt,
5985                                int cont) {
5986   app_data *a = (app_data *)get_void_arg(fsmargs);
5987   install_node_group(a);
5988   if (is_member(get_site_def())) {
5989     empty_prop_input_queue();
5990     {
5991       synode_no start = get_site_def()->start;
5992       if (start.msgno == 0) { /* May happen during initial boot */
5993         start.msgno = 1;
5994       }
5995       set_executed_msg(start);
5996     }
5997     pop_dbg();
5998     SET_X_FSM_STATE(xcom_fsm_run_enter);
5999     cont = 1;
6000   }
6001   return cont;
6002 }
6003 
handle_fsm_snapshot(task_arg fsmargs,xcom_fsm_state * ctxt)6004 static int handle_fsm_snapshot(task_arg fsmargs, xcom_fsm_state *ctxt) {
6005   gcs_snapshot *gcs = (gcs_snapshot *)get_void_arg(fsmargs);
6006   empty_prop_input_queue();
6007   set_log_end(gcs);
6008   handle_x_snapshot(gcs);
6009 
6010   /* Get recovery manager going again */
6011   if (recovery_restart_cb) recovery_restart_cb();
6012 
6013   /* If we run under control of the recovery manager, we need to call
6014    * recovery_begin_cb to rendezvous with the recovery manager */
6015   if (recovery_begin_cb) recovery_begin_cb();
6016 
6017   /* If we run under control of the recovery manager, we need to call
6018    * recovery_end_cb to rendezvous with the recovery manager */
6019   if (recovery_end_cb) recovery_end_cb();
6020 
6021   /* If we are here, it means that we are recovering from another node
6022    */
6023   /* Do not bother to wait for more snapshots, just handle it and
6024   enter run state */
6025   pop_dbg();
6026   SET_X_FSM_STATE(xcom_fsm_run_enter);
6027   return 1;
6028 }
6029 
6030 /* purecov: begin deadcode */
handle_fsm_snapshot_wait(xcom_fsm_state * ctxt)6031 static int handle_fsm_snapshot_wait(xcom_fsm_state *ctxt) {
6032   empty_prop_input_queue();
6033   start_x_timer(SNAPSHOT_WAIT_TIME);
6034   pop_dbg();
6035   SET_X_FSM_STATE(xcom_fsm_snapshot_wait_enter);
6036   return 1;
6037 }
6038 /* purecov: end */
6039 
handle_fsm_exit()6040 static void handle_fsm_exit() {
6041   /* Xcom is finished when we get here */
6042   push_dbg(D_BUG);
6043   bury_site(get_group_id(get_site_def()));
6044   task_terminate_all(); /* Kill, kill, kill, kill, kill, kill. This is
6045                            the end. */
6046 
6047   /* init_xcom_base(); */ /* Reset shared variables */
6048   init_tasks();           /* Reset task variables */
6049   free_site_defs();
6050   free_forced_config_site_def();
6051   wait_forced_config = 0;
6052   garbage_collect_servers();
6053   IFDBG(D_NONE, FN; STRLIT("shutting down"));
6054   xcom_shutdown = 1;
6055   start_config = null_synode;
6056   G_DEBUG("Exiting xcom thread");
6057 }
6058 
6059 /* start state */
xcom_fsm_start(xcom_actions action,task_arg fsmargs,xcom_fsm_state * ctxt)6060 static int xcom_fsm_start(xcom_actions action, task_arg fsmargs,
6061                           xcom_fsm_state *ctxt) {
6062   static int need_init_cache = 0;
6063   int cont = 0; /* Set to 1 if we should continue execution */
6064 
6065   switch (action) {
6066     case x_fsm_init:
6067       xcom_shutdown = 0;
6068       sent_alive = 0.0;
6069       oom_abort = 0;
6070       if (need_init_cache) init_cache();
6071       break;
6072 
6073     case x_fsm_net_boot:
6074       cont = handle_fsm_net_boot(fsmargs, ctxt, cont);
6075       break;
6076 
6077     case x_fsm_snapshot:
6078       cont = handle_fsm_snapshot(fsmargs, ctxt);
6079       break;
6080 
6081     /* This is the entry point for the initial recovery after the process
6082      * has started when running under an external recovery manager. */
6083     /* If we get x_fsm_snapshot_wait, we are called from the recovery
6084      * manager thread */
6085     /* purecov: begin deadcode */
6086     case x_fsm_snapshot_wait:
6087       cont = handle_fsm_snapshot_wait(ctxt);
6088       break;
6089       /* purecov: end */
6090 
6091     case x_fsm_exit:
6092       handle_fsm_exit();
6093       break;
6094 
6095     default:
6096       break;
6097   }
6098   need_init_cache = 1;
6099   return cont;
6100 }
6101 
6102 /* snapshot_wait_enter state */
6103 /* purecov: begin deadcode */
xcom_fsm_snapshot_wait_enter(xcom_actions action,task_arg fsmargs,xcom_fsm_state * ctxt)6104 static int xcom_fsm_snapshot_wait_enter(xcom_actions action, task_arg fsmargs,
6105                                         xcom_fsm_state *ctxt) {
6106   (void)action;
6107   (void)fsmargs;
6108   push_dbg(D_DETECT | D_FSM | D_FILEOP | D_CONS | D_BASE | D_TRANSPORT);
6109   IFDBG(D_NONE, FN; STRLIT("state x_snapshot_wait"););
6110   log_start_max = null_synode;
6111   log_end_max = null_synode;
6112   SET_X_FSM_STATE(xcom_fsm_snapshot_wait);
6113   return 0;
6114 }
6115 /* purecov: end */
6116 
6117 /* purecov: begin deadcode */
handle_local_snapshot(task_arg fsmargs,xcom_fsm_state * ctxt)6118 static int handle_local_snapshot(task_arg fsmargs, xcom_fsm_state *ctxt) {
6119   update_best_snapshot((gcs_snapshot *)get_void_arg(fsmargs));
6120   /* When recovering locally, fetch node number from site_def after
6121    * processing the snapshot */
6122   note_snapshot(get_site_def()->nodeno);
6123   send_need_boot();
6124   pop_dbg();
6125   SET_X_FSM_STATE(xcom_fsm_recover_wait_enter);
6126   return 1;
6127 }
6128 /* purecov: end */
6129 
6130 /* purecov: begin deadcode */
handle_snapshot(task_arg fsmargs,xcom_fsm_state * ctxt)6131 static int handle_snapshot(task_arg fsmargs, xcom_fsm_state *ctxt) {
6132   /* Snapshot from another node */
6133   gcs_snapshot *gcs = (gcs_snapshot *)get_void_arg(fsmargs);
6134   set_log_end(gcs);
6135   update_best_snapshot(gcs);
6136   /* We now have a site, so note that we have processed the local
6137    * snapshot even if we have not seen one, since if we are here, no
6138    * local snapshot will ever arrive. This simplifies the test in
6139    * got_all_snapshots() */
6140   note_snapshot(get_site_def()->nodeno);
6141   send_need_boot();
6142   pop_dbg();
6143   SET_X_FSM_STATE(xcom_fsm_recover_wait_enter);
6144   return 1;
6145 }
6146 /* purecov: end */
6147 
6148 /* snapshot_wait state */
6149 /* purecov: begin deadcode */
xcom_fsm_snapshot_wait(xcom_actions action,task_arg fsmargs,xcom_fsm_state * ctxt)6150 static int xcom_fsm_snapshot_wait(xcom_actions action, task_arg fsmargs,
6151                                   xcom_fsm_state *ctxt) {
6152   switch (action) {
6153       /* If we get x_fsm_local_snapshot, we are called from the recovery
6154        * manager thread */
6155     case x_fsm_local_snapshot:
6156       return handle_local_snapshot(fsmargs, ctxt);
6157 
6158     case x_fsm_snapshot:
6159       return handle_snapshot(fsmargs, ctxt);
6160 
6161     case x_fsm_timeout:
6162       /* Will time out if no snapshot available */
6163       /* If we run under control of the recovery manager, we need to call
6164        * recovery_end_cb to rendezvous with the recovery manager */
6165       if (recovery_end_cb) recovery_end_cb();
6166       pop_dbg();
6167       SET_X_FSM_STATE(xcom_fsm_start_enter);
6168       return 1;
6169 
6170     default:
6171       break;
6172   }
6173   return 0;
6174 }
6175 /* purecov: end */
6176 
6177 /* recover_wait_enter state */
6178 /* purecov: begin deadcode */
xcom_fsm_recover_wait_enter(xcom_actions action,task_arg fsmargs,xcom_fsm_state * ctxt)6179 static int xcom_fsm_recover_wait_enter(xcom_actions action, task_arg fsmargs,
6180                                        xcom_fsm_state *ctxt) {
6181   (void)action;
6182   (void)fsmargs;
6183   push_dbg(D_DETECT | D_FSM | D_FILEOP | D_CONS | D_BASE | D_TRANSPORT);
6184   IFDBG(D_NONE, FN; STRLIT("state x_recover_wait"););
6185   if (got_all_snapshots()) {
6186     /* Need to send message to trigger transition in context of xcom
6187      * thread */
6188     send_x_fsm_complete();
6189   }
6190   SET_X_FSM_STATE(xcom_fsm_recover_wait);
6191   return 0;
6192 }
6193 /* purecov: end */
6194 
6195 /* recover_wait state */
6196 /* purecov: begin deadcode */
xcom_fsm_recover_wait(xcom_actions action,task_arg fsmargs,xcom_fsm_state * ctxt)6197 static int xcom_fsm_recover_wait(xcom_actions action, task_arg fsmargs,
6198                                  xcom_fsm_state *ctxt) {
6199   if (action == x_fsm_snapshot) {
6200     gcs_snapshot *gcs = (gcs_snapshot *)get_void_arg(fsmargs);
6201     set_log_end(gcs);
6202     update_best_snapshot(gcs);
6203   } else if (action == x_fsm_timeout || action == x_fsm_complete) {
6204     /* Wait terminated by timeout or because all nodes have sent a
6205      * snapshot */
6206     /* If we run under control of the recovery manager, we need to call
6207      * recovery_end_cb to rendezvous with the recovery manager */
6208     if (recovery_end_cb) recovery_end_cb();
6209     pop_dbg();
6210     SET_X_FSM_STATE(xcom_fsm_run_enter);
6211     return 1;
6212   }
6213   if (got_all_snapshots()) {
6214     /* Need to send message to trigger transition in context of xcom
6215      * thread */
6216     send_x_fsm_complete();
6217   }
6218   return 0;
6219 }
6220 /* purecov: end */
6221 
6222 /* run_enter state */
xcom_fsm_run_enter(xcom_actions action,task_arg fsmargs,xcom_fsm_state * ctxt)6223 static int xcom_fsm_run_enter(xcom_actions action, task_arg fsmargs,
6224                               xcom_fsm_state *ctxt) {
6225   (void)action;
6226   (void)fsmargs;
6227   start_config = get_site_def()->boot_key;
6228 
6229   /* Final sanity check of executed_msg */
6230   if (find_site_def(executed_msg) == 0) {
6231     /* No site_def matches executed_msg, set it to site->start */
6232     set_executed_msg(get_site_def()->start);
6233   }
6234 
6235   IFDBG(D_NONE, FN; STRLIT("state x_run"););
6236   IFDBG(D_BUG, FN; SYCEXP(executed_msg););
6237   IFDBG(D_BUG, FN; SYCEXP(start_config););
6238   stop_x_timer();
6239   if (xcom_run_cb) xcom_run_cb(0);
6240   client_boot_done = 1;
6241   netboot_ok = 1;
6242   set_proposer_startpoint();
6243   create_proposers();
6244   set_task(&executor, task_new(executor_task, null_arg, "executor_task",
6245                                XCOM_THREAD_DEBUG));
6246   set_task(&sweeper,
6247            task_new(sweeper_task, null_arg, "sweeper_task", XCOM_THREAD_DEBUG));
6248   set_task(&detector, task_new(detector_task, null_arg, "detector_task",
6249                                XCOM_THREAD_DEBUG));
6250   set_task(&alive_t,
6251            task_new(alive_task, null_arg, "alive_task", XCOM_THREAD_DEBUG));
6252   set_task(&cache_task, task_new(cache_manager_task, null_arg,
6253                                  "cache_manager_task", XCOM_THREAD_DEBUG));
6254 
6255   push_dbg(D_FSM /* | D_EXEC | D_BASE | D_TRANSPORT */);
6256   SET_X_FSM_STATE(xcom_fsm_run);
6257   return 1;
6258 }
6259 
handle_fsm_terminate(task_arg fsmargs,xcom_fsm_state * ctxt)6260 static int handle_fsm_terminate(task_arg fsmargs, xcom_fsm_state *ctxt) {
6261   dump_debug_exec_state();
6262   client_boot_done = 0;
6263   netboot_ok = 0;
6264   oom_abort = 0;
6265   terminate_proposers();
6266   init_proposers();
6267   task_terminate(executor);
6268   set_task(&executor, NULL);
6269   task_terminate(sweeper);
6270   set_task(&sweeper, NULL);
6271   task_terminate(detector);
6272   set_task(&detector, NULL);
6273   task_terminate(alive_t);
6274   set_task(&alive_t, NULL);
6275   task_terminate(cache_task);
6276   set_task(&cache_task, NULL);
6277 
6278   init_xcom_base(); /* Reset shared variables */
6279   free_site_defs();
6280   free_forced_config_site_def();
6281   wait_forced_config = 0;
6282   garbage_collect_servers();
6283   if (xcom_terminate_cb) xcom_terminate_cb(get_int_arg(fsmargs));
6284   pop_dbg();
6285   SET_X_FSM_STATE(xcom_fsm_start_enter);
6286   return 1;
6287 }
6288 
handle_fsm_force_config(task_arg fsmargs)6289 static void handle_fsm_force_config(task_arg fsmargs) {
6290   app_data *a = (app_data *)get_void_arg(fsmargs);
6291   site_def *s = create_site_def_with_start(a, executed_msg);
6292 
6293   s->boot_key = executed_msg;
6294   invalidate_servers(get_site_def(), s);
6295   start_force_config(s, 1);
6296   wait_forced_config = 1; /* Note that forced config has not yet arrived */
6297 }
6298 
6299 /* run state */
xcom_fsm_run(xcom_actions action,task_arg fsmargs,xcom_fsm_state * ctxt)6300 static int xcom_fsm_run(xcom_actions action, task_arg fsmargs,
6301                         xcom_fsm_state *ctxt) {
6302   switch (action) {
6303     case x_fsm_terminate:
6304       return handle_fsm_terminate(fsmargs, ctxt);
6305 
6306     /* purecov: begin deadcode */
6307     case x_fsm_need_snapshot:
6308       IFDBG(D_NONE, STRLIT("got snapshot request in x_run state"));
6309       break;
6310       /* purecov: end */
6311 
6312     case x_fsm_force_config:
6313       handle_fsm_force_config(fsmargs);
6314       break;
6315 
6316     default:
6317       break;
6318   }
6319   return 0;
6320 }
6321 
6322 /* Trampoline which loops calling thunks pointed to by ctxt.state_fp until 0 is
6323  * returned. Return pointer to ctxt. */
xcom_fsm_impl(xcom_actions action,task_arg fsmargs)6324 xcom_fsm_state *xcom_fsm_impl(xcom_actions action, task_arg fsmargs) {
6325   static xcom_fsm_state ctxt = X_FSM_STATE(xcom_fsm_init);
6326 
6327   G_DEBUG("%f pid %d xcom_id %x state %s action %s", seconds(), xpid(),
6328           get_my_xcom_id(), ctxt.state_name, xcom_actions_name[action]);
6329   ADD_DBG(D_FSM, add_event(EVENT_DUMP_PAD, string_arg("state"));
6330           add_event(EVENT_DUMP_PAD, string_arg(ctxt.state_name));
6331           add_event(EVENT_DUMP_PAD, string_arg("action"));
6332           add_event(EVENT_DUMP_PAD, string_arg(xcom_actions_name[action]));
6333           add_event(EVENT_DUMP_PAD, string_arg("executed_msg"));
6334           add_synode_event(executed_msg););
6335 #ifdef TASK_EVENT_TRACE
6336   dump_task_events();
6337 #endif
6338   /* Crank the state machine until it stops */
6339   IFDBG(D_BUG, FN; STREXP(ctxt.state_name); STREXP(xcom_actions_name[action]));
6340   while (ctxt.state_fp(action, fsmargs, &ctxt)) {
6341     IFDBG(D_BUG, FN; STREXP(ctxt.state_name);
6342           STREXP(xcom_actions_name[action]));
6343   }
6344   return &ctxt;
6345 }
6346 
6347 /* Call FSM trampoline and return state name of resulting state */
xcom_fsm(xcom_actions action,task_arg fsmargs)6348 char const *xcom_fsm(xcom_actions action, task_arg fsmargs) {
6349   xcom_fsm_state *s = xcom_fsm_impl(action, fsmargs);
6350   return s->state_name;
6351 }
6352 
6353 /* See if we can send a snapshot to another node */
6354 /* purecov: begin deadcode */
can_send_snapshot()6355 static int can_send_snapshot() {
6356   xcom_fsm_state *state = xcom_fsm_impl(x_fsm_need_snapshot, null_arg);
6357   return state->state_fp == xcom_fsm_run;
6358 }
6359 /* purecov: end */
6360 
set_app_snap_handler(app_snap_handler x)6361 void set_app_snap_handler(app_snap_handler x) { handle_app_snap_cb = x; }
6362 
6363 /* purecov: begin deadcode */
set_app_snap_getter(app_snap_getter x)6364 void set_app_snap_getter(app_snap_getter x) { get_app_snap_cb = x; }
6365 /* purecov: end */
6366 
checked_create_socket(int domain,int type,int protocol)6367 static result checked_create_socket(int domain, int type, int protocol) {
6368   result retval = {0, 0};
6369   int nr_attempts = 1005;
6370 
6371   do {
6372     SET_OS_ERR(0);
6373     retval.val = (int)socket(domain, type, protocol);
6374     retval.funerr = to_errno(GET_OS_ERR);
6375     if (nr_attempts % 10 == 0) xcom_sleep(1);
6376   } while (--nr_attempts && retval.val == -1 &&
6377            (from_errno(retval.funerr) == SOCK_EAGAIN));
6378 
6379   if (retval.val == -1) {
6380     task_dump_err(retval.funerr);
6381 #if defined(_WIN32)
6382     G_MESSAGE("Socket creation failed with error %d.", retval.funerr);
6383 #else
6384     G_MESSAGE("Socket creation failed with error %d - %s.", retval.funerr,
6385               strerror(retval.funerr));
6386 #endif
6387   }
6388   return retval;
6389 }
6390 
6391 /* Read max n bytes from socket fd into buffer buf */
socket_read(connection_descriptor * rfd,void * buf,int n)6392 static result socket_read(connection_descriptor *rfd, void *buf, int n) {
6393   result ret = {0, 0};
6394 
6395   assert(n >= 0);
6396 
6397   do {
6398     ret = con_read(rfd, buf, n);
6399     task_dump_err(ret.funerr);
6400   } while (ret.val < 0 && can_retry_read(ret.funerr));
6401   return ret;
6402 }
6403 
6404 /* Read exactly n bytes from socket fd into buffer buf */
socket_read_bytes(connection_descriptor * rfd,char * p,uint32_t n)6405 static int64_t socket_read_bytes(connection_descriptor *rfd, char *p,
6406                                  uint32_t n) {
6407   uint32_t left = n;
6408   char *bytes = p;
6409 
6410   result nread = {0, 0};
6411 
6412   while (left > 0) {
6413     /*
6414       socket_read just reads no more than INT_MAX bytes. We should not pass
6415       a length more than INT_MAX to it.
6416     */
6417     int r = (int)MIN(left, INT_MAX);
6418 
6419     nread = socket_read(rfd, bytes, r);
6420     if (nread.val == 0) {
6421       return 0;
6422     } else if (nread.val < 0) {
6423       return -1;
6424     } else {
6425       bytes += nread.val;
6426       left -= (uint32_t)nread.val;
6427     }
6428   }
6429   assert(left == 0);
6430   return n;
6431 }
6432 
6433 /* Write n bytes from buffer buf to socket fd */
socket_write(connection_descriptor * wfd,void * _buf,uint32_t n)6434 static int64_t socket_write(connection_descriptor *wfd, void *_buf,
6435                             uint32_t n) {
6436   char *buf = (char *)_buf;
6437   result ret = {0, 0};
6438 
6439   uint32_t total; /* Keeps track of number of bytes written so far */
6440 
6441   total = 0;
6442   while (total < n) {
6443     int w = (int)MIN(n - total, INT_MAX);
6444 
6445     while ((ret = con_write(wfd, buf + total, w)).val < 0 &&
6446            can_retry_write(ret.funerr)) {
6447       task_dump_err(ret.funerr);
6448       IFDBG(D_NONE, FN; STRLIT("retry "); NEXP(total, d); NEXP(n, d));
6449     }
6450     if (ret.val <= 0) { /* Something went wrong */
6451       task_dump_err(ret.funerr);
6452       return -1;
6453     } else {
6454       total += (uint32_t)ret.val; /* Add number of bytes written to total */
6455     }
6456   }
6457   IFDBG(D_TRANSPORT, FN; NEXP(total, u); NEXP(n, u));
6458   assert(total == n);
6459   return (total);
6460 }
6461 
xcom_close_socket(int * sock)6462 static inline result xcom_close_socket(int *sock) {
6463   result res = {0, 0};
6464   if (*sock != -1) {
6465     IFDBG(D_FILEOP, FN; STRLIT("closing socket "); NDBG(*sock, d));
6466     do {
6467       SET_OS_ERR(0);
6468       res.val = CLOSESOCKET(*sock);
6469       res.funerr = to_errno(GET_OS_ERR);
6470     } while (res.val == -1 && from_errno(res.funerr) == SOCK_EINTR);
6471     *sock = -1;
6472   }
6473   return res;
6474 }
6475 
xcom_shut_close_socket(int * sock)6476 static inline result xcom_shut_close_socket(int *sock) {
6477   result res = {0, 0};
6478   if (*sock >= 0) {
6479     shutdown_socket(sock);
6480     res = xcom_close_socket(sock);
6481   }
6482   return res;
6483 }
6484 
6485 #define CONNECT_FAIL \
6486   ret_fd = -1;       \
6487   goto end
6488 
6489 /*
6490 
6491 */
6492 
6493 /**
6494   @brief Retreives a node IPv4 address, if it exists.
6495 
6496   If a node is v4 reachable, means one of two:
6497   - The raw address is V4
6498   - a name was resolved to a V4/V6 address
6499 
6500   If the later is the case, we are going to prefer the first v4
6501   address in the list, since it is the common language between
6502   old and new version. If you want exclusive V6, please configure your
6503   DNS server to serve V6 names
6504 
6505   @param retrieved a previously retrieved struct addrinfo
6506   @return struct addrinfo* An addrinfo of the first IPv4 address. Else it will
6507                            return the entry parameter.
6508  */
does_node_have_v4_address(struct addrinfo * retrieved)6509 struct addrinfo *does_node_have_v4_address(struct addrinfo *retrieved) {
6510   struct addrinfo *cycle = NULL;
6511 
6512   int v4_reachable = is_node_v4_reachable_with_info(retrieved);
6513 
6514   if (v4_reachable) {
6515     cycle = retrieved;
6516     while (cycle) {
6517       if (cycle->ai_family == AF_INET) {
6518         return cycle;
6519       }
6520       cycle = cycle->ai_next;
6521     }
6522   }
6523 
6524   /* If something goes really wrong... we fallback to avoid crashes */
6525   return retrieved;
6526 }
6527 
timed_connect_msec(int fd,struct sockaddr * sock_addr,socklen_t sock_size,int timeout)6528 static int timed_connect_msec(int fd, struct sockaddr *sock_addr,
6529                               socklen_t sock_size, int timeout) {
6530   int ret_fd = fd;
6531   int syserr;
6532   int sysret;
6533   struct pollfd fds;
6534 
6535   fds.fd = fd;
6536   fds.events = POLLOUT;
6537   fds.revents = 0;
6538 
6539   /* Set non-blocking */
6540   if (unblock_fd(fd) < 0) return -1;
6541 
6542   /* Trying to connect with timeout */
6543   SET_OS_ERR(0);
6544   sysret = connect(fd, sock_addr, sock_size);
6545 
6546   if (is_socket_error(sysret)) {
6547     syserr = GET_OS_ERR;
6548     /* If the error is SOCK_EWOULDBLOCK or SOCK_EINPROGRESS or SOCK_EALREADY,
6549      * wait. */
6550     switch (syserr) {
6551       case SOCK_EWOULDBLOCK:
6552       case SOCK_EINPROGRESS:
6553       case SOCK_EALREADY:
6554         break;
6555       default:
6556         G_DEBUG(
6557             "connect - Error connecting "
6558             "(socket=%d, error=%d).",
6559             fd, GET_OS_ERR);
6560         CONNECT_FAIL;
6561     }
6562 
6563     SET_OS_ERR(0);
6564     IFDBG(D_TRANSPORT, FN; STRLIT("poll - Starting. "); NEXP(timeout, d);
6565           NEXP(sysret, d));
6566     while ((sysret = poll(&fds, 1, timeout)) < 0) {
6567       syserr = GET_OS_ERR;
6568       if (syserr != SOCK_EINTR && syserr != SOCK_EINPROGRESS) break;
6569       SET_OS_ERR(0);
6570     }
6571     IFDBG(D_TRANSPORT, FN; STRLIT("poll - Finished. "); NEXP(timeout, d);
6572           NEXP(sysret, d));
6573 
6574     if (sysret == 0) {
6575       G_DEBUG(
6576           "Timed out while waiting for connection to be established! "
6577           "Cancelling connection attempt. (socket= %d, error=%d)",
6578           fd, sysret);
6579       /* G_WARNING("poll - Timeout! Cancelling connection..."); */
6580       CONNECT_FAIL;
6581     }
6582 
6583     if (is_socket_error(sysret)) {
6584       G_DEBUG(
6585           "poll - Error while connecting! "
6586           "(socket= %d, error=%d)",
6587           fd, GET_OS_ERR);
6588       CONNECT_FAIL;
6589     }
6590 
6591     {
6592       int socket_errno = 0;
6593       socklen_t socket_errno_len = sizeof(socket_errno);
6594 
6595       if ((fds.revents & POLLOUT) == 0) {
6596         IFDBG(D_NONE, FN; STRLIT("POLLOUT not set - Socket failure!"););
6597         ret_fd = -1;
6598       }
6599 
6600       if (fds.revents & (POLLERR | POLLHUP | POLLNVAL)) {
6601         IFDBG(D_NONE, FN;
6602               STRLIT("POLLERR | POLLHUP | POLLNVAL set - Socket failure!"););
6603         ret_fd = -1;
6604       }
6605       if (getsockopt(fd, SOL_SOCKET, SO_ERROR, (xcom_buf *)&socket_errno,
6606                      &socket_errno_len) != 0) {
6607         G_DEBUG("getsockopt socket %d failed.", fd);
6608         ret_fd = -1;
6609       } else {
6610         if (socket_errno != 0) {
6611           G_DEBUG("Connection to socket %d failed with error %d.", fd,
6612                   socket_errno);
6613           ret_fd = -1;
6614         }
6615       }
6616     }
6617   }
6618 
6619 end:
6620   /* Set blocking */
6621   SET_OS_ERR(0);
6622   if (block_fd(fd) < 0) {
6623     G_DEBUG(
6624         "Unable to set socket back to blocking state. "
6625         "(socket=%d, error=%d).",
6626         fd, GET_OS_ERR);
6627     return -1;
6628   }
6629   return ret_fd;
6630 }
6631 
timed_connect(int fd,struct sockaddr * sock_addr,socklen_t sock_size)6632 static int timed_connect(int fd, struct sockaddr *sock_addr,
6633                          socklen_t sock_size) {
6634   return timed_connect_msec(fd, sock_addr, sock_size, 10000);
6635 }
6636 
6637 /* purecov: begin deadcode */
timed_connect_sec(int fd,struct sockaddr * sock_addr,socklen_t sock_size,int timeout)6638 int timed_connect_sec(int fd, struct sockaddr *sock_addr, socklen_t sock_size,
6639                       int timeout) {
6640   return timed_connect_msec(fd, sock_addr, sock_size, timeout * 1000);
6641 }
6642 /* purecov: end */
6643 
6644 /* Connect to server on given port */
6645 #ifndef XCOM_WITHOUT_OPENSSL
connect_xcom(char const * server,xcom_port port,int use_ssl)6646 static connection_descriptor *connect_xcom(char const *server, xcom_port port,
6647                                            int use_ssl) {
6648 #else
6649 static connection_descriptor *connect_xcom(char const *server, xcom_port port) {
6650 #endif
6651   result fd = {0, 0};
6652   result ret = {0, 0};
6653   connection_descriptor *cd = NULL;
6654   char buf[SYS_STRERROR_SIZE];
6655 
6656   IFDBG(D_NONE, FN; STREXP(server); NEXP(port, d));
6657   G_DEBUG("connecting to %s %d", server, port);
6658 
6659   {
6660     struct addrinfo *addr = NULL, *from_ns = NULL;
6661 
6662     char buffer[20];
6663     sprintf(buffer, "%d", port);
6664 
6665     checked_getaddrinfo(server, buffer, 0, &from_ns);
6666 
6667     if (from_ns == NULL) {
6668       /* purecov: begin inspected */
6669       G_ERROR("Error retrieving server information.");
6670       goto end;
6671       /* purecov: end */
6672     }
6673 
6674     addr = does_node_have_v4_address(from_ns);
6675 
6676     /* Create socket after knowing the family that we are dealing with
6677        getaddrinfo returns a list of possible addresses. We will alays default
6678        to the first one in the list, which is V4 if applicable.
6679      */
6680     if ((fd = checked_create_socket(addr->ai_family, SOCK_STREAM, IPPROTO_TCP))
6681             .val < 0) {
6682       /* purecov: begin inspected */
6683       G_ERROR(
6684           "Error creating socket in local GR->GCS connection to address %s.",
6685           server);
6686       goto end;
6687       /* purecov: end */
6688     }
6689 
6690     /* Connect socket to address */
6691 
6692     SET_OS_ERR(0);
6693 
6694     if (timed_connect(fd.val, addr->ai_addr, (socklen_t)addr->ai_addrlen) ==
6695         -1) {
6696       fd.funerr = to_errno(GET_OS_ERR);
6697       G_DEBUG(
6698           "Connecting socket to address %s in port %d failed with error %d - "
6699           "%s.",
6700           server, port, fd.funerr, strerr_msg(buf, sizeof(buf), fd.funerr));
6701       xcom_close_socket(&fd.val);
6702       goto end;
6703     }
6704     {
6705       int peer = 0;
6706       /* Sanity check before return */
6707       SET_OS_ERR(0);
6708       {
6709         socklen_t ai_addrlen = (socklen_t)addr->ai_addrlen;
6710         ret.val = peer = xcom_getpeername(fd.val, addr->ai_addr, &ai_addrlen);
6711       }
6712       ret.funerr = to_errno(GET_OS_ERR);
6713       if (peer >= 0) {
6714         ret = set_nodelay(fd.val);
6715         if (ret.val < 0) {
6716           /* purecov: begin inspected */
6717           task_dump_err(ret.funerr);
6718           xcom_shut_close_socket(&fd.val);
6719 #if defined(_WIN32)
6720           G_DEBUG(
6721               "Setting node delay failed  while connecting to %s with error "
6722               "%d.",
6723               server, ret.funerr);
6724 #else
6725           G_DEBUG(
6726               "Setting node delay failed  while connecting to %s with error %d "
6727               "- "
6728               "%s.",
6729               server, ret.funerr, strerror(ret.funerr));
6730 #endif
6731           goto end;
6732           /* purecov: end */
6733         }
6734         G_DEBUG("client connected to %s %d fd %d", server, port, fd.val);
6735       } else {
6736         /* Something is wrong */
6737         /* purecov: begin inspected */
6738         socklen_t errlen = sizeof(ret.funerr);
6739         IFDBG(D_NONE, FN; STRLIT("xcom_getpeername failed"););
6740         if (ret.funerr) {
6741           IFDBG(D_NONE, FN; NEXP(from_errno(ret.funerr), d);
6742                 STRLIT(strerror(from_errno(ret.funerr))));
6743         }
6744         getsockopt(fd.val, SOL_SOCKET, SO_ERROR, (xcom_buf *)&ret.funerr,
6745                    &errlen);
6746         if (ret.funerr == 0) {
6747           ret.funerr = to_errno(SOCK_ECONNREFUSED);
6748         }
6749         xcom_shut_close_socket(&fd.val);
6750 #if defined(_WIN32)
6751         G_DEBUG(
6752             "Getting the peer name failed while connecting to server %s with "
6753             "error %d.",
6754             server, ret.funerr);
6755 #else
6756         G_DEBUG(
6757             "Getting the peer name failed while connecting to server %s with "
6758             "error %d -%s.",
6759             server, ret.funerr, strerror(ret.funerr));
6760 #endif
6761         goto end;
6762         /* purecov: end */
6763       }
6764 
6765 #ifndef XCOM_WITHOUT_OPENSSL
6766       if (use_ssl && xcom_use_ssl()) {
6767         SSL *ssl = SSL_new(client_ctx);
6768         G_DEBUG("Trying to connect using SSL.")
6769         SSL_set_fd(ssl, fd.val);
6770 
6771         ERR_clear_error();
6772         ret.val = SSL_connect(ssl);
6773         ret.funerr = to_ssl_err(SSL_get_error(ssl, ret.val));
6774 
6775         if (ret.val != SSL_SUCCESS) {
6776           /* purecov: begin inspected */
6777           G_MESSAGE("Error connecting using SSL %d %d.", ret.funerr,
6778                     SSL_get_error(ssl, ret.val));
6779           task_dump_err(ret.funerr);
6780           SSL_shutdown(ssl);
6781           SSL_free(ssl);
6782           xcom_shut_close_socket(&fd.val);
6783 
6784           goto end;
6785           /* purecov: end */
6786         }
6787         IFDBG(D_NONE, FN; STRLIT("ssl connected to "); STRLIT(server);
6788               NDBG(port, d); NDBG(fd.val, d); PTREXP(ssl));
6789 
6790         if (ssl_verify_server_cert(ssl, server)) {
6791           /* purecov: begin inspected */
6792           G_MESSAGE("Error validating certificate and peer.");
6793           task_dump_err(ret.funerr);
6794           SSL_shutdown(ssl);
6795           SSL_free(ssl);
6796           xcom_shut_close_socket(&fd.val);
6797 
6798           goto end;
6799           /* purecov: end */
6800         }
6801 
6802         cd = new_connection(fd.val, ssl);
6803         set_connected(cd, CON_FD);
6804         G_DEBUG("Success connecting using SSL.")
6805 
6806         goto end;
6807       } else {
6808         cd = new_connection(fd.val, 0);
6809         set_connected(cd, CON_FD);
6810 
6811         goto end;
6812       }
6813 #else
6814       {
6815         cd = new_connection(fd.val);
6816         set_connected(cd, CON_FD);
6817 
6818         goto end;
6819       }
6820 #endif
6821     }
6822 
6823   end:
6824     if (from_ns) freeaddrinfo(from_ns);
6825   }
6826   return cd;
6827 }
6828 
6829 connection_descriptor *xcom_open_client_connection(char const *server,
6830                                                    xcom_port port) {
6831 #ifndef XCOM_WITHOUT_OPENSSL
6832   return connect_xcom(server, port, TRUE);
6833 #else
6834   return connect_xcom(server, port);
6835 #endif
6836 }
6837 
6838 /* Send a protocol negotiation message on connection con */
6839 static int xcom_send_proto(connection_descriptor *con, xcom_proto x_proto,
6840                            x_msg_type x_type, unsigned int tag) {
6841   char buf[MSG_HDR_SIZE];
6842   memset(buf, 0, MSG_HDR_SIZE);
6843 
6844   if (con->fd >= 0) {
6845     con->snd_tag = tag;
6846     write_protoversion(VERS_PTR((unsigned char *)buf), x_proto);
6847     put_header_1_0((unsigned char *)buf, 0, x_type, tag);
6848     {
6849       int sent;
6850       sent = (int)socket_write(con, buf, MSG_HDR_SIZE);
6851       if (con->fd < 0) {
6852         return -1;
6853       }
6854       return sent;
6855     }
6856   } else {
6857     return -1;
6858   }
6859 }
6860 
6861 static int xcom_recv_proto(connection_descriptor *rfd, xcom_proto *x_proto,
6862                            x_msg_type *x_type, unsigned int *tag) {
6863   int n;
6864   unsigned char header_buf[MSG_HDR_SIZE];
6865   uint32_t msgsize;
6866 
6867   /* Read length field, protocol version, and checksum */
6868   n = (int)socket_read_bytes(rfd, (char *)header_buf, MSG_HDR_SIZE);
6869 
6870   if (n != MSG_HDR_SIZE) {
6871     IFDBG(D_NONE, FN; NDBG(n, d));
6872     return -1;
6873   }
6874 
6875   *x_proto = read_protoversion(VERS_PTR(header_buf));
6876   get_header_1_0(header_buf, &msgsize, x_type, tag);
6877 
6878   return n;
6879 }
6880 
6881 enum { TAG_START = 313 };
6882 
6883 /**
6884  * @brief Checks if a given app_data is from a given cargo_type.
6885  *
6886  * @param a the app_data
6887  * @param t the cargo type
6888  * @return int TRUE (1) if app_data a is from cargo_type t
6889  */
6890 
6891 static inline int is_cargo_type(app_data_ptr a, cargo_type t) {
6892   return a ? (a->body.c_t == t) : 0;
6893 }
6894 
6895 /**
6896  * @brief Retrieves the address that was used in the add_node request
6897  *
6898  * @param a app data containing the node to add
6899  * @param member address we used to present ourselves to other nodes
6900  * @return char* a pointer to the address being added.
6901  */
6902 static char *get_add_node_address(app_data_ptr a, unsigned int *member) {
6903   char *retval = NULL;
6904   if (!is_cargo_type(a, add_node_type)) return NULL;
6905 
6906   if ((*member) < a->body.app_u_u.nodes.node_list_len) {
6907     retval = a->body.app_u_u.nodes.node_list_val[(*member)].address;
6908     (*member)++;
6909   }
6910 
6911   return retval;
6912 }
6913 
6914 int is_node_v4_reachable_with_info(struct addrinfo *retrieved_addr_info) {
6915   int v4_reachable = 0;
6916 
6917   /* Verify if we are reachable either by V4 and by V6 with the provided
6918      address. */
6919   struct addrinfo *my_own_information_loop = NULL;
6920 
6921   my_own_information_loop = retrieved_addr_info;
6922   while (!v4_reachable && my_own_information_loop) {
6923     if (my_own_information_loop->ai_family == AF_INET) {
6924       v4_reachable = 1;
6925     }
6926     my_own_information_loop = my_own_information_loop->ai_next;
6927   }
6928 
6929   return v4_reachable;
6930 }
6931 
6932 int is_node_v4_reachable(char *node_address) {
6933   int v4_reachable = 0;
6934 
6935   /* Verify if we are reachable either by V4 and by V6 with the provided
6936      address. */
6937   struct addrinfo *my_own_information = NULL;
6938 
6939   checked_getaddrinfo(node_address, NULL, NULL, &my_own_information);
6940   if (my_own_information == NULL) {
6941     return v4_reachable;
6942   }
6943 
6944   v4_reachable = is_node_v4_reachable_with_info(my_own_information);
6945 
6946   if (my_own_information) freeaddrinfo(my_own_information);
6947 
6948   return v4_reachable;
6949 }
6950 
6951 int are_we_allowed_to_upgrade_to_v6(app_data_ptr a) {
6952   /* This should the address we used to present ourselves to other nodes. */
6953   unsigned int list_member = 0;
6954   char *added_node = NULL;
6955 
6956   int is_v4_reachable = 0;
6957   while ((added_node = get_add_node_address(a, &list_member)) != NULL) {
6958     xcom_port my_own_port;
6959     char my_own_address[IP_MAX_SIZE];
6960     int ip_and_port_error =
6961         get_ip_and_port(added_node, my_own_address, &my_own_port);
6962 
6963     if (ip_and_port_error) {
6964       G_DEBUG("Error retrieving IP and Port information");
6965       return 0;
6966     }
6967 
6968     /* Verify if we are reachable either by V4 and by V6 with the provided
6969        address.
6970        This means that the other side won't be able to contact us since we
6971        do not provide a public V4 address */
6972     if (!(is_v4_reachable = is_node_v4_reachable(my_own_address))) {
6973       G_ERROR(
6974           "Unable to add node to a group of older nodes. Please "
6975           "reconfigure "
6976           "you local address to an IPv4 address or configure your DNS to "
6977           "provide "
6978           "an IPv4 address");
6979       return 0;
6980     }
6981   }
6982 
6983   return is_v4_reachable;
6984 }
6985 
6986 int64_t xcom_send_client_app_data(connection_descriptor *fd, app_data_ptr a,
6987                                   int force) {
6988   pax_msg *msg = pax_msg_new(null_synode, 0);
6989   uint32_t buflen = 0;
6990   char *buf = 0;
6991   int64_t retval = 0;
6992   int serialized = 0;
6993 
6994   if (!proto_done(fd)) {
6995     xcom_proto x_proto;
6996     x_msg_type x_type;
6997     unsigned int tag;
6998     retval = xcom_send_proto(fd, my_xcom_version, x_version_req, TAG_START);
6999     G_DEBUG("client sent negotiation request for protocol %d", my_xcom_version);
7000     if (retval < 0) goto end;
7001     retval = xcom_recv_proto(fd, &x_proto, &x_type, &tag);
7002     if (retval < 0) goto end;
7003     if (tag != TAG_START) {
7004       retval = -1;
7005       goto end;
7006     }
7007     if (x_type != x_version_reply) {
7008       retval = -1;
7009       goto end;
7010     }
7011 
7012     if (x_proto == x_unknown_proto) {
7013       G_DEBUG("no common protocol, returning error");
7014       retval = -1;
7015       goto end;
7016     }
7017 
7018     /* This code will check if, in case of an upgrade if:
7019        - We are a node able to speak IPv6.
7020        - If we are connecting to a group that does not speak IPv6.
7021        - If our address is IPv4-compatible in order for the old group to be able
7022        to contact us back. */
7023     if (is_cargo_type(a, add_node_type) && x_proto < minimum_ipv6_version() &&
7024         !are_we_allowed_to_upgrade_to_v6(a)) {
7025       retval = -1;
7026       goto end;
7027     }
7028 
7029     G_DEBUG("client connection will use protocol version %d", x_proto);
7030     IFDBG(D_NONE, STRLIT("client connection will use protocol version ");
7031           NDBG(x_proto, u); STRLIT(xcom_proto_to_str(x_proto)));
7032     fd->x_proto = x_proto;
7033     set_connected(fd, CON_PROTO);
7034   }
7035   msg->a = a;
7036   msg->to = VOID_NODE_NO;
7037   msg->op = client_msg;
7038   msg->force_delivery = force;
7039 
7040   serialized = serialize_msg(msg, fd->x_proto, &buflen, &buf);
7041   if (serialized) {
7042     retval = socket_write(fd, buf, buflen);
7043     if (buflen != retval) {
7044       IFDBG(D_NONE, FN; STRLIT("write failed "); NDBG(fd->fd, d);
7045             NDBG(buflen, d); NDBG64(retval));
7046     }
7047   } else {
7048     /* Failed to serialize, set retval accordingly. */
7049     retval = -1;
7050   }
7051   X_FREE(buf);
7052 end:
7053   msg->a = 0; /* Do not deallocate a */
7054   XCOM_XDR_FREE(xdr_pax_msg, msg);
7055   return retval;
7056 }
7057 
7058 /* purecov: begin tested */
7059 /*
7060  * Tested by TEST_F(XComMultinodeSmokeTest,
7061  * 3_nodes_member_crashes_with_dieop_and_joins_again_immediately) GCS smoke test
7062  */
7063 int64_t xcom_client_send_die(connection_descriptor *fd) {
7064   uint32_t buflen = 0;
7065   char *buf = 0;
7066   int64_t retval = 0;
7067   app_data a;
7068   pax_msg *msg = pax_msg_new(null_synode, 0);
7069 
7070   if (!proto_done(fd)) {
7071     xcom_proto x_proto;
7072     x_msg_type x_type;
7073     unsigned int tag;
7074     retval = xcom_send_proto(fd, my_xcom_version, x_version_req, TAG_START);
7075     G_DEBUG("client sent negotiation request for protocol %d", my_xcom_version);
7076     if (retval < 0) goto end;
7077     retval = xcom_recv_proto(fd, &x_proto, &x_type, &tag);
7078     if (retval < 0) goto end;
7079     if (tag != TAG_START) {
7080       retval = -1;
7081       goto end;
7082     }
7083     if (x_type != x_version_reply) {
7084       retval = -1;
7085       goto end;
7086     }
7087 
7088     if (x_proto == x_unknown_proto) {
7089       G_DEBUG("no common protocol, returning error");
7090       retval = -1;
7091       goto end;
7092     }
7093     G_DEBUG("client connection will use protocol version %d", x_proto);
7094     IFDBG(D_NONE, STRLIT("client connection will use protocol version ");
7095           NDBG(x_proto, u); STRLIT(xcom_proto_to_str(x_proto)));
7096     fd->x_proto = x_proto;
7097     set_connected(fd, CON_PROTO);
7098   }
7099   init_app_data(&a);
7100   a.body.c_t = app_type;
7101   msg->a = &a;
7102   msg->op = die_op;
7103   /*
7104     Set the msgno to a value that ensures the die_op will be processed by
7105     XCom when it is received (it needs to be higher than the msgno of the
7106     executed_msg, otherwise XCom will simply ignore it).
7107    */
7108   msg->synode.msgno = UINT64_MAX;
7109 
7110   serialize_msg(msg, fd->x_proto, &buflen, &buf);
7111   if (buflen) {
7112     retval = socket_write(fd, buf, buflen);
7113     if (buflen != retval) {
7114       IFDBG(D_NONE, FN; STRLIT("write failed "); NDBG(fd->fd, d);
7115             NDBG(buflen, d); NDBG64(retval));
7116     }
7117     X_FREE(buf);
7118   }
7119   xdr_free((xdrproc_t)xdr_app_data, (char *)&a);
7120 end:
7121   msg->a = 0;
7122   XCOM_XDR_FREE(xdr_pax_msg, msg);
7123   return retval > 0 && retval == buflen ? 1 : 0;
7124 }
7125 /* purecov: end */
7126 
7127 /* purecov: begin deadcode */
7128 int64_t xcom_client_send_data(uint32_t size, char *data,
7129                               connection_descriptor *fd) {
7130   app_data a;
7131   int64_t retval = 0;
7132   init_app_data(&a);
7133   a.body.c_t = app_type;
7134   a.body.app_u_u.data.data_len = size;
7135   a.body.app_u_u.data.data_val = data;
7136   retval = xcom_send_client_app_data(fd, &a, 0);
7137   xdr_free((xdrproc_t)xdr_app_data, (char *)&a);
7138   return retval;
7139 }
7140 /* purecov: end */
7141 
7142 #ifndef _WIN32
7143 #include <arpa/inet.h>
7144 #include <netinet/in.h>
7145 #include <sys/socket.h>
7146 #endif
7147 
7148 /* Output warning in log periodically if we receive messages
7149 with a protocol version that does not match our own */
7150 /* purecov: begin inspected */
7151 void warn_protoversion_mismatch(connection_descriptor *rfd) {
7152   struct sockaddr_storage sock_addr;
7153   socklen_t sock_size = sizeof(sock_addr);
7154 
7155   if (task_now() - protoversion_warning_time > PROTOVERSION_WARNING_TIMEOUT) {
7156     if (0 ==
7157         xcom_getpeername(rfd->fd, (struct sockaddr *)&sock_addr, &sock_size)) {
7158       char buf[INET6_ADDRSTRLEN + 1];
7159       struct sockaddr_in *s4 = (struct sockaddr_in *)&sock_addr;
7160       struct sockaddr_in6 *s6 = (struct sockaddr_in6 *)&sock_addr;
7161       char const *ok;
7162 
7163       memset((void *)buf, 0, sizeof(buf));
7164       if (sock_addr.ss_family == AF_INET) {
7165         ok = inet_ntop(sock_addr.ss_family, (void *)&s4->sin_addr, buf,
7166                        sizeof(buf));
7167       } else {
7168         ok = inet_ntop(sock_addr.ss_family, (void *)&s6->sin6_addr, buf,
7169                        sizeof(buf));
7170       }
7171       if (ok) {
7172         G_WARNING(
7173             "Detected incorrect xcom protocol version in connection from %s "
7174             "indicates "
7175             "missing cleanup of, or incorrect, xcom group definition on remote "
7176             "host. Please upgrade the process running on %s to a compatible "
7177             "version or stop it.",
7178             buf, buf);
7179         protoversion_warning_time = task_now();
7180       }
7181     }
7182   }
7183 }
7184 /* purecov: end */
7185 
7186 static pax_msg *socket_read_msg(connection_descriptor *rfd, pax_msg *p)
7187 /* Should buffer reads as well */
7188 {
7189   int64_t n;
7190   char *bytes;
7191   unsigned char header_buf[MSG_HDR_SIZE];
7192   xcom_proto x_version;
7193   uint32_t msgsize;
7194   x_msg_type x_type;
7195   unsigned int tag;
7196   int deserialize_ok = 0;
7197 
7198   bytes = NULL;
7199 
7200   /* Read version, length, type, and tag */
7201   n = socket_read_bytes(rfd, (char *)header_buf, MSG_HDR_SIZE);
7202 
7203   if (n <= 0) {
7204     IFDBG(D_NONE, FN; NDBG64(n));
7205     return 0;
7206   }
7207   assert(n == MSG_HDR_SIZE);
7208   x_version = (xcom_proto)get_32(VERS_PTR(header_buf));
7209 /* Check the protocol version before doing anything else */
7210 #ifdef XCOM_PARANOID
7211   assert(check_protoversion(x_version, rfd->x_proto));
7212 #endif
7213   if (!check_protoversion(x_version, rfd->x_proto)) {
7214     /* purecov: begin inspected */
7215     warn_protoversion_mismatch(rfd);
7216     return 0;
7217     /* purecov: end */
7218   }
7219 
7220   /* OK, we can grok this version */
7221 
7222   get_header_1_0(header_buf, &msgsize, &x_type, &tag);
7223 
7224   /* Allocate buffer space for message */
7225   bytes = (char *)calloc(1, msgsize);
7226 
7227   /* Read message */
7228   n = socket_read_bytes(rfd, bytes, msgsize);
7229 
7230   if (n > 0) {
7231     /* Deserialize message */
7232     deserialize_ok = deserialize_msg(p, rfd->x_proto, bytes, msgsize);
7233     IFDBG(D_NONE, FN; STRLIT(" deserialized message"));
7234   }
7235   /* Deallocate buffer */
7236   X_FREE(bytes);
7237   if (n <= 0 || deserialize_ok == 0) {
7238     IFDBG(D_NONE, FN; NDBG64(n));
7239     return 0;
7240   }
7241   return (p);
7242 }
7243 
7244 int xcom_close_client_connection(connection_descriptor *connection) {
7245   int retval = 0;
7246 
7247 #ifndef XCOM_WITHOUT_OPENSSL
7248   if (connection->ssl_fd) {
7249     SSL_shutdown(connection->ssl_fd);
7250     ssl_free_con(connection);
7251   }
7252 #endif
7253   retval = xcom_shut_close_socket(&connection->fd).val;
7254   free(connection);
7255   return retval;
7256 }
7257 
7258 /* purecov: begin deadcode */
7259 int xcom_client_boot(connection_descriptor *fd, node_list *nl,
7260                      uint32_t group_id) {
7261   app_data a;
7262   int retval = 0;
7263   retval = (int)xcom_send_client_app_data(
7264       fd, init_config_with_group(&a, nl, unified_boot_type, group_id), 0);
7265   xdr_free((xdrproc_t)xdr_app_data, (char *)&a);
7266   return retval;
7267 }
7268 /* purecov: end */
7269 
7270 enum xcom_send_app_wait_result {
7271   SEND_REQUEST_FAILED = 0,
7272   RECEIVE_REQUEST_FAILED,
7273   REQUEST_BOTCHED,
7274   RETRIES_EXCEEDED,
7275   REQUEST_OK_RECEIVED,
7276   REQUEST_FAIL_RECEIVED
7277 };
7278 typedef enum xcom_send_app_wait_result xcom_send_app_wait_result;
7279 
7280 /**
7281  * Send a message and wait for response.
7282  *
7283  * The caller is reponsible for freeing p after calling this function,
7284  * i.e. xdr_free((xdrproc_t)xdr_pax_msg, (char *)p)
7285  */
7286 static xcom_send_app_wait_result xcom_send_app_wait_and_get(
7287     connection_descriptor *fd, app_data *a, int force, pax_msg *p) {
7288   int retval = 0;
7289   int retry_count = 10; /* Same as 'connection_attempts' */
7290   pax_msg *rp = 0;
7291 
7292   do {
7293     retval = (int)xcom_send_client_app_data(fd, a, force);
7294     memset(p, 0, sizeof(*p)); /* before return so caller can free p */
7295     if (retval < 0) return SEND_REQUEST_FAILED;
7296     rp = socket_read_msg(fd, p);
7297     if (rp) {
7298       client_reply_code cli_err = rp->cli_err;
7299       switch (cli_err) {
7300         case REQUEST_OK:
7301           return REQUEST_OK_RECEIVED;
7302         case REQUEST_FAIL:
7303 
7304           G_DEBUG("cli_err %d", cli_err);
7305           return REQUEST_FAIL_RECEIVED;
7306         case REQUEST_RETRY:
7307           G_DEBUG("cli_err %d", cli_err);
7308           if (retry_count > 1) xdr_free((xdrproc_t)xdr_pax_msg, (char *)p);
7309           xcom_sleep(1);
7310           break;
7311         default:
7312           G_WARNING("client protocol botched");
7313           return REQUEST_BOTCHED;
7314       }
7315     } else {
7316       G_WARNING("read failed");
7317       return RECEIVE_REQUEST_FAILED;
7318     }
7319   } while (--retry_count);
7320   /* Timeout after REQUEST_RETRY has been received 'retry_count' times */
7321   G_MESSAGE(
7322       "Request failed: maximum number of retries (10) has been exhausted.");
7323   return RETRIES_EXCEEDED;
7324 }
7325 
7326 int xcom_send_app_wait(connection_descriptor *fd, app_data *a, int force) {
7327   pax_msg p;
7328   int result = 0;
7329   xcom_send_app_wait_result res = xcom_send_app_wait_and_get(fd, a, force, &p);
7330   switch (res) {
7331     case SEND_REQUEST_FAILED:
7332     case RECEIVE_REQUEST_FAILED:
7333     case REQUEST_BOTCHED:
7334     case RETRIES_EXCEEDED:
7335     case REQUEST_FAIL_RECEIVED:
7336       result = 0;
7337       break;
7338     case REQUEST_OK_RECEIVED:
7339       result = 1;
7340       break;
7341   }
7342   xdr_free((xdrproc_t)xdr_pax_msg, (char *)&p);
7343   return result;
7344 }
7345 
7346 int xcom_send_cfg_wait(connection_descriptor *fd, node_list *nl,
7347                        uint32_t group_id, cargo_type ct, int force) {
7348   app_data a;
7349   int retval = 0;
7350   IFDBG(D_NONE, FN; COPY_AND_FREE_GOUT(dbg_list(nl)););
7351   retval = xcom_send_app_wait(fd, init_config_with_group(&a, nl, ct, group_id),
7352                               force);
7353   xdr_free((xdrproc_t)xdr_app_data, (char *)&a);
7354   return retval;
7355 }
7356 
7357 int xcom_client_add_node(connection_descriptor *fd, node_list *nl,
7358                          uint32_t group_id) {
7359   u_int i;
7360   for (i = 0; i < nl->node_list_len; i++) {
7361     assert(nl->node_list_val[i].proto.max_proto > x_unknown_proto);
7362   }
7363   return xcom_send_cfg_wait(fd, nl, group_id, add_node_type, 0);
7364 }
7365 
7366 int xcom_client_remove_node(connection_descriptor *fd, node_list *nl,
7367                             uint32_t group_id) {
7368   return xcom_send_cfg_wait(fd, nl, group_id, remove_node_type, 0);
7369 }
7370 
7371 /* purecov: begin deadcode */
7372 int xcom_client_get_event_horizon(connection_descriptor *fd, uint32_t group_id,
7373                                   xcom_event_horizon *event_horizon) {
7374   pax_msg p;
7375   app_data a;
7376   int result = 0;
7377 
7378   xcom_send_app_wait_result res = xcom_send_app_wait_and_get(
7379       fd, init_get_event_horizon_msg(&a, group_id), 0, &p);
7380 
7381   switch (res) {
7382     case RECEIVE_REQUEST_FAILED:
7383     case REQUEST_BOTCHED:
7384     case RETRIES_EXCEEDED:
7385     case SEND_REQUEST_FAILED:
7386     case REQUEST_FAIL_RECEIVED:
7387       result = 0;
7388       break;
7389     case REQUEST_OK_RECEIVED:
7390       *event_horizon = p.event_horizon;
7391       result = 1;
7392       break;
7393   }
7394 
7395   xdr_free((xdrproc_t)xdr_pax_msg, (char *)&p);
7396   xdr_free((xdrproc_t)xdr_app_data, (char *)&a);
7397 
7398   return result;
7399 }
7400 /* purecov: end */
7401 
7402 /* purecov: begin deadcode */
7403 int xcom_client_set_event_horizon(connection_descriptor *fd, uint32_t group_id,
7404                                   xcom_event_horizon event_horizon) {
7405   app_data a;
7406   int retval = 0;
7407   retval = xcom_send_app_wait(
7408       fd, init_set_event_horizon_msg(&a, group_id, event_horizon), 0);
7409   xdr_free((xdrproc_t)xdr_app_data, (char *)&a);
7410   return retval;
7411 }
7412 /* purecov: end */
7413 
7414 int xcom_client_get_synode_app_data(connection_descriptor *const fd,
7415                                     uint32_t group_id,
7416                                     synode_no_array *const synodes,
7417                                     synode_app_data_array *const reply) {
7418   bool_t const success = TRUE;
7419   bool_t const failure = FALSE;
7420   bool_t result = failure;
7421   pax_msg p;
7422   app_data a;
7423   u_int const nr_synodes_requested = synodes->synode_no_array_len;
7424 
7425   /* This call moves, as in C++ move semantics, synodes into app_data a. */
7426   init_get_synode_app_data_msg(&a, group_id, synodes);
7427 
7428   {
7429     xcom_send_app_wait_result res = xcom_send_app_wait_and_get(fd, &a, 0, &p);
7430     switch (res) {
7431       case RECEIVE_REQUEST_FAILED:
7432       case REQUEST_BOTCHED:
7433       case RETRIES_EXCEEDED:
7434       case SEND_REQUEST_FAILED:
7435       case REQUEST_FAIL_RECEIVED: {
7436         G_TRACE(
7437             "xcom_client_get_synode_app_data: XCom did not have the required "
7438             "%u "
7439             "synodes.",
7440             nr_synodes_requested);
7441         break;
7442       }
7443       case REQUEST_OK_RECEIVED: {
7444         u_int const nr_synodes_received =
7445             p.requested_synode_app_data.synode_app_data_array_len;
7446         G_TRACE(
7447             "xcom_client_get_synode_app_data: Got %u synode payloads, we asked "
7448             "for %u.",
7449             nr_synodes_received, nr_synodes_requested);
7450 
7451         /* This should always be TRUE.
7452          * But rather than asserting it, let's treat an unexpected number of
7453          * synode payloads in the reply as a failure. */
7454         if (nr_synodes_received == nr_synodes_requested) {
7455           /* Move (as in C++ move semantics) into reply */
7456           synode_app_data_array_move(reply, &p.requested_synode_app_data);
7457           result = success;
7458         }
7459         break;
7460       }
7461     }
7462   }
7463 
7464   xdr_free((xdrproc_t)xdr_pax_msg, (char *)&p);
7465   xdr_free((xdrproc_t)xdr_app_data, (char *)&a);
7466 
7467   return result;
7468 }
7469 
7470 #ifdef NOTDEF
7471 /* Not completely implemented, need to be handled properly
7472    when received as a client message in dispatch_op.
7473    Should have separate opcode from normal add/remove,
7474    like force config_type */
7475 int xcom_client_force_add_node(connection_descriptor *, node_list *nl,
7476                                uint32_t group_id) {
7477   return xcom_send_cfg_wait(fd, nl, group_id, add_node_type, 1);
7478 }
7479 
7480 int xcom_client_force_remove_node(connection_descriptor *, node_list *nl,
7481                                   uint32_t group_id) {
7482   return xcom_send_cfg_wait(fd, nl, group_id, remove_node_type, 1);
7483 }
7484 #endif
7485 
7486 int xcom_client_force_config(connection_descriptor *fd, node_list *nl,
7487                              uint32_t group_id) {
7488   return xcom_send_cfg_wait(fd, nl, group_id, force_config_type, 1);
7489 }
7490 
7491 /* purecov: begin deadcode */
7492 int xcom_client_enable_arbitrator(connection_descriptor *fd) {
7493   app_data a;
7494   int retval = 0;
7495   init_app_data(&a);
7496   a.body.c_t = enable_arbitrator;
7497   retval = xcom_send_app_wait(fd, &a, 0);
7498   xdr_free((xdrproc_t)xdr_app_data, (char *)&a);
7499   return retval;
7500 }
7501 /* purecov: end */
7502 
7503 /* purecov: begin deadcode */
7504 int xcom_client_disable_arbitrator(connection_descriptor *fd) {
7505   app_data a;
7506   int retval = 0;
7507   init_app_data(&a);
7508   a.body.c_t = disable_arbitrator;
7509   retval = xcom_send_app_wait(fd, &a, 0);
7510   xdr_free((xdrproc_t)xdr_app_data, (char *)&a);
7511   return retval;
7512 }
7513 /* purecov: end */
7514 
7515 /* purecov: begin deadcode */
7516 int xcom_client_terminate_and_exit(connection_descriptor *fd) {
7517   app_data a;
7518   int retval = 0;
7519   init_app_data(&a);
7520   a.body.c_t = x_terminate_and_exit;
7521   retval = xcom_send_app_wait(fd, &a, 0);
7522   xdr_free((xdrproc_t)xdr_app_data, (char *)&a);
7523   return retval;
7524 }
7525 /* purecov: end */
7526 
7527 /* purecov: begin deadcode */
7528 int xcom_client_set_cache_limit(connection_descriptor *fd,
7529                                 uint64_t cache_limit) {
7530   app_data a;
7531   int retval = 0;
7532   init_app_data(&a);
7533   a.body.c_t = set_cache_limit;
7534   a.body.app_u_u.cache_limit = cache_limit;
7535   retval = xcom_send_app_wait(fd, &a, 0);
7536   xdr_free((xdrproc_t)xdr_app_data, (char *)&a);
7537   return retval;
7538 }
7539 /* purecov: end */
7540 
7541 int xcom_client_convert_into_local_server(connection_descriptor *const fd) {
7542   app_data a;
7543   int retval = 0;
7544   retval = xcom_send_app_wait(fd, init_convert_into_local_server_msg(&a), 0);
7545   xdr_free((xdrproc_t)xdr_app_data, (char *)&a);
7546   return retval;
7547 }
7548