1 /* Copyright (c) 2018, 2021, Oracle and/or its affiliates.
2 
3    This program is free software; you can redistribute it and/or modify
4    it under the terms of the GNU General Public License, version 2.0,
5    as published by the Free Software Foundation.
6 
7    This program is also distributed with certain software (including
8    but not limited to OpenSSL) that is licensed under separate terms,
9    as designated in a particular file or component or in included license
10    documentation.  The authors of MySQL hereby grant you an additional
11    permission to link the program and your derivative works with the
12    separately licensed software that they have included with MySQL.
13 
14    This program is distributed in the hope that it will be useful,
15    but WITHOUT ANY WARRANTY; without even the implied warranty of
16    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17    GNU General Public License, version 2.0, for more details.
18 
19    You should have received a copy of the GNU General Public License
20    along with this program; if not, write to the Free Software Foundation,
21    51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA */
22 
23 #include "group_partition_handling.h"
24 #include "plugin_psi.h"
25 #include "plugin.h"
26 #include <mysql/group_replication_priv.h>
27 
28 using std::string;
29 
launch_handler_thread(void * arg)30 static void *launch_handler_thread(void* arg)
31 {
32   Group_partition_handling *handler= (Group_partition_handling*) arg;
33   handler->partition_thread_handler();
34   return 0;
35 }
36 
37 Group_partition_handling::
Group_partition_handling(Shared_writelock * shared_stop_lock,ulong unreachable_timeout)38 Group_partition_handling(Shared_writelock *shared_stop_lock,
39                          ulong unreachable_timeout)
40   : member_in_partition(false),
41     thread_running(false), partition_handling_aborted(false),
42     partition_handling_terminated(false),
43     timeout_on_unreachable(unreachable_timeout),
44     shared_stop_write_lock(shared_stop_lock)
45 {
46   mysql_mutex_init(key_GR_LOCK_group_part_handler_run, &run_lock,
47                    MY_MUTEX_INIT_FAST);
48   mysql_mutex_init(key_GR_LOCK_group_part_handler_abort,
49                    &trx_termination_aborted_lock,
50                    MY_MUTEX_INIT_FAST);
51 
52   mysql_cond_init(key_GR_COND_group_part_handler_run, &run_cond);
53   mysql_cond_init(key_GR_COND_group_part_handler_abort,
54                   &trx_termination_aborted_cond);
55 
56 }
57 
~Group_partition_handling()58 Group_partition_handling::~Group_partition_handling()
59 {
60   mysql_mutex_destroy(&run_lock);
61   mysql_cond_destroy(&run_cond);
62   mysql_mutex_destroy(&trx_termination_aborted_lock);
63   mysql_cond_destroy(&trx_termination_aborted_cond);
64 }
65 
66 void
update_timeout_on_unreachable(ulong unreachable_timeout)67 Group_partition_handling::update_timeout_on_unreachable(ulong unreachable_timeout)
68 {
69   timeout_on_unreachable= unreachable_timeout;
70 }
71 
get_timeout_on_unreachable()72 ulong Group_partition_handling::get_timeout_on_unreachable()
73 {
74   return timeout_on_unreachable;
75 }
76 
is_member_on_partition()77 bool Group_partition_handling::is_member_on_partition()
78 {
79   return member_in_partition;
80 }
81 
is_partition_handler_running()82 bool Group_partition_handling::is_partition_handler_running()
83 {
84   return thread_running;
85 }
86 
is_partition_handling_terminated()87 bool Group_partition_handling::is_partition_handling_terminated()
88 {
89   return partition_handling_terminated;
90 }
91 
kill_transactions_and_leave()92 void Group_partition_handling::kill_transactions_and_leave()
93 {
94   DBUG_ENTER("Group_partition_handling::kill_transactions_and_leave");
95 
96   log_message(MY_ERROR_LEVEL,
97               "This member could not reach a majority of the members for more "
98               "than %ld seconds. The member will now leave the group as instructed "
99               "by the group_replication_unreachable_majority_timeout option.",
100                timeout_on_unreachable);
101 
102   /*
103     Suspend the applier for the uncommon case of a network restore happening
104     when this termination process is ongoing.
105     Don't care if an error is returned because the applier failed.
106   */
107   applier_module->add_suspension_packet();
108 
109   group_member_mgr->update_member_status(local_member_info->get_uuid(),
110                                          Group_member_info::MEMBER_ERROR);
111 
112   bool set_read_mode= false;
113   Gcs_operations::enum_leave_state state= gcs_module->leave();
114 
115   std::stringstream ss;
116   plugin_log_level log_severity= MY_WARNING_LEVEL;
117   switch (state)
118   {
119     case Gcs_operations::ERROR_WHEN_LEAVING:
120       ss << "Unable to confirm whether the server has left the group or not. "
121             "Check performance_schema.replication_group_members to check group membership information.";
122       log_severity= MY_ERROR_LEVEL;
123       set_read_mode= true;
124       break;
125     case Gcs_operations::ALREADY_LEAVING:
126       ss << "Skipping leave operation: concurrent attempt to leave the group is on-going."; /* purecov: inspected */
127       break; /* purecov: inspected */
128     case Gcs_operations::ALREADY_LEFT:
129       ss << "Skipping leave operation: member already left the group."; /* purecov: inspected */
130       break; /* purecov: inspected */
131     case Gcs_operations::NOW_LEAVING:
132       set_read_mode= true;
133       ss << "The server was automatically set into read only mode after an error was detected.";
134       log_severity= MY_ERROR_LEVEL;
135       break;
136   }
137   log_message(log_severity, ss.str().c_str());
138 
139   /*
140     If true it means:
141     1) The plugin is stopping and waiting on some transactions to finish.
142        No harm in unblocking them first cutting the stop command time
143     2) There was an error in the applier and the plugin will leave the group.
144        No problem, both processes will try to kill the transactions and set the
145        read mode to true.
146   */
147   bool already_locked= shared_stop_write_lock->try_grab_write_lock();
148 
149   //kill pending transactions
150   blocked_transaction_handler->unblock_waiting_transactions();
151 
152   if (!already_locked)
153     shared_stop_write_lock->release_write_lock();
154 
155   if (set_read_mode)
156     enable_server_read_mode(PSESSION_INIT_THREAD);
157 
158   if (exit_state_action_var == EXIT_STATE_ACTION_ABORT_SERVER)
159   {
160     abort_plugin_process("Fatal error during execution of Group Replication");
161   }
162 
163   DBUG_VOID_RETURN;
164 }
165 
abort_partition_handler_if_running()166 bool Group_partition_handling::abort_partition_handler_if_running()
167 {
168   DBUG_ENTER("Group_partition_handling::abort_partition_handler_if_running");
169 
170   // if someone tried to cancel it, we are no longer in a partition.
171   member_in_partition= false;
172 
173   /*
174     This check is safe to invoke as the start method and abort method are only
175     invoked in GCS serialized operations.
176   */
177   if (thread_running)
178     terminate_partition_handler_thread();
179 
180   DBUG_RETURN(partition_handling_terminated);
181 }
182 
launch_partition_handler_thread()183 int Group_partition_handling::launch_partition_handler_thread()
184 {
185   DBUG_ENTER("Group_partition_handling::launch_partition_handler_thread");
186 
187   member_in_partition= true;
188 
189   //If the timeout is set to 0 do nothing
190   if (!timeout_on_unreachable)
191      return 0;
192 
193   mysql_mutex_lock(&run_lock);
194 
195   partition_handling_aborted= false;
196 
197   if(thread_running)
198   {
199     mysql_mutex_unlock(&run_lock); /* purecov: inspected */
200     DBUG_RETURN(0);                /* purecov: inspected */
201   }
202 
203   if (mysql_thread_create(key_GR_THD_group_partition_handler,
204                           &partition_trx_handler_pthd,
205                           get_connection_attrib(),
206                           launch_handler_thread,
207                           (void*)this))
208   {
209     DBUG_RETURN(1); /* purecov: inspected */
210   }
211 
212   while (!thread_running)
213   {
214     DBUG_PRINT("sleep",("Waiting for the partition handler thread to start"));
215     mysql_cond_wait(&run_cond, &run_lock);
216   }
217   mysql_mutex_unlock(&run_lock);
218 
219   DBUG_RETURN(0);
220 }
221 
terminate_partition_handler_thread()222 int Group_partition_handling::terminate_partition_handler_thread()
223 {
224   DBUG_ENTER("Group_partition_handling::terminate_partition_handler_thread");
225 
226   mysql_mutex_lock(&run_lock);
227 
228   if (!thread_running)
229   {
230     mysql_mutex_unlock(&run_lock);
231     DBUG_RETURN(0);
232   }
233 
234   mysql_mutex_lock(&trx_termination_aborted_lock);
235   partition_handling_aborted= true;
236   mysql_cond_broadcast(&trx_termination_aborted_cond);
237   mysql_mutex_unlock(&trx_termination_aborted_lock);
238 
239   ulong stop_wait_timeout= TRANSACTION_KILL_TIMEOUT;
240 
241   while (thread_running)
242   {
243     DBUG_PRINT("loop", ("killing group replication partition handler thread"));
244 
245     struct timespec abstime;
246     set_timespec(&abstime, 2);
247 #ifndef NDEBUG
248     int error=
249 #endif
250       mysql_cond_timedwait(&run_cond, &run_lock, &abstime);
251     if (stop_wait_timeout >= 2)
252     {
253       stop_wait_timeout= stop_wait_timeout - 2;
254     }
255       /* purecov: begin inspected */
256     else if (thread_running) // quit waiting
257     {
258       mysql_mutex_unlock(&run_lock);
259       DBUG_RETURN(1);
260     }
261     /* purecov: inspected */
262     assert(error == ETIMEDOUT || error == 0);
263   }
264 
265   assert(!thread_running);
266 
267   mysql_mutex_unlock(&run_lock);
268 
269   DBUG_RETURN(0);
270 }
271 
partition_thread_handler()272 int Group_partition_handling::partition_thread_handler()
273 {
274   DBUG_ENTER("Group_partition_handling::partition_thread_handler");
275 
276   mysql_mutex_lock(&run_lock);
277   thread_running= true;
278   mysql_cond_broadcast(&run_cond);
279   mysql_mutex_unlock(&run_lock);
280 
281   struct timespec abstime;
282   bool timeout= false;
283 
284   longlong timeout_remaining_time= timeout_on_unreachable;
285 
286   mysql_mutex_lock(&trx_termination_aborted_lock);
287   while (!timeout && !partition_handling_aborted)
288   {
289     set_timespec(&abstime, 2);
290     mysql_cond_timedwait(&trx_termination_aborted_cond,
291                          &trx_termination_aborted_lock, &abstime);
292 
293     timeout_remaining_time -= 2;
294     timeout= (timeout_remaining_time <= 0);
295   }
296 
297   mysql_mutex_unlock(&trx_termination_aborted_lock);
298 
299   if (!partition_handling_aborted)
300   {
301     partition_handling_terminated= true;
302     kill_transactions_and_leave();
303   }
304 
305   mysql_mutex_lock(&run_lock);
306   thread_running= false;
307   mysql_cond_broadcast(&run_cond);
308   mysql_mutex_unlock(&run_lock);
309 
310   DBUG_RETURN(0);
311 }
312