1 /* Copyright (c) 2018, 2021, Oracle and/or its affiliates.
2
3 This program is free software; you can redistribute it and/or modify
4 it under the terms of the GNU General Public License, version 2.0,
5 as published by the Free Software Foundation.
6
7 This program is also distributed with certain software (including
8 but not limited to OpenSSL) that is licensed under separate terms,
9 as designated in a particular file or component or in included license
10 documentation. The authors of MySQL hereby grant you an additional
11 permission to link the program and your derivative works with the
12 separately licensed software that they have included with MySQL.
13
14 This program is distributed in the hope that it will be useful,
15 but WITHOUT ANY WARRANTY; without even the implied warranty of
16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 GNU General Public License, version 2.0, for more details.
18
19 You should have received a copy of the GNU General Public License
20 along with this program; if not, write to the Free Software Foundation,
21 51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA */
22
23 #include "group_partition_handling.h"
24 #include "plugin_psi.h"
25 #include "plugin.h"
26 #include <mysql/group_replication_priv.h>
27
28 using std::string;
29
launch_handler_thread(void * arg)30 static void *launch_handler_thread(void* arg)
31 {
32 Group_partition_handling *handler= (Group_partition_handling*) arg;
33 handler->partition_thread_handler();
34 return 0;
35 }
36
37 Group_partition_handling::
Group_partition_handling(Shared_writelock * shared_stop_lock,ulong unreachable_timeout)38 Group_partition_handling(Shared_writelock *shared_stop_lock,
39 ulong unreachable_timeout)
40 : member_in_partition(false),
41 thread_running(false), partition_handling_aborted(false),
42 partition_handling_terminated(false),
43 timeout_on_unreachable(unreachable_timeout),
44 shared_stop_write_lock(shared_stop_lock)
45 {
46 mysql_mutex_init(key_GR_LOCK_group_part_handler_run, &run_lock,
47 MY_MUTEX_INIT_FAST);
48 mysql_mutex_init(key_GR_LOCK_group_part_handler_abort,
49 &trx_termination_aborted_lock,
50 MY_MUTEX_INIT_FAST);
51
52 mysql_cond_init(key_GR_COND_group_part_handler_run, &run_cond);
53 mysql_cond_init(key_GR_COND_group_part_handler_abort,
54 &trx_termination_aborted_cond);
55
56 }
57
~Group_partition_handling()58 Group_partition_handling::~Group_partition_handling()
59 {
60 mysql_mutex_destroy(&run_lock);
61 mysql_cond_destroy(&run_cond);
62 mysql_mutex_destroy(&trx_termination_aborted_lock);
63 mysql_cond_destroy(&trx_termination_aborted_cond);
64 }
65
66 void
update_timeout_on_unreachable(ulong unreachable_timeout)67 Group_partition_handling::update_timeout_on_unreachable(ulong unreachable_timeout)
68 {
69 timeout_on_unreachable= unreachable_timeout;
70 }
71
get_timeout_on_unreachable()72 ulong Group_partition_handling::get_timeout_on_unreachable()
73 {
74 return timeout_on_unreachable;
75 }
76
is_member_on_partition()77 bool Group_partition_handling::is_member_on_partition()
78 {
79 return member_in_partition;
80 }
81
is_partition_handler_running()82 bool Group_partition_handling::is_partition_handler_running()
83 {
84 return thread_running;
85 }
86
is_partition_handling_terminated()87 bool Group_partition_handling::is_partition_handling_terminated()
88 {
89 return partition_handling_terminated;
90 }
91
kill_transactions_and_leave()92 void Group_partition_handling::kill_transactions_and_leave()
93 {
94 DBUG_ENTER("Group_partition_handling::kill_transactions_and_leave");
95
96 log_message(MY_ERROR_LEVEL,
97 "This member could not reach a majority of the members for more "
98 "than %ld seconds. The member will now leave the group as instructed "
99 "by the group_replication_unreachable_majority_timeout option.",
100 timeout_on_unreachable);
101
102 /*
103 Suspend the applier for the uncommon case of a network restore happening
104 when this termination process is ongoing.
105 Don't care if an error is returned because the applier failed.
106 */
107 applier_module->add_suspension_packet();
108
109 group_member_mgr->update_member_status(local_member_info->get_uuid(),
110 Group_member_info::MEMBER_ERROR);
111
112 bool set_read_mode= false;
113 Gcs_operations::enum_leave_state state= gcs_module->leave();
114
115 std::stringstream ss;
116 plugin_log_level log_severity= MY_WARNING_LEVEL;
117 switch (state)
118 {
119 case Gcs_operations::ERROR_WHEN_LEAVING:
120 ss << "Unable to confirm whether the server has left the group or not. "
121 "Check performance_schema.replication_group_members to check group membership information.";
122 log_severity= MY_ERROR_LEVEL;
123 set_read_mode= true;
124 break;
125 case Gcs_operations::ALREADY_LEAVING:
126 ss << "Skipping leave operation: concurrent attempt to leave the group is on-going."; /* purecov: inspected */
127 break; /* purecov: inspected */
128 case Gcs_operations::ALREADY_LEFT:
129 ss << "Skipping leave operation: member already left the group."; /* purecov: inspected */
130 break; /* purecov: inspected */
131 case Gcs_operations::NOW_LEAVING:
132 set_read_mode= true;
133 ss << "The server was automatically set into read only mode after an error was detected.";
134 log_severity= MY_ERROR_LEVEL;
135 break;
136 }
137 log_message(log_severity, ss.str().c_str());
138
139 /*
140 If true it means:
141 1) The plugin is stopping and waiting on some transactions to finish.
142 No harm in unblocking them first cutting the stop command time
143 2) There was an error in the applier and the plugin will leave the group.
144 No problem, both processes will try to kill the transactions and set the
145 read mode to true.
146 */
147 bool already_locked= shared_stop_write_lock->try_grab_write_lock();
148
149 //kill pending transactions
150 blocked_transaction_handler->unblock_waiting_transactions();
151
152 if (!already_locked)
153 shared_stop_write_lock->release_write_lock();
154
155 if (set_read_mode)
156 enable_server_read_mode(PSESSION_INIT_THREAD);
157
158 if (exit_state_action_var == EXIT_STATE_ACTION_ABORT_SERVER)
159 {
160 abort_plugin_process("Fatal error during execution of Group Replication");
161 }
162
163 DBUG_VOID_RETURN;
164 }
165
abort_partition_handler_if_running()166 bool Group_partition_handling::abort_partition_handler_if_running()
167 {
168 DBUG_ENTER("Group_partition_handling::abort_partition_handler_if_running");
169
170 // if someone tried to cancel it, we are no longer in a partition.
171 member_in_partition= false;
172
173 /*
174 This check is safe to invoke as the start method and abort method are only
175 invoked in GCS serialized operations.
176 */
177 if (thread_running)
178 terminate_partition_handler_thread();
179
180 DBUG_RETURN(partition_handling_terminated);
181 }
182
launch_partition_handler_thread()183 int Group_partition_handling::launch_partition_handler_thread()
184 {
185 DBUG_ENTER("Group_partition_handling::launch_partition_handler_thread");
186
187 member_in_partition= true;
188
189 //If the timeout is set to 0 do nothing
190 if (!timeout_on_unreachable)
191 return 0;
192
193 mysql_mutex_lock(&run_lock);
194
195 partition_handling_aborted= false;
196
197 if(thread_running)
198 {
199 mysql_mutex_unlock(&run_lock); /* purecov: inspected */
200 DBUG_RETURN(0); /* purecov: inspected */
201 }
202
203 if (mysql_thread_create(key_GR_THD_group_partition_handler,
204 &partition_trx_handler_pthd,
205 get_connection_attrib(),
206 launch_handler_thread,
207 (void*)this))
208 {
209 DBUG_RETURN(1); /* purecov: inspected */
210 }
211
212 while (!thread_running)
213 {
214 DBUG_PRINT("sleep",("Waiting for the partition handler thread to start"));
215 mysql_cond_wait(&run_cond, &run_lock);
216 }
217 mysql_mutex_unlock(&run_lock);
218
219 DBUG_RETURN(0);
220 }
221
terminate_partition_handler_thread()222 int Group_partition_handling::terminate_partition_handler_thread()
223 {
224 DBUG_ENTER("Group_partition_handling::terminate_partition_handler_thread");
225
226 mysql_mutex_lock(&run_lock);
227
228 if (!thread_running)
229 {
230 mysql_mutex_unlock(&run_lock);
231 DBUG_RETURN(0);
232 }
233
234 mysql_mutex_lock(&trx_termination_aborted_lock);
235 partition_handling_aborted= true;
236 mysql_cond_broadcast(&trx_termination_aborted_cond);
237 mysql_mutex_unlock(&trx_termination_aborted_lock);
238
239 ulong stop_wait_timeout= TRANSACTION_KILL_TIMEOUT;
240
241 while (thread_running)
242 {
243 DBUG_PRINT("loop", ("killing group replication partition handler thread"));
244
245 struct timespec abstime;
246 set_timespec(&abstime, 2);
247 #ifndef NDEBUG
248 int error=
249 #endif
250 mysql_cond_timedwait(&run_cond, &run_lock, &abstime);
251 if (stop_wait_timeout >= 2)
252 {
253 stop_wait_timeout= stop_wait_timeout - 2;
254 }
255 /* purecov: begin inspected */
256 else if (thread_running) // quit waiting
257 {
258 mysql_mutex_unlock(&run_lock);
259 DBUG_RETURN(1);
260 }
261 /* purecov: inspected */
262 assert(error == ETIMEDOUT || error == 0);
263 }
264
265 assert(!thread_running);
266
267 mysql_mutex_unlock(&run_lock);
268
269 DBUG_RETURN(0);
270 }
271
partition_thread_handler()272 int Group_partition_handling::partition_thread_handler()
273 {
274 DBUG_ENTER("Group_partition_handling::partition_thread_handler");
275
276 mysql_mutex_lock(&run_lock);
277 thread_running= true;
278 mysql_cond_broadcast(&run_cond);
279 mysql_mutex_unlock(&run_lock);
280
281 struct timespec abstime;
282 bool timeout= false;
283
284 longlong timeout_remaining_time= timeout_on_unreachable;
285
286 mysql_mutex_lock(&trx_termination_aborted_lock);
287 while (!timeout && !partition_handling_aborted)
288 {
289 set_timespec(&abstime, 2);
290 mysql_cond_timedwait(&trx_termination_aborted_cond,
291 &trx_termination_aborted_lock, &abstime);
292
293 timeout_remaining_time -= 2;
294 timeout= (timeout_remaining_time <= 0);
295 }
296
297 mysql_mutex_unlock(&trx_termination_aborted_lock);
298
299 if (!partition_handling_aborted)
300 {
301 partition_handling_terminated= true;
302 kill_transactions_and_leave();
303 }
304
305 mysql_mutex_lock(&run_lock);
306 thread_running= false;
307 mysql_cond_broadcast(&run_cond);
308 mysql_mutex_unlock(&run_lock);
309
310 DBUG_RETURN(0);
311 }
312