1 /*
2 * Copyright (C) 2011 Andrea Mazzoleni
3 *
4 * This program is free software: you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation, either version 3 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program. If not, see <http://www.gnu.org/licenses/>.
16 */
17
18 #include "portable.h"
19
20 #include "support.h"
21 #include "elem.h"
22 #include "state.h"
23 #include "parity.h"
24 #include "handle.h"
25 #include "io.h"
26 #include "raid/raid.h"
27
28 /****************************************************************************/
29 /* hash */
30
state_hash_process(struct snapraid_state * state,block_off_t blockstart,block_off_t blockmax,int * skip_sync)31 static int state_hash_process(struct snapraid_state* state, block_off_t blockstart, block_off_t blockmax, int* skip_sync)
32 {
33 struct snapraid_handle* handle;
34 unsigned diskmax;
35 block_off_t i;
36 unsigned j;
37 void* buffer;
38 void* buffer_alloc;
39 data_off_t countsize;
40 block_off_t countpos;
41 block_off_t countmax;
42 int ret;
43 unsigned error;
44 unsigned silent_error;
45 unsigned io_error;
46 char esc_buffer[ESC_MAX];
47
48 /* maps the disks to handles */
49 handle = handle_mapping(state, &diskmax);
50
51 /* buffer for reading */
52 buffer = malloc_nofail_direct(state->block_size, &buffer_alloc);
53 if (!state->opt.skip_self)
54 mtest_vector(1, state->block_size, &buffer);
55
56 error = 0;
57 silent_error = 0;
58 io_error = 0;
59
60 /* first count the number of blocks to process */
61 countmax = 0;
62 for (j = 0; j < diskmax; ++j) {
63 struct snapraid_disk* disk = handle[j].disk;
64
65 /* if no disk, nothing to check */
66 if (!disk)
67 continue;
68
69 for (i = blockstart; i < blockmax; ++i) {
70 struct snapraid_block* block;
71 unsigned block_state;
72
73 block = fs_par2block_find(disk, i);
74
75 /* get the state of the block */
76 block_state = block_state_get(block);
77
78 /* process REP and CHG blocks */
79 if (block_state != BLOCK_STATE_REP && block_state != BLOCK_STATE_CHG)
80 continue;
81
82 ++countmax;
83 }
84 }
85
86 /* drop until now */
87 state_usage_waste(state);
88
89 countsize = 0;
90 countpos = 0;
91 if (!state_progress_begin(state, blockstart, blockmax, countmax))
92 goto end;
93
94 for (j = 0; j < diskmax; ++j) {
95 struct snapraid_disk* disk = handle[j].disk;
96
97 /* if no disk, nothing to check */
98 if (!disk)
99 continue;
100
101 for (i = blockstart; i < blockmax; ++i) {
102 snapraid_info info;
103 int rehash;
104 struct snapraid_block* block;
105 int read_size;
106 unsigned char hash[HASH_MAX];
107 unsigned block_state;
108 struct snapraid_file* file;
109 block_off_t file_pos;
110
111 block = fs_par2block_find(disk, i);
112
113 /* get the state of the block */
114 block_state = block_state_get(block);
115
116 /* process REP and CHG blocks */
117 if (block_state != BLOCK_STATE_REP && block_state != BLOCK_STATE_CHG)
118 continue;
119
120 /* get the file of this block */
121 file = fs_par2file_get(disk, i, &file_pos);
122
123 /* get block specific info */
124 info = info_get(&state->infoarr, i);
125
126 /* if we have to use the old hash */
127 rehash = info_get_rehash(info);
128
129 /* until now is misc */
130 state_usage_misc(state);
131
132 /* if the file is different than the current one, close it */
133 if (handle[j].file != 0 && handle[j].file != file) {
134 /* keep a pointer at the file we are going to close for error reporting */
135 struct snapraid_file* report = handle[j].file;
136 ret = handle_close(&handle[j]);
137 if (ret == -1) {
138 /* LCOV_EXCL_START */
139 /* This one is really an unexpected error, because we are only reading */
140 /* and closing a descriptor should never fail */
141 if (errno == EIO) {
142 log_tag("error:%u:%s:%s: Close EIO error. %s\n", i, disk->name, esc_tag(report->sub, esc_buffer), strerror(errno));
143 log_fatal("DANGER! Unexpected input/output close error in a data disk, it isn't possible to sync.\n");
144 log_fatal("Ensure that disk '%s' is sane and that file '%s' can be accessed.\n", disk->dir, handle[j].path);
145 log_fatal("Stopping at block %u\n", i);
146 ++io_error;
147 goto bail;
148 }
149
150 log_tag("error:%u:%s:%s: Close error. %s\n", i, disk->name, esc_tag(report->sub, esc_buffer), strerror(errno));
151 log_fatal("WARNING! Unexpected close error in a data disk, it isn't possible to sync.\n");
152 log_fatal("Ensure that file '%s' can be accessed.\n", handle[j].path);
153 log_fatal("Stopping at block %u\n", i);
154 ++error;
155 goto bail;
156 /* LCOV_EXCL_STOP */
157 }
158 }
159
160 ret = handle_open(&handle[j], file, state->file_mode, log_error, 0);
161 if (ret == -1) {
162 if (errno == EIO) {
163 /* LCOV_EXCL_START */
164 log_tag("error:%u:%s:%s: Open EIO error. %s\n", i, disk->name, esc_tag(file->sub, esc_buffer), strerror(errno));
165 log_fatal("DANGER! Unexpected input/output open error in a data disk, it isn't possible to sync.\n");
166 log_fatal("Ensure that disk '%s' is sane and that file '%s' can be accessed.\n", disk->dir, handle[j].path);
167 log_fatal("Stopping at block %u\n", i);
168 ++io_error;
169 goto bail;
170 /* LCOV_EXCL_STOP */
171 }
172
173 if (errno == ENOENT) {
174 log_tag("error:%u:%s:%s: Open ENOENT error. %s\n", i, disk->name, esc_tag(file->sub, esc_buffer), strerror(errno));
175 log_error("Missing file '%s'.\n", handle[j].path);
176 log_error("WARNING! You cannot modify data disk during a sync.\n");
177 log_error("Rerun the sync command when finished.\n");
178 ++error;
179 /* if the file is missing, it means that it was removed during sync */
180 /* this isn't a serious error, so we skip this block, and continue with others */
181 continue;
182 }
183
184 if (errno == EACCES) {
185 log_tag("error:%u:%s:%s: Open EACCES error. %s\n", i, disk->name, esc_tag(file->sub, esc_buffer), strerror(errno));
186 log_error("No access at file '%s'.\n", handle[j].path);
187 log_error("WARNING! Please fix the access permission in the data disk.\n");
188 log_error("Rerun the sync command when finished.\n");
189 ++error;
190 /* this isn't a serious error, so we skip this block, and continue with others */
191 continue;
192 }
193
194 /* LCOV_EXCL_START */
195 log_tag("error:%u:%s:%s: Open error. %s\n", i, disk->name, esc_tag(file->sub, esc_buffer), strerror(errno));
196 log_fatal("WARNING! Unexpected open error in a data disk, it isn't possible to sync.\n");
197 log_fatal("Ensure that file '%s' can be accessed.\n", handle[j].path);
198 log_fatal("Stopping to allow recovery. Try with 'snapraid check -f /%s'\n", fmt_poll(disk, file->sub, esc_buffer));
199 ++error;
200 goto bail;
201 /* LCOV_EXCL_STOP */
202 }
203
204 /* check if the file is changed */
205 if (handle[j].st.st_size != file->size
206 || handle[j].st.st_mtime != file->mtime_sec
207 || STAT_NSEC(&handle[j].st) != file->mtime_nsec
208 || handle[j].st.st_ino != file->inode
209 ) {
210 log_tag("error:%u:%s:%s: Unexpected attribute change\n", i, disk->name, esc_tag(file->sub, esc_buffer));
211 if (handle[j].st.st_size != file->size) {
212 log_error("Unexpected size change at file '%s' from %" PRIu64 " to %" PRIu64 ".\n", handle[j].path, file->size, (uint64_t)handle[j].st.st_size);
213 } else if (handle[j].st.st_mtime != file->mtime_sec
214 || STAT_NSEC(&handle[j].st) != file->mtime_nsec) {
215 log_error("Unexpected time change at file '%s' from %" PRIu64 ".%d to %" PRIu64 ".%d.\n", handle[j].path, file->mtime_sec, file->mtime_nsec, (uint64_t)handle[j].st.st_mtime, STAT_NSEC(&handle[j].st));
216 } else {
217 log_error("Unexpected inode change from %" PRIu64 " to %" PRIu64 " at file '%s'.\n", file->inode, (uint64_t)handle[j].st.st_ino, handle[j].path);
218 }
219 log_error("WARNING! You cannot modify files during a sync.\n");
220 log_error("Rerun the sync command when finished.\n");
221 ++error;
222 /* if the file is changed, it means that it was modified during sync */
223 /* this isn't a serious error, so we skip this block, and continue with others */
224 continue;
225 }
226
227 read_size = handle_read(&handle[j], file_pos, buffer, state->block_size, log_fatal, 0);
228 if (read_size == -1) {
229 /* LCOV_EXCL_START */
230 if (errno == EIO) {
231 log_tag("error:%u:%s:%s: Read EIO error at position %u. %s\n", i, disk->name, esc_tag(file->sub, esc_buffer), file_pos, strerror(errno));
232 log_fatal("DANGER! Unexpected input/output read error in a data disk, it isn't possible to sync.\n");
233 log_fatal("Ensure that disk '%s' is sane and that file '%s' can be read.\n", disk->dir, handle[j].path);
234 log_fatal("Stopping at block %u\n", i);
235 ++io_error;
236 goto bail;
237 }
238
239 log_tag("error:%u:%s:%s: Read error at position %u. %s\n", i, disk->name, esc_tag(file->sub, esc_buffer), file_pos, strerror(errno));
240 log_fatal("WARNING! Unexpected read error in a data disk, it isn't possible to sync.\n");
241 log_fatal("Ensure that file '%s' can be read.\n", handle[j].path);
242 log_fatal("Stopping to allow recovery. Try with 'snapraid check -f /%s'\n", fmt_poll(disk, file->sub, esc_buffer));
243 ++error;
244 goto bail;
245 /* LCOV_EXCL_STOP */
246 }
247
248 /* until now is disk */
249 state_usage_disk(state, handle, &j, 1);
250
251 state_usage_file(state, disk, file);
252
253 countsize += read_size;
254
255 /* now compute the hash */
256 if (rehash) {
257 memhash(state->prevhash, state->prevhashseed, hash, buffer, read_size);
258 } else {
259 memhash(state->hash, state->hashseed, hash, buffer, read_size);
260 }
261
262 /* until now is hash */
263 state_usage_hash(state);
264
265 if (block_state == BLOCK_STATE_REP) {
266 /* compare the hash */
267 if (memcmp(hash, block->hash, BLOCK_HASH_SIZE) != 0) {
268 log_tag("error:%u:%s:%s: Unexpected data change\n", i, disk->name, esc_tag(file->sub, esc_buffer));
269 log_error("Data change at file '%s' at position '%u'\n", handle[j].path, file_pos);
270 log_error("WARNING! Unexpected data modification of a file without parity!\n");
271
272 if (file_flag_has(file, FILE_IS_COPY)) {
273 log_error("This file was detected as a copy of another file with the same name, size,\n");
274 log_error("and timestamp, but the file data isn't matching the assumed copy.\n");
275 log_error("If this is a false positive, and the files are expected to be different,\n");
276 log_error("you can 'sync' anyway using 'snapraid --force-nocopy sync'\n");
277 } else {
278 log_error("Try removing the file from the array and rerun the 'sync' command!\n");
279 }
280
281 /* block sync to allow a recovery before overwriting */
282 /* the parity needed to make such recovery */
283 *skip_sync = 1; /* avoid to run the next sync */
284
285 ++silent_error;
286 continue;
287 }
288 } else {
289 /* the only other case is BLOCK_STATE_CHG */
290 assert(block_state == BLOCK_STATE_CHG);
291
292 /* copy the hash in the block */
293 memcpy(block->hash, hash, BLOCK_HASH_SIZE);
294
295 /* and mark the block as hashed */
296 block_state_set(block, BLOCK_STATE_REP);
297
298 /* mark the state as needing write */
299 state->need_write = 1;
300 }
301
302 /* count the number of processed block */
303 ++countpos;
304
305 /* progress */
306 if (state_progress(state, 0, i, countpos, countmax, countsize)) {
307 /* LCOV_EXCL_START */
308 *skip_sync = 1; /* avoid to run the next sync */
309 break;
310 /* LCOV_EXCL_STOP */
311 }
312 }
313
314 /* close the last file in the disk */
315 if (handle[j].file != 0) {
316 /* keep a pointer at the file we are going to close for error reporting */
317 struct snapraid_file* report = handle[j].file;
318 ret = handle_close(&handle[j]);
319 if (ret == -1) {
320 /* LCOV_EXCL_START */
321 /* This one is really an unexpected error, because we are only reading */
322 /* and closing a descriptor should never fail */
323 if (errno == EIO) {
324 log_tag("error:%u:%s:%s: Close EIO error. %s\n", blockmax, disk->name, esc_tag(report->sub, esc_buffer), strerror(errno));
325 log_fatal("DANGER! Unexpected input/output close error in a data disk, it isn't possible to sync.\n");
326 log_fatal("Ensure that disk '%s' is sane and that file '%s' can be accessed.\n", disk->dir, handle[j].path);
327 log_fatal("Stopping at block %u\n", blockmax);
328 ++io_error;
329 goto bail;
330 }
331
332 log_tag("error:%u:%s:%s: Close error. %s\n", blockmax, disk->name, esc_tag(report->sub, esc_buffer), strerror(errno));
333 log_fatal("WARNING! Unexpected close error in a data disk, it isn't possible to sync.\n");
334 log_fatal("Ensure that file '%s' can be accessed.\n", handle[j].path);
335 log_fatal("Stopping at block %u\n", blockmax);
336 ++error;
337 goto bail;
338 /* LCOV_EXCL_STOP */
339 }
340 }
341 }
342
343 end:
344 state_progress_end(state, countpos, countmax, countsize);
345
346 /* note that at this point no io_error is possible */
347 /* because at the first one we bail out */
348 assert(io_error == 0);
349
350 if (error || io_error || silent_error) {
351 msg_status("\n");
352 msg_status("%8u file errors\n", error);
353 msg_status("%8u io errors\n", io_error);
354 msg_status("%8u data errors\n", silent_error);
355 } else {
356 /* print the result only if processed something */
357 if (countpos != 0)
358 msg_status("Everything OK\n");
359 }
360
361 if (error)
362 log_fatal("WARNING! Unexpected file errors!\n");
363
364 log_tag("hash_summary:error_file:%u\n", error);
365
366 /* proceed without bailing out */
367 goto finish;
368
369 bail:
370 /* on bail, don't run the next sync */
371 *skip_sync = 1;
372
373 /* close files left open */
374 for (j = 0; j < diskmax; ++j) {
375 struct snapraid_file* file = handle[j].file;
376 struct snapraid_disk* disk = handle[j].disk;
377 ret = handle_close(&handle[j]);
378 if (ret == -1) {
379 log_tag("error:%u:%s:%s: Close error. %s\n", i, disk->name, esc_tag(file->sub, esc_buffer), strerror(errno));
380 log_fatal("DANGER! Unexpected close error in a data disk.\n");
381 ++error;
382 /* continue, as we are already exiting */
383 }
384 }
385
386 finish:
387 free(handle);
388 free(buffer_alloc);
389
390 if (error + io_error + silent_error != 0)
391 return -1;
392 return 0;
393 }
394
395 /****************************************************************************/
396 /* sync */
397
398 /**
399 * Sync plan to use.
400 */
401 struct snapraid_plan {
402 unsigned handle_max;
403 struct snapraid_handle* handle_map;
404 int force_full;
405 };
406
407 /**
408 * A block that failed the hash check, or that was deleted.
409 */
410 struct failed_struct {
411 unsigned index; /**< Index of the failed block. */
412 unsigned size; /**< Size of the block. */
413
414 struct snapraid_block* block; /**< The failed block, or BLOCK_DELETED for a deleted block */
415 };
416
417 /**
418 * Comparison function for sorting by index.
419 */
failed_compare_by_index(const void * void_a,const void * void_b)420 int failed_compare_by_index(const void* void_a, const void* void_b)
421 {
422 const struct failed_struct* a = void_a;
423 const struct failed_struct* b = void_b;
424
425 if (a->index < b->index)
426 return -1;
427 if (a->index > b->index)
428 return 1;
429 return 0;
430 }
431
432 /**
433 * Buffer for storing the new hashes.
434 */
435 struct snapraid_rehash {
436 unsigned char hash[HASH_MAX];
437 struct snapraid_block* block;
438 };
439
440 /**
441 * Check if we have to process the specified block index ::i.
442 */
block_is_enabled(struct snapraid_plan * plan,block_off_t i)443 static int block_is_enabled(struct snapraid_plan* plan, block_off_t i)
444 {
445 unsigned j;
446 int one_invalid;
447 int one_valid;
448
449 /* for each disk */
450 one_invalid = 0;
451 one_valid = 0;
452 for (j = 0; j < plan->handle_max; ++j) {
453 struct snapraid_block* block;
454 struct snapraid_disk* disk = plan->handle_map[j].disk;
455
456 /* if no disk, nothing to check */
457 if (!disk)
458 continue;
459
460 block = fs_par2block_find(disk, i);
461
462 if (block_has_file(block))
463 one_valid = 1;
464
465 if (block_has_invalid_parity(block) || plan->force_full)
466 one_invalid = 1;
467 }
468
469 /* if none valid or none invalid, we don't need to update */
470 if (!one_invalid || !one_valid)
471 return 0;
472
473 return 1;
474 }
475
sync_data_reader(struct snapraid_worker * worker,struct snapraid_task * task)476 static void sync_data_reader(struct snapraid_worker* worker, struct snapraid_task* task)
477 {
478 struct snapraid_io* io = worker->io;
479 struct snapraid_state* state = io->state;
480 struct snapraid_handle* handle = worker->handle;
481 struct snapraid_disk* disk = handle->disk;
482 block_off_t blockcur = task->position;
483 unsigned char* buffer = task->buffer;
484 int ret;
485 char esc_buffer[ESC_MAX];
486
487 /* if the disk position is not used */
488 if (!disk) {
489 /* use an empty block */
490 memset(buffer, 0, state->block_size);
491 task->state = TASK_STATE_DONE;
492 return;
493 }
494
495 /* get the block */
496 task->block = fs_par2block_find(disk, blockcur);
497
498 /* if the block has no file, meaning that it's EMPTY or DELETED, */
499 /* it doesn't participate in the new parity computation */
500 if (!block_has_file(task->block)) {
501 /* use an empty block */
502 memset(buffer, 0, state->block_size);
503 task->state = TASK_STATE_DONE;
504 return;
505 }
506
507 /* get the file of this block */
508 task->file = fs_par2file_get(disk, blockcur, &task->file_pos);
509
510 /* if the file is different than the current one, close it */
511 if (handle->file != 0 && handle->file != task->file) {
512 /* keep a pointer at the file we are going to close for error reporting */
513 struct snapraid_file* report = handle->file;
514 ret = handle_close(handle);
515 if (ret == -1) {
516 /* LCOV_EXCL_START */
517 /* This one is really an unexpected error, because we are only reading */
518 /* and closing a descriptor should never fail */
519 if (errno == EIO) {
520 log_tag("error:%u:%s:%s: Close EIO error. %s\n", blockcur, disk->name, esc_tag(report->sub, esc_buffer), strerror(errno));
521 log_fatal("DANGER! Unexpected input/output close error in a data disk, it isn't possible to sync.\n");
522 log_fatal("Ensure that disk '%s' is sane and that file '%s' can be accessed.\n", disk->dir, handle->path);
523 log_fatal("Stopping at block %u\n", blockcur);
524 task->state = TASK_STATE_IOERROR;
525 return;
526 }
527
528 log_tag("error:%u:%s:%s: Close error. %s\n", blockcur, disk->name, esc_tag(report->sub, esc_buffer), strerror(errno));
529 log_fatal("WARNING! Unexpected close error in a data disk, it isn't possible to sync.\n");
530 log_fatal("Ensure that file '%s' can be accessed.\n", handle->path);
531 log_fatal("Stopping at block %u\n", blockcur);
532 task->state = TASK_STATE_ERROR;
533 return;
534 /* LCOV_EXCL_STOP */
535 }
536 }
537
538 ret = handle_open(handle, task->file, state->file_mode, log_error, 0);
539 if (ret == -1) {
540 if (errno == EIO) {
541 /* LCOV_EXCL_START */
542 log_tag("error:%u:%s:%s: Open EIO error. %s\n", blockcur, disk->name, esc_tag(task->file->sub, esc_buffer), strerror(errno));
543 log_fatal("DANGER! Unexpected input/output open error in a data disk, it isn't possible to sync.\n");
544 log_fatal("Ensure that disk '%s' is sane and that file '%s' can be accessed.\n", disk->dir, handle->path);
545 log_fatal("Stopping at block %u\n", blockcur);
546 task->state = TASK_STATE_IOERROR;
547 return;
548 /* LCOV_EXCL_STOP */
549 }
550
551 if (errno == ENOENT) {
552 log_tag("error:%u:%s:%s: Open ENOENT error. %s\n", blockcur, disk->name, esc_tag(task->file->sub, esc_buffer), strerror(errno));
553 log_error("Missing file '%s'.\n", handle->path);
554 log_error("WARNING! You cannot modify data disk during a sync.\n");
555 log_error("Rerun the sync command when finished.\n");
556 /* if the file is missing, it means that it was removed during sync */
557 /* this isn't a serious error, so we skip this block, and continue with others */
558 task->state = TASK_STATE_ERROR_CONTINUE;
559 return;
560 }
561
562 if (errno == EACCES) {
563 log_tag("error:%u:%s:%s: Open EACCES error. %s\n", blockcur, disk->name, esc_tag(task->file->sub, esc_buffer), strerror(errno));
564 log_error("No access at file '%s'.\n", handle->path);
565 log_error("WARNING! Please fix the access permission in the data disk.\n");
566 log_error("Rerun the sync command when finished.\n");
567 /* this isn't a serious error, so we skip this block, and continue with others */
568 task->state = TASK_STATE_ERROR_CONTINUE;
569 return;
570 }
571
572 /* LCOV_EXCL_START */
573 log_tag("error:%u:%s:%s: Open error. %s\n", blockcur, disk->name, esc_tag(task->file->sub, esc_buffer), strerror(errno));
574 log_fatal("WARNING! Unexpected open error in a data disk, it isn't possible to sync.\n");
575 log_fatal("Ensure that file '%s' can be accessed.\n", handle->path);
576 log_fatal("Stopping to allow recovery. Try with 'snapraid check -f /%s'\n", fmt_poll(disk, task->file->sub, esc_buffer));
577 task->state = TASK_STATE_ERROR;
578 return;
579 /* LCOV_EXCL_STOP */
580 }
581
582 /* check if the file is changed */
583 if (handle->st.st_size != task->file->size
584 || handle->st.st_mtime != task->file->mtime_sec
585 || STAT_NSEC(&handle->st) != task->file->mtime_nsec
586 || handle->st.st_ino != task->file->inode
587 ) {
588 log_tag("error:%u:%s:%s: Unexpected attribute change\n", blockcur, disk->name, esc_tag(task->file->sub, esc_buffer));
589 if (handle->st.st_size != task->file->size) {
590 log_error("Unexpected size change at file '%s' from %" PRIu64 " to %" PRIu64 ".\n", handle->path, task->file->size, (uint64_t)handle->st.st_size);
591 } else if (handle->st.st_mtime != task->file->mtime_sec
592 || STAT_NSEC(&handle->st) != task->file->mtime_nsec) {
593 log_error("Unexpected time change at file '%s' from %" PRIu64 ".%d to %" PRIu64 ".%d.\n", handle->path, task->file->mtime_sec, task->file->mtime_nsec, (uint64_t)handle->st.st_mtime, STAT_NSEC(&handle->st));
594 } else {
595 log_error("Unexpected inode change from %" PRIu64 " to %" PRIu64 " at file '%s'.\n", task->file->inode, (uint64_t)handle->st.st_ino, handle->path);
596 }
597 log_error("WARNING! You cannot modify files during a sync.\n");
598 log_error("Rerun the sync command when finished.\n");
599 /* if the file is changed, it means that it was modified during sync */
600 /* this isn't a serious error, so we skip this block, and continue with others */
601 task->state = TASK_STATE_ERROR_CONTINUE;
602 return;
603 }
604
605 task->read_size = handle_read(handle, task->file_pos, buffer, state->block_size, log_error, 0);
606 if (task->read_size == -1) {
607 /* LCOV_EXCL_START */
608 if (errno == EIO) {
609 log_tag("error:%u:%s:%s: Read EIO error at position %u. %s\n", blockcur, disk->name, esc_tag(task->file->sub, esc_buffer), task->file_pos, strerror(errno));
610 log_error("Input/Output error in file '%s' at position '%u'\n", handle->path, task->file_pos);
611 task->state = TASK_STATE_IOERROR_CONTINUE;
612 return;
613 }
614
615 log_tag("error:%u:%s:%s: Read error at position %u. %s\n", blockcur, disk->name, esc_tag(task->file->sub, esc_buffer), task->file_pos, strerror(errno));
616 log_fatal("WARNING! Unexpected read error in a data disk, it isn't possible to sync.\n");
617 log_fatal("Ensure that file '%s' can be read.\n", handle->path);
618 log_fatal("Stopping to allow recovery. Try with 'snapraid check -f /%s'\n", fmt_poll(disk, task->file->sub, esc_buffer));
619 task->state = TASK_STATE_ERROR;
620 return;
621 /* LCOV_EXCL_STOP */
622 }
623
624 /* store the path of the opened file */
625 pathcpy(task->path, sizeof(task->path), handle->path);
626
627 task->state = TASK_STATE_DONE;
628 }
629
sync_parity_writer(struct snapraid_worker * worker,struct snapraid_task * task)630 static void sync_parity_writer(struct snapraid_worker* worker, struct snapraid_task* task)
631 {
632 struct snapraid_io* io = worker->io;
633 struct snapraid_state* state = io->state;
634 struct snapraid_parity_handle* parity_handle = worker->parity_handle;
635 unsigned level = parity_handle->level;
636 block_off_t blockcur = task->position;
637 unsigned char* buffer = task->buffer;
638 int ret;
639
640 /* write parity */
641 ret = parity_write(parity_handle, blockcur, buffer, state->block_size);
642 if (ret == -1) {
643 /* LCOV_EXCL_START */
644 if (errno == EIO) {
645 log_tag("parity_error:%u:%s: Write EIO error. %s\n", blockcur, lev_config_name(level), strerror(errno));
646 log_error("Input/Output error in parity '%s' at position '%u'\n", lev_config_name(level), blockcur);
647 task->state = TASK_STATE_IOERROR_CONTINUE;
648 return;
649 }
650
651 log_tag("parity_error:%u:%s: Write error. %s\n", blockcur, lev_config_name(level), strerror(errno));
652 log_fatal("WARNING! Unexpected write error in the %s disk, it isn't possible to sync.\n", lev_name(level));
653 log_fatal("Ensure that disk '%s' has some free space available.\n", lev_config_name(level));
654 log_fatal("Stopping at block %u\n", blockcur);
655 task->state = TASK_STATE_ERROR;
656 return;
657 /* LCOV_EXCL_STOP */
658 }
659
660 task->state = TASK_STATE_DONE;
661 }
662
state_sync_process(struct snapraid_state * state,struct snapraid_parity_handle * parity_handle,block_off_t blockstart,block_off_t blockmax)663 static int state_sync_process(struct snapraid_state* state, struct snapraid_parity_handle* parity_handle, block_off_t blockstart, block_off_t blockmax)
664 {
665 struct snapraid_io io;
666 struct snapraid_plan plan;
667 struct snapraid_handle* handle;
668 void* rehandle_alloc;
669 struct snapraid_rehash* rehandle;
670 unsigned diskmax;
671 block_off_t blockcur;
672 unsigned j;
673 void* zero_alloc;
674 void** zero;
675 void* copy_alloc;
676 void** copy;
677 unsigned buffermax;
678 data_off_t countsize;
679 block_off_t countpos;
680 block_off_t countmax;
681 block_off_t autosavedone;
682 block_off_t autosavelimit;
683 block_off_t autosavemissing;
684 int ret;
685 unsigned error;
686 unsigned silent_error;
687 unsigned io_error;
688 time_t now;
689 struct failed_struct* failed;
690 int* failed_map;
691 unsigned l;
692 unsigned* waiting_map;
693 unsigned waiting_mac;
694 char esc_buffer[ESC_MAX];
695 bit_vect_t* block_enabled;
696
697 /* the sync process assumes that all the hashes are correct */
698 /* including the ones from CHG and DELETED blocks */
699 assert(state->clear_past_hash != 0);
700
701 /* get the present time */
702 now = time(0);
703
704 /* maps the disks to handles */
705 handle = handle_mapping(state, &diskmax);
706
707 /* rehash buffers */
708 rehandle = malloc_nofail_align(diskmax * sizeof(struct snapraid_rehash), &rehandle_alloc);
709
710 /* we need 1 * data + 1 * parity */
711 buffermax = diskmax + state->level;
712
713 /* initialize the io threads */
714 io_init(&io, state, state->opt.io_cache, buffermax, sync_data_reader, handle, diskmax, 0, sync_parity_writer, parity_handle, state->level);
715
716 /* allocate the copy buffer */
717 copy = malloc_nofail_vector_align(diskmax, diskmax, state->block_size, ©_alloc);
718
719 /* allocate and fill the zero buffer */
720 zero = malloc_nofail_align(state->block_size, &zero_alloc);
721 memset(zero, 0, state->block_size);
722 raid_zero(zero);
723
724 failed = malloc_nofail(diskmax * sizeof(struct failed_struct));
725 failed_map = malloc_nofail(diskmax * sizeof(unsigned));
726
727 /* possibly waiting disks */
728 waiting_mac = diskmax > RAID_PARITY_MAX ? diskmax : RAID_PARITY_MAX;
729 waiting_map = malloc_nofail(waiting_mac * sizeof(unsigned));
730
731 error = 0;
732 silent_error = 0;
733 io_error = 0;
734
735 msg_progress("Selecting...\n");
736
737 /* first count the number of blocks to process */
738 countmax = 0;
739 plan.handle_max = diskmax;
740 plan.handle_map = handle;
741 plan.force_full = state->opt.force_full;
742 block_enabled = calloc_nofail(1, bit_vect_size(blockmax)); /* preinitialize to 0 */
743 for (blockcur = blockstart; blockcur < blockmax; ++blockcur) {
744 if (!block_is_enabled(&plan, blockcur))
745 continue;
746 bit_vect_set(block_enabled, blockcur);
747 ++countmax;
748 }
749
750 /* compute the autosave size for all disk, even if not read */
751 /* this makes sense because the speed should be almost the same */
752 /* if the disks are read in parallel */
753 autosavelimit = state->autosave / (diskmax * state->block_size);
754 autosavemissing = countmax; /* blocks to do */
755 autosavedone = 0; /* blocks done */
756
757 /* drop until now */
758 state_usage_waste(state);
759
760 countsize = 0;
761 countpos = 0;
762
763 msg_progress("Syncing...\n");
764
765 /* start all the worker threads */
766 io_start(&io, blockstart, blockmax, block_enabled);
767
768 if (!state_progress_begin(state, blockstart, blockmax, countmax))
769 goto end;
770
771 while (1) {
772 unsigned failed_count;
773 int error_on_this_block;
774 int silent_error_on_this_block;
775 int io_error_on_this_block;
776 int fixed_error_on_this_block;
777 int parity_needs_to_be_updated;
778 int parity_going_to_be_updated;
779 snapraid_info info;
780 int rehash;
781 void** buffer;
782 int writer_error[IO_WRITER_ERROR_MAX];
783
784 /* go to the next block */
785 blockcur = io_read_next(&io, &buffer);
786 if (blockcur >= blockmax)
787 break;
788
789 /* until now is scheduling */
790 state_usage_sched(state);
791
792 /* one more block processed for autosave */
793 ++autosavedone;
794 --autosavemissing;
795
796 /* by default process the block, and skip it if something goes wrong */
797 error_on_this_block = 0;
798 silent_error_on_this_block = 0;
799 io_error_on_this_block = 0;
800 fixed_error_on_this_block = 0;
801
802 /* keep track of the number of failed blocks */
803 failed_count = 0;
804
805 /* get block specific info */
806 info = info_get(&state->infoarr, blockcur);
807
808 /* if we have to use the old hash */
809 rehash = info_get_rehash(info);
810
811 /* if the parity requires to be updated */
812 /* It could happens that all the blocks are EMPTY/BLK and CHG but with the hash */
813 /* still matching because the specific CHG block was not modified. */
814 /* In such case, we can avoid to update parity, because it would be the same as before */
815 /* Note that CHG/DELETED blocks already present in the content file loaded */
816 /* have the hash cleared (::clear_past_hash flag), and then they won't never match the hash. */
817 /* We are treating only CHG blocks created at runtime. */
818 parity_needs_to_be_updated = state->opt.force_full || state->opt.force_parity_update;
819
820 /* if the parity is going to be updated */
821 parity_going_to_be_updated = 0;
822
823 /* if the block is marked as bad, we force the parity update */
824 /* because the bad block may be the result of a wrong parity */
825 if (info_get_bad(info))
826 parity_needs_to_be_updated = 1;
827
828 /* for each disk, process the block */
829 for (j = 0; j < diskmax; ++j) {
830 struct snapraid_task* task;
831 int read_size;
832 unsigned char hash[HASH_MAX];
833 struct snapraid_block* block;
834 unsigned block_state;
835 struct snapraid_disk* disk;
836 struct snapraid_file* file;
837 block_off_t file_pos;
838 unsigned diskcur;
839
840 /* until now is misc */
841 state_usage_misc(state);
842
843 task = io_data_read(&io, &diskcur, waiting_map, &waiting_mac);
844
845 /* until now is disk */
846 state_usage_disk(state, handle, waiting_map, waiting_mac);
847
848 /* get the results */
849 disk = task->disk;
850 block = task->block;
851 file = task->file;
852 file_pos = task->file_pos;
853 read_size = task->read_size;
854
855 /* by default no rehash in case of "continue" */
856 rehandle[diskcur].block = 0;
857
858 /* if the disk position is not used */
859 if (!disk)
860 continue;
861
862 state_usage_file(state, disk, file);
863
864 /* get the state of the block */
865 block_state = block_state_get(block);
866
867 /* if the block has invalid parity, */
868 /* we have to take care of it in case of recover */
869 if (block_has_invalid_parity(block)) {
870 /* store it in the failed set, because */
871 /* the parity may be still computed with the previous content */
872 failed[failed_count].index = diskcur;
873 failed[failed_count].size = state->block_size;
874 failed[failed_count].block = block;
875 ++failed_count;
876
877 /* if the block has invalid parity, we have to update the parity */
878 /* to include this block change */
879 /* This also apply to CHG blocks, but we are going to handle */
880 /* later this case to do the updates only if really needed */
881 if (block_state != BLOCK_STATE_CHG)
882 parity_needs_to_be_updated = 1;
883
884 /* note that DELETE blocks are skipped in the next check */
885 /* and we have to store them in the failed blocks */
886 /* before skipping */
887
888 /* follow */
889 }
890
891 /* if the block is not used */
892 if (!block_has_file(block))
893 continue;
894
895 /* handle error conditions */
896 if (task->state == TASK_STATE_IOERROR) {
897 /* LCOV_EXCL_START */
898 ++io_error;
899 goto bail;
900 /* LCOV_EXCL_STOP */
901 }
902 if (task->state == TASK_STATE_ERROR) {
903 /* LCOV_EXCL_START */
904 ++error;
905 goto bail;
906 /* LCOV_EXCL_STOP */
907 }
908 if (task->state == TASK_STATE_ERROR_CONTINUE) {
909 ++error;
910 error_on_this_block = 1;
911 continue;
912 }
913 if (task->state == TASK_STATE_IOERROR_CONTINUE) {
914 ++io_error;
915 if (io_error >= state->opt.io_error_limit) {
916 /* LCOV_EXCL_START */
917 log_fatal("DANGER! Unexpected input/output read error in a data disk, it isn't possible to sync.\n");
918 log_fatal("Ensure that disk '%s' is sane and that file '%s' can be read.\n", disk->dir, task->path);
919 log_fatal("Stopping at block %u\n", blockcur);
920 goto bail;
921 /* LCOV_EXCL_STOP */
922 }
923
924 /* otherwise continue */
925 io_error_on_this_block = 1;
926 continue;
927 }
928 if (task->state != TASK_STATE_DONE) {
929 /* LCOV_EXCL_START */
930 log_fatal("Internal inconsistency in task state\n");
931 os_abort();
932 /* LCOV_EXCL_STOP */
933 }
934
935 countsize += read_size;
936
937 /* now compute the hash */
938 if (rehash) {
939 memhash(state->prevhash, state->prevhashseed, hash, buffer[diskcur], read_size);
940
941 /* compute the new hash, and store it */
942 rehandle[diskcur].block = block;
943 memhash(state->hash, state->hashseed, rehandle[diskcur].hash, buffer[diskcur], read_size);
944 } else {
945 memhash(state->hash, state->hashseed, hash, buffer[diskcur], read_size);
946 }
947
948 /* until now is hash */
949 state_usage_hash(state);
950
951 if (block_has_updated_hash(block)) {
952 /* compare the hash */
953 if (memcmp(hash, block->hash, BLOCK_HASH_SIZE) != 0) {
954 /* if the file has invalid parity, it's a REP changed during the sync */
955 if (block_has_invalid_parity(block)) {
956 log_tag("error:%u:%s:%s: Unexpected data change\n", blockcur, disk->name, esc_tag(file->sub, esc_buffer));
957 log_error("Data change at file '%s' at position '%u'\n", task->path, file_pos);
958 log_error("WARNING! Unexpected data modification of a file without parity!\n");
959
960 if (file_flag_has(file, FILE_IS_COPY)) {
961 log_error("This file was detected as a copy of another file with the same name, size,\n");
962 log_error("and timestamp, but the file data isn't matching the assumed copy.\n");
963 log_error("If this is a false positive, and the files are expected to be different,\n");
964 log_error("you can 'sync' anyway using 'snapraid --force-nocopy sync'\n");
965 } else {
966 log_error("Try removing the file from the array and rerun the 'sync' command!\n");
967 }
968
969 ++error;
970
971 /* if the file is changed, it means that it was modified during sync */
972 /* this isn't a serious error, so we skip this block, and continue with others */
973 error_on_this_block = 1;
974 continue;
975 } else { /* otherwise it's a BLK with silent error */
976 unsigned diff = memdiff(hash, block->hash, BLOCK_HASH_SIZE);
977 log_tag("error:%u:%s:%s: Data error at position %u, diff bits %u/%u\n", blockcur, disk->name, esc_tag(file->sub, esc_buffer), file_pos, diff, BLOCK_HASH_SIZE * 8);
978 log_error("Data error in file '%s' at position '%u', diff bits %u/%u\n", task->path, file_pos, diff, BLOCK_HASH_SIZE * 8);
979
980 /* save the failed block for the fix */
981 failed[failed_count].index = diskcur;
982 failed[failed_count].size = read_size;
983 failed[failed_count].block = block;
984 ++failed_count;
985
986 /* silent errors are very rare, and are not a signal that a disk */
987 /* is going to fail. So, we just continue marking the block as bad */
988 /* just like in scrub */
989 ++silent_error;
990 silent_error_on_this_block = 1;
991 continue;
992 }
993 }
994 } else {
995 /* if until now the parity doesn't need to be updated */
996 if (!parity_needs_to_be_updated) {
997 /* for sure it's a CHG block, because EMPTY are processed before with "continue" */
998 /* and BLK and REP have "block_has_updated_hash()" as 1, and all the others */
999 /* have "parity_needs_to_be_updated" already at 1 */
1000 assert(block_state_get(block) == BLOCK_STATE_CHG);
1001
1002 /* if the hash represents the data unequivocally */
1003 if (hash_is_unique(block->hash)) {
1004 /* check if the hash is changed */
1005 if (memcmp(hash, block->hash, BLOCK_HASH_SIZE) != 0) {
1006 /* the block is different, and we must update parity */
1007 parity_needs_to_be_updated = 1;
1008 }
1009 } else {
1010 /* if the hash is already invalid, we update parity */
1011 parity_needs_to_be_updated = 1;
1012 }
1013 }
1014
1015 /* copy the hash in the block, but doesn't mark the block as hashed */
1016 /* this allow in case of skipped block to do not save the failed computation */
1017 memcpy(block->hash, hash, BLOCK_HASH_SIZE);
1018
1019 /* note that in case of rehash, this is the wrong hash, */
1020 /* but it will be overwritten later */
1021 }
1022 }
1023
1024 /* if we have only silent errors we can try to fix them on-the-fly */
1025 /* note the fix is not written to disk, but used only to */
1026 /* compute the new parity */
1027 if (!error_on_this_block && !io_error_on_this_block && silent_error_on_this_block) {
1028 unsigned failed_mac;
1029 int something_to_recover = 0;
1030
1031 /* sort the failed vector */
1032 /* because with threads it may be in any order */
1033 /* but RAID requires the indexes to be sorted */
1034 qsort(failed, failed_count, sizeof(failed[0]), failed_compare_by_index);
1035
1036 /* setup the blocks to recover */
1037 failed_mac = 0;
1038 for (j = 0; j < failed_count; ++j) {
1039 unsigned char* block_buffer = buffer[failed[j].index];
1040 unsigned char* block_copy = copy[failed[j].index];
1041 unsigned block_state = block_state_get(failed[j].block);
1042
1043 /* we try to recover only if at least one BLK is present */
1044 if (block_state == BLOCK_STATE_BLK)
1045 something_to_recover = 1;
1046
1047 /* save a copy of the content just read */
1048 /* that it's going to be overwritten by the recovering function */
1049 memcpy(block_copy, block_buffer, state->block_size);
1050
1051 if (block_state == BLOCK_STATE_CHG
1052 && hash_is_zero(failed[j].block->hash)
1053 ) {
1054 /* if the block was filled with 0, restore this state */
1055 /* and avoid to recover it */
1056 memset(block_buffer, 0, state->block_size);
1057 } else {
1058 /* if we have too many failures, we cannot recover */
1059 if (failed_mac >= state->level)
1060 break;
1061
1062 /* otherwise it has to be recovered */
1063 failed_map[failed_mac++] = failed[j].index;
1064 }
1065 }
1066
1067 /* if we have something to recover and enough parity */
1068 if (something_to_recover && j == failed_count) {
1069 /* until now is misc */
1070 state_usage_misc(state);
1071
1072 /* read the parity */
1073 /* we are sure that parity exists because */
1074 /* we have at least one BLK block */
1075 for (l = 0; l < state->level; ++l) {
1076 ret = parity_read(&parity_handle[l], blockcur, buffer[diskmax + l], state->block_size, log_error);
1077 if (ret == -1) {
1078 /* LCOV_EXCL_START */
1079 if (errno == EIO) {
1080 log_tag("parity_error:%u:%s: Read EIO error. %s\n", blockcur, lev_config_name(l), strerror(errno));
1081 if (io_error >= state->opt.io_error_limit) {
1082 log_fatal("DANGER! Unexpected input/output read error in the %s disk, it isn't possible to sync.\n", lev_name(l));
1083 log_fatal("Ensure that disk '%s' is sane and can be read.\n", lev_config_name(l));
1084 log_fatal("Stopping at block %u\n", blockcur);
1085 ++io_error;
1086 goto bail;
1087 }
1088
1089 log_error("Input/Output error in parity '%s' at position '%u'\n", lev_config_name(l), blockcur);
1090 ++io_error;
1091 io_error_on_this_block = 1;
1092 continue;
1093 }
1094
1095 log_tag("parity_error:%u:%s: Read error. %s\n", blockcur, lev_config_name(l), strerror(errno));
1096 log_fatal("WARNING! Unexpected read error in the %s disk, it isn't possible to sync.\n", lev_name(l));
1097 log_fatal("Ensure that disk '%s' can be read.\n", lev_config_name(l));
1098 log_fatal("Stopping at block %u\n", blockcur);
1099 ++error;
1100 goto bail;
1101 /* LCOV_EXCL_STOP */
1102 }
1103
1104 /* until now is parity */
1105 state_usage_parity(state, &l, 1);
1106 }
1107
1108 /* if no error in parity read */
1109 if (!io_error_on_this_block) {
1110 /* try to fix the data */
1111 /* note that this is a simple fix algorithm, that doesn't take into */
1112 /* account the case of a wrong parity */
1113 /* only 'fix' supports the most advanced fixing */
1114 raid_rec(failed_mac, failed_map, diskmax, state->level, state->block_size, buffer);
1115
1116 /* until now is raid */
1117 state_usage_raid(state);
1118
1119 /* check the result and prepare the data */
1120 for (j = 0; j < failed_count; ++j) {
1121 unsigned char hash[HASH_MAX];
1122 unsigned char* block_buffer = buffer[failed[j].index];
1123 unsigned char* block_copy = copy[failed[j].index];
1124 unsigned block_state = block_state_get(failed[j].block);
1125
1126 if (block_state == BLOCK_STATE_BLK) {
1127 unsigned size = failed[j].size;
1128
1129 /* compute the hash of the recovered block */
1130 if (rehash) {
1131 memhash(state->prevhash, state->prevhashseed, hash, block_buffer, size);
1132 } else {
1133 memhash(state->hash, state->hashseed, hash, block_buffer, size);
1134 }
1135
1136 /* until now is hash */
1137 state_usage_hash(state);
1138
1139 /* if the hash doesn't match */
1140 if (memcmp(hash, failed[j].block->hash, BLOCK_HASH_SIZE) != 0) {
1141 /* we have not recovered */
1142 break;
1143 }
1144
1145 /* pad with 0 if needed */
1146 if (size < state->block_size)
1147 memset(block_buffer + size, 0, state->block_size - size);
1148 } else {
1149 /* otherwise restore the content */
1150 /* because we are not interested in the old state */
1151 /* that it's recovered for CHG, REP and DELETED blocks */
1152 memcpy(block_buffer, block_copy, state->block_size);
1153 }
1154 }
1155
1156 /* if all is processed, we have fixed it */
1157 if (j == failed_count)
1158 fixed_error_on_this_block = 1;
1159 }
1160 }
1161 }
1162
1163 /* if we have read all the data required and it's correct, proceed with the parity */
1164 if (!error_on_this_block && !io_error_on_this_block
1165 && (!silent_error_on_this_block || fixed_error_on_this_block)
1166 ) {
1167 /* update the parity only if really needed */
1168 if (parity_needs_to_be_updated) {
1169 /* compute the parity */
1170 raid_gen(diskmax, state->level, state->block_size, buffer);
1171
1172 /* until now is raid */
1173 state_usage_raid(state);
1174
1175 /* mark that the parity is going to be written */
1176 parity_going_to_be_updated = 1;
1177 }
1178
1179 /* for each disk, mark the blocks as processed */
1180 for (j = 0; j < diskmax; ++j) {
1181 struct snapraid_block* block;
1182
1183 if (!handle[j].disk)
1184 continue;
1185
1186 block = fs_par2block_find(handle[j].disk, blockcur);
1187
1188 if (block == BLOCK_NULL) {
1189 /* nothing to do */
1190 continue;
1191 }
1192
1193 /* if it's a deleted block */
1194 if (block_state_get(block) == BLOCK_STATE_DELETED) {
1195 /* the parity is now updated without this block, so it's now empty */
1196 fs_deallocate(handle[j].disk, blockcur);
1197 continue;
1198 }
1199
1200 /* now all the blocks have the hash and the parity computed */
1201 block_state_set(block, BLOCK_STATE_BLK);
1202 }
1203
1204 /* we update the info block only if we really have updated the parity */
1205 /* because otherwise the time/justsynced info would be misleading as we didn't */
1206 /* wrote the parity at this time */
1207 /* we also update the info block only if no silent error was found */
1208 /* because has no sense to refresh the time for data that we know bad */
1209 if (parity_needs_to_be_updated
1210 && !silent_error_on_this_block
1211 ) {
1212 /* if rehash is needed */
1213 if (rehash) {
1214 /* store all the new hash already computed */
1215 for (j = 0; j < diskmax; ++j) {
1216 if (rehandle[j].block)
1217 memcpy(rehandle[j].block->hash, rehandle[j].hash, BLOCK_HASH_SIZE);
1218 }
1219 }
1220
1221 /* update the time info of the block */
1222 /* we are also clearing any previous bad and rehash flag */
1223 info_set(&state->infoarr, blockcur, info_make(now, 0, 0, 1));
1224 }
1225 }
1226
1227 /* if a silent (even if corrected) or input/output error was found */
1228 /* mark the block as bad to have check/fix to handle it */
1229 /* because our correction is in memory only and not yet written */
1230 if (silent_error_on_this_block || io_error_on_this_block) {
1231 /* set the error status keeping the other info */
1232 info_set(&state->infoarr, blockcur, info_set_bad(info));
1233 }
1234
1235 /* finally schedule parity write */
1236 /* Note that the calls to io_parity_write() are mandatory */
1237 /* even if the parity doesn't need to be updated */
1238 /* This because we want to keep track of the time usage */
1239 state_usage_misc(state);
1240
1241 /* write start */
1242 io_write_preset(&io, blockcur, !parity_going_to_be_updated);
1243
1244 /* write the parity */
1245 for (l = 0; l < state->level; ++l) {
1246 unsigned levcur;
1247
1248 io_parity_write(&io, &levcur, waiting_map, &waiting_mac);
1249
1250 /* until now is parity */
1251 state_usage_parity(state, waiting_map, waiting_mac);
1252 }
1253
1254 /* write finished */
1255 io_write_next(&io, blockcur, !parity_going_to_be_updated, writer_error);
1256
1257 /* handle errors reported */
1258 for (j = 0; j < IO_WRITER_ERROR_MAX; ++j) {
1259 if (writer_error[j]) {
1260 switch (j + IO_WRITER_ERROR_BASE) {
1261 case TASK_STATE_IOERROR_CONTINUE :
1262 ++io_error;
1263 if (io_error >= state->opt.io_error_limit) {
1264 /* LCOV_EXCL_START */
1265 log_fatal("DANGER! Unexpected input/output write error in a parity disk, it isn't possible to sync.\n");
1266 log_fatal("Stopping at block %u\n", blockcur);
1267 goto bail;
1268 /* LCOV_EXCL_STOP */
1269 }
1270 break;
1271 case TASK_STATE_ERROR_CONTINUE :
1272 ++error;
1273 break;
1274 case TASK_STATE_IOERROR :
1275 /* LCOV_EXCL_START */
1276 ++io_error;
1277 goto bail;
1278 /* LCOV_EXCL_STOP */
1279 case TASK_STATE_ERROR :
1280 /* LCOV_EXCL_START */
1281 ++error;
1282 goto bail;
1283 /* LCOV_EXCL_STOP */
1284 }
1285 }
1286 }
1287
1288 /* mark the state as needing write */
1289 state->need_write = 1;
1290
1291 /* count the number of processed block */
1292 ++countpos;
1293
1294 /* progress */
1295 if (state_progress(state, &io, blockcur, countpos, countmax, countsize)) {
1296 /* LCOV_EXCL_START */
1297 break;
1298 /* LCOV_EXCL_STOP */
1299 }
1300
1301 /* autosave */
1302 if ((state->autosave != 0
1303 && autosavedone >= autosavelimit /* if we have reached the limit */
1304 && autosavemissing >= autosavelimit) /* if we have at least a full step to do */
1305 /* or if we have a forced autosave at the specified block */
1306 || (state->opt.force_autosave_at != 0 && state->opt.force_autosave_at == blockcur)
1307 ) {
1308 autosavedone = 0; /* restart the counter */
1309
1310 /* until now is misc */
1311 state_usage_misc(state);
1312
1313 state_progress_stop(state);
1314
1315 msg_progress("Autosaving...\n");
1316
1317 /* before writing the new content file we ensure that */
1318 /* the parity is really written flushing the disk cache */
1319 for (l = 0; l < state->level; ++l) {
1320 ret = parity_sync(&parity_handle[l]);
1321 if (ret == -1) {
1322 /* LCOV_EXCL_START */
1323 log_tag("parity_error:%u:%s: Sync error\n", blockcur, lev_config_name(l));
1324 log_fatal("DANGER! Unexpected sync error in %s disk.\n", lev_name(l));
1325 log_fatal("Ensure that disk '%s' is sane.\n", lev_config_name(l));
1326 log_fatal("Stopping at block %u\n", blockcur);
1327 ++error;
1328 goto bail;
1329 /* LCOV_EXCL_STOP */
1330 }
1331 }
1332
1333 /* now we can safely write the content file */
1334 state_write(state);
1335
1336 state_progress_restart(state);
1337
1338 /* drop until now */
1339 state_usage_waste(state);
1340 }
1341 }
1342
1343 end:
1344 state_progress_end(state, countpos, countmax, countsize);
1345
1346 state_usage_print(state);
1347
1348 /* before returning we ensure that */
1349 /* the parity is really written flushing the disk cache */
1350 for (l = 0; l < state->level; ++l) {
1351 ret = parity_sync(&parity_handle[l]);
1352 if (ret == -1) {
1353 /* LCOV_EXCL_START */
1354 log_tag("parity_error:%u:%s: Sync error\n", blockcur, lev_config_name(l));
1355 log_fatal("DANGER! Unexpected sync error in %s disk.\n", lev_name(l));
1356 log_fatal("Ensure that disk '%s' is sane.\n", lev_config_name(l));
1357 log_fatal("Stopping at block %u\n", blockcur);
1358 ++error;
1359 goto bail;
1360 /* LCOV_EXCL_STOP */
1361 }
1362 }
1363
1364 if (error || silent_error || io_error) {
1365 msg_status("\n");
1366 msg_status("%8u file errors\n", error);
1367 msg_status("%8u io errors\n", io_error);
1368 msg_status("%8u data errors\n", silent_error);
1369 } else {
1370 /* print the result only if processed something */
1371 if (countpos != 0)
1372 msg_status("Everything OK\n");
1373 }
1374
1375 if (error)
1376 log_fatal("WARNING! Unexpected file errors!\n");
1377 if (io_error)
1378 log_fatal("DANGER! Unexpected input/output errors! The failing blocks are now marked as bad!\n");
1379 if (silent_error)
1380 log_fatal("DANGER! Unexpected data errors! The failing blocks are now marked as bad!\n");
1381 if (io_error || silent_error) {
1382 log_fatal("Use 'snapraid status' to list the bad blocks.\n");
1383 log_fatal("Use 'snapraid -e fix' to recover.\n");
1384 }
1385
1386 log_tag("summary:error_file:%u\n", error);
1387 log_tag("summary:error_io:%u\n", io_error);
1388 log_tag("summary:error_data:%u\n", silent_error);
1389 if (error + silent_error + io_error == 0)
1390 log_tag("summary:exit:ok\n");
1391 else
1392 log_tag("summary:exit:error\n");
1393 log_flush();
1394
1395 bail:
1396 /* stop all the worker threads */
1397 io_stop(&io);
1398
1399 for (j = 0; j < diskmax; ++j) {
1400 struct snapraid_file* file = handle[j].file;
1401 struct snapraid_disk* disk = handle[j].disk;
1402 ret = handle_close(&handle[j]);
1403 if (ret == -1) {
1404 /* LCOV_EXCL_START */
1405 log_tag("error:%u:%s:%s: Close error. %s\n", blockcur, disk->name, esc_tag(file->sub, esc_buffer), strerror(errno));
1406 log_fatal("DANGER! Unexpected close error in a data disk.\n");
1407 ++error;
1408 /* continue, as we are already exiting */
1409 /* LCOV_EXCL_STOP */
1410 }
1411 }
1412
1413 free(handle);
1414 free(zero_alloc);
1415 free(copy_alloc);
1416 free(copy);
1417 free(rehandle_alloc);
1418 free(failed);
1419 free(failed_map);
1420 free(waiting_map);
1421 io_done(&io);
1422 free(block_enabled);
1423
1424 if (state->opt.expect_recoverable) {
1425 if (error + silent_error + io_error == 0)
1426 return -1;
1427 } else {
1428 if (error + silent_error + io_error != 0)
1429 return -1;
1430 }
1431 return 0;
1432 }
1433
state_sync(struct snapraid_state * state,block_off_t blockstart,block_off_t blockcount)1434 int state_sync(struct snapraid_state* state, block_off_t blockstart, block_off_t blockcount)
1435 {
1436 block_off_t blockmax;
1437 block_off_t used_paritymax;
1438 block_off_t file_paritymax;
1439 data_off_t size;
1440 int ret;
1441 struct snapraid_parity_handle parity_handle[LEV_MAX];
1442 unsigned unrecoverable_error;
1443 unsigned l;
1444 int skip_sync = 0;
1445
1446 msg_progress("Initializing...\n");
1447
1448 blockmax = parity_allocated_size(state);
1449 size = blockmax * (data_off_t)state->block_size;
1450
1451 /* minimum size of the parity files we expect */
1452 used_paritymax = parity_used_size(state);
1453
1454 /* effective size of the parity files */
1455 file_paritymax = 0;
1456
1457 if (blockstart > blockmax) {
1458 /* LCOV_EXCL_START */
1459 log_fatal("Error in the starting block %u. It's bigger than the parity size %u.\n", blockstart, blockmax);
1460 exit(EXIT_FAILURE);
1461 /* LCOV_EXCL_STOP */
1462 }
1463
1464 /* adjust the number of block to process */
1465 if (blockcount != 0 && blockstart + blockcount < blockmax) {
1466 blockmax = blockstart + blockcount;
1467 }
1468
1469 for (l = 0; l < state->level; ++l) {
1470 data_off_t out_size;
1471 block_off_t parityblocks;
1472
1473 /* create the file and open for writing */
1474 ret = parity_create(&parity_handle[l], &state->parity[l], l, state->file_mode, state->block_size, state->opt.parity_limit_size);
1475 if (ret == -1) {
1476 /* LCOV_EXCL_START */
1477 log_fatal("WARNING! Without an accessible %s file, it isn't possible to sync.\n", lev_name(l));
1478 exit(EXIT_FAILURE);
1479 /* LCOV_EXCL_STOP */
1480 }
1481
1482 /* number of block in the parity file */
1483 parity_size(&parity_handle[l], &out_size);
1484 parityblocks = out_size / state->block_size;
1485
1486 /* if the file is too small */
1487 if (parityblocks < used_paritymax) {
1488 log_fatal("WARNING! The %s parity has data only %u blocks instead of %u.\n", lev_name(l), parityblocks, used_paritymax);
1489 }
1490
1491 /* keep the smallest parity number of blocks */
1492 if (l == 0 || file_paritymax > parityblocks)
1493 file_paritymax = parityblocks;
1494 }
1495
1496 /* if we do a full parity realloc or computation, having a wrong parity size is expected */
1497 if (!state->opt.force_realloc && !state->opt.force_full) {
1498 /* if the parities are too small */
1499 if (file_paritymax < used_paritymax) {
1500 /* LCOV_EXCL_START */
1501 log_fatal("DANGER! One or more the parity files are smaller than expected!\n");
1502 if (file_paritymax != 0) {
1503 log_fatal("If this happens because you are using an old content file,\n");
1504 log_fatal("you can 'sync' anyway using 'snapraid --force-full sync'\n");
1505 log_fatal("to force a full rebuild of the parity.\n");
1506 } else {
1507 log_fatal("It's possible that the parity disks are not mounted.\n");
1508 log_fatal("If instead you are adding a new parity level, you can 'sync' using\n");
1509 log_fatal("'snapraid --force-full sync' to force a full rebuild of the parity.\n");
1510 }
1511 exit(EXIT_FAILURE);
1512 /* LCOV_EXCL_STOP */
1513 }
1514 }
1515
1516 unrecoverable_error = 0;
1517
1518 if (state->opt.prehash) {
1519 msg_progress("Hashing...\n");
1520
1521 ret = state_hash_process(state, blockstart, blockmax, &skip_sync);
1522 if (ret == -1) {
1523 /* LCOV_EXCL_START */
1524 ++unrecoverable_error;
1525 /* continue, in case also doing the sync if ::skip_sync is not set */
1526 /* LCOV_EXCL_STOP */
1527 }
1528 }
1529
1530 if (!skip_sync) {
1531 msg_progress("Resizing...\n");
1532
1533 /* now change the size of all parities */
1534 for (l = 0; l < state->level; ++l) {
1535 int is_modified;
1536
1537 /* change the size of the parity file, truncating or extending it */
1538 /* from this point all the DELETED blocks after the end of the parity are invalid */
1539 /* and they are automatically removed when we save the new content file */
1540 ret = parity_chsize(&parity_handle[l], &state->parity[l], &is_modified, size, state->block_size, state->opt.skip_fallocate, state->opt.skip_space_holder);
1541 if (ret == -1) {
1542 /* LCOV_EXCL_START */
1543 data_off_t out_size;
1544 parity_size(&parity_handle[l], &out_size);
1545 parity_overflow(state, out_size);
1546 log_fatal("WARNING! Without a usable %s file, it isn't possible to sync.\n", lev_name(l));
1547 exit(EXIT_FAILURE);
1548 /* LCOV_EXCL_STOP */
1549 }
1550
1551 if (is_modified)
1552 state->need_write = 1;
1553 }
1554
1555 /* after resizing parity files, refresh again the free info */
1556 state_refresh(state);
1557
1558 /**
1559 * Save the new state before the sync but after the hashing phase
1560 *
1561 * This allows to recover after an aborted sync, and at the same time
1562 * it allows to recover broken copied/moved files identified in the
1563 * hashing phase.
1564 *
1565 * For example, think at this case:
1566 * - Add some files at the array
1567 * - Run a sync command, it will recompute the parity adding the new files
1568 * - Abort the sync command before it stores the new content file
1569 * - Delete the not yet synced files from the array
1570 * - Run a new sync command
1571 *
1572 * The sync command has no way to know that the parity file was modified
1573 * because the files triggering these changes are now deleted and they aren't
1574 * listed in the content file.
1575 * Instead, saving the new content file in advance, keeps track of all the parity
1576 * that may be modified.
1577 */
1578 if (!state->opt.skip_content_write) {
1579 if (state->need_write)
1580 state_write(state);
1581 } else {
1582 log_fatal("WARNING! Skipped state write for --test-skip-content-write option.\n");
1583 }
1584
1585 /* skip degenerated cases of empty parity, or skipping all */
1586 if (blockstart < blockmax) {
1587 ret = state_sync_process(state, parity_handle, blockstart, blockmax);
1588 if (ret == -1) {
1589 /* LCOV_EXCL_START */
1590 ++unrecoverable_error;
1591 /* continue, as we are already exiting */
1592 /* LCOV_EXCL_STOP */
1593 }
1594 } else {
1595 msg_status("Nothing to do\n");
1596 }
1597 }
1598
1599 for (l = 0; l < state->level; ++l) {
1600 ret = parity_close(&parity_handle[l]);
1601 if (ret == -1) {
1602 /* LCOV_EXCL_START */
1603 log_fatal("DANGER! Unexpected close error in %s disk.\n", lev_name(l));
1604 ++unrecoverable_error;
1605 /* continue, as we are already exiting */
1606 /* LCOV_EXCL_STOP */
1607 }
1608 }
1609
1610 /* abort if required */
1611 if (unrecoverable_error != 0)
1612 return -1;
1613 return 0;
1614 }
1615
1616