1<?php
2
3/**
4 * webtrees: online genealogy
5 * Copyright (C) 2021 webtrees development team
6 * This program is free software: you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation, either version 3 of the License, or
9 * (at your option) any later version.
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 * You should have received a copy of the GNU General Public License
15 * along with this program. If not, see <https://www.gnu.org/licenses/>.
16 */
17
18declare(strict_types=1);
19
20namespace Fisharebest\Webtrees\Http\RequestHandlers;
21
22use Exception;
23use Fisharebest\Webtrees\Exceptions\GedcomErrorException;
24use Fisharebest\Webtrees\Functions\FunctionsImport;
25use Fisharebest\Webtrees\Gedcom;
26use Fisharebest\Webtrees\Http\ViewResponseTrait;
27use Fisharebest\Webtrees\I18N;
28use Fisharebest\Webtrees\Services\TimeoutService;
29use Fisharebest\Webtrees\Services\TreeService;
30use Fisharebest\Webtrees\Tree;
31use Illuminate\Database\Capsule\Manager as DB;
32use Illuminate\Database\DetectsDeadlocks;
33use Illuminate\Database\Query\Expression;
34use Illuminate\Support\Str;
35use PDOException;
36use Psr\Http\Message\ResponseInterface;
37use Psr\Http\Message\ServerRequestInterface;
38use Psr\Http\Server\RequestHandlerInterface;
39
40use function assert;
41use function preg_match;
42use function preg_split;
43use function response;
44use function str_replace;
45use function str_starts_with;
46use function strlen;
47use function strtoupper;
48use function substr;
49use function trim;
50use function view;
51
52/**
53 * Load a chunk of GEDCOM data.
54 */
55class GedcomLoad implements RequestHandlerInterface
56{
57    use ViewResponseTrait;
58    use DetectsDeadlocks;
59
60    /** @var TimeoutService */
61    private $timeout_service;
62
63    /** @var TreeService */
64    private $tree_service;
65
66    /**
67     * GedcomLoad constructor.
68     *
69     * @param TimeoutService $timeout_service
70     * @param TreeService    $tree_service
71     */
72    public function __construct(TimeoutService $timeout_service, TreeService $tree_service)
73    {
74        $this->timeout_service = $timeout_service;
75        $this->tree_service    = $tree_service;
76    }
77
78    /**
79     * @param ServerRequestInterface $request
80     *
81     * @return ResponseInterface
82     */
83    public function handle(ServerRequestInterface $request): ResponseInterface
84    {
85        $this->layout = 'layouts/ajax';
86
87        $tree = $request->getAttribute('tree');
88        assert($tree instanceof Tree);
89
90        try {
91            // What is the current import status?
92            $import_offset = DB::table('gedcom_chunk')
93                ->where('gedcom_id', '=', $tree->id())
94                ->where('imported', '=', '1')
95                ->count();
96
97            $import_total = DB::table('gedcom_chunk')
98                ->where('gedcom_id', '=', $tree->id())
99                ->count();
100
101            // Finished?
102            if ($import_offset === $import_total) {
103                $tree->setPreference('imported', '1');
104
105                $html = view('admin/import-complete', ['tree' => $tree]);
106
107                return response($html);
108            }
109
110            // Calculate progress so far
111            $progress = $import_offset / $import_total;
112
113            $first_time = ($import_offset === 0);
114
115            // Collect up any errors, and show them later.
116            $errors = '';
117
118            // Run for a short period of time. This keeps the resource requirements low.
119            do {
120                $data = DB::table('gedcom_chunk')
121                    ->where('gedcom_id', '=', $tree->id())
122                    ->where('imported', '=', '0')
123                    ->orderBy('gedcom_chunk_id')
124                    ->select(['gedcom_chunk_id', 'chunk_data'])
125                    ->first();
126
127                if ($data === null) {
128                    break;
129                }
130
131                // Mark the chunk as imported.  This will create a row-lock, to prevent other
132                // processes from reading it until we have finished.
133                $n = DB::table('gedcom_chunk')
134                    ->where('gedcom_chunk_id', '=', $data->gedcom_chunk_id)
135                    ->where('imported', '=', '0')
136                    ->update(['imported' => 1]);
137
138                // Another process has already imported this data?
139                if ($n === 0) {
140                    break;
141                }
142
143                // If we are loading the first (header) record, then delete old data and convert to UTF-8.
144                if ($first_time) {
145                    $this->tree_service->deleteGenealogyData($tree, (bool) $tree->getPreference('keep_media'));
146
147                    // Remove any byte-order-mark
148                    if (str_starts_with($data->chunk_data, Gedcom::UTF8_BOM)) {
149                        $data->chunk_data = substr($data->chunk_data, strlen(Gedcom::UTF8_BOM));
150                        // Put it back in the database, so we can do character conversion
151                        DB::table('gedcom_chunk')
152                            ->where('gedcom_chunk_id', '=', $data->gedcom_chunk_id)
153                            ->update(['chunk_data' => $data->chunk_data]);
154                    }
155
156                    if (!str_starts_with($data->chunk_data, '0 HEAD')) {
157                        return $this->viewResponse('admin/import-fail', [
158                            'error' => I18N::translate('Invalid GEDCOM file - no header record found.'),
159                            'tree'  => $tree,
160                        ]);
161                    }
162
163                    // What character set is this? Need to convert it to UTF8
164                    if (preg_match('/[\r\n][ \t]*1 CHAR(?:ACTER)? ([^\r\n]+)/', $data->chunk_data, $match)) {
165                        $charset = strtoupper(trim($match[1]));
166                    } else {
167                        $charset = 'ASCII';
168                    }
169
170                    // MySQL supports a wide range of collation conversions. These are ones that
171                    // have been encountered "in the wild".
172                    switch ($charset) {
173                        case 'ASCII':
174                            DB::table('gedcom_chunk')
175                                ->where('gedcom_id', '=', $tree->id())
176                                ->update(['chunk_data' => new Expression('CONVERT(CONVERT(chunk_data USING ascii) USING utf8)')]);
177                            break;
178                        case 'IBMPC':   // IBMPC, IBM WINDOWS and MS-DOS could be anything. Mostly it means CP850.
179                        case 'IBM WINDOWS':
180                        case 'MS-DOS':
181                        case 'CP437':
182                        case 'CP850':
183                            // CP850 has extra letters with diacritics to replace box-drawing chars in CP437.
184                            DB::table('gedcom_chunk')
185                                ->where('gedcom_id', '=', $tree->id())
186                                ->update(['chunk_data' => new Expression('CONVERT(CONVERT(chunk_data USING cp850) USING utf8)')]);
187                            break;
188                        case 'ANSI': // ANSI could be anything. Most applications seem to treat it as latin1.
189                        case 'WINDOWS':
190                        case 'CP1252':
191                        case 'ISO8859-1':
192                        case 'ISO-8859-1':
193                        case 'LATIN1':
194                        case 'LATIN-1':
195                            // Convert from ISO-8859-1 (western european) to UTF8.
196                            DB::table('gedcom_chunk')
197                                ->where('gedcom_id', '=', $tree->id())
198                                ->update(['chunk_data' => new Expression('CONVERT(CONVERT(chunk_data USING latin1) USING utf8)')]);
199                            break;
200                        case 'CP1250':
201                        case 'ISO8859-2':
202                        case 'ISO-8859-2':
203                        case 'LATIN2':
204                        case 'LATIN-2':
205                            // Convert from ISO-8859-2 (eastern european) to UTF8.
206                            DB::table('gedcom_chunk')
207                                ->where('gedcom_id', '=', $tree->id())
208                                ->update(['chunk_data' => new Expression('CONVERT(CONVERT(chunk_data USING latin2) USING utf8)')]);
209                            break;
210                        case 'MACINTOSH':
211                            // Convert from MAC Roman to UTF8.
212                            DB::table('gedcom_chunk')
213                                ->where('gedcom_id', '=', $tree->id())
214                                ->update(['chunk_data' => new Expression('CONVERT(CONVERT(chunk_data USING macroman) USING utf8)')]);
215                            break;
216                        case 'UTF8':
217                        case 'UTF-8':
218                            // Already UTF-8 so nothing to do!
219                            break;
220                        case 'ANSEL':
221                        default:
222                            return $this->viewResponse('admin/import-fail', [
223                                'error' => I18N::translate('Error: converting GEDCOM files from %s encoding to UTF-8 encoding not currently supported.', $charset),
224                                'tree'  => $tree,
225                            ]);
226                    }
227                    $first_time = false;
228
229                    // Re-fetch the data, now that we have performed character set conversion.
230                    $data = DB::table('gedcom_chunk')
231                        ->where('gedcom_chunk_id', '=', $data->gedcom_chunk_id)
232                        ->select(['gedcom_chunk_id', 'chunk_data'])
233                        ->first();
234                }
235
236                $data->chunk_data = str_replace("\r", "\n", $data->chunk_data);
237
238                // Import all the records in this chunk of data
239                foreach (preg_split('/\n+(?=0)/', $data->chunk_data) as $rec) {
240                    try {
241                        FunctionsImport::importRecord($rec, $tree, false);
242                    } catch (GedcomErrorException $exception) {
243                        $errors .= $exception->getMessage();
244                    }
245                }
246
247                // Do not need the data any more.
248                DB::table('gedcom_chunk')
249                    ->where('gedcom_chunk_id', '=', $data->gedcom_chunk_id)
250                    ->update(['chunk_data' => '']);
251            } while (!$this->timeout_service->isTimeLimitUp());
252
253            return $this->viewResponse('admin/import-progress', [
254                'errors'   => $errors,
255                'progress' => $progress,
256                'tree'     => $tree,
257            ]);
258        } catch (Exception $ex) {
259            DB::connection()->rollBack();
260
261            // Deadlock? Try again.
262            if ($this->causedByDeadlock($ex)) {
263                return $this->viewResponse('admin/import-progress', [
264                    'errors'   => '',
265                    'progress' => $progress ?? 0.0,
266                    'tree'     => $tree,
267                ]);
268            }
269
270            return $this->viewResponse('admin/import-fail', [
271                'error' => $ex->getMessage(),
272                'tree'  => $tree,
273            ]);
274        }
275    }
276}
277