1<?php
2
3declare(strict_types=1);
4
5/*
6 * FINE granularity DIFF
7 *
8 * (c) 2011 Raymond Hill (http://raymondhill.net/blog/?p=441)
9 * (c) 2013 Robert Crowe (http://cogpowered.com)
10 * (c) 2021 Christian Kuhn
11 *
12 * For the full copyright and license information, please view
13 * the LICENSE file that was distributed with this source code.
14 */
15
16namespace cogpowered\FineDiff\Parser;
17
18use cogpowered\FineDiff\Exceptions\GranularityCountException;
19use cogpowered\FineDiff\Exceptions\OperationException;
20use cogpowered\FineDiff\Granularity\GranularityInterface;
21use cogpowered\FineDiff\Parser\Operations\Copy;
22use cogpowered\FineDiff\Parser\Operations\Delete;
23use cogpowered\FineDiff\Parser\Operations\Insert;
24use cogpowered\FineDiff\Parser\Operations\OperationInterface;
25use cogpowered\FineDiff\Parser\Operations\Replace;
26
27/**
28 * Generates a set of instructions to convert one string to another.
29 */
30class Parser implements ParserInterface
31{
32    /**
33     * @var GranularityInterface
34     */
35    protected $granularity;
36
37    /**
38     * @var OpcodesInterface
39     */
40    protected $opcodes;
41
42    /**
43     * @var string Text we are comparing against.
44     */
45    protected $from_text;
46
47    /**
48     * @var int Position of the $from_text we are at.
49     */
50    protected $from_offset = 0;
51
52    /**
53     * @var OperationInterface|null
54     */
55    protected $last_edit;
56
57    /**
58     * @var int Current position in the granularity array.
59     */
60    protected $stackpointer = 0;
61
62    /**
63     * @var array<int, OperationInterface> Holds the individual opcodes as the diff takes place.
64     */
65    protected $edits = [];
66
67    /**
68     * @inheritdoc
69     */
70    public function __construct(GranularityInterface $granularity)
71    {
72        $this->granularity = $granularity;
73
74        // Set default opcodes generator
75        $this->opcodes = new Opcodes();
76    }
77
78    /**
79     * @inheritdoc
80     */
81    public function getGranularity()
82    {
83        return $this->granularity;
84    }
85
86    /**
87     * @inheritdoc
88     */
89    public function setGranularity(GranularityInterface $granularity): void
90    {
91        $this->granularity = $granularity;
92    }
93
94    /**
95     * @inheritdoc
96     */
97    public function getOpcodes(): OpcodesInterface
98    {
99        return $this->opcodes;
100    }
101
102    /**
103     * @inheritdoc
104     */
105    public function setOpcodes(OpcodesInterface $opcodes): void
106    {
107        $this->opcodes = $opcodes;
108    }
109
110    /**
111     * @inheritdoc
112     * @throws GranularityCountException|OperationException
113     */
114    public function parse(string $from_text, string $to_text): OpcodesInterface
115    {
116        // Ensure the granularity contains some delimiters
117        if (count($this->granularity) === 0) {
118            throw new GranularityCountException('Granularity contains no delimiters');
119        }
120
121        // Reset internal parser properties
122        $this->from_text    = $from_text;
123        $this->from_offset  = 0;
124        $this->last_edit    = null;
125        $this->stackpointer = 0;
126        $this->edits        = [];
127
128        // Parse the two string
129        $this->process($from_text, $to_text);
130
131        // Return processed diff
132        $this->opcodes->setOpcodes($this->edits);
133        return $this->opcodes;
134    }
135
136    /**
137     * Actually kicks off the processing. Recursive function.
138     */
139    protected function process(string $from_text, string $to_text): void
140    {
141        // Lets get parsing
142        $delimiters     = $this->granularity[$this->stackpointer++] ?? '';
143        $has_next_stage = $this->stackpointer < count($this->granularity);
144
145        // Actually perform diff
146        $diff = $this->diff($from_text, $to_text, $delimiters);
147
148        foreach ($diff as $fragment) {
149            // increase granularity
150            if ($fragment instanceof Replace && $has_next_stage) {
151                $this->process(
152                    mb_substr($this->from_text, $this->from_offset, $fragment->getFromLen()),
153                    $fragment->getText()
154                );
155            } elseif ($fragment instanceof Copy && $this->last_edit instanceof Copy) {
156                // fuse copy ops whenever possible
157                /** @var Copy $copyOperation */
158                $copyOperation = $this->edits[count($this->edits)-1];
159                $copyOperation->increase($fragment->getFromLen());
160                $this->from_offset += $fragment->getFromLen();
161            } else {
162                /* $fragment instanceof Copy */
163                /* $fragment instanceof Delete */
164                /* $fragment instanceof Insert */
165                $this->edits[] = $this->last_edit = $fragment;
166                $this->from_offset += $fragment->getFromLen();
167            }
168        }
169
170        $this->stackpointer--;
171    }
172
173    /**
174     * Core parsing function.
175     *
176     * @return array<int, OperationInterface>
177     */
178    protected function diff(string $from_text, string $to_text, string $delimiters): array
179    {
180        // Empty delimiter means character-level diffing.
181        // In such case, use code path optimized for character-level diffing.
182        if (empty($delimiters)) {
183            return $this->charDiff($from_text, $to_text);
184        }
185
186        $result = [];
187
188        // fragment-level diffing
189        $from_text_len  = mb_strlen($from_text);
190        $to_text_len    = mb_strlen($to_text);
191        $from_fragments = $this->extractFragments($from_text, $delimiters);
192        $to_fragments   = $this->extractFragments($to_text, $delimiters);
193
194        $jobs              = [[0, $from_text_len, 0, $to_text_len]];
195        $cached_array_keys = [];
196        $best_from_start = 0;
197        $best_to_start = 0;
198
199        while ($job = array_pop($jobs)) {
200
201            // get the segments which must be diff'ed
202            list($from_segment_start, $from_segment_end, $to_segment_start, $to_segment_end) = $job;
203
204            // catch easy cases first
205            $from_segment_length = $from_segment_end - $from_segment_start;
206            $to_segment_length   = $to_segment_end - $to_segment_start;
207
208            if (!$from_segment_length || !$to_segment_length) {
209                if ($from_segment_length) {
210                    $result[$from_segment_start * 4] = new Delete($from_segment_length);
211                } elseif ($to_segment_length) {
212                    $result[$from_segment_start * 4 + 1] = new Insert(mb_substr($to_text, $to_segment_start, $to_segment_length));
213                }
214
215                continue;
216            }
217
218            // find longest copy operation for the current segments
219            $best_copy_length = 0;
220
221            $from_base_fragment_index = $from_segment_start;
222            $cached_array_keys_for_current_segment = [];
223
224            while ($from_base_fragment_index < $from_segment_end) {
225                $from_base_fragment        = $from_fragments[$from_base_fragment_index];
226                $from_base_fragment_length = mb_strlen($from_base_fragment);
227
228                // performance boost: cache array keys
229                if (!isset($cached_array_keys_for_current_segment[$from_base_fragment])) {
230                    if (!isset($cached_array_keys[$from_base_fragment])) {
231                        $to_all_fragment_indices = $cached_array_keys[$from_base_fragment] = array_keys($to_fragments, $from_base_fragment, true);
232                    } else {
233                        $to_all_fragment_indices = $cached_array_keys[$from_base_fragment];
234                    }
235
236                    // get only indices which falls within current segment
237                    if ($to_segment_start > 0 || $to_segment_end < $to_text_len) {
238                        $to_fragment_indices = [];
239
240                        foreach ($to_all_fragment_indices as $to_fragment_index) {
241                            if ($to_fragment_index < $to_segment_start) {
242                                continue;
243                            }
244
245                            if ($to_fragment_index >= $to_segment_end) {
246                                break;
247                            }
248
249                            $to_fragment_indices[] = $to_fragment_index;
250                        }
251
252                        $cached_array_keys_for_current_segment[$from_base_fragment] = $to_fragment_indices;
253                    } else {
254                        $to_fragment_indices = $to_all_fragment_indices;
255                    }
256                } else {
257                    $to_fragment_indices = $cached_array_keys_for_current_segment[$from_base_fragment];
258                }
259
260                // iterate through collected indices
261                foreach ($to_fragment_indices as $to_base_fragment_index) {
262                    $fragment_index_offset = $from_base_fragment_length;
263
264                    // iterate until no more match
265                    for (;;) {
266                        $fragment_from_index = $from_base_fragment_index + $fragment_index_offset;
267
268                        if ($fragment_from_index >= $from_segment_end) {
269                            break;
270                        }
271
272                        $fragment_to_index = $to_base_fragment_index + $fragment_index_offset;
273
274                        if ($fragment_to_index >= $to_segment_end) {
275                            break;
276                        }
277
278                        if ($from_fragments[$fragment_from_index] !== $to_fragments[$fragment_to_index]) {
279                            break;
280                        }
281
282                        $fragment_length = mb_strlen($from_fragments[$fragment_from_index]);
283                        $fragment_index_offset += $fragment_length;
284                    }
285
286                    if ($fragment_index_offset > $best_copy_length) {
287                        $best_copy_length = $fragment_index_offset;
288                        $best_from_start  = $from_base_fragment_index;
289                        $best_to_start    = $to_base_fragment_index;
290                    }
291                }
292
293                $from_base_fragment_index += mb_strlen($from_base_fragment);
294
295                // If match is larger than half segment size, no point trying to find better
296                // @todo: Really?
297                if ($best_copy_length >= $from_segment_length / 2) {
298                    break;
299                }
300
301                // no point to keep looking if what is left is less than
302                // current best match
303                if ($from_base_fragment_index + $best_copy_length >= $from_segment_end) {
304                    break;
305                }
306            }
307
308            if ($best_copy_length) {
309                $jobs[] = [$from_segment_start, $best_from_start, $to_segment_start, $best_to_start];
310                $result[$best_from_start * 4 + 2] = new Copy($best_copy_length);
311                $jobs[] = [$best_from_start + $best_copy_length, $from_segment_end, $best_to_start + $best_copy_length, $to_segment_end];
312            } else {
313                $result[$from_segment_start * 4 ] = new Replace($from_segment_length, mb_substr($to_text, $to_segment_start, $to_segment_length));
314            }
315        }
316
317        ksort($result, SORT_NUMERIC);
318        return array_values($result);
319    }
320
321    /**
322     * Same as Parser::diff but tuned for character level granularity.
323     *
324     * @return array<int, OperationInterface>
325     */
326    protected function charDiff(string $from_text, string $to_text): array
327    {
328        $result = [];
329        $jobs   = [[0, mb_strlen($from_text), 0, mb_strlen($to_text)]];
330        $from_copy_start = 0;
331        $to_copy_start = 0;
332
333        while ($job = array_pop($jobs)) {
334
335            // get the segments which must be diff'ed
336            list($from_segment_start, $from_segment_end, $to_segment_start, $to_segment_end) = $job;
337
338            $from_segment_len = $from_segment_end - $from_segment_start;
339            $to_segment_len   = $to_segment_end - $to_segment_start;
340
341            // catch easy cases first
342            if (!$from_segment_len || !$to_segment_len) {
343                if ($from_segment_len) {
344                    $result[$from_segment_start * 4 + 0] = new Delete($from_segment_len);
345                } elseif ($to_segment_len) {
346                    $result[$from_segment_start * 4 + 1] = new Insert(mb_substr($to_text, $to_segment_start, $to_segment_len));
347                }
348
349                continue;
350            }
351
352            if ($from_segment_len >= $to_segment_len) {
353                $copy_len = $to_segment_len;
354
355                while ($copy_len) {
356                    $to_copy_start     = $to_segment_start;
357                    $to_copy_start_max = $to_segment_end - $copy_len;
358
359                    while ($to_copy_start <= $to_copy_start_max) {
360                        $from_copy_start = mb_strpos(mb_substr($from_text, $from_segment_start, $from_segment_len), mb_substr($to_text, $to_copy_start, $copy_len));
361
362                        if ($from_copy_start !== false) {
363                            $from_copy_start += $from_segment_start;
364                            break 2;
365                        }
366
367                        $to_copy_start++;
368                    }
369
370                    $copy_len--;
371                }
372            } else {
373                $copy_len = $from_segment_len;
374
375                while ($copy_len) {
376                    $from_copy_start     = $from_segment_start;
377                    $from_copy_start_max = $from_segment_end - $copy_len;
378
379                    while ($from_copy_start <= $from_copy_start_max) {
380                        $to_copy_start = mb_strpos(mb_substr($to_text, $to_segment_start, $to_segment_len), mb_substr($from_text, $from_copy_start, $copy_len));
381
382                        if ($to_copy_start !== false) {
383                            $to_copy_start += $to_segment_start;
384                            break 2;
385                        }
386
387                        $from_copy_start++;
388                    }
389
390                    $copy_len--;
391                }
392            }
393
394            // match found
395            if ($copy_len) {
396                $jobs[] = [$from_segment_start, $from_copy_start, $to_segment_start, $to_copy_start];
397                $result[$from_copy_start * 4 + 2] = new Copy($copy_len);
398                $jobs[] = [$from_copy_start + $copy_len, $from_segment_end, $to_copy_start + $copy_len, $to_segment_end];
399            }
400            // no match,  so delete all, insert all
401            else {
402                $result[$from_segment_start * 4] = new Replace($from_segment_len, mb_substr($to_text, $to_segment_start, $to_segment_len));
403            }
404        }
405
406        ksort($result, SORT_NUMERIC);
407        return array_values($result);
408    }
409
410    /**
411    * Efficiently fragment the text into an array according to specified delimiters.
412    *
413    * The array indices are the offset of the fragments into
414    * the input string. A sentinel empty fragment is always added at the end.
415    * Careful: No check is performed as to the validity of the delimiters.
416     *
417     * @return array<int, string>
418    */
419    protected function extractFragments(string $text, string $delimiters)
420    {
421        $fragments = [];
422        $start     = 0;
423        $end       = 0;
424        for (;;) {
425            $end += strcspn($text, $delimiters, $end);
426            $end += strspn($text, $delimiters, $end);
427            if ($end === $start) {
428                break;
429            }
430            $fragments[$start] = mb_substr($text, $start, $end - $start);
431            $start             = $end;
432        }
433        $fragments[$start] = '';
434        return $fragments;
435    }
436}
437