1<?php
2
3/**
4 * FINE granularity DIFF
5 *
6 * Computes a set of instructions to convert the content of
7 * one string into another.
8 *
9 * Originally created by Raymond Hill (https://github.com/gorhill/PHP-FineDiff), brought up
10 * to date by Cog Powered (https://github.com/cogpowered/FineDiff).
11 *
12 * @copyright Copyright 2011 (c) Raymond Hill (http://raymondhill.net/blog/?p=441)
13 * @copyright Copyright 2013 (c) Robert Crowe (http://cogpowered.com)
14 * @link https://github.com/cogpowered/FineDiff
15 * @version 0.0.1
16 * @license MIT License (http://www.opensource.org/licenses/mit-license.php)
17 */
18
19namespace cogpowered\FineDiff\Parser;
20
21use cogpowered\FineDiff\Granularity\GranularityInterface;
22use cogpowered\FineDiff\Exceptions\GranularityCountException;
23use cogpowered\FineDiff\Parser\Operations\Copy;
24use cogpowered\FineDiff\Parser\Operations\Delete;
25use cogpowered\FineDiff\Parser\Operations\Insert;
26use cogpowered\FineDiff\Parser\Operations\Replace;
27
28/**
29 * Generates a set of instructions to convert one string to another.
30 */
31class Parser implements ParserInterface
32{
33    /**
34     * @var cogpowered\FineDiff\GranularityInterface
35     */
36    protected $granularity;
37
38    /**
39     * @var cogpowered\FineDiff\Parser\OpcodesInterface
40     */
41    protected $opcodes;
42
43    /**
44     * @var string Text we are comparing against.
45     */
46    protected $from_text;
47
48    /**
49     * @var int Position of the $from_text we are at.
50     */
51    protected $from_offset = 0;
52
53    /**
54     * @var cogpowered\FineDiff\Operations\OperationInterface
55     */
56    protected $last_edit;
57
58    /**
59     * @var int Current position in the granularity array.
60     */
61    protected $stackpointer = 0;
62
63    /**
64     * @var array Holds the individual opcodes as the diff takes place.
65     */
66    protected $edits = array();
67
68    /**
69     * @inheritdoc
70     */
71    public function __construct(GranularityInterface $granularity)
72    {
73        $this->granularity = $granularity;
74
75        // Set default opcodes generator
76        $this->opcodes = new Opcodes;
77    }
78
79    /**
80     * @inheritdoc
81     */
82    public function getGranularity()
83    {
84        return $this->granularity;
85    }
86
87    /**
88     * @inheritdoc
89     */
90    public function setGranularity(GranularityInterface $granularity)
91    {
92        $this->granularity = $granularity;
93    }
94
95    /**
96     * @inheritdoc
97     */
98    public function getOpcodes()
99    {
100        return $this->opcodes;
101    }
102
103    /**
104     * @inheritdoc
105     */
106    public function setOpcodes(OpcodesInterface $opcodes)
107    {
108        $this->opcodes = $opcodes;
109    }
110
111    /**
112     * @inheritdoc
113     */
114    public function parse($from_text, $to_text)
115    {
116        // Ensure the granularity contains some delimiters
117        if (count($this->granularity) === 0) {
118            throw new GranularityCountException('Granularity contains no delimiters');
119        }
120
121        // Reset internal parser properties
122        $this->from_text    = $from_text;
123        $this->from_offset  = 0;
124        $this->last_edit    = null;
125        $this->stackpointer = 0;
126        $this->edits        = array();
127
128        // Parse the two string
129        $this->process($from_text, $to_text);
130
131        // Return processed diff
132        $this->opcodes->setOpcodes($this->edits);
133        return $this->opcodes;
134    }
135
136    /**
137     * Actually kicks off the processing. Recursive function.
138     *
139     * @param string $from_text
140     * @param string $to_text
141     * @return void
142     */
143    protected function process($from_text, $to_text)
144    {
145        // Lets get parsing
146        $delimiters     = $this->granularity[$this->stackpointer++];
147        $has_next_stage = $this->stackpointer < count($this->granularity);
148
149        // Actually perform diff
150        $diff = $this->diff($from_text, $to_text, $delimiters);
151        $diff = (is_array($diff)) ? $diff : array();
152
153        foreach ($diff as $fragment) {
154
155            // increase granularity
156            if ($fragment instanceof Replace && $has_next_stage) {
157                $this->process(
158                    substr($this->from_text, $this->from_offset, $fragment->getFromLen()),
159                    $fragment->getText()
160                );
161            }
162            // fuse copy ops whenever possible
163            elseif ($fragment instanceof Copy && $this->last_edit instanceof Copy) {
164                $this->edits[count($this->edits)-1]->increase($fragment->getFromLen());
165                $this->from_offset += $fragment->getFromLen();
166            }
167            else {
168                /* $fragment instanceof Copy */
169                /* $fragment instanceof Delete */
170                /* $fragment instanceof Insert */
171                $this->edits[] = $this->last_edit = $fragment;
172                $this->from_offset += $fragment->getFromLen();
173            }
174        }
175
176        $this->stackpointer--;
177    }
178
179    /**
180     * Core parsing function.
181     *
182     * @param string $from_text
183     * @param string $to_text
184     * @param string $delimiters Delimiter to use for this parse.
185     * @return array
186     */
187    protected function diff($from_text, $to_text, $delimiters)
188    {
189        // Empty delimiter means character-level diffing.
190        // In such case, use code path optimized for character-level diffing.
191        if (empty($delimiters)) {
192            return $this->charDiff($from_text, $to_text);
193        }
194
195        $result = array();
196
197        // fragment-level diffing
198        $from_text_len  = strlen($from_text);
199        $to_text_len    = strlen($to_text);
200        $from_fragments = $this->extractFragments($from_text, $delimiters);
201        $to_fragments   = $this->extractFragments($to_text, $delimiters);
202
203        $jobs              = array(array(0, $from_text_len, 0, $to_text_len));
204        $cached_array_keys = array();
205
206
207        while ($job = array_pop($jobs)) {
208
209            // get the segments which must be diff'ed
210            list($from_segment_start, $from_segment_end, $to_segment_start, $to_segment_end) = $job;
211
212            // catch easy cases first
213            $from_segment_length = $from_segment_end - $from_segment_start;
214            $to_segment_length   = $to_segment_end - $to_segment_start;
215
216            if (!$from_segment_length || !$to_segment_length ) {
217
218                if ( $from_segment_length ) {
219                    $result[$from_segment_start * 4] = new Delete($from_segment_length);
220                } else if ( $to_segment_length ) {
221                    $result[$from_segment_start * 4 + 1] = new Insert(substr($to_text, $to_segment_start, $to_segment_length));
222                }
223
224                continue;
225            }
226
227            // find longest copy operation for the current segments
228            $best_copy_length = 0;
229
230            $from_base_fragment_index = $from_segment_start;
231            $cached_array_keys_for_current_segment = array();
232
233            while ( $from_base_fragment_index < $from_segment_end ) {
234
235                $from_base_fragment        = $from_fragments[$from_base_fragment_index];
236                $from_base_fragment_length = strlen($from_base_fragment);
237
238                // performance boost: cache array keys
239                if (!isset($cached_array_keys_for_current_segment[$from_base_fragment])) {
240
241                    if ( !isset($cached_array_keys[$from_base_fragment]) ) {
242                        $to_all_fragment_indices = $cached_array_keys[$from_base_fragment] = array_keys($to_fragments, $from_base_fragment, true);
243                    }
244                    else {
245                        $to_all_fragment_indices = $cached_array_keys[$from_base_fragment];
246                    }
247
248                    // get only indices which falls within current segment
249                    if ($to_segment_start > 0 || $to_segment_end < $to_text_len) {
250
251                        $to_fragment_indices = array();
252
253                        foreach ($to_all_fragment_indices as $to_fragment_index) {
254
255                            if ($to_fragment_index < $to_segment_start) {
256                                continue;
257                            }
258
259                            if ($to_fragment_index >= $to_segment_end) {
260                                break;
261                            }
262
263                            $to_fragment_indices[] = $to_fragment_index;
264                        }
265
266                        $cached_array_keys_for_current_segment[$from_base_fragment] = $to_fragment_indices;
267
268                    } else {
269                        $to_fragment_indices = $to_all_fragment_indices;
270                    }
271
272                } else {
273                    $to_fragment_indices = $cached_array_keys_for_current_segment[$from_base_fragment];
274                }
275
276                // iterate through collected indices
277                foreach ($to_fragment_indices as $to_base_fragment_index) {
278
279                    $fragment_index_offset = $from_base_fragment_length;
280
281                    // iterate until no more match
282                    for (;;) {
283
284                        $fragment_from_index = $from_base_fragment_index + $fragment_index_offset;
285
286                        if ($fragment_from_index >= $from_segment_end) {
287                            break;
288                        }
289
290                        $fragment_to_index = $to_base_fragment_index + $fragment_index_offset;
291
292                        if ($fragment_to_index >= $to_segment_end) {
293                            break;
294                        }
295
296                        if ($from_fragments[$fragment_from_index] !== $to_fragments[$fragment_to_index]) {
297                            break;
298                        }
299
300                        $fragment_length = strlen($from_fragments[$fragment_from_index]);
301                        $fragment_index_offset += $fragment_length;
302                    }
303
304                    if ($fragment_index_offset > $best_copy_length) {
305                        $best_copy_length = $fragment_index_offset;
306                        $best_from_start  = $from_base_fragment_index;
307                        $best_to_start    = $to_base_fragment_index;
308                    }
309                }
310
311                $from_base_fragment_index += strlen($from_base_fragment);
312
313                // If match is larger than half segment size, no point trying to find better
314                // TODO: Really?
315                if ($best_copy_length >= $from_segment_length / 2) {
316                    break;
317                }
318
319                // no point to keep looking if what is left is less than
320                // current best match
321                if ( $from_base_fragment_index + $best_copy_length >= $from_segment_end ) {
322                    break;
323                }
324            }
325
326            if ($best_copy_length) {
327                $jobs[] = array($from_segment_start, $best_from_start, $to_segment_start, $best_to_start);
328                $result[$best_from_start * 4 + 2] = new Copy($best_copy_length);
329                $jobs[] = array($best_from_start + $best_copy_length, $from_segment_end, $best_to_start + $best_copy_length, $to_segment_end);
330            } else {
331                $result[$from_segment_start * 4 ] = new Replace($from_segment_length, substr($to_text, $to_segment_start, $to_segment_length));
332            }
333        }
334
335        ksort($result, SORT_NUMERIC);
336        return array_values($result);
337    }
338
339    /**
340     * Same as Parser::diff but tuned for character level granularity.
341     *
342     * @param string $from_text
343     * @param string $to_text
344     * @return array
345     */
346    protected function charDiff($from_text, $to_text)
347    {
348        $result = array();
349        $jobs   = array(array(0, strlen($from_text), 0, strlen($to_text)));
350
351        while ($job = array_pop($jobs)) {
352
353            // get the segments which must be diff'ed
354            list($from_segment_start, $from_segment_end, $to_segment_start, $to_segment_end) = $job;
355
356            $from_segment_len = $from_segment_end - $from_segment_start;
357            $to_segment_len   = $to_segment_end - $to_segment_start;
358
359            // catch easy cases first
360            if (!$from_segment_len || !$to_segment_len) {
361
362                if ($from_segment_len) {
363                    $result[$from_segment_start * 4 + 0] = new Delete($from_segment_len);
364                } else if ( $to_segment_len ) {
365                    $result[$from_segment_start * 4 + 1] = new Insert(substr($to_text, $to_segment_start, $to_segment_len));
366                }
367
368                continue;
369            }
370
371            if ($from_segment_len >= $to_segment_len) {
372
373                $copy_len = $to_segment_len;
374
375                while ($copy_len) {
376
377                    $to_copy_start     = $to_segment_start;
378                    $to_copy_start_max = $to_segment_end - $copy_len;
379
380                    while ($to_copy_start <= $to_copy_start_max) {
381
382                        $from_copy_start = strpos(substr($from_text, $from_segment_start, $from_segment_len), substr($to_text, $to_copy_start, $copy_len));
383
384                        if ($from_copy_start !== false) {
385                            $from_copy_start += $from_segment_start;
386                            break 2;
387                        }
388
389                        $to_copy_start++;
390                    }
391
392                    $copy_len--;
393                }
394            } else {
395
396                $copy_len = $from_segment_len;
397
398                while ($copy_len) {
399
400                    $from_copy_start     = $from_segment_start;
401                    $from_copy_start_max = $from_segment_end - $copy_len;
402
403                    while ($from_copy_start <= $from_copy_start_max) {
404
405                        $to_copy_start = strpos(substr($to_text, $to_segment_start, $to_segment_len), substr($from_text, $from_copy_start, $copy_len));
406
407                        if ($to_copy_start !== false) {
408                            $to_copy_start += $to_segment_start;
409                            break 2;
410                        }
411
412                        $from_copy_start++;
413                    }
414
415                    $copy_len--;
416                }
417            }
418
419            // match found
420            if ( $copy_len ) {
421                $jobs[] = array($from_segment_start, $from_copy_start, $to_segment_start, $to_copy_start);
422                $result[$from_copy_start * 4 + 2] = new Copy($copy_len);
423                $jobs[] = array($from_copy_start + $copy_len, $from_segment_end, $to_copy_start + $copy_len, $to_segment_end);
424            }
425            // no match,  so delete all, insert all
426            else {
427                $result[$from_segment_start * 4] = new Replace($from_segment_len, substr($to_text, $to_segment_start, $to_segment_len));
428            }
429        }
430
431        ksort($result, SORT_NUMERIC);
432        return array_values($result);
433    }
434
435    /**
436    * Efficiently fragment the text into an array according to specified delimiters.
437    *
438    * No delimiters means fragment into single character. The array indices are the offset of the fragments into
439    * the input string. A sentinel empty fragment is always added at the end.
440    * Careful: No check is performed as to the validity of the delimiters.
441    *
442    * @param string $text
443    * @param string $delimiters
444    * @param array
445    */
446    protected function extractFragments($text, $delimiters)
447    {
448        // special case: split into characters
449        if (empty($delimiters)) {
450            $chars                = str_split($text, 1);
451            $chars[strlen($text)] = '';
452
453            return $chars;
454        }
455
456        $fragments = array();
457        $start     = 0;
458        $end       = 0;
459
460        for (;;) {
461
462            $end += strcspn($text, $delimiters, $end);
463            $end += strspn($text, $delimiters, $end);
464
465            if ($end === $start) {
466                break;
467            }
468
469            $fragments[$start] = substr($text, $start, $end - $start);
470            $start             = $end;
471        }
472
473        $fragments[$start] = '';
474
475        return $fragments;
476    }
477}