1 /*
2 * lcs.c : routines for creating an lcs
3 *
4 * ====================================================================
5 * Licensed to the Apache Software Foundation (ASF) under one
6 * or more contributor license agreements. See the NOTICE file
7 * distributed with this work for additional information
8 * regarding copyright ownership. The ASF licenses this file
9 * to you under the Apache License, Version 2.0 (the
10 * "License"); you may not use this file except in compliance
11 * with the License. You may obtain a copy of the License at
12 *
13 * http://www.apache.org/licenses/LICENSE-2.0
14 *
15 * Unless required by applicable law or agreed to in writing,
16 * software distributed under the License is distributed on an
17 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
18 * KIND, either express or implied. See the License for the
19 * specific language governing permissions and limitations
20 * under the License.
21 * ====================================================================
22 */
23
24
25 #include <apr.h>
26 #include <apr_pools.h>
27 #include <apr_general.h>
28
29 #include "diff.h"
30
31
32 /*
33 * Calculate the Longest Common Subsequence (LCS) between two datasources.
34 * This function is what makes the diff code tick.
35 *
36 * The LCS algorithm implemented here is based on the approach described
37 * by Sun Wu, Udi Manber and Gene Meyers in "An O(NP) Sequence Comparison
38 * Algorithm", but has been modified for better performance.
39 *
40 * Let M and N be the lengths (number of tokens) of the two sources
41 * ('files'). The goal is to reach the end of both sources (files) with the
42 * minimum number of insertions + deletions. Since there is a known length
43 * difference N-M between the files, that is equivalent to just the minimum
44 * number of deletions, or equivalently the minimum number of insertions.
45 * For symmetry, we use the lesser number - deletions if M<N, insertions if
46 * M>N.
47 *
48 * Let 'k' be the difference in remaining length between the files, i.e.
49 * if we're at the beginning of both files, k=N-M, whereas k=0 for the
50 * 'end state', at the end of both files. An insertion will increase k by
51 * one, while a deletion decreases k by one. If k<0, then insertions are
52 * 'free' - we need those to reach the end state k=0 anyway - but deletions
53 * are costly: Adding a deletion means that we will have to add an additional
54 * insertion later to reach the end state, so it doesn't matter if we count
55 * deletions or insertions. Similarly, deletions are free for k>0.
56 *
57 * Let a 'state' be a given position in each file {pos1, pos2}. An array
58 * 'fp' keeps track of the best possible state (largest values of
59 * {pos1, pos2}) that can be achieved for a given cost 'p' (# moves away
60 * from k=0), as well as a linked list of what matches were used to reach
61 * that state. For each new value of p, we find for each value of k the
62 * best achievable state for that k - either by doing a costly operation
63 * (deletion if k<0) from a state achieved at a lower p, or doing a free
64 * operation (insertion if k<0) from a state achieved at the same p -
65 * and in both cases advancing past any matching regions found. This is
66 * handled by running loops over k in order of descending absolute value.
67 *
68 * A recent improvement of the algorithm is to ignore tokens that are unique
69 * to one file or the other, as those are known from the start to be
70 * impossible to match.
71 */
72
73 typedef struct svn_diff__snake_t svn_diff__snake_t;
74
75 struct svn_diff__snake_t
76 {
77 apr_off_t y;
78 svn_diff__lcs_t *lcs;
79 svn_diff__position_t *position[2];
80 };
81
82 static APR_INLINE void
svn_diff__snake(svn_diff__snake_t * fp_k,svn_diff__token_index_t * token_counts[2],svn_diff__lcs_t ** freelist,apr_pool_t * pool)83 svn_diff__snake(svn_diff__snake_t *fp_k,
84 svn_diff__token_index_t *token_counts[2],
85 svn_diff__lcs_t **freelist,
86 apr_pool_t *pool)
87 {
88 svn_diff__position_t *start_position[2];
89 svn_diff__position_t *position[2];
90 svn_diff__lcs_t *lcs;
91 svn_diff__lcs_t *previous_lcs;
92
93 /* The previous entry at fp[k] is going to be replaced. See if we
94 * can mark that lcs node for reuse, because the sequence up to this
95 * point was a dead end.
96 */
97 lcs = fp_k[0].lcs;
98 while (lcs)
99 {
100 lcs->refcount--;
101 if (lcs->refcount)
102 break;
103
104 previous_lcs = lcs->next;
105 lcs->next = *freelist;
106 *freelist = lcs;
107 lcs = previous_lcs;
108 }
109
110 if (fp_k[-1].y >= fp_k[1].y)
111 {
112 start_position[0] = fp_k[-1].position[0];
113 start_position[1] = fp_k[-1].position[1]->next;
114
115 previous_lcs = fp_k[-1].lcs;
116 }
117 else
118 {
119 start_position[0] = fp_k[1].position[0]->next;
120 start_position[1] = fp_k[1].position[1];
121
122 previous_lcs = fp_k[1].lcs;
123 }
124
125
126 if (previous_lcs)
127 {
128 previous_lcs->refcount++;
129 }
130
131 /* ### Optimization, skip all positions that don't have matchpoints
132 * ### anyway. Beware of the sentinel, don't skip it!
133 */
134
135 position[0] = start_position[0];
136 position[1] = start_position[1];
137
138 while (1)
139 {
140 while (position[0]->token_index == position[1]->token_index)
141 {
142 position[0] = position[0]->next;
143 position[1] = position[1]->next;
144 }
145
146 if (position[1] != start_position[1])
147 {
148 lcs = *freelist;
149 if (lcs)
150 {
151 *freelist = lcs->next;
152 }
153 else
154 {
155 lcs = apr_palloc(pool, sizeof(*lcs));
156 }
157
158 lcs->position[0] = start_position[0];
159 lcs->position[1] = start_position[1];
160 lcs->length = position[1]->offset - start_position[1]->offset;
161 lcs->next = previous_lcs;
162 lcs->refcount = 1;
163 previous_lcs = lcs;
164 start_position[0] = position[0];
165 start_position[1] = position[1];
166 }
167
168 /* Skip any and all tokens that only occur in one of the files */
169 if (position[0]->token_index >= 0
170 && token_counts[1][position[0]->token_index] == 0)
171 start_position[0] = position[0] = position[0]->next;
172 else if (position[1]->token_index >= 0
173 && token_counts[0][position[1]->token_index] == 0)
174 start_position[1] = position[1] = position[1]->next;
175 else
176 break;
177 }
178
179 fp_k[0].lcs = previous_lcs;
180 fp_k[0].position[0] = position[0];
181 fp_k[0].position[1] = position[1];
182
183 fp_k[0].y = position[1]->offset;
184 }
185
186
187 static svn_diff__lcs_t *
svn_diff__lcs_reverse(svn_diff__lcs_t * lcs)188 svn_diff__lcs_reverse(svn_diff__lcs_t *lcs)
189 {
190 svn_diff__lcs_t *next;
191 svn_diff__lcs_t *prev;
192
193 next = NULL;
194 while (lcs != NULL)
195 {
196 prev = lcs->next;
197 lcs->next = next;
198 next = lcs;
199 lcs = prev;
200 }
201
202 return next;
203 }
204
205
206 /* Prepends a new lcs chunk for the amount of LINES at the given positions
207 * POS0_OFFSET and POS1_OFFSET to the given LCS chain, and returns it.
208 * This function assumes LINES > 0. */
209 static svn_diff__lcs_t *
prepend_lcs(svn_diff__lcs_t * lcs,apr_off_t lines,apr_off_t pos0_offset,apr_off_t pos1_offset,apr_pool_t * pool)210 prepend_lcs(svn_diff__lcs_t *lcs, apr_off_t lines,
211 apr_off_t pos0_offset, apr_off_t pos1_offset,
212 apr_pool_t *pool)
213 {
214 svn_diff__lcs_t *new_lcs;
215
216 SVN_ERR_ASSERT_NO_RETURN(lines > 0);
217
218 new_lcs = apr_palloc(pool, sizeof(*new_lcs));
219 new_lcs->position[0] = apr_pcalloc(pool, sizeof(*new_lcs->position[0]));
220 new_lcs->position[0]->offset = pos0_offset;
221 new_lcs->position[1] = apr_pcalloc(pool, sizeof(*new_lcs->position[1]));
222 new_lcs->position[1]->offset = pos1_offset;
223 new_lcs->length = lines;
224 new_lcs->refcount = 1;
225 new_lcs->next = lcs;
226
227 return new_lcs;
228 }
229
230
231 svn_diff__lcs_t *
svn_diff__lcs(svn_diff__position_t * position_list1,svn_diff__position_t * position_list2,svn_diff__token_index_t * token_counts_list1,svn_diff__token_index_t * token_counts_list2,svn_diff__token_index_t num_tokens,apr_off_t prefix_lines,apr_off_t suffix_lines,apr_pool_t * pool)232 svn_diff__lcs(svn_diff__position_t *position_list1, /* pointer to tail (ring) */
233 svn_diff__position_t *position_list2, /* pointer to tail (ring) */
234 svn_diff__token_index_t *token_counts_list1, /* array of counts */
235 svn_diff__token_index_t *token_counts_list2, /* array of counts */
236 svn_diff__token_index_t num_tokens,
237 apr_off_t prefix_lines,
238 apr_off_t suffix_lines,
239 apr_pool_t *pool)
240 {
241 apr_off_t length[2];
242 svn_diff__token_index_t *token_counts[2];
243 svn_diff__token_index_t unique_count[2];
244 svn_diff__token_index_t token_index;
245 svn_diff__snake_t *fp;
246 apr_off_t d;
247 apr_off_t k;
248 apr_off_t p = 0;
249 svn_diff__lcs_t *lcs, *lcs_freelist = NULL;
250
251 svn_diff__position_t sentinel_position[2];
252
253 /* Since EOF is always a sync point we tack on an EOF link
254 * with sentinel positions
255 */
256 lcs = apr_palloc(pool, sizeof(*lcs));
257 lcs->position[0] = apr_pcalloc(pool, sizeof(*lcs->position[0]));
258 lcs->position[0]->offset = position_list1
259 ? position_list1->offset + suffix_lines + 1
260 : prefix_lines + suffix_lines + 1;
261 lcs->position[1] = apr_pcalloc(pool, sizeof(*lcs->position[1]));
262 lcs->position[1]->offset = position_list2
263 ? position_list2->offset + suffix_lines + 1
264 : prefix_lines + suffix_lines + 1;
265 lcs->length = 0;
266 lcs->refcount = 1;
267 lcs->next = NULL;
268
269 if (position_list1 == NULL || position_list2 == NULL)
270 {
271 if (suffix_lines)
272 lcs = prepend_lcs(lcs, suffix_lines,
273 lcs->position[0]->offset - suffix_lines,
274 lcs->position[1]->offset - suffix_lines,
275 pool);
276 if (prefix_lines)
277 lcs = prepend_lcs(lcs, prefix_lines, 1, 1, pool);
278
279 return lcs;
280 }
281
282 unique_count[1] = unique_count[0] = 0;
283 for (token_index = 0; token_index < num_tokens; token_index++)
284 {
285 if (token_counts_list1[token_index] == 0)
286 unique_count[1] += token_counts_list2[token_index];
287 if (token_counts_list2[token_index] == 0)
288 unique_count[0] += token_counts_list1[token_index];
289 }
290
291 /* Calculate lengths M and N of the sequences to be compared. Do not
292 * count tokens unique to one file, as those are ignored in __snake.
293 */
294 length[0] = position_list1->offset - position_list1->next->offset + 1
295 - unique_count[0];
296 length[1] = position_list2->offset - position_list2->next->offset + 1
297 - unique_count[1];
298
299 /* strikerXXX: here we allocate the furthest point array, which is
300 * strikerXXX: sized M + N + 3 (!)
301 */
302 fp = apr_pcalloc(pool,
303 sizeof(*fp) * (apr_size_t)(length[0] + length[1] + 3));
304
305 /* The origo of fp corresponds to the end state, where we are
306 * at the end of both files. The valid states thus span from
307 * -N (at end of first file and at the beginning of the second
308 * file) to +M (the opposite :). Finally, svn_diff__snake needs
309 * 1 extra slot on each side to work.
310 */
311 fp += length[1] + 1;
312
313 sentinel_position[0].next = position_list1->next;
314 position_list1->next = &sentinel_position[0];
315 sentinel_position[0].offset = position_list1->offset + 1;
316 token_counts[0] = token_counts_list1;
317
318 sentinel_position[1].next = position_list2->next;
319 position_list2->next = &sentinel_position[1];
320 sentinel_position[1].offset = position_list2->offset + 1;
321 token_counts[1] = token_counts_list2;
322
323 /* Negative indices will not be used elsewhere
324 */
325 sentinel_position[0].token_index = -1;
326 sentinel_position[1].token_index = -2;
327
328 /* position d = M - N corresponds to the initial state, where
329 * we are at the beginning of both files.
330 */
331 d = length[0] - length[1];
332
333 /* k = d - 1 will be the first to be used to get previous
334 * position information from, make sure it holds sane
335 * data
336 */
337 fp[d - 1].position[0] = sentinel_position[0].next;
338 fp[d - 1].position[1] = &sentinel_position[1];
339
340 p = 0;
341 do
342 {
343 /* For k < 0, insertions are free */
344 for (k = (d < 0 ? d : 0) - p; k < 0; k++)
345 {
346 svn_diff__snake(fp + k, token_counts, &lcs_freelist, pool);
347 }
348 /* for k > 0, deletions are free */
349 for (k = (d > 0 ? d : 0) + p; k >= 0; k--)
350 {
351 svn_diff__snake(fp + k, token_counts, &lcs_freelist, pool);
352 }
353
354 p++;
355 }
356 while (fp[0].position[1] != &sentinel_position[1]);
357
358 if (suffix_lines)
359 lcs->next = prepend_lcs(fp[0].lcs, suffix_lines,
360 lcs->position[0]->offset - suffix_lines,
361 lcs->position[1]->offset - suffix_lines,
362 pool);
363 else
364 lcs->next = fp[0].lcs;
365
366 lcs = svn_diff__lcs_reverse(lcs);
367
368 position_list1->next = sentinel_position[0].next;
369 position_list2->next = sentinel_position[1].next;
370
371 if (prefix_lines)
372 return prepend_lcs(lcs, prefix_lines, 1, 1, pool);
373 else
374 return lcs;
375 }
376