1 /*
2 Author: Daniele Fognini, Andreas Wuerl
3 Copyright (C) 2013-2014, Siemens AG
4 
5 This program is free software; you can redistribute it and/or
6 modify it under the terms of the GNU General Public License
7 version 2 as published by the Free Software Foundation.
8 
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12 GNU General Public License for more details.
13 
14 You should have received a copy of the GNU General Public License along
15 with this program; if not, write to the Free Software Foundation, Inc.,
16 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
17 */
18 
19 #include "diff.h"
20 
21 #include "_squareVisitor.h"
22 #include <stdlib.h>
23 
matchNTokens(const GArray * textTokens,size_t textStart,size_t textLength,const GArray * searchTokens,size_t searchStart,size_t searchLength,unsigned int numberOfWantedMatches)24 int matchNTokens(const GArray* textTokens, size_t textStart, size_t textLength,
25                  const GArray* searchTokens, size_t searchStart, size_t searchLength,
26                  unsigned int numberOfWantedMatches) {
27 
28   if (
29     textStart < textLength &&
30     searchStart < searchLength &&
31     tokenEquals(
32       tokens_index(textTokens, textStart),
33       tokens_index(searchTokens, searchStart
34     )))
35   {
36     unsigned matched = 1;
37 
38     size_t canMatch = MIN(textLength - textStart, searchLength - searchStart);
39     size_t shouldMatch = MIN(numberOfWantedMatches, canMatch);
40     for (size_t i = 1; i < shouldMatch; i++) {
41       if (tokenEquals(
42             tokens_index(textTokens, i + textStart),
43             tokens_index(searchTokens, i + searchStart)))
44         matched++;
45       else
46         break;
47     }
48     return (matched == shouldMatch);
49   }
50   return 0;
51 }
52 
lookForDiff(const GArray * textTokens,const GArray * searchTokens,size_t iText,size_t iSearch,unsigned int maxAllowedDiff,unsigned int minAdjacentMatches,DiffMatchInfo * result)53 int lookForDiff(const GArray* textTokens, const GArray* searchTokens,
54                        size_t iText, size_t iSearch,
55                        unsigned int maxAllowedDiff, unsigned int minAdjacentMatches,
56                        DiffMatchInfo* result) {
57   size_t searchLength = searchTokens->len;
58   size_t textLength = textTokens->len;
59 
60   size_t searchStopAt = MIN(iSearch + maxAllowedDiff, searchLength);
61   size_t textStopAt = MIN(iText + maxAllowedDiff, textLength);
62 
63   size_t textPos;
64   size_t searchPos;
65   for (unsigned int i = 0; i < SQUARE_VISITOR_LENGTH; i++) {
66     textPos = iText + squareVisitorX[i];
67     searchPos = iSearch + squareVisitorY[i];
68 
69     if ((textPos < textStopAt) && (searchPos < searchStopAt))
70       if (matchNTokens(textTokens, textPos, textLength,
71                        searchTokens, searchPos, searchLength,
72                        minAdjacentMatches))
73       {
74         result->search.start = searchPos;
75         result->search.length = searchPos - iSearch;
76         result->text.start = textPos;
77         result->text.length = textPos - iText;
78         return 1;
79       }
80   }
81 
82   return 0;
83 }
84 
applyDiff(const DiffMatchInfo * diff,GArray * matchedInfo,size_t * additionsCounter,size_t * removedCounter,size_t * iText,size_t * iSearch)85 static int applyDiff(const DiffMatchInfo* diff,
86               GArray* matchedInfo,
87               size_t* additionsCounter, size_t* removedCounter,
88               size_t* iText, size_t* iSearch) {
89 
90   DiffMatchInfo diffCopy = *diff;
91   diffCopy.text.start = *iText;
92   diffCopy.search.start = *iSearch;
93 
94   if (diffCopy.search.length == 0)
95     diffCopy.diffType = DIFF_TYPE_ADDITION;
96   else if (diffCopy.text.length == 0)
97     diffCopy.diffType = DIFF_TYPE_REMOVAL;
98   else
99     diffCopy.diffType = DIFF_TYPE_REPLACE;
100 
101   *iText = diff->text.start;
102   *iSearch = diff->search.start;
103 
104   g_array_append_val(matchedInfo, diffCopy);
105 
106   *additionsCounter += diffCopy.text.length;
107   *removedCounter += diffCopy.search.length;
108 
109   return 1;
110 }
111 
applyTailDiff(GArray * matchedInfo,size_t searchLength,size_t * removedCounter,size_t * additionsCounter,size_t * iText,size_t * iSearch)112 static void applyTailDiff(GArray* matchedInfo,
113         size_t searchLength,
114         size_t* removedCounter, size_t* additionsCounter,
115         size_t* iText, size_t* iSearch) {
116 
117   DiffMatchInfo tailDiff = (DiffMatchInfo) {
118           .search = (DiffPoint) {
119                   .start = (*iSearch),
120                   .length = searchLength - (*iSearch)
121           },
122           .text = (DiffPoint) {
123                   .start = (*iText),
124                   .length = 0
125           },
126           .diffType = NULL
127   };
128 
129   applyDiff(&tailDiff, matchedInfo, additionsCounter, removedCounter, iText, iSearch);
130 }
131 
initSimpleMatch(DiffMatchInfo * simpleMatch,size_t iText,size_t iSearch)132 static void initSimpleMatch(DiffMatchInfo* simpleMatch, size_t iText, size_t iSearch) {
133   simpleMatch->text.start = iText;
134   simpleMatch->text.length = 0;
135   simpleMatch->search.start = iSearch;
136   simpleMatch->search.length = 0;
137 }
138 
139 /**
140  @brief perform a diff match search between two tokenized texts
141 
142  @param textTokens   array containing the Tokens of the text in which we are searching
143  @param searchTokens array containing the Tokens of the reference text to be searched
144  @param textStartPosition position in the text where the search starts,
145                           it will be updated to a value for a successive search
146  @param maxAllowedDiff maximum number of Tokens that can be avoided
147  @param minAdjacentMatches minimum number of adjacent matched Tokens that must be equal
148 
149  @return pointer to the result, or NULL on negative match. To be freed with diffResult_free
150  ****************************************************/
findMatchAsDiffs(const GArray * textTokens,const GArray * searchTokens,size_t textStartPosition,size_t searchStartPosition,unsigned int maxAllowedDiff,unsigned int minAdjacentMatches)151 DiffResult* findMatchAsDiffs(const GArray* textTokens, const GArray* searchTokens,
152                              size_t textStartPosition, size_t searchStartPosition,
153                              unsigned int maxAllowedDiff, unsigned int minAdjacentMatches) {
154   size_t textLength = textTokens->len;
155   size_t searchLength = searchTokens->len;
156 
157   if ((searchLength<=searchStartPosition) || (textLength<=textStartPosition)) {
158     return NULL;
159   }
160 
161   DiffResult* result = malloc(sizeof(DiffResult));
162   result->matchedInfo = g_array_new(FALSE, FALSE, sizeof(DiffMatchInfo));
163   GArray* matchedInfo = result->matchedInfo;
164 
165   size_t iText = textStartPosition;
166   size_t iSearch = 0;
167 
168   size_t removedCounter = 0;
169   size_t matchedCounter = 0;
170   size_t additionsCounter = 0;
171 
172   if (searchStartPosition > 0) {
173     DiffMatchInfo licenseHeadDiff = (DiffMatchInfo) {
174       .search = (DiffPoint) {
175         .start = 0,
176         .length = searchStartPosition
177       },
178       .text = (DiffPoint) {
179         .start = iText,
180         .length = 0
181       },
182       .diffType = NULL
183     };
184     applyDiff(&licenseHeadDiff, matchedInfo, &additionsCounter, &removedCounter, &iText, &iSearch);
185     iSearch = searchStartPosition;
186   }
187 
188   // match first token
189   while (iText < textLength) {
190       Token* textToken = tokens_index(textTokens, iText);
191       Token* searchToken = tokens_index(searchTokens, iSearch);
192       if (tokenEquals(textToken, searchToken))
193         break;
194     iText++;
195   }
196 
197   if (iText < textLength) {
198     DiffMatchInfo simpleMatch;
199     simpleMatch.diffType = DIFF_TYPE_MATCH;
200     initSimpleMatch(&simpleMatch, iText, iSearch);
201 
202     while ((iText < textLength) && (iSearch < searchLength)) {
203       Token* textToken = tokens_index(textTokens, iText);
204       Token* searchToken = tokens_index(searchTokens, iSearch);
205 
206       if (tokenEquals(textToken, searchToken)) {
207         simpleMatch.text.length++;
208         simpleMatch.search.length++;
209         matchedCounter++;
210         iSearch++;
211         iText++;
212       } else {
213         /* the previous tokens matched, here starts a difference */
214         g_array_append_val(matchedInfo, simpleMatch);
215         initSimpleMatch(&simpleMatch, iText, iSearch);
216 
217         DiffMatchInfo diff;
218         if (lookForDiff(textTokens, searchTokens,
219                         iText, iSearch,
220                         maxAllowedDiff, minAdjacentMatches,
221                         &diff)) {
222           applyDiff(&diff,
223                     matchedInfo,
224                     &additionsCounter, &removedCounter,
225                     &iText, &iSearch);
226 
227           simpleMatch.text.start = iText;
228           simpleMatch.search.start = iSearch;
229         } else {
230           break;
231         }
232       }
233     }
234     if (simpleMatch.text.length > 0) {
235       g_array_append_val(matchedInfo, simpleMatch);
236     }
237   }
238 
239   if ((iSearch < searchLength) && (searchLength < maxAllowedDiff + iSearch)) {
240     applyTailDiff(matchedInfo, searchLength, &removedCounter, &additionsCounter, &iText, &iSearch);
241   }
242 
243   if (matchedCounter + removedCounter != searchLength) {
244     g_array_free(matchedInfo, TRUE);
245     free(result);
246 
247     return NULL;
248   } else {
249 #ifdef DEBUG_DIFF
250     printf("diff: (=%zu +%zu -%zu)/%zu\n", matchedCounter, additionsCounter, removedCounter, searchLength);
251     for (size_t i=0; i<matchedInfo->len; i++) {
252       DiffMatchInfo current = g_array_index(matchedInfo, DiffMatchInfo, i);
253       printf("info[%zu]: t[%zu+%zu] %s s[%zu+%zu]}\n",
254              i,
255              current.text.start,
256              current.text.length,
257              current.diffType,
258              current.search.start,
259              current.search.length
260       );
261     }
262     printf("\n");
263 #endif
264     result->removed = removedCounter;
265     result->added = additionsCounter;
266     result->matched = matchedCounter;
267     return result;
268   }
269 }
270 
diffResult_free(DiffResult * diffResult)271 void diffResult_free(DiffResult* diffResult) {
272   g_array_free(diffResult->matchedInfo, TRUE);
273   free(diffResult);
274 }
275