1 /* -*- Mode: C; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 /*
3  *  Copyright (C) 2008-2010  Kouhei Sutou <kou@clear-code.com>
4  *
5  *  This library is free software: you can redistribute it and/or modify
6  *  it under the terms of the GNU Lesser General Public License as published by
7  *  the Free Software Foundation, either version 3 of the License, or
8  *  (at your option) any later version.
9  *
10  *  This library is distributed in the hope that it will be useful,
11  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13  *  GNU Lesser General Public License for more details.
14  *
15  *  You should have received a copy of the GNU Lesser General Public License
16  *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
17  *
18  */
19 
20 #ifdef HAVE_CONFIG_H
21 #  include <config.h>
22 #endif /* HAVE_CONFIG_H */
23 
24 #include <stdlib.h>
25 #include <string.h>
26 #include <glib.h>
27 
28 #include "cut-sequence-matcher.h"
29 #include "../gcutter/gcut-list.h"
30 
31 #define CUT_SEQUENCE_MATCHER_GET_PRIVATE(obj) (G_TYPE_INSTANCE_GET_PRIVATE ((obj), CUT_TYPE_SEQUENCE_MATCHER, CutSequenceMatcherPrivate))
32 
33 typedef struct _CutSequenceMatcherPrivate	CutSequenceMatcherPrivate;
34 struct _CutSequenceMatcherPrivate
35 {
36     GSequence *from;
37     GSequence *to;
38     GSequenceIterCompareFunc compare_func;
39     gpointer compare_func_user_data;
40     GHashTable *to_indices;
41     GHashTable *junks;
42     GList *matches;
43     GList *blocks;
44     GList *operations;
45     GList *grouped_operations;
46     gdouble ratio;
47     guint context_size;
48 };
49 
50 enum
51 {
52     PROP_0,
53     PROP_FROM_SEQUENCE,
54     PROP_TO_SEQUENCE,
55     PROP_COMPARE_FUNC,
56     PROP_COMPARE_FUNC_USER_DATA,
57     PROP_TO_INDICES,
58     PROP_JUNKS
59 };
60 
61 G_DEFINE_TYPE (CutSequenceMatcher, cut_sequence_matcher, G_TYPE_OBJECT)
62 
63 static void dispose        (GObject         *object);
64 static void set_property   (GObject         *object,
65                             guint            prop_id,
66                             const GValue    *value,
67                             GParamSpec      *pspec);
68 static void get_property   (GObject         *object,
69                             guint            prop_id,
70                             GValue          *value,
71                             GParamSpec      *pspec);
72 
73 CutSequenceMatchInfo *
cut_sequence_match_info_new(guint from_index,guint to_index,guint size)74 cut_sequence_match_info_new (guint from_index, guint to_index, guint size)
75 {
76     CutSequenceMatchInfo *info;
77 
78     info = g_slice_new(CutSequenceMatchInfo);
79     info->from_index = from_index;
80     info->to_index = to_index;
81     info->size = size;
82     return info;
83 }
84 
85 void
cut_sequence_match_info_free(CutSequenceMatchInfo * info)86 cut_sequence_match_info_free (CutSequenceMatchInfo *info)
87 {
88     g_slice_free(CutSequenceMatchInfo, info);
89 }
90 
91 CutSequenceMatchOperation *
cut_sequence_match_operation_new(CutSequenceMatchOperationType type,guint from_begin,guint from_end,guint to_begin,guint to_end)92 cut_sequence_match_operation_new (CutSequenceMatchOperationType type,
93                                   guint from_begin, guint from_end,
94                                   guint to_begin, guint to_end)
95 {
96     CutSequenceMatchOperation *operation;
97 
98     operation = g_slice_new(CutSequenceMatchOperation);
99     operation->type = type;
100     operation->from_begin = from_begin;
101     operation->from_end = from_end;
102     operation->to_begin = to_begin;
103     operation->to_end = to_end;
104     return operation;
105 }
106 
107 void
cut_sequence_match_operation_free(CutSequenceMatchOperation * operation)108 cut_sequence_match_operation_free (CutSequenceMatchOperation *operation)
109 {
110     g_slice_free(CutSequenceMatchOperation, operation);
111 }
112 
113 static void
cut_sequence_matcher_class_init(CutSequenceMatcherClass * klass)114 cut_sequence_matcher_class_init (CutSequenceMatcherClass *klass)
115 {
116     GObjectClass *gobject_class;
117     GParamSpec *spec;
118 
119     gobject_class = G_OBJECT_CLASS(klass);
120 
121     gobject_class->dispose = dispose;
122     gobject_class->set_property = set_property;
123     gobject_class->get_property = get_property;
124 
125     spec = g_param_spec_pointer("from-sequence",
126                                 "From Sequence",
127                                 "From Sequence",
128                                 G_PARAM_READWRITE | G_PARAM_CONSTRUCT);
129     g_object_class_install_property(gobject_class, PROP_FROM_SEQUENCE, spec);
130 
131     spec = g_param_spec_pointer("to-sequence",
132                                 "To Sequence",
133                                 "To Sequence",
134                                 G_PARAM_READWRITE | G_PARAM_CONSTRUCT);
135     g_object_class_install_property(gobject_class, PROP_TO_SEQUENCE, spec);
136 
137     spec = g_param_spec_pointer("compare-function",
138                                 "Compare Fcuntion",
139                                 "Compare Fcuntion",
140                                 G_PARAM_READWRITE | G_PARAM_CONSTRUCT);
141     g_object_class_install_property(gobject_class, PROP_COMPARE_FUNC, spec);
142 
143     spec = g_param_spec_pointer("compare-function-user-data",
144                                 "Compare Fcuntion User Data",
145                                 "Compare Fcuntion User Data",
146                                 G_PARAM_READWRITE | G_PARAM_CONSTRUCT);
147     g_object_class_install_property(gobject_class, PROP_COMPARE_FUNC_USER_DATA, spec);
148 
149     spec = g_param_spec_pointer("to-indices",
150                                 "To Indecies",
151                                 "To Indecies",
152                                 G_PARAM_READWRITE | G_PARAM_CONSTRUCT);
153     g_object_class_install_property(gobject_class, PROP_TO_INDICES, spec);
154 
155     spec = g_param_spec_pointer("junks",
156                                 "Junks",
157                                 "Junks",
158                                 G_PARAM_READWRITE | G_PARAM_CONSTRUCT);
159     g_object_class_install_property(gobject_class, PROP_JUNKS, spec);
160 
161     g_type_class_add_private(gobject_class, sizeof(CutSequenceMatcherPrivate));
162 }
163 
164 static void
cut_sequence_matcher_init(CutSequenceMatcher * sequence_matcher)165 cut_sequence_matcher_init (CutSequenceMatcher *sequence_matcher)
166 {
167     CutSequenceMatcherPrivate *priv;
168 
169     priv = CUT_SEQUENCE_MATCHER_GET_PRIVATE(sequence_matcher);
170 
171     priv->from = NULL;
172     priv->to = NULL;
173     priv->to_indices = NULL;
174     priv->junks = NULL;
175     priv->matches = NULL;
176     priv->blocks = NULL;
177     priv->operations = NULL;
178     priv->grouped_operations = NULL;
179     priv->ratio = -1.0;
180     priv->context_size = 3;
181 }
182 
183 static void
dispose_goruped_operations(CutSequenceMatcherPrivate * priv)184 dispose_goruped_operations (CutSequenceMatcherPrivate *priv)
185 {
186     GList *node;
187 
188     if (!priv->grouped_operations)
189         return;
190 
191     for (node = priv->grouped_operations; node; node = g_list_next(node)) {
192         GList *operations = node->data;
193         g_list_foreach(operations,
194                        (GFunc)cut_sequence_match_operation_free, NULL);
195         g_list_free(operations);
196     }
197     g_list_free(priv->grouped_operations);
198     priv->grouped_operations = NULL;
199 }
200 
201 static void
dispose(GObject * object)202 dispose (GObject *object)
203 {
204     CutSequenceMatcherPrivate *priv;
205 
206     priv = CUT_SEQUENCE_MATCHER_GET_PRIVATE(object);
207 
208     if (priv->from) {
209         g_sequence_free(priv->from);
210         priv->from = NULL;
211     }
212 
213     if (priv->to) {
214         g_sequence_free(priv->to);
215         priv->to = NULL;
216     }
217 
218     if (priv->to_indices) {
219         g_hash_table_unref(priv->to_indices);
220         priv->to_indices = NULL;
221     }
222 
223     if (priv->junks) {
224         g_hash_table_unref(priv->junks);
225         priv->junks = NULL;
226     }
227 
228     if (priv->matches) {
229         g_list_foreach(priv->matches, (GFunc)cut_sequence_match_info_free, NULL);
230         g_list_free(priv->matches);
231         priv->matches = NULL;
232     }
233 
234     if (priv->blocks) {
235         g_list_foreach(priv->blocks, (GFunc)cut_sequence_match_info_free, NULL);
236         g_list_free(priv->blocks);
237         priv->blocks = NULL;
238     }
239 
240 
241     if (priv->operations) {
242         g_list_foreach(priv->operations,
243                        (GFunc)cut_sequence_match_operation_free, NULL);
244         g_list_free(priv->operations);
245         priv->operations = NULL;
246     }
247 
248     dispose_goruped_operations(priv);
249 
250     G_OBJECT_CLASS(cut_sequence_matcher_parent_class)->dispose(object);
251 }
252 
253 static void
set_property(GObject * object,guint prop_id,const GValue * value,GParamSpec * pspec)254 set_property (GObject      *object,
255               guint         prop_id,
256               const GValue *value,
257               GParamSpec   *pspec)
258 {
259     CutSequenceMatcherPrivate *priv;
260     priv = CUT_SEQUENCE_MATCHER_GET_PRIVATE(object);
261 
262     switch (prop_id) {
263       case PROP_FROM_SEQUENCE:
264         priv->from = g_value_get_pointer(value);
265         break;
266       case PROP_TO_SEQUENCE:
267         priv->to = g_value_get_pointer(value);
268         break;
269       case PROP_COMPARE_FUNC:
270         priv->compare_func = g_value_get_pointer(value);
271         break;
272       case PROP_COMPARE_FUNC_USER_DATA:
273         priv->compare_func_user_data = g_value_get_pointer(value);
274         break;
275       case PROP_TO_INDICES:
276         priv->to_indices = g_value_get_pointer(value);
277         break;
278       case PROP_JUNKS:
279         priv->junks = g_value_get_pointer(value);
280         break;
281       default:
282         G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec);
283         break;
284     }
285 }
286 
287 static void
get_property(GObject * object,guint prop_id,GValue * value,GParamSpec * pspec)288 get_property (GObject    *object,
289               guint       prop_id,
290               GValue     *value,
291               GParamSpec *pspec)
292 {
293     CutSequenceMatcherPrivate *priv;
294     priv = CUT_SEQUENCE_MATCHER_GET_PRIVATE(object);
295 
296     switch (prop_id) {
297       case PROP_FROM_SEQUENCE:
298         g_value_set_pointer(value, priv->from);
299         break;
300       case PROP_TO_SEQUENCE:
301         g_value_set_pointer(value, priv->to);
302         break;
303       case PROP_COMPARE_FUNC:
304         g_value_set_pointer(value, priv->compare_func);
305         break;
306       case PROP_COMPARE_FUNC_USER_DATA:
307         g_value_set_pointer(value, priv->compare_func_user_data);
308         break;
309       case PROP_TO_INDICES:
310         g_value_set_pointer(value, priv->to_indices);
311         break;
312       case PROP_JUNKS:
313         g_value_set_pointer(value, priv->junks);
314         break;
315       default:
316         G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec);
317         break;
318     }
319 }
320 
321 typedef struct {
322     GHashTable *junks;
323     CutJunkFilterFunc junk_filter_func;
324     gpointer junk_filter_func_user_data;
325 } RemoveJunkData;
326 
327 static gboolean
remove_junk(gpointer key,gpointer value,gpointer user_data)328 remove_junk (gpointer key, gpointer value, gpointer user_data)
329 {
330     RemoveJunkData *data = user_data;
331     gboolean is_junk;
332 
333     is_junk = data->junk_filter_func(key, data->junk_filter_func_user_data);
334     if (is_junk) {
335         g_hash_table_insert(data->junks, key, GINT_TO_POINTER(TRUE));
336     }
337     return is_junk;
338 }
339 
340 static void
remove_junks_from_to_indices(CutSequenceMatcher * matcher,CutJunkFilterFunc junk_filter_func,gpointer junk_filter_func_user_data)341 remove_junks_from_to_indices (CutSequenceMatcher *matcher,
342                               CutJunkFilterFunc junk_filter_func,
343                               gpointer junk_filter_func_user_data)
344 {
345     CutSequenceMatcherPrivate *priv;
346     RemoveJunkData data;
347 
348     priv = CUT_SEQUENCE_MATCHER_GET_PRIVATE(matcher);
349     data.junk_filter_func = junk_filter_func;
350     data.junk_filter_func_user_data = junk_filter_func_user_data;
351     data.junks = priv->junks;
352     g_hash_table_foreach_steal(priv->to_indices, remove_junk, &data);
353 }
354 
355 static void
update_to_indices(CutSequenceMatcher * matcher,CutJunkFilterFunc junk_filter_func,gpointer junk_filter_func_user_data)356 update_to_indices (CutSequenceMatcher *matcher,
357                    CutJunkFilterFunc junk_filter_func,
358                    gpointer junk_filter_func_user_data)
359 {
360     CutSequenceMatcherPrivate *priv;
361     gint i;
362     GSequenceIter *iter, *begin, *end;
363 
364     priv = CUT_SEQUENCE_MATCHER_GET_PRIVATE(matcher);
365 
366     if (!priv->to)
367         return;
368 
369     begin = g_sequence_get_begin_iter(priv->to);
370     end = g_sequence_get_end_iter(priv->to);
371     for (i = 0, iter = begin;
372          iter != end;
373          i++, iter = g_sequence_iter_next(iter)) {
374         gpointer data;
375         GList *indices;
376 
377         data = g_sequence_get(iter);
378         indices = g_hash_table_lookup(priv->to_indices, data);
379         indices = g_list_append(indices, GINT_TO_POINTER(i));
380         g_hash_table_replace(priv->to_indices, data, g_list_copy(indices));
381     }
382 
383     if (junk_filter_func)
384         remove_junks_from_to_indices(matcher,
385                                      junk_filter_func,
386                                      junk_filter_func_user_data);
387 }
388 
389 CutSequenceMatcher *
cut_sequence_matcher_new(GSequence * from,GSequence * to,GSequenceIterCompareFunc compare_func,gpointer compare_func_user_data,GHashFunc content_hash_func,GEqualFunc content_equal_func,CutJunkFilterFunc junk_filter_func,gpointer junk_filter_func_user_data)390 cut_sequence_matcher_new (GSequence *from, GSequence *to,
391                           GSequenceIterCompareFunc compare_func,
392                           gpointer compare_func_user_data,
393                           GHashFunc content_hash_func,
394                           GEqualFunc content_equal_func,
395                           CutJunkFilterFunc junk_filter_func,
396                           gpointer junk_filter_func_user_data)
397 {
398     CutSequenceMatcher *matcher;
399 
400     matcher = g_object_new(CUT_TYPE_SEQUENCE_MATCHER,
401                            "from-sequence", from,
402                            "to-sequence", to,
403                            "compare-function", compare_func,
404                            "compare-function-user-data", compare_func_user_data,
405                            "to-indices",
406                            g_hash_table_new_full(content_hash_func,
407                                                  content_equal_func,
408                                                  NULL,
409                                                  (GDestroyNotify)g_list_free),
410                            "junks", g_hash_table_new(content_hash_func,
411                                                      content_equal_func),
412                            NULL);
413     update_to_indices(matcher, junk_filter_func, junk_filter_func_user_data);
414     return matcher;
415 }
416 
417 static GSequence *
char_sequence_new(const gchar * string)418 char_sequence_new (const gchar *string)
419 {
420     GSequence *sequence;
421 
422     sequence = g_sequence_new(NULL);
423 
424     for (; *string != '\0'; string = g_utf8_next_char(string)) {
425         g_sequence_append(sequence, GUINT_TO_POINTER(g_utf8_get_char(string)));
426     }
427 
428     return sequence;
429 }
430 
431 static gint
char_sequence_iter_compare(GSequenceIter * data1,GSequenceIter * data2,gpointer user_data)432 char_sequence_iter_compare (GSequenceIter *data1, GSequenceIter *data2,
433                             gpointer user_data)
434 {
435     gunichar character1, character2;
436 
437     character1 = GPOINTER_TO_UINT(g_sequence_get(data1));
438     character2 = GPOINTER_TO_UINT(g_sequence_get(data2));
439 
440     if (character1 < character2) {
441         return -1;
442     } else if (character1 == character2) {
443         return 0;
444     } else {
445         return 1;
446     }
447 }
448 
449 static guint
char_value_hash(gconstpointer v)450 char_value_hash (gconstpointer v)
451 {
452     return GPOINTER_TO_UINT(v);
453 }
454 
455 static gboolean
char_value_equal(gconstpointer v1,gconstpointer v2)456 char_value_equal (gconstpointer v1, gconstpointer v2)
457 {
458     gunichar character1, character2;
459     character1 = GPOINTER_TO_UINT(v1);
460     character2 = GPOINTER_TO_UINT(v2);
461     return character1 == character2;
462 }
463 
464 CutSequenceMatcher *
cut_sequence_matcher_char_new(const gchar * from,const gchar * to)465 cut_sequence_matcher_char_new (const gchar *from, const gchar *to)
466 {
467     return cut_sequence_matcher_char_new_full(from, to, NULL, NULL);
468 }
469 
470 CutSequenceMatcher *
cut_sequence_matcher_char_new_full(const gchar * from,const gchar * to,CutJunkFilterFunc junk_filter_func,gpointer junk_filter_func_user_data)471 cut_sequence_matcher_char_new_full (const gchar *from, const gchar *to,
472                                     CutJunkFilterFunc junk_filter_func,
473                                     gpointer junk_filter_func_user_data)
474 {
475     return cut_sequence_matcher_new(char_sequence_new(from),
476                                     char_sequence_new(to),
477                                     char_sequence_iter_compare, NULL,
478                                     char_value_hash, char_value_equal,
479                                     junk_filter_func,
480                                     junk_filter_func_user_data);
481 }
482 
483 static GSequence *
string_sequence_new(gchar ** strings)484 string_sequence_new (gchar **strings)
485 {
486     GSequence *sequence;
487 
488     sequence = g_sequence_new(g_free);
489 
490     for (; *strings; strings++) {
491         g_sequence_append(sequence, g_strdup(*strings));
492     }
493 
494     return sequence;
495 }
496 
497 static gint
string_sequence_iter_compare(GSequenceIter * data1,GSequenceIter * data2,gpointer user_data)498 string_sequence_iter_compare (GSequenceIter *data1, GSequenceIter *data2,
499                               gpointer user_data)
500 {
501     gchar *string1, *string2;
502 
503     string1 = g_sequence_get(data1);
504     string2 = g_sequence_get(data2);
505 
506     return strcmp(string1, string2);
507 }
508 
509 CutSequenceMatcher *
cut_sequence_matcher_string_new(gchar ** from,gchar ** to)510 cut_sequence_matcher_string_new (gchar **from, gchar **to)
511 {
512     return cut_sequence_matcher_string_new_full(from, to, NULL, NULL);
513 }
514 
515 CutSequenceMatcher *
cut_sequence_matcher_string_new_full(gchar ** from,gchar ** to,CutJunkFilterFunc junk_filter_func,gpointer junk_filter_func_user_data)516 cut_sequence_matcher_string_new_full (gchar **from, gchar **to,
517                                       CutJunkFilterFunc junk_filter_func,
518                                       gpointer junk_filter_func_user_data)
519 {
520     return cut_sequence_matcher_new(string_sequence_new(from),
521                                     string_sequence_new(to),
522                                     string_sequence_iter_compare, NULL,
523                                     g_str_hash, g_str_equal,
524                                     junk_filter_func,
525                                     junk_filter_func_user_data);
526 }
527 
528 const GList *
cut_sequence_matcher_get_to_index(CutSequenceMatcher * matcher,gpointer to_content)529 cut_sequence_matcher_get_to_index (CutSequenceMatcher *matcher,
530                                    gpointer to_content)
531 {
532     CutSequenceMatcherPrivate *priv;
533 
534     priv = CUT_SEQUENCE_MATCHER_GET_PRIVATE(matcher);
535     if (!priv->to)
536         return NULL;
537 
538     return g_hash_table_lookup(priv->to_indices, to_content);
539 }
540 
541 static CutSequenceMatchInfo *
find_best_match_position(CutSequenceMatcher * matcher,guint from_begin,guint from_end,guint to_begin,guint to_end)542 find_best_match_position (CutSequenceMatcher *matcher,
543                           guint from_begin, guint from_end,
544                           guint to_begin, guint to_end)
545 {
546     CutSequenceMatcherPrivate *priv;
547     CutSequenceMatchInfo *info;
548     GSequenceIter *iter, *begin, *end;
549     GHashTable *sizes, *current_sizes;
550 
551     priv = CUT_SEQUENCE_MATCHER_GET_PRIVATE(matcher);
552 
553     info = cut_sequence_match_info_new(from_begin, to_begin, 0);
554 
555     sizes = g_hash_table_new(g_direct_hash, g_direct_equal);
556 
557     begin = g_sequence_get_iter_at_pos(priv->from, from_begin);
558     end = g_sequence_get_iter_at_pos(priv->from, from_end);
559     for (iter = begin; iter != end; iter = g_sequence_iter_next(iter)) {
560         guint from_index;
561         const GList *node;
562 
563         from_index = g_sequence_iter_get_position(iter);
564         current_sizes = g_hash_table_new(g_direct_hash, g_direct_equal);
565         for (node = cut_sequence_matcher_get_to_index(matcher,
566                                                       g_sequence_get(iter));
567              node;
568              node = g_list_next(node)) {
569             guint size, to_index;
570             gpointer size_as_pointer;
571 
572             to_index = GPOINTER_TO_UINT(node->data);
573             if (to_index < to_begin)
574                 continue;
575             if (to_index + 1 > to_end)
576                 break;
577 
578             size_as_pointer =
579                 g_hash_table_lookup(sizes, GUINT_TO_POINTER(to_index - 1));
580             size = GPOINTER_TO_INT(size_as_pointer);
581             size++;
582             size_as_pointer = GUINT_TO_POINTER(size);
583             g_hash_table_insert(current_sizes,
584                                 GUINT_TO_POINTER(to_index),
585                                 size_as_pointer);
586             if (size > info->size) {
587                 info->from_index = from_index - size + 1;
588                 info->to_index = to_index - size + 1;
589                 info->size = size;
590             }
591         }
592         g_hash_table_unref(sizes);
593         sizes = current_sizes;
594     }
595     g_hash_table_unref(sizes);
596 
597     return info;
598 }
599 
600 static gboolean
check_junk(CutSequenceMatcherPrivate * priv,gboolean should_junk,guint index)601 check_junk (CutSequenceMatcherPrivate *priv, gboolean should_junk, guint index)
602 {
603     gpointer key;
604     gboolean is_junk;
605 
606     key = g_sequence_get(g_sequence_get_iter_at_pos(priv->to, index));
607     is_junk = GPOINTER_TO_INT(g_hash_table_lookup(priv->junks, key));
608     return should_junk ? is_junk : !is_junk;
609 }
610 
611 static gboolean
equal_content(CutSequenceMatcherPrivate * priv,guint from_index,guint to_index)612 equal_content (CutSequenceMatcherPrivate *priv, guint from_index, guint to_index)
613 {
614     GSequenceIter *from_iter, *to_iter;
615 
616     from_iter = g_sequence_get_iter_at_pos(priv->from, from_index);
617     to_iter = g_sequence_get_iter_at_pos(priv->to, to_index);
618     return priv->compare_func(from_iter, to_iter,
619                               priv->compare_func_user_data) == 0;
620 }
621 
622 
623 static void
adjust_best_info_with_junk_predicate(CutSequenceMatcher * matcher,gboolean should_junk,CutSequenceMatchInfo * best_info,guint from_begin,guint from_end,guint to_begin,guint to_end)624 adjust_best_info_with_junk_predicate (CutSequenceMatcher *matcher,
625                                       gboolean should_junk,
626                                       CutSequenceMatchInfo *best_info,
627                                       guint from_begin, guint from_end,
628                                       guint to_begin, guint to_end)
629 {
630     CutSequenceMatcherPrivate *priv;
631 
632     priv = CUT_SEQUENCE_MATCHER_GET_PRIVATE(matcher);
633 
634     while (best_info->from_index > from_begin &&
635            best_info->to_index > to_begin &&
636            check_junk(priv, should_junk, best_info->to_index - 1) &&
637            equal_content(priv,
638                          best_info->from_index - 1,
639                          best_info->to_index - 1)) {
640         best_info->from_index--;
641         best_info->to_index--;
642         best_info->size++;
643     }
644 
645     while (best_info->from_index + best_info->size + 1 < from_end &&
646            best_info->to_index + best_info->size + 1 < to_end &&
647            check_junk(priv,
648                       should_junk,
649                       best_info->to_index + best_info->size) &&
650            equal_content(priv,
651                          best_info->from_index + best_info->size,
652                          best_info->to_index + best_info->size)) {
653         best_info->size++;
654     }
655 }
656 
657 CutSequenceMatchInfo *
cut_sequence_matcher_get_longest_match(CutSequenceMatcher * matcher,guint from_begin,guint from_end,guint to_begin,guint to_end)658 cut_sequence_matcher_get_longest_match (CutSequenceMatcher *matcher,
659                                         guint from_begin, guint from_end,
660                                         guint to_begin, guint to_end)
661 {
662     CutSequenceMatcherPrivate *priv;
663     CutSequenceMatchInfo *info;
664 
665     priv = CUT_SEQUENCE_MATCHER_GET_PRIVATE(matcher);
666 
667     info = find_best_match_position(matcher,
668                                     from_begin, from_end,
669                                     to_begin, to_end);
670     if (g_hash_table_size(priv->junks) > 0) {
671         adjust_best_info_with_junk_predicate(matcher, FALSE, info,
672                                              from_begin, from_end,
673                                              to_begin, to_end);
674         adjust_best_info_with_junk_predicate(matcher, TRUE, info,
675                                              from_begin, from_end,
676                                              to_begin, to_end);
677     }
678 
679     return info;
680 }
681 
682 typedef struct _MatchingInfo MatchingInfo;
683 struct _MatchingInfo {
684     guint from_begin;
685     guint from_end;
686     guint to_begin;
687     guint to_end;
688 };
689 
690 static void
push_matching_info(GQueue * queue,guint from_begin,guint from_end,guint to_begin,guint to_end)691 push_matching_info (GQueue *queue,
692                     guint from_begin, guint from_end,
693                     guint to_begin, guint to_end)
694 {
695     MatchingInfo *info;
696 
697     info = g_slice_new(MatchingInfo);
698     info->from_begin = from_begin;
699     info->from_end = from_end;
700     info->to_begin = to_begin;
701     info->to_end = to_end;
702 
703     g_queue_push_tail(queue, info);
704 }
705 
706 static void
pop_matching_info(GQueue * queue,MatchingInfo * info)707 pop_matching_info (GQueue *queue, MatchingInfo *info)
708 {
709     MatchingInfo *popped_info;
710 
711     popped_info = g_queue_pop_head(queue);
712     *info = *popped_info;
713     g_slice_free(MatchingInfo, popped_info);
714 }
715 
716 static gint
compare_match_info(gconstpointer data1,gconstpointer data2)717 compare_match_info (gconstpointer data1, gconstpointer data2)
718 {
719     const CutSequenceMatchInfo *info1, *info2;
720 
721     info1 = data1;
722     info2 = data2;
723 
724     if (info1->from_index < info2->from_index) {
725         return -1;
726     } else if (info1->from_index == info2->from_index) {
727         return 0;
728     } else {
729         return 1;
730     }
731 }
732 
733 const GList *
cut_sequence_matcher_get_matches(CutSequenceMatcher * matcher)734 cut_sequence_matcher_get_matches (CutSequenceMatcher *matcher)
735 {
736     CutSequenceMatcherPrivate *priv;
737     GList *matches = NULL;
738     GQueue *queue;
739 
740     priv = CUT_SEQUENCE_MATCHER_GET_PRIVATE(matcher);
741     if (priv->matches)
742         return priv->matches;
743 
744     queue = g_queue_new();
745     push_matching_info(queue,
746                        0, g_sequence_get_length(priv->from),
747                        0, g_sequence_get_length(priv->to));
748 
749     while (!g_queue_is_empty(queue)) {
750         MatchingInfo info;
751         CutSequenceMatchInfo *match_info;
752 
753         pop_matching_info(queue, &info);
754         match_info = cut_sequence_matcher_get_longest_match(matcher,
755                                                             info.from_begin,
756                                                             info.from_end,
757                                                             info.to_begin,
758                                                             info.to_end);
759         if (match_info->size == 0) {
760             cut_sequence_match_info_free(match_info);
761             continue;
762         }
763 
764         if (info.from_begin < match_info->from_index &&
765             info.to_begin < match_info->to_index)
766             push_matching_info(queue,
767                                info.from_begin, match_info->from_index,
768                                info.to_begin, match_info->to_index);
769         matches = g_list_prepend(matches, match_info);
770         if (match_info->from_index + match_info->size < info.from_end &&
771             match_info->to_index + match_info->size < info.to_end)
772             push_matching_info(queue,
773                                match_info->from_index + match_info->size,
774                                info.from_end,
775                                match_info->to_index + match_info->size,
776                                info.to_end);
777     }
778 
779     g_queue_free(queue);
780     priv->matches = g_list_sort(matches, compare_match_info);
781 
782     return priv->matches;
783 }
784 
785 static GList *
prepend_match_info(GList * list,guint begin,guint end,guint size)786 prepend_match_info (GList *list, guint begin, guint end, guint size)
787 {
788     return g_list_prepend(list, cut_sequence_match_info_new(begin, end, size));
789 }
790 
791 const GList *
cut_sequence_matcher_get_blocks(CutSequenceMatcher * matcher)792 cut_sequence_matcher_get_blocks (CutSequenceMatcher *matcher)
793 {
794     CutSequenceMatcherPrivate *priv;
795     guint from_index, to_index, size;
796     const GList *node;
797     GList *blocks = NULL;
798 
799     priv = CUT_SEQUENCE_MATCHER_GET_PRIVATE(matcher);
800     if (priv->blocks)
801         return priv->blocks;
802 
803     from_index = to_index = size = 0;
804     for (node = cut_sequence_matcher_get_matches(matcher);
805          node;
806          node = g_list_next(node)) {
807         CutSequenceMatchInfo *info = node->data;
808 
809         if (from_index + size == info->from_index &&
810             to_index + size == info->to_index) {
811             size += info->size;
812         } else {
813             if (size > 0)
814                 blocks = prepend_match_info(blocks, from_index, to_index, size);
815             from_index = info->from_index;
816             to_index = info->to_index;
817             size = info->size;
818         }
819     }
820 
821     if (size > 0)
822         blocks = prepend_match_info(blocks, from_index, to_index, size);
823 
824     blocks = prepend_match_info(blocks,
825                                 g_sequence_get_length(priv->from),
826                                 g_sequence_get_length(priv->to),
827                                 0);
828     priv->blocks = g_list_reverse(blocks);
829 
830     return priv->blocks;
831 }
832 
833 static CutSequenceMatchOperationType
determine_operation_type(guint from_index,guint to_index,CutSequenceMatchInfo * info)834 determine_operation_type (guint from_index, guint to_index,
835                           CutSequenceMatchInfo *info)
836 {
837     if (from_index < info->from_index && to_index < info->to_index) {
838         return CUT_SEQUENCE_MATCH_OPERATION_REPLACE;
839     } else if (from_index < info->from_index) {
840         return CUT_SEQUENCE_MATCH_OPERATION_DELETE;
841     } else if (to_index < info->to_index) {
842         return CUT_SEQUENCE_MATCH_OPERATION_INSERT;
843     } else {
844         return CUT_SEQUENCE_MATCH_OPERATION_EQUAL;
845     }
846 }
847 
848 static GList *
prepend_operation(GList * list,CutSequenceMatchOperationType type,guint from_begin,guint from_end,guint to_begin,guint to_end)849 prepend_operation (GList *list, CutSequenceMatchOperationType type,
850                    guint from_begin, guint from_end,
851                    guint to_begin, guint to_end)
852 {
853     return g_list_prepend(list,
854                           cut_sequence_match_operation_new(type,
855                                                            from_begin,
856                                                            from_end,
857                                                            to_begin,
858                                                            to_end));
859 }
860 
861 const GList *
cut_sequence_matcher_get_operations(CutSequenceMatcher * matcher)862 cut_sequence_matcher_get_operations (CutSequenceMatcher *matcher)
863 {
864     CutSequenceMatcherPrivate *priv;
865     const GList *node;
866     GList *operations = NULL;
867     guint from_index = 0;
868     guint to_index = 0;
869 
870     priv = CUT_SEQUENCE_MATCHER_GET_PRIVATE(matcher);
871     if (priv->operations)
872         return priv->operations;
873 
874     for (node = cut_sequence_matcher_get_blocks(matcher);
875          node;
876          node = g_list_next(node)) {
877         CutSequenceMatchInfo *info = node->data;
878         CutSequenceMatchOperationType type;
879 
880         type = determine_operation_type(from_index, to_index, info);
881         if (type != CUT_SEQUENCE_MATCH_OPERATION_EQUAL)
882             operations = prepend_operation(operations, type,
883                                            from_index, info->from_index,
884                                            to_index, info->to_index);
885 
886         from_index = info->from_index + info->size;
887         to_index = info->to_index + info->size;
888         if (info->size > 0)
889             operations = prepend_operation(operations,
890                                            CUT_SEQUENCE_MATCH_OPERATION_EQUAL,
891                                            info->from_index, from_index,
892                                            info->to_index, to_index);
893     }
894 
895     priv->operations = g_list_reverse(operations);
896     return priv->operations;
897 }
898 
899 static GList *
get_edge_expanded_copied_operations(CutSequenceMatcher * matcher,guint context_size)900 get_edge_expanded_copied_operations (CutSequenceMatcher *matcher,
901                                      guint context_size)
902 {
903     const GList *original_operations = NULL;
904     GList *operations = NULL;
905 
906     original_operations = cut_sequence_matcher_get_operations(matcher);
907     if (original_operations) {
908         CutSequenceMatchOperation *operation = original_operations->data;
909         const GList *node;
910         guint from_begin, from_end, to_begin, to_end;
911 
912         from_begin = operation->from_begin;
913         from_end = operation->from_end;
914         to_begin = operation->to_begin;
915         to_end = operation->to_end;
916         node = original_operations;
917         if (operation->type == CUT_SEQUENCE_MATCH_OPERATION_EQUAL) {
918             guint from_context_begin = 0, to_context_begin = 0;
919 
920             if (from_end > context_size)
921                 from_context_begin = from_end - context_size;
922             if (to_end > context_size)
923                 to_context_begin = to_end - context_size;
924             operations = prepend_operation(operations,
925                                            CUT_SEQUENCE_MATCH_OPERATION_EQUAL,
926                                            MAX(from_begin, from_context_begin),
927                                            from_end,
928                                            MAX(to_begin, to_context_begin),
929                                            to_end);
930             node = g_list_next(node);
931         }
932         for (; node; node = g_list_next(node)) {
933             operation = node->data;
934             from_begin = operation->from_begin;
935             from_end = operation->from_end;
936             to_begin = operation->to_begin;
937             to_end = operation->to_end;
938             if (!g_list_next(node) &&
939                 operation->type == CUT_SEQUENCE_MATCH_OPERATION_EQUAL) {
940                 operations =
941                     prepend_operation(operations,
942                                       CUT_SEQUENCE_MATCH_OPERATION_EQUAL,
943                                       from_begin,
944                                       MIN(from_end, from_begin + context_size),
945                                       to_begin,
946                                       MIN(to_end, to_begin + context_size));
947             } else {
948                 operations = prepend_operation(operations,
949                                                operation->type,
950                                                from_begin, from_end,
951                                                to_begin, to_end);
952             }
953         }
954         operations = g_list_reverse(operations);
955     } else {
956         operations = prepend_operation(operations,
957                                        CUT_SEQUENCE_MATCH_OPERATION_EQUAL,
958                                        0, 0, 0, 0);
959     }
960 
961     return operations;
962 }
963 
964 const GList *
cut_sequence_matcher_get_grouped_operations(CutSequenceMatcher * matcher)965 cut_sequence_matcher_get_grouped_operations (CutSequenceMatcher *matcher)
966 {
967     CutSequenceMatcherPrivate *priv;
968     GList *node;
969     GList *operations = NULL;
970     GList *groups = NULL;
971     GList *group = NULL;
972     guint context_size;
973     guint group_window;
974 
975     priv = CUT_SEQUENCE_MATCHER_GET_PRIVATE(matcher);
976     if (priv->grouped_operations)
977         return priv->grouped_operations;
978 
979     context_size = priv->context_size;
980     operations = get_edge_expanded_copied_operations(matcher, context_size);
981     group_window = context_size * 2;
982 
983     for (node = operations; node; node = g_list_next(node)) {
984         CutSequenceMatchOperation *operation = node->data;
985         guint from_begin, from_end, to_begin, to_end;
986 
987         from_begin = operation->from_begin;
988         from_end = operation->from_end;
989         to_begin = operation->to_begin;
990         to_end = operation->to_end;
991         if (operation->type == CUT_SEQUENCE_MATCH_OPERATION_EQUAL &&
992             ((from_end - from_begin) > group_window)) {
993             operation->from_end = MIN(from_end, from_begin + context_size);
994             operation->to_end = MIN(to_end, to_begin + context_size);
995             group = g_list_prepend(group, operation);
996             groups = g_list_prepend(groups, g_list_reverse(group));
997             group = NULL;
998 
999             if (from_end > context_size)
1000                 from_begin = MAX(from_begin, from_end - context_size);
1001             if (to_end > context_size)
1002                 to_begin = MAX(to_begin, to_end - context_size);
1003             group = prepend_operation(group, operation->type,
1004                                       from_begin, from_end,
1005                                       to_begin, to_end);
1006         } else {
1007             group = g_list_prepend(group, operation);
1008         }
1009     }
1010 
1011     if (group)
1012         groups = g_list_prepend(groups, g_list_reverse(group));
1013 
1014     g_list_free(operations);
1015 
1016     priv->grouped_operations = g_list_reverse(groups);
1017     return priv->grouped_operations;
1018 }
1019 
1020 gdouble
cut_sequence_matcher_get_ratio(CutSequenceMatcher * matcher)1021 cut_sequence_matcher_get_ratio (CutSequenceMatcher *matcher)
1022 {
1023     CutSequenceMatcherPrivate *priv;
1024     const GList *node;
1025     guint length;
1026 
1027     priv = CUT_SEQUENCE_MATCHER_GET_PRIVATE(matcher);
1028     if (priv->ratio >= 0.0)
1029         return priv->ratio;
1030 
1031     length = g_sequence_get_length(priv->from) + g_sequence_get_length(priv->to);
1032     if (length == 0) {
1033         priv->ratio = 1.0;
1034     } else {
1035         guint matches = 0;
1036 
1037         for (node = cut_sequence_matcher_get_blocks(matcher);
1038              node;
1039              node = g_list_next(node)) {
1040             CutSequenceMatchInfo *info = node->data;
1041 
1042             matches += info->size;
1043         }
1044         priv->ratio = 2.0 * matches / length;
1045     }
1046 
1047     return priv->ratio;
1048 }
1049 
1050 guint
cut_sequence_matcher_get_context_size(CutSequenceMatcher * matcher)1051 cut_sequence_matcher_get_context_size (CutSequenceMatcher *matcher)
1052 {
1053     return CUT_SEQUENCE_MATCHER_GET_PRIVATE(matcher)->context_size;
1054 }
1055 
1056 void
cut_sequence_matcher_set_context_size(CutSequenceMatcher * matcher,guint context_size)1057 cut_sequence_matcher_set_context_size (CutSequenceMatcher *matcher,
1058                                        guint               context_size)
1059 {
1060     CutSequenceMatcherPrivate *priv;
1061 
1062     priv = CUT_SEQUENCE_MATCHER_GET_PRIVATE(matcher);
1063     if (priv->context_size != context_size) {
1064         priv->context_size = context_size;
1065         dispose_goruped_operations(priv);
1066     }
1067 }
1068 
1069 /*
1070 vi:ts=4:nowrap:ai:expandtab:sw=4
1071 */
1072