1 /* stringmatcher_flexiblematch.cc
2  * This file belongs to Worker, a file manager for UN*X/X11.
3  * Copyright (C) 2012-2015 Ralf Hoffmann.
4  * You can contact me at: ralf@boomerangsworld.de
5  *   or http://www.boomerangsworld.de/worker
6  *
7  * This program is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 2 of the License, or
10  * (at your option) any later version.
11  *
12  * This program is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with this program; if not, write to the Free Software
19  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
20  */
21 
22 #include "stringmatcher_flexiblematch.hh"
23 #include "aguix/lowlevelfunc.h"
24 #include "aguix/utf8.hh"
25 #include <cctype>
26 #include <algorithm>
27 #include <functional>
28 #include <vector>
29 #include <list>
30 #include <map>
31 
StringMatcherFlexibleMatch()32 StringMatcherFlexibleMatch::StringMatcherFlexibleMatch() : m_match_case_sensitive( false )
33 {
34 }
35 
~StringMatcherFlexibleMatch()36 StringMatcherFlexibleMatch::~StringMatcherFlexibleMatch()
37 {
38 }
39 
StringMatcherFlexibleMatch(const StringMatcherFlexibleMatch & other)40 StringMatcherFlexibleMatch::StringMatcherFlexibleMatch( const StringMatcherFlexibleMatch &other )
41     : m_match_string( other.m_match_string ),
42       m_match_case_sensitive( other.m_match_case_sensitive ),
43       m_match_string_lowercase( other.m_match_string_lowercase )
44 {
45     buildStateDescription();
46 }
47 
operator =(const StringMatcherFlexibleMatch & rhs)48 StringMatcherFlexibleMatch &StringMatcherFlexibleMatch::operator=( const StringMatcherFlexibleMatch &rhs )
49 {
50     if ( &rhs != this ) {
51         m_match_string = rhs.m_match_string;
52         m_match_case_sensitive = rhs.m_match_case_sensitive;
53         m_match_string_lowercase = rhs.m_match_string_lowercase;
54 
55         m_state_list.clear();
56 
57         buildStateDescription();
58     }
59 
60     return *this;
61 }
62 
match(const std::string & str)63 bool StringMatcherFlexibleMatch::match( const std::string &str )
64 {
65     if ( countNonMatchingBlocks<false>( str, NULL ) >= 0 ) return true;
66 
67     return false;
68 }
69 
70 #if 0
71 // this variant works and is much simpler and faster but is not accurate, the returned
72 // block counter is only one value of multiple solutions.
73 int StringMatcherFlexibleMatch::countNonMatchingBlocks( const std::string &str )
74 {
75     int res = -1;
76     bool failed = false;
77 
78     const char *search_str = str.c_str();
79     const char *needle_str = m_match_string.c_str();
80     bool skipped_chars = true;  /* start with true since the res counter starts with -1
81                                    so for the first hit this counter will be raised to 0 */
82 
83     if ( ! m_match_case_sensitive ) {
84         needle_str = m_match_string_lowercase.c_str();
85     }
86 
87     while ( *needle_str != '\0' && ! failed ) {
88         size_t l = UTF8::getLenOfCharacter( needle_str );
89 
90         if ( l < 1 ) break;
91 
92         /* silently skip wildcard * */
93         if ( *needle_str != '*' ) {
94             bool found = false;
95 
96             /* now search in search_str for current char */
97             while ( *search_str != '\0' && ! found ) {
98                 size_t sl = UTF8::getLenOfCharacter( search_str );
99 
100                 if ( sl < 1 ) {
101                     failed = true;
102                     break;
103                 }
104 
105                 if ( sl == l &&
106                      memcmp( search_str, needle_str, l ) == 0 ) {
107                     found = true;
108                 } else if ( sl == 1 && l == 1 && ! m_match_case_sensitive &&
109                             std::tolower( *search_str ) == *needle_str ) {
110                     // TODO this is not really utf8 ready but should at least work
111                     // for utf8 input
112                     found = true;
113                 }
114 
115                 if ( ! found ) {
116                     skipped_chars = true;
117                 }
118 
119                 search_str += sl;
120             }
121 
122             if ( ! found ) {
123                 failed = true;
124                 break;
125             } else {
126                 if ( skipped_chars ) {
127                     res++;
128                 }
129                 skipped_chars = false;
130             }
131         }
132 
133         needle_str += l;
134     }
135 
136     if ( failed ) return -1;
137 
138     return res;
139 }
140 #endif
141 
setMatchString(const std::string & str)142 void StringMatcherFlexibleMatch::setMatchString( const std::string &str )
143 {
144     m_match_string = str;
145     createLoweredCase();
146 
147     buildStateDescription();
148 }
149 
getMatchString() const150 std::string StringMatcherFlexibleMatch::getMatchString() const
151 {
152     return m_match_string;
153 }
154 
setMatchCaseSensitive(bool nv)155 void StringMatcherFlexibleMatch::setMatchCaseSensitive( bool nv )
156 {
157     m_match_case_sensitive = nv;
158 
159     buildStateDescription();
160 }
161 
getMatchCaseSensitive() const162 bool StringMatcherFlexibleMatch::getMatchCaseSensitive() const
163 {
164     return m_match_case_sensitive;
165 }
166 
167 template <class T>
168 struct my_tolower : public std::unary_function<T,T>
169 {
operator ()my_tolower170     T operator() (const T& c) const
171     {
172         return std::tolower( c );
173     }
174 };
175 
createLoweredCase()176 void StringMatcherFlexibleMatch::createLoweredCase()
177 {
178     m_match_string_lowercase = m_match_string;
179 
180     std::transform( m_match_string_lowercase.begin(),
181                     m_match_string_lowercase.end(),
182                     m_match_string_lowercase.begin(),
183                     my_tolower<char>() );
184 }
185 
186 template <bool with_offsets> struct state_type {
187     int state_number;
188     int block_counter;
189     bool prev_was_hit;
190     int offsets; //dummy variable
191 
state_typestate_type192     state_type( int _state_number,
193                 int _block_counter,
194                 bool _prev_was_hit,
195                 int _offsets,
196                 size_t _offset,
197                 size_t _length ) : state_number( _state_number ),
198                                    block_counter( _block_counter ),
199                                    prev_was_hit( _prev_was_hit ),
200                                    offsets( _offsets )
201     {
202     }
203 };
204 
205 template <> struct state_type<true> {
206     int state_number;
207     int block_counter;
208     bool prev_was_hit;
209     std::vector< std::pair< size_t, size_t > > offsets;
210 
state_typestate_type211     state_type( int _state_number,
212                 int _block_counter,
213                 bool _prev_was_hit,
214                 const std::vector< std::pair< size_t, size_t > > &current_offsets,
215                 size_t _offset,
216                 size_t _length ) : state_number( _state_number ),
217                                    block_counter( _block_counter ),
218                                    prev_was_hit( _prev_was_hit )
219     {
220         offsets = current_offsets;
221         offsets.push_back( std::make_pair( _offset, _length ) );
222     }
223 };
224 
check_string(const char * s1,int l1,const char * s2,int l2,bool case_sensitive)225 static bool check_string( const char *s1, int l1,
226                           const char *s2, int l2,
227                           bool case_sensitive )
228 {
229     if ( l1 == l2 &&
230          memcmp( s1, s2, l1 ) == 0 ) {
231         return true;
232     } else if ( l1 == 1 && l2 == 1 && ! case_sensitive &&
233                 (char)std::tolower( *s1 ) == *s2 ) {
234         // TODO this is not really utf8 ready but should at least work
235         // for utf8 input
236         return true;
237     }
238 
239     return false;
240 }
241 
getSegments(const std::string & str,const std::list<state_type<with_segments>> & current_states,const int state_list_size,const int min_block_couunt,std::vector<size_t> * return_segments)242 template <bool with_segments> void getSegments( const std::string &str,
243                                                 const std::list< state_type<with_segments> > &current_states,
244                                                 const int state_list_size,
245                                                 const int min_block_couunt,
246                                                 std::vector< size_t > *return_segments )
247 {
248 }
249 
getSegments(const std::string & str,const std::list<state_type<true>> & current_states,const int state_list_size,const int min_block_count,std::vector<size_t> * return_segments)250 template <> void getSegments<true>( const std::string &str,
251                                     const std::list< state_type<true> > &current_states,
252                                     const int state_list_size,
253                                     const int min_block_count,
254                                     std::vector< size_t > *return_segments )
255 {
256     // now generate list of alternating segments similar to flexibleregex
257     for ( auto it1 = current_states.begin();
258           it1 != current_states.end();
259           it1++ ) {
260         if ( it1->state_number >= state_list_size ) {
261             if ( min_block_count == it1->block_counter &&
262                  it1->offsets.size() > 1 ) {
263                 std::vector< size_t > res;
264                 size_t last_o = 0;
265                 size_t last_segment_start = it1->offsets[1].first;
266                 size_t last_segment_end = it1->offsets[1].first + it1->offsets[1].second;
267 
268                 for ( int i = 2; i < (int)it1->offsets.size(); i++ ) {
269                     const size_t o = it1->offsets[i].first;
270 
271                     if ( o == last_segment_end ) {
272                         // merge matches
273                         last_segment_end = o + it1->offsets[i].second;
274                     } else {
275                         res.push_back( last_segment_start - last_o );
276                         res.push_back( last_segment_end - last_segment_start );
277                         last_o = last_segment_end;
278 
279                         last_segment_start = o;
280                         last_segment_end = o + it1->offsets[i].second;
281                     }
282                 }
283 
284                 res.push_back( last_segment_start - last_o );
285                 res.push_back( last_segment_end - last_segment_start );
286                 last_o = last_segment_end;
287 
288                 if ( last_o <= str.length() ) {
289                     res.push_back( str.length() - last_o );
290                 }
291 
292                 *return_segments = res;
293 
294                 break;
295             }
296         }
297     }
298 }
299 
countNonMatchingBlocks(const std::string & str,std::vector<size_t> * return_segments)300 template <bool with_segments> int StringMatcherFlexibleMatch::countNonMatchingBlocks( const std::string &str,
301                                                                                       std::vector< size_t > *return_segments )
302 {
303     // int pushes=0;
304     // int c= 0;
305     // int e=0;
306 
307     std::list< state_type<with_segments> > current_states;
308     std::vector< int > smallest_negative_block_count;
309     const int state_list_size = (int)m_state_list.size();
310 
311     if ( state_list_size < 1 ) return 0; // always matches for empty match strings
312 
313     smallest_negative_block_count.resize( state_list_size );
314     for ( unsigned int j = 0; j < smallest_negative_block_count.size(); j++ ) {
315         smallest_negative_block_count[j] = -2;
316     }
317 
318     current_states.push_back( state_type<with_segments>( 0, -1, false, {}, 0, 0 ) );
319 
320     const char *search_str = str.c_str();
321     const char *search_str_start = search_str;
322     bool exact_hit_found = false;
323     int min_accepted_block_count = -1;
324 
325     while ( *search_str != '\0' && ! exact_hit_found ) {
326         size_t sl = UTF8::getLenOfCharacter( search_str );
327 
328         if ( sl < 1 ) {
329             break;
330         }
331 
332         for ( auto it1 = current_states.begin();
333               it1 != current_states.end(); ) {
334             if ( it1->state_number < state_list_size ) {
335                 if ( min_accepted_block_count >= 0 &&
336                      it1->block_counter >= min_accepted_block_count ) {
337                     // there is already an accepted state for the same or smaller block count
338                     // so remove this state completely
339                     auto next = it1;
340                     next++;
341                     current_states.erase( it1 );
342                     it1=next;
343                     continue;
344                 } else {
345                     bool hit = false;
346                     if ( check_string( search_str, sl,
347                                        m_state_list[it1->state_number].sub_string,
348                                        m_state_list[it1->state_number].string_len,
349                                        m_match_case_sensitive ) ) {
350                         //pushes++;
351                         current_states.push_front( state_type<with_segments>( it1->state_number + 1,
352                                                                               it1->prev_was_hit ? it1->block_counter : it1->block_counter + 1,
353                                                                               true,
354                                                                               it1->offsets,
355                                                                               search_str - search_str_start,
356                                                                               sl) );
357 
358                         hit = true;
359                     }
360                     bool was_true = it1->prev_was_hit;
361 
362                     it1->prev_was_hit = false;
363 
364                     // the following is to optimize remove redundant states
365                     // it is not necessary
366                     if ( smallest_negative_block_count[it1->state_number] >= -1 &&
367                          it1->block_counter >= smallest_negative_block_count[it1->state_number] ) {
368                         // block count smaller than already seen
369                         if ( was_true ) {
370                             // state changed from true to false so remove the state
371                             auto next = it1;
372                             next++;
373                             current_states.erase( it1 );
374                             it1=next;
375                             //c++;
376                             continue;
377                         }
378                     } else {
379                         smallest_negative_block_count[it1->state_number] = it1->block_counter;
380 
381                         if ( was_true && hit ) {
382                             // current iterator was a hit, but there is a new match directly after so remove the old one
383                             auto next = it1;
384                             next++;
385                             current_states.erase( it1 );
386                             it1=next;
387                             //c++;
388                             continue;
389                         }
390                     }
391                 }
392             } else if ( it1->block_counter == 0 ) {
393                 // found exact hit
394                 exact_hit_found = true;
395                 break;
396             } else {
397                 if ( min_accepted_block_count == -1 ||
398                      it1->block_counter < min_accepted_block_count ) {
399                     min_accepted_block_count = it1->block_counter;
400                 }
401             }
402             it1++;
403         }
404 
405 #if 0
406         // the following is purely optional, it's just there to keep the number
407         // of states low. It will remove all states with the same state_number and prev_was_hit == false
408         // and only keep the one with the smallest block counter
409 
410         if ( current_states.size() > 1 ) {
411             for ( int i = 0; i < state_list_size; i++ ) {
412                 int smallest_block_counter = -2;
413                 for ( auto it1 = current_states.begin();
414                       it1 != current_states.end();
415                       it1++ ) {
416                     if ( it1->state_number == i &&
417                          it1->prev_was_hit == false ) {
418                         if ( smallest_block_counter == -2 ||
419                              it1->block_counter < smallest_block_counter ) {
420                             smallest_block_counter = it1->block_counter;
421                         }
422                     }
423                 }
424 
425                 if ( smallest_block_counter >= -1 ) {
426                     bool kept_one = false;
427 
428                     for ( auto it1 = current_states.begin();
429                           it1 != current_states.end(); ) {
430                         if ( it1->state_number == i &&
431                              it1->prev_was_hit == false ) {
432                             if ( it1->block_counter > smallest_block_counter ||
433                                  ( kept_one == true && it1->block_counter == smallest_block_counter ) ) {
434                                 auto next_it = it1;
435                                 next_it++;
436 
437                                 //e++;
438                                 current_states.erase( it1 );
439                                 it1 = next_it;
440                                 continue;
441                             } else {
442                                 kept_one = true;
443                             }
444                         }
445                         it1++;
446                     }
447                 }
448             }
449         }
450 #endif
451 
452         search_str += sl;
453     }
454 
455     // printf("pushes:%d\n",pushes);
456     // printf("c:%d\n",c);
457     // printf("e:%d\n",e);
458 
459     int min_block_count = -1;
460 
461     for ( auto it1 = current_states.begin();
462           it1 != current_states.end();
463           it1++ ) {
464         if ( it1->state_number >= state_list_size ) {
465             if ( min_block_count < 0 || min_block_count > it1->block_counter ) {
466                 min_block_count = it1->block_counter;
467             }
468         }
469     }
470 
471     if ( return_segments ) {
472         getSegments<with_segments>( str,
473                                     current_states,
474                                     state_list_size,
475                                     min_block_count,
476                                     return_segments );
477     }
478 
479     return min_block_count;
480 }
481 
482 template int StringMatcherFlexibleMatch::countNonMatchingBlocks<true>( const std::string &str,
483                                                                        std::vector< size_t > *return_segments );
484 template int StringMatcherFlexibleMatch::countNonMatchingBlocks<false>( const std::string &str,
485                                                                         std::vector< size_t > *return_segments );
486 
487 
buildStateDescription()488 void StringMatcherFlexibleMatch::buildStateDescription()
489 {
490     const char *needle_str = m_match_string.c_str();
491 
492     if ( ! m_match_case_sensitive ) {
493         needle_str = m_match_string_lowercase.c_str();
494     }
495 
496     m_state_list.clear();
497 
498     while ( *needle_str != '\0' ) {
499         size_t l = UTF8::getLenOfCharacter( needle_str );
500 
501         if ( l < 1 ) break;
502 
503         /* silently skip wildcard * */
504         if ( *needle_str != '*' ) {
505             m_state_list.push_back( state_description( needle_str, l ) );
506         }
507 
508         needle_str += l;
509     }
510 }
511