1 /* stringmatcher_flexiblematch.cc
2 * This file belongs to Worker, a file manager for UN*X/X11.
3 * Copyright (C) 2012-2015 Ralf Hoffmann.
4 * You can contact me at: ralf@boomerangsworld.de
5 * or http://www.boomerangsworld.de/worker
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
20 */
21
22 #include "stringmatcher_flexiblematch.hh"
23 #include "aguix/lowlevelfunc.h"
24 #include "aguix/utf8.hh"
25 #include <cctype>
26 #include <algorithm>
27 #include <functional>
28 #include <vector>
29 #include <list>
30 #include <map>
31
StringMatcherFlexibleMatch()32 StringMatcherFlexibleMatch::StringMatcherFlexibleMatch() : m_match_case_sensitive( false )
33 {
34 }
35
~StringMatcherFlexibleMatch()36 StringMatcherFlexibleMatch::~StringMatcherFlexibleMatch()
37 {
38 }
39
StringMatcherFlexibleMatch(const StringMatcherFlexibleMatch & other)40 StringMatcherFlexibleMatch::StringMatcherFlexibleMatch( const StringMatcherFlexibleMatch &other )
41 : m_match_string( other.m_match_string ),
42 m_match_case_sensitive( other.m_match_case_sensitive ),
43 m_match_string_lowercase( other.m_match_string_lowercase )
44 {
45 buildStateDescription();
46 }
47
operator =(const StringMatcherFlexibleMatch & rhs)48 StringMatcherFlexibleMatch &StringMatcherFlexibleMatch::operator=( const StringMatcherFlexibleMatch &rhs )
49 {
50 if ( &rhs != this ) {
51 m_match_string = rhs.m_match_string;
52 m_match_case_sensitive = rhs.m_match_case_sensitive;
53 m_match_string_lowercase = rhs.m_match_string_lowercase;
54
55 m_state_list.clear();
56
57 buildStateDescription();
58 }
59
60 return *this;
61 }
62
match(const std::string & str)63 bool StringMatcherFlexibleMatch::match( const std::string &str )
64 {
65 if ( countNonMatchingBlocks<false>( str, NULL ) >= 0 ) return true;
66
67 return false;
68 }
69
70 #if 0
71 // this variant works and is much simpler and faster but is not accurate, the returned
72 // block counter is only one value of multiple solutions.
73 int StringMatcherFlexibleMatch::countNonMatchingBlocks( const std::string &str )
74 {
75 int res = -1;
76 bool failed = false;
77
78 const char *search_str = str.c_str();
79 const char *needle_str = m_match_string.c_str();
80 bool skipped_chars = true; /* start with true since the res counter starts with -1
81 so for the first hit this counter will be raised to 0 */
82
83 if ( ! m_match_case_sensitive ) {
84 needle_str = m_match_string_lowercase.c_str();
85 }
86
87 while ( *needle_str != '\0' && ! failed ) {
88 size_t l = UTF8::getLenOfCharacter( needle_str );
89
90 if ( l < 1 ) break;
91
92 /* silently skip wildcard * */
93 if ( *needle_str != '*' ) {
94 bool found = false;
95
96 /* now search in search_str for current char */
97 while ( *search_str != '\0' && ! found ) {
98 size_t sl = UTF8::getLenOfCharacter( search_str );
99
100 if ( sl < 1 ) {
101 failed = true;
102 break;
103 }
104
105 if ( sl == l &&
106 memcmp( search_str, needle_str, l ) == 0 ) {
107 found = true;
108 } else if ( sl == 1 && l == 1 && ! m_match_case_sensitive &&
109 std::tolower( *search_str ) == *needle_str ) {
110 // TODO this is not really utf8 ready but should at least work
111 // for utf8 input
112 found = true;
113 }
114
115 if ( ! found ) {
116 skipped_chars = true;
117 }
118
119 search_str += sl;
120 }
121
122 if ( ! found ) {
123 failed = true;
124 break;
125 } else {
126 if ( skipped_chars ) {
127 res++;
128 }
129 skipped_chars = false;
130 }
131 }
132
133 needle_str += l;
134 }
135
136 if ( failed ) return -1;
137
138 return res;
139 }
140 #endif
141
setMatchString(const std::string & str)142 void StringMatcherFlexibleMatch::setMatchString( const std::string &str )
143 {
144 m_match_string = str;
145 createLoweredCase();
146
147 buildStateDescription();
148 }
149
getMatchString() const150 std::string StringMatcherFlexibleMatch::getMatchString() const
151 {
152 return m_match_string;
153 }
154
setMatchCaseSensitive(bool nv)155 void StringMatcherFlexibleMatch::setMatchCaseSensitive( bool nv )
156 {
157 m_match_case_sensitive = nv;
158
159 buildStateDescription();
160 }
161
getMatchCaseSensitive() const162 bool StringMatcherFlexibleMatch::getMatchCaseSensitive() const
163 {
164 return m_match_case_sensitive;
165 }
166
167 template <class T>
168 struct my_tolower : public std::unary_function<T,T>
169 {
operator ()my_tolower170 T operator() (const T& c) const
171 {
172 return std::tolower( c );
173 }
174 };
175
createLoweredCase()176 void StringMatcherFlexibleMatch::createLoweredCase()
177 {
178 m_match_string_lowercase = m_match_string;
179
180 std::transform( m_match_string_lowercase.begin(),
181 m_match_string_lowercase.end(),
182 m_match_string_lowercase.begin(),
183 my_tolower<char>() );
184 }
185
186 template <bool with_offsets> struct state_type {
187 int state_number;
188 int block_counter;
189 bool prev_was_hit;
190 int offsets; //dummy variable
191
state_typestate_type192 state_type( int _state_number,
193 int _block_counter,
194 bool _prev_was_hit,
195 int _offsets,
196 size_t _offset,
197 size_t _length ) : state_number( _state_number ),
198 block_counter( _block_counter ),
199 prev_was_hit( _prev_was_hit ),
200 offsets( _offsets )
201 {
202 }
203 };
204
205 template <> struct state_type<true> {
206 int state_number;
207 int block_counter;
208 bool prev_was_hit;
209 std::vector< std::pair< size_t, size_t > > offsets;
210
state_typestate_type211 state_type( int _state_number,
212 int _block_counter,
213 bool _prev_was_hit,
214 const std::vector< std::pair< size_t, size_t > > ¤t_offsets,
215 size_t _offset,
216 size_t _length ) : state_number( _state_number ),
217 block_counter( _block_counter ),
218 prev_was_hit( _prev_was_hit )
219 {
220 offsets = current_offsets;
221 offsets.push_back( std::make_pair( _offset, _length ) );
222 }
223 };
224
check_string(const char * s1,int l1,const char * s2,int l2,bool case_sensitive)225 static bool check_string( const char *s1, int l1,
226 const char *s2, int l2,
227 bool case_sensitive )
228 {
229 if ( l1 == l2 &&
230 memcmp( s1, s2, l1 ) == 0 ) {
231 return true;
232 } else if ( l1 == 1 && l2 == 1 && ! case_sensitive &&
233 (char)std::tolower( *s1 ) == *s2 ) {
234 // TODO this is not really utf8 ready but should at least work
235 // for utf8 input
236 return true;
237 }
238
239 return false;
240 }
241
getSegments(const std::string & str,const std::list<state_type<with_segments>> & current_states,const int state_list_size,const int min_block_couunt,std::vector<size_t> * return_segments)242 template <bool with_segments> void getSegments( const std::string &str,
243 const std::list< state_type<with_segments> > ¤t_states,
244 const int state_list_size,
245 const int min_block_couunt,
246 std::vector< size_t > *return_segments )
247 {
248 }
249
getSegments(const std::string & str,const std::list<state_type<true>> & current_states,const int state_list_size,const int min_block_count,std::vector<size_t> * return_segments)250 template <> void getSegments<true>( const std::string &str,
251 const std::list< state_type<true> > ¤t_states,
252 const int state_list_size,
253 const int min_block_count,
254 std::vector< size_t > *return_segments )
255 {
256 // now generate list of alternating segments similar to flexibleregex
257 for ( auto it1 = current_states.begin();
258 it1 != current_states.end();
259 it1++ ) {
260 if ( it1->state_number >= state_list_size ) {
261 if ( min_block_count == it1->block_counter &&
262 it1->offsets.size() > 1 ) {
263 std::vector< size_t > res;
264 size_t last_o = 0;
265 size_t last_segment_start = it1->offsets[1].first;
266 size_t last_segment_end = it1->offsets[1].first + it1->offsets[1].second;
267
268 for ( int i = 2; i < (int)it1->offsets.size(); i++ ) {
269 const size_t o = it1->offsets[i].first;
270
271 if ( o == last_segment_end ) {
272 // merge matches
273 last_segment_end = o + it1->offsets[i].second;
274 } else {
275 res.push_back( last_segment_start - last_o );
276 res.push_back( last_segment_end - last_segment_start );
277 last_o = last_segment_end;
278
279 last_segment_start = o;
280 last_segment_end = o + it1->offsets[i].second;
281 }
282 }
283
284 res.push_back( last_segment_start - last_o );
285 res.push_back( last_segment_end - last_segment_start );
286 last_o = last_segment_end;
287
288 if ( last_o <= str.length() ) {
289 res.push_back( str.length() - last_o );
290 }
291
292 *return_segments = res;
293
294 break;
295 }
296 }
297 }
298 }
299
countNonMatchingBlocks(const std::string & str,std::vector<size_t> * return_segments)300 template <bool with_segments> int StringMatcherFlexibleMatch::countNonMatchingBlocks( const std::string &str,
301 std::vector< size_t > *return_segments )
302 {
303 // int pushes=0;
304 // int c= 0;
305 // int e=0;
306
307 std::list< state_type<with_segments> > current_states;
308 std::vector< int > smallest_negative_block_count;
309 const int state_list_size = (int)m_state_list.size();
310
311 if ( state_list_size < 1 ) return 0; // always matches for empty match strings
312
313 smallest_negative_block_count.resize( state_list_size );
314 for ( unsigned int j = 0; j < smallest_negative_block_count.size(); j++ ) {
315 smallest_negative_block_count[j] = -2;
316 }
317
318 current_states.push_back( state_type<with_segments>( 0, -1, false, {}, 0, 0 ) );
319
320 const char *search_str = str.c_str();
321 const char *search_str_start = search_str;
322 bool exact_hit_found = false;
323 int min_accepted_block_count = -1;
324
325 while ( *search_str != '\0' && ! exact_hit_found ) {
326 size_t sl = UTF8::getLenOfCharacter( search_str );
327
328 if ( sl < 1 ) {
329 break;
330 }
331
332 for ( auto it1 = current_states.begin();
333 it1 != current_states.end(); ) {
334 if ( it1->state_number < state_list_size ) {
335 if ( min_accepted_block_count >= 0 &&
336 it1->block_counter >= min_accepted_block_count ) {
337 // there is already an accepted state for the same or smaller block count
338 // so remove this state completely
339 auto next = it1;
340 next++;
341 current_states.erase( it1 );
342 it1=next;
343 continue;
344 } else {
345 bool hit = false;
346 if ( check_string( search_str, sl,
347 m_state_list[it1->state_number].sub_string,
348 m_state_list[it1->state_number].string_len,
349 m_match_case_sensitive ) ) {
350 //pushes++;
351 current_states.push_front( state_type<with_segments>( it1->state_number + 1,
352 it1->prev_was_hit ? it1->block_counter : it1->block_counter + 1,
353 true,
354 it1->offsets,
355 search_str - search_str_start,
356 sl) );
357
358 hit = true;
359 }
360 bool was_true = it1->prev_was_hit;
361
362 it1->prev_was_hit = false;
363
364 // the following is to optimize remove redundant states
365 // it is not necessary
366 if ( smallest_negative_block_count[it1->state_number] >= -1 &&
367 it1->block_counter >= smallest_negative_block_count[it1->state_number] ) {
368 // block count smaller than already seen
369 if ( was_true ) {
370 // state changed from true to false so remove the state
371 auto next = it1;
372 next++;
373 current_states.erase( it1 );
374 it1=next;
375 //c++;
376 continue;
377 }
378 } else {
379 smallest_negative_block_count[it1->state_number] = it1->block_counter;
380
381 if ( was_true && hit ) {
382 // current iterator was a hit, but there is a new match directly after so remove the old one
383 auto next = it1;
384 next++;
385 current_states.erase( it1 );
386 it1=next;
387 //c++;
388 continue;
389 }
390 }
391 }
392 } else if ( it1->block_counter == 0 ) {
393 // found exact hit
394 exact_hit_found = true;
395 break;
396 } else {
397 if ( min_accepted_block_count == -1 ||
398 it1->block_counter < min_accepted_block_count ) {
399 min_accepted_block_count = it1->block_counter;
400 }
401 }
402 it1++;
403 }
404
405 #if 0
406 // the following is purely optional, it's just there to keep the number
407 // of states low. It will remove all states with the same state_number and prev_was_hit == false
408 // and only keep the one with the smallest block counter
409
410 if ( current_states.size() > 1 ) {
411 for ( int i = 0; i < state_list_size; i++ ) {
412 int smallest_block_counter = -2;
413 for ( auto it1 = current_states.begin();
414 it1 != current_states.end();
415 it1++ ) {
416 if ( it1->state_number == i &&
417 it1->prev_was_hit == false ) {
418 if ( smallest_block_counter == -2 ||
419 it1->block_counter < smallest_block_counter ) {
420 smallest_block_counter = it1->block_counter;
421 }
422 }
423 }
424
425 if ( smallest_block_counter >= -1 ) {
426 bool kept_one = false;
427
428 for ( auto it1 = current_states.begin();
429 it1 != current_states.end(); ) {
430 if ( it1->state_number == i &&
431 it1->prev_was_hit == false ) {
432 if ( it1->block_counter > smallest_block_counter ||
433 ( kept_one == true && it1->block_counter == smallest_block_counter ) ) {
434 auto next_it = it1;
435 next_it++;
436
437 //e++;
438 current_states.erase( it1 );
439 it1 = next_it;
440 continue;
441 } else {
442 kept_one = true;
443 }
444 }
445 it1++;
446 }
447 }
448 }
449 }
450 #endif
451
452 search_str += sl;
453 }
454
455 // printf("pushes:%d\n",pushes);
456 // printf("c:%d\n",c);
457 // printf("e:%d\n",e);
458
459 int min_block_count = -1;
460
461 for ( auto it1 = current_states.begin();
462 it1 != current_states.end();
463 it1++ ) {
464 if ( it1->state_number >= state_list_size ) {
465 if ( min_block_count < 0 || min_block_count > it1->block_counter ) {
466 min_block_count = it1->block_counter;
467 }
468 }
469 }
470
471 if ( return_segments ) {
472 getSegments<with_segments>( str,
473 current_states,
474 state_list_size,
475 min_block_count,
476 return_segments );
477 }
478
479 return min_block_count;
480 }
481
482 template int StringMatcherFlexibleMatch::countNonMatchingBlocks<true>( const std::string &str,
483 std::vector< size_t > *return_segments );
484 template int StringMatcherFlexibleMatch::countNonMatchingBlocks<false>( const std::string &str,
485 std::vector< size_t > *return_segments );
486
487
buildStateDescription()488 void StringMatcherFlexibleMatch::buildStateDescription()
489 {
490 const char *needle_str = m_match_string.c_str();
491
492 if ( ! m_match_case_sensitive ) {
493 needle_str = m_match_string_lowercase.c_str();
494 }
495
496 m_state_list.clear();
497
498 while ( *needle_str != '\0' ) {
499 size_t l = UTF8::getLenOfCharacter( needle_str );
500
501 if ( l < 1 ) break;
502
503 /* silently skip wildcard * */
504 if ( *needle_str != '*' ) {
505 m_state_list.push_back( state_description( needle_str, l ) );
506 }
507
508 needle_str += l;
509 }
510 }
511