worker-4.10.0/src/stringmatcher_flexiblematch.cc

/* stringmatcher_flexiblematch.cc
 * This file belongs to Worker, a file manager for UN*X/X11.
 * Copyright (C) 2012-2015 Ralf Hoffmann.
 * You can contact me at: ralf@boomerangsworld.de
 *   or http://www.boomerangsworld.de/worker
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 */

#include "stringmatcher_flexiblematch.hh"
#include "aguix/lowlevelfunc.h"
#include "aguix/utf8.hh"
#include <cctype>
#include <algorithm>
#include <functional>
#include <vector>
#include <list>
#include <map>

StringMatcherFlexibleMatch::StringMatcherFlexibleMatch() : m_match_case_sensitive( false )
{
}

StringMatcherFlexibleMatch::~StringMatcherFlexibleMatch()
{
}

StringMatcherFlexibleMatch::StringMatcherFlexibleMatch( const StringMatcherFlexibleMatch &other )
    : m_match_string( other.m_match_string ),
      m_match_case_sensitive( other.m_match_case_sensitive ),
      m_match_string_lowercase( other.m_match_string_lowercase )
{
    buildStateDescription();
}

StringMatcherFlexibleMatch &StringMatcherFlexibleMatch::operator=( const StringMatcherFlexibleMatch &rhs )
{
    if ( &rhs != this ) {
        m_match_string = rhs.m_match_string;
        m_match_case_sensitive = rhs.m_match_case_sensitive;
        m_match_string_lowercase = rhs.m_match_string_lowercase;

        m_state_list.clear();

        buildStateDescription();
    }

    return *this;
}

bool StringMatcherFlexibleMatch::match( const std::string &str )
{
    if ( countNonMatchingBlocks<false>( str, NULL ) >= 0 ) return true;

    return false;
}

#if 0
// this variant works and is much simpler and faster but is not accurate, the returned
// block counter is only one value of multiple solutions.
int StringMatcherFlexibleMatch::countNonMatchingBlocks( const std::string &str )
{
    int res = -1;
    bool failed = false;

    const char *search_str = str.c_str();
    const char *needle_str = m_match_string.c_str();
    bool skipped_chars = true;  /* start with true since the res counter starts with -1
                                   so for the first hit this counter will be raised to 0 */

    if ( ! m_match_case_sensitive ) {
        needle_str = m_match_string_lowercase.c_str();
    }

    while ( *needle_str != '\0' && ! failed ) {
        size_t l = UTF8::getLenOfCharacter( needle_str );

        if ( l < 1 ) break;

        /* silently skip wildcard * */
        if ( *needle_str != '*' ) {
            bool found = false;

            /* now search in search_str for current char */
            while ( *search_str != '\0' && ! found ) {
                size_t sl = UTF8::getLenOfCharacter( search_str );

                if ( sl < 1 ) {
                    failed = true;
                    break;
                }

                if ( sl == l &&
                     memcmp( search_str, needle_str, l ) == 0 ) {
                    found = true;
                } else if ( sl == 1 && l == 1 && ! m_match_case_sensitive &&
                            std::tolower( *search_str ) == *needle_str ) {
                    // TODO this is not really utf8 ready but should at least work
                    // for utf8 input
                    found = true;
                }

                if ( ! found ) {
                    skipped_chars = true;
                }

                search_str += sl;
            }

            if ( ! found ) {
                failed = true;
                break;
            } else {
                if ( skipped_chars ) {
                    res++;
                }
                skipped_chars = false;
            }
        }

        needle_str += l;
    }

    if ( failed ) return -1;

    return res;
}
#endif

void StringMatcherFlexibleMatch::setMatchString( const std::string &str )
{
    m_match_string = str;
    createLoweredCase();

    buildStateDescription();
}

std::string StringMatcherFlexibleMatch::getMatchString() const
{
    return m_match_string;
}

void StringMatcherFlexibleMatch::setMatchCaseSensitive( bool nv )
{
    m_match_case_sensitive = nv;

    buildStateDescription();
}

bool StringMatcherFlexibleMatch::getMatchCaseSensitive() const
{
    return m_match_case_sensitive;
}

template <class T>
struct my_tolower : public std::unary_function<T,T>
{
    T operator() (const T& c) const
    {
        return std::tolower( c );
    }
};

void StringMatcherFlexibleMatch::createLoweredCase()
{
    m_match_string_lowercase = m_match_string;

    std::transform( m_match_string_lowercase.begin(),
                    m_match_string_lowercase.end(),
                    m_match_string_lowercase.begin(),
                    my_tolower<char>() );
}

template <bool with_offsets> struct state_type {
    int state_number;
    int block_counter;
    bool prev_was_hit;
    int offsets; //dummy variable

    state_type( int _state_number,
                int _block_counter,
                bool _prev_was_hit,
                int _offsets,
                size_t _offset,
                size_t _length ) : state_number( _state_number ),
                                   block_counter( _block_counter ),
                                   prev_was_hit( _prev_was_hit ),
                                   offsets( _offsets )
    {
    }
};

template <> struct state_type<true> {
    int state_number;
    int block_counter;
    bool prev_was_hit;
    std::vector< std::pair< size_t, size_t > > offsets;

    state_type( int _state_number,
                int _block_counter,
                bool _prev_was_hit,
                const std::vector< std::pair< size_t, size_t > > &current_offsets,
                size_t _offset,
                size_t _length ) : state_number( _state_number ),
                                   block_counter( _block_counter ),
                                   prev_was_hit( _prev_was_hit )
    {
        offsets = current_offsets;
        offsets.push_back( std::make_pair( _offset, _length ) );
    }
};

static bool check_string( const char *s1, int l1,
                          const char *s2, int l2,
                          bool case_sensitive )
{
    if ( l1 == l2 &&
         memcmp( s1, s2, l1 ) == 0 ) {
        return true;
    } else if ( l1 == 1 && l2 == 1 && ! case_sensitive &&
                (char)std::tolower( *s1 ) == *s2 ) {
        // TODO this is not really utf8 ready but should at least work
        // for utf8 input
        return true;
    }

    return false;
}

template <bool with_segments> void getSegments( const std::string &str,
                                                const std::list< state_type<with_segments> > &current_states,
                                                const int state_list_size,
                                                const int min_block_couunt,
                                                std::vector< size_t > *return_segments )
{
}

template <> void getSegments<true>( const std::string &str,
                                    const std::list< state_type<true> > &current_states,
                                    const int state_list_size,
                                    const int min_block_count,
                                    std::vector< size_t > *return_segments )
{
    // now generate list of alternating segments similar to flexibleregex
    for ( auto it1 = current_states.begin();
          it1 != current_states.end();
          it1++ ) {
        if ( it1->state_number >= state_list_size ) {
            if ( min_block_count == it1->block_counter &&
                 it1->offsets.size() > 1 ) {
                std::vector< size_t > res;
                size_t last_o = 0;
                size_t last_segment_start = it1->offsets[1].first;
                size_t last_segment_end = it1->offsets[1].first + it1->offsets[1].second;

                for ( int i = 2; i < (int)it1->offsets.size(); i++ ) {
                    const size_t o = it1->offsets[i].first;

                    if ( o == last_segment_end ) {
                        // merge matches
                        last_segment_end = o + it1->offsets[i].second;
                    } else {
                        res.push_back( last_segment_start - last_o );
                        res.push_back( last_segment_end - last_segment_start );
                        last_o = last_segment_end;

                        last_segment_start = o;
                        last_segment_end = o + it1->offsets[i].second;
                    }
                }

                res.push_back( last_segment_start - last_o );
                res.push_back( last_segment_end - last_segment_start );
                last_o = last_segment_end;

                if ( last_o <= str.length() ) {
                    res.push_back( str.length() - last_o );
                }

                *return_segments = res;

                break;
            }
        }
    }
}

template <bool with_segments> int StringMatcherFlexibleMatch::countNonMatchingBlocks( const std::string &str,
                                                                                      std::vector< size_t > *return_segments )
{
    // int pushes=0;
    // int c= 0;
    // int e=0;

    std::list< state_type<with_segments> > current_states;
    std::vector< int > smallest_negative_block_count;
    const int state_list_size = (int)m_state_list.size();

    if ( state_list_size < 1 ) return 0; // always matches for empty match strings

    smallest_negative_block_count.resize( state_list_size );
    for ( unsigned int j = 0; j < smallest_negative_block_count.size(); j++ ) {
        smallest_negative_block_count[j] = -2;
    }

    current_states.push_back( state_type<with_segments>( 0, -1, false, {}, 0, 0 ) );

    const char *search_str = str.c_str();
    const char *search_str_start = search_str;
    bool exact_hit_found = false;
    int min_accepted_block_count = -1;

    while ( *search_str != '\0' && ! exact_hit_found ) {
        size_t sl = UTF8::getLenOfCharacter( search_str );

        if ( sl < 1 ) {
            break;
        }

        for ( auto it1 = current_states.begin();
              it1 != current_states.end(); ) {
            if ( it1->state_number < state_list_size ) {
                if ( min_accepted_block_count >= 0 &&
                     it1->block_counter >= min_accepted_block_count ) {
                    // there is already an accepted state for the same or smaller block count
                    // so remove this state completely
                    auto next = it1;
                    next++;
                    current_states.erase( it1 );
                    it1=next;
                    continue;
                } else {
                    bool hit = false;
                    if ( check_string( search_str, sl,
                                       m_state_list[it1->state_number].sub_string,
                                       m_state_list[it1->state_number].string_len,
                                       m_match_case_sensitive ) ) {
                        //pushes++;
                        current_states.push_front( state_type<with_segments>( it1->state_number + 1,
                                                                              it1->prev_was_hit ? it1->block_counter : it1->block_counter + 1,
                                                                              true,
                                                                              it1->offsets,
                                                                              search_str - search_str_start,
                                                                              sl) );

                        hit = true;
                    }
                    bool was_true = it1->prev_was_hit;

                    it1->prev_was_hit = false;

                    // the following is to optimize remove redundant states
                    // it is not necessary
                    if ( smallest_negative_block_count[it1->state_number] >= -1 &&
                         it1->block_counter >= smallest_negative_block_count[it1->state_number] ) {
                        // block count smaller than already seen
                        if ( was_true ) {
                            // state changed from true to false so remove the state
                            auto next = it1;
                            next++;
                            current_states.erase( it1 );
                            it1=next;
                            //c++;
                            continue;
                        }
                    } else {
                        smallest_negative_block_count[it1->state_number] = it1->block_counter;

                        if ( was_true && hit ) {
                            // current iterator was a hit, but there is a new match directly after so remove the old one
                            auto next = it1;
                            next++;
                            current_states.erase( it1 );
                            it1=next;
                            //c++;
                            continue;
                        }
                    }
                }
            } else if ( it1->block_counter == 0 ) {
                // found exact hit
                exact_hit_found = true;
                break;
            } else {
                if ( min_accepted_block_count == -1 ||
                     it1->block_counter < min_accepted_block_count ) {
                    min_accepted_block_count = it1->block_counter;
                }
            }
            it1++;
        }

#if 0
        // the following is purely optional, it's just there to keep the number
        // of states low. It will remove all states with the same state_number and prev_was_hit == false
        // and only keep the one with the smallest block counter

        if ( current_states.size() > 1 ) {
            for ( int i = 0; i < state_list_size; i++ ) {
                int smallest_block_counter = -2;
                for ( auto it1 = current_states.begin();
                      it1 != current_states.end();
                      it1++ ) {
                    if ( it1->state_number == i &&
                         it1->prev_was_hit == false ) {
                        if ( smallest_block_counter == -2 ||
                             it1->block_counter < smallest_block_counter ) {
                            smallest_block_counter = it1->block_counter;
                        }
                    }
                }

                if ( smallest_block_counter >= -1 ) {
                    bool kept_one = false;

                    for ( auto it1 = current_states.begin();
                          it1 != current_states.end(); ) {
                        if ( it1->state_number == i &&
                             it1->prev_was_hit == false ) {
                            if ( it1->block_counter > smallest_block_counter ||
                                 ( kept_one == true && it1->block_counter == smallest_block_counter ) ) {
                                auto next_it = it1;
                                next_it++;

                                //e++;
                                current_states.erase( it1 );
                                it1 = next_it;
                                continue;
                            } else {
                                kept_one = true;
                            }
                        }
                        it1++;
                    }
                }
            }
        }
#endif

        search_str += sl;
    }

    // printf("pushes:%d\n",pushes);
    // printf("c:%d\n",c);
    // printf("e:%d\n",e);

    int min_block_count = -1;

    for ( auto it1 = current_states.begin();
          it1 != current_states.end();
          it1++ ) {
        if ( it1->state_number >= state_list_size ) {
            if ( min_block_count < 0 || min_block_count > it1->block_counter ) {
                min_block_count = it1->block_counter;
            }
        }
    }

    if ( return_segments ) {
        getSegments<with_segments>( str,
                                    current_states,
                                    state_list_size,
                                    min_block_count,
                                    return_segments );
    }

    return min_block_count;
}

template int StringMatcherFlexibleMatch::countNonMatchingBlocks<true>( const std::string &str,
                                                                       std::vector< size_t > *return_segments );
template int StringMatcherFlexibleMatch::countNonMatchingBlocks<false>( const std::string &str,
                                                                        std::vector< size_t > *return_segments );


void StringMatcherFlexibleMatch::buildStateDescription()
{
    const char *needle_str = m_match_string.c_str();

    if ( ! m_match_case_sensitive ) {
        needle_str = m_match_string_lowercase.c_str();
    }

    m_state_list.clear();

    while ( *needle_str != '\0' ) {
        size_t l = UTF8::getLenOfCharacter( needle_str );

        if ( l < 1 ) break;

        /* silently skip wildcard * */
        if ( *needle_str != '*' ) {
            m_state_list.push_back( state_description( needle_str, l ) );
        }

        needle_str += l;
    }
}