Logo Search packages:      
Sourcecode: icu version File versions

strsrch.h

/*
**********************************************************************
*   Copyright (C) 1999-2000 IBM and others. All rights reserved.
**********************************************************************
*   Date        Name        Description
*  03/22/2000   helena      Creation.
**********************************************************************
*/
#ifndef STRSRCH_H
#define STRSRCH_H

#include "unicode/utypes.h"
#include "unicode/unistr.h"
#include "unicode/chariter.h"
#include "unicode/tblcoll.h"
#include "unicode/brkiter.h"
#include "srchiter.h"


class SearchIterator;
/**
 * <code>StringSearch</code> is a <code>SearchIterator</code> that provides
 * language-sensitive text searching based on the comparison rules defined
 * in a {@link RuleBasedCollator} object.
 * Instances of <code>StringSearch</code> function as iterators
 * maintain a current position and scan over text returning the index of
 * characters where the pattern occurs and the length of each match.
 * <p>
 * <code>StringSearch</code> uses a version of the fast Boyer-Moore search
 * algorithm that has been adapted to work with the large character set of
 * Unicode.  See "Efficient Text Searching in Java", to be published in
 * <i>Java Report</i> in February, 1999, for further information on the algorithm.
 * <p>
 * Consult the <code>SearchIterator</code> documentation for information on
 * and examples of how to use instances of this class to implement text
 * searching.  <code>SearchIterator</code> provides all of the necessary
 * API; this class only provides constructors and internal implementation
 * methods.
 *
 * @see SearchIterator
 * @see RuleBasedCollator
 *
 * @author Laura Werner
 * @version 1.0
 */

class StringSearch : public SearchIterator
{
public:
    /**
     * Construct a <code>StringSearch</code> object using a specific collator and set
     * of boundary-detection rules.
     * <p>
     * @param pat       The text for which this object will search.
     *
     * @param target    The text in which to search for the pattern.
     *
     * @param coll      A <code>RuleBasedCollator</code> object which defines the
     *                  language-sensitive comparison rules used to determine 
     *                  whether text in the pattern and target matches.
     *
     * @param breaker   A <code>BreakIterator</code> object used to constrain the matches
     *                  that are found.  Matches whose start and end indices
     *                  in the target text are not boundaries as determined
     *                  by the <code>BreakIterator</code> are ignored.  If this behavior
     *                  is not desired, <code>null</code> can be passed in instead.
     */
    StringSearch(const UnicodeString& pat, 
                        CharacterIterator* target,
                        RuleBasedCollator* coll, 
                        BreakIterator* breaker,
                        UErrorCode& status);

    /**
     * Construct a <code>StringSearch</code> object using a specific collator.
     * <p>
     * @param pattern   The text for which this object will search.
     *
     * @param target    The text in which to search for the pattern.
     *
     * @param collator  A <code>RuleBasedCollator</code> object which defines the
     *                  language-sensitive comparison rules used to determine 
     *                  whether text in the pattern and target matches.
     */
    StringSearch(const UnicodeString& pattern,
                 CharacterIterator* target,
                 RuleBasedCollator* collator,
                 UErrorCode& status);

    /**
     * copy constructor
     */
    StringSearch(const StringSearch& that);

    /**
     * Construct a <code>StringSearch</code> object using the collator and
     * character boundary detection rules for a given locale
     * <p>
     * @param pattern   The text for which this object will search.
     *
     * @param target    The text in which to search for the pattern.
     *
     * @param loc       The locale whose collation and break-detection rules
     *                  should be used.
     *
     * @exception       ClassCastException thrown if the collator for the specified
     *                  locale is not a RuleBasedCollator.
     */
    StringSearch(const UnicodeString& pattern, 
                 CharacterIterator* target, 
                 const Locale& loc,
                 UErrorCode& status);
    /**
     * Construct a <code>StringSearch</code> object using the collator for the default
     * locale
     * <p>
     * @param pattern   The text for which this object will search.
     *
     * @param target    The text in which to search for the pattern.
     *
     * @param collator  A <code>RuleBasedCollator</code> object which defines the
     *                  language-sensitive comparison rules used to determine 
     *                  whether text in the pattern and target matches.
     */
    StringSearch(const UnicodeString& pattern, 
                 const UnicodeString& target,
                 UErrorCode& status);

    virtual ~StringSearch(void);
    /**
     * Assignment operator.  Sets this iterator to have the same behavior,
     * and iterate over the same text, as the one passed in.
     */
    StringSearch& operator=(const StringSearch& that);

    /**
     * Equality operator.  Returns TRUE if both BreakIterators are of the
     * same class, have the same behavior, and iterate over the same text.
     */
    virtual UBool operator==(const SearchIterator& that) const;

    /**
     * Not-equal operator.  If operator== returns TRUE, this returns FALSE,
     * and vice versa.
     */
    UBool operator!=(const SearchIterator& that) const;

    /**
     * Returns a newly-constructed RuleBasedBreakIterator with the same
     * behavior, and iterating over the same text, as this one.
     */
    virtual SearchIterator* clone(void) const;

    //-------------------------------------------------------------------
    // Getters and Setters
    //-------------------------------------------------------------------
    
    /**
     * Sets this object's strength property. The strength determines the
     * minimum level of difference considered significant during a
     * search.  Generally, {@link Collator#TERTIARY} and 
     * {@link Collator#IDENTICAL} indicate that all differences are
     * considered significant, {@link Collator#SECONDARY} indicates
     * that upper/lower case distinctions should be ignored, and
     * {@link Collator#PRIMARY} indicates that both case and accents
     * should be ignored.  However, the exact meanings of these constants
     * are determined by individual Collator objects.
     * <p>
     * @see Collator#PRIMARY
     * @see Collator#SECONDARY
     * @see Collator#TERTIARY
     * @see Collator#IDENTICAL
     */
     void setStrength(Collator::ECollationStrength newStrength, UErrorCode& status);
    
    
    /**
     * Returns this object's strength property, which indicates what level
     * of differences are considered significant during a search.
     * <p>
     * @see #setStrength
     */
00183      Collator::ECollationStrength getStrength(void) const{ return strength; }
    
    /**
     * Set the collator to be used for this string search.  Also changes
     * the search strength to match that of the new collator.
     * <p>
     * This method causes internal data such as Boyer-Moore shift tables
     * to be recalculated, but the iterator's position is unchanged.
     * <p>
     * @see #getCollator
     */
     void setCollator(const RuleBasedCollator* coll, UErrorCode& status);
    
    /**
     * Return the RuleBasedCollator being used for this string search.
     */
    const RuleBasedCollator&     getCollator() const;
    
    /**
     * Set the pattern for which to search.  
     * This method causes internal data such as Boyer-Moore shift tables
     * to be recalculated, but the iterator's position is unchanged.
     */
    void setPattern(const UnicodeString& pat, UErrorCode& status);
    
    /**
     * Returns the pattern for which this object is searching.
     */
    const UnicodeString& getPattern() const;
    
    /**
     * Set the target text which should be searched and resets the
     * iterator's position to point before the start of the new text.
     * This method is useful if you want to re-use an iterator to
     * search for the same pattern within a different body of text.
     */
    virtual void setTarget(const UnicodeString& newText);    

    /**
     * Set the target text which should be searched and resets the
     * iterator's position to point before the start of the target text.
     * This method is useful if you want to re-use an iterator to
     * search for the same pattern within a different body of text.
     *
     * @see #getTarget
     */
    virtual void adoptTarget(CharacterIterator* iterator);

    /** Reset iterator
     */
    virtual void reset(void);
    /**
     * Returns a unique class ID POLYMORPHICALLY.  Pure virtual override.
     * This method is to implement a simple version of RTTI, since not all
     * C++ compilers support genuine RTTI.  Polymorphic operator==() and
     * clone() methods call this method.
     *
     * @return          The class ID for this object. All objects of a
     *                  given class have the same class ID.  Objects of
     *                  other classes have different class IDs.
     */
    inline virtual UClassID getDynamicClassID(void) const;

    /**
     * Returns the class ID for this class.  This is useful only for
     * comparing to a return value from getDynamicClassID().  For example:
     *
     *      Base* polymorphic_pointer = createPolymorphicObject();
     *      if (polymorphic_pointer->getDynamicClassID() ==
     *          Derived::getStaticClassID()) ...
     *
     * @return          The class ID for all objects of this class.
     */
    inline static UClassID getStaticClassID(void);

protected:
    //-------------------------------------------------------------------
    // Privates
    //-------------------------------------------------------------------

    /**
     * Search forward for matching text, starting at a given location.
     * Clients should not call this method directly; instead they should call
     * {@link SearchIterator#next}.
     * <p>
     * If a match is found, this method returns the index at which the match
     * starts and calls {@link SearchIterator#setMatchLength}
     * with the number of characters in the target
     * text that make up the match.  If no match is found, the method returns
     * <code>DONE</code> and does not call <tt>setMatchLength</tt>.
     * <p>
     * @param start The index in the target text at which the search starts.
     *
     * @return      The index at which the matched text in the target starts, or DONE
     *              if no match was found.
     * <p>
     * @see SearchIterator#next
     * @see SearchIterator#DONE
     */
    virtual int32_t handleNext(int32_t start, UErrorCode& status);
    /**
     * Search backward for matching text ,starting at a given location.
     * Clients should not call this method directly; instead they should call
     * <code>SearchIterator.previous()</code>, which this method overrides.
     * <p>
     * If a match is found, this method returns the index at which the match
     * starts and calls {@link SearchIterator#setMatchLength}
     * with the number of characters in the target
     * text that make up the match.  If no match is found, the method returns
     * <code>DONE</code> and does not call <tt>setMatchLength</tt>.
     * <p>
     * @param start The index in the target text at which the search starts.
     *
     * @return      The index at which the matched text in the target starts, or DONE
     *              if no match was found.
     * <p>
     * @see SearchIterator#previous
     * @see SearchIterator#DONE
     */
    virtual int32_t handlePrev(int32_t start, UErrorCode& status);
private:
    /**
     * Return a bitmask that will select only the portions of a collation 
     * element that are significant at the given strength level.
     */
    static int32_t getMask(Collator::ECollationStrength strength);
    

    void initialize(UErrorCode& status);
    /**
     * Method used by StringSearch to determine how far to the right to
     * shift the pattern during a Boyer-Moore search.  
     *
     * @param curValue  The current value in the target text
     * @param curIndex  The index in the pattern at which we failed to match
     *                  curValue in the target text.
     */
    int32_t getShift( int32_t curValue, int32_t curIndex ) const;

    /**
     * Method used by StringSearch to determine how far to the left to
     * shift the pattern during a reverse Boyer-Moore search.  
     *
     * @param curValue  The current value in the target text
     * @param curIndex  The index in the pattern at which we failed to match
     *                  curValue in the target text.
     */
    int32_t getBackShift( int32_t curValue, int32_t curIndex ) const;

    /**
     * Hash a collation element from its full size (32 bits) down into a
     * value that can be used as an index into the shift tables.  Right
     * now we do a modulus by the size of the hash table.
     *
     * TODO: At some point I should experiment to see whether a slightly
     * more complicated hash function gives us a better distribution
     * on multilingual text.  I doubt it will have much effect on
     * performance, though.
     */
    static int32_t hash(int32_t order);

    //------------------------------------------------------------------------
    // Private Data
    //
    CollationElementIterator      *iter;
    RuleBasedCollator             *collator;
    /* HSYS ? Why?  Changes to this will not affect collator.  no changes to the comparsion result */
    Collator::ECollationStrength  strength;
    
    //------------------------------------------------------------------------
    // Everything from here on down is the data used to represent the
    // Boyer-Moore shift tables and the code that generates and manipulates
    // them.
    //    
    int32_t         *valueList;
    int32_t         valueListLen;
    int32_t         shiftTable[256];
    int32_t         backShiftTable[256];

    UnicodeString   pattern;            // The pattern string
    int32_t         normLen;        // num. of collation elements in pattern.
    int32_t         minLen;         // Min of composed, decomposed versions
    int32_t         maxLen;         // Max
    CollationElementIterator *it;   // to be removed

private:
    /* to be removed */
    void dumpTables();
    /**
     * Class ID
     */
00374     static char fgClassID;
};

inline UBool ::StringSearch::operator!=(const SearchIterator& that) const 
{
    return !operator==(that);
}

inline UClassID ::StringSearch::getDynamicClassID(void) const 
{
    return ::StringSearch::getStaticClassID();
}

inline UClassID ::StringSearch::getStaticClassID(void) 
{
    return (UClassID)(&fgClassID);
}

#endif


Generated by  Doxygen 1.6.0   Back to index