Logo Search packages:      
Sourcecode: icu version File versions

upropset.h

/*
**********************************************************************
*   Copyright (c) 2001-2002, International Business Machines
*   Corporation and others.  All Rights Reserved.
**********************************************************************
*/
#ifndef _UPROPSET_H_
#define _UPROPSET_H_

#include "unicode/utypes.h"
#include "unicode/uscript.h"

U_NAMESPACE_BEGIN

class UnicodeString;
class UnicodeSet;
class ParsePosition;
class Hashtable;

/**
 * INTERNAL CLASS implementing the UnicodeSet properties as outlined
 * at:
 *
 * http://oss.software.ibm.com/cvs/icu/~checkout~/icuhtml/design/unicodeset_properties.html
 *
 * Recognized syntax:
 *
 * [:foo:] [:^foo:] - white space not allowed within "[:" or ":]"
 * \p{foo} \P{foo}  - white space not allowed within "\p" or "\P"
 *
 * Other than the above restrictions, white space is ignored.  Case
 * is ignored except in "\p" and "\P".
 *
 * This class cannot be instantiated.  It has a public static method,
 * createPropertySet(), with takes a pattern to be parsed and returns
 * a new UnicodeSet.  Another public static method,
 * resemblesPattern(), returns true if a given pattern string appears
 * to be a property set pattern, and therefore should be passed in to
 * createPropertySet().
 *
 * NOTE: Current implementation is incomplete.  The following list
 * indicates which properties are supported.
 *
 *    + GeneralCategory
 *      CombiningClass
 *      BidiClass
 *      DecompositionType
 *    + NumericValue
 *      NumericType
 *      EastAsianWidth
 *      LineBreak
 *      JoiningType
 *    + Script
 *
 * '+' indicates a supported property.
 *
 * @author Alan Liu
 * @internal
 */
00060 class UnicodePropertySet {

 public:

    //----------------------------------------------------------------
    // Public API
    //----------------------------------------------------------------

    /**
     * Return true if the given position, in the given pattern, appears
     * to be the start of a property set pattern [:foo:], \p{foo}, or
     * \P{foo}.
     */
    static UBool resemblesPattern(const UnicodeString& pattern, int32_t pos);

    /**
     * Create a UnicodeSet by parsing the given pattern at the given
     * parse position.
     *
     * @param pattern the pattern string
     * @param ppos on entry, the position at which to begin parsing.
     * This shold be one of the locations marked '^':
     *
     *   [:blah:]     \p{blah}     \P{blah}
     *   ^       %    ^       %    ^       %
     *
     * On return, the position after the last character parsed, that is,
     * the locations marked '%'.  If the parse fails, ppos is returned
     * unchanged.
     * @return a newly-constructed UnicodeSet object, or null upon
     * failure.
     */
    static UnicodeSet* createFromPattern(const UnicodeString& pattern,
                                         ParsePosition& ppos);


    typedef UnicodeSet* (*SetFactory)(const UnicodeString& valueName);

 private:

    //----------------------------------------------------------------
    // Property set factory static methods
    // NOTE: This will change/go away when we implement UCharacter
    // based property retrieval.
    //----------------------------------------------------------------

    static UnicodeSet* createNumericValueSet(const UnicodeString& valueName);

    /**
     * Given a general category value name, create a corresponding
     * set and return it, or return null if the name is invalid.
     * @param valueName a pre-munged general category value name
     */
    static UnicodeSet* createCategorySet(const UnicodeString& valueName);

    /**
     * Given a script value name, create a corresponding set and
     * return it, or return null if the name is invalid.
     * @param valueName a pre-munged script value name
     */
    static UnicodeSet* createScriptSet(const UnicodeString& valueName);

    //----------------------------------------------------------------
    // Utility methods
    //----------------------------------------------------------------

    /**
     * Returns a UnicodeSet for the given category.  This set is
     * cached and returned again if this method is called again with
     * the same parameter.
     *
     * Callers MUST NOT MODIFY the returned set.
     */
    static const UnicodeSet& getCategorySet(int32_t cat);

    /**
     * Returns a UnicodeSet for the given script.  This set is
     * cached and returned again if this method is called again with
     * the same parameter.
     *
     * Callers MUST NOT MODIFY the returned set.
     */
    static const UnicodeSet& getScriptSet(UScriptCode script);

    /**
     * Given a string, munge it to upper case and lose the whitespace.
     * So "General Category " becomes "GENERALCATEGORY".  We munge all
     * type and value strings, and store all type and value keys
     * pre-munged.
     */
    static UnicodeString munge(const UnicodeString& str, int32_t start, int32_t limit);

    /**
     * Skip over a sequence of zero or more white space characters
     * at pos.  Return the index of the first non-white-space character
     * at or after pos, or str.length(), if there is none.
     */
    static int32_t skipWhitespace(const UnicodeString& str, int32_t pos);

    //----------------------------------------------------------------
    // Generic filter-based scanning code
    //
    // NOTE: In general, we don't want to do this!  This is a temporary
    // implementation until we have time for something that examines
    // the underlying UCharacter data structures in an intelligent
    // way.  Iterating over all code points is dumb.  What we want to
    // do, for instance, is iterate over internally-stored ranges
    // of characters that have a given property.
    //----------------------------------------------------------------

    /**
     * A filter that returns TRUE if the given code point should be
     * included in the UnicodeSet being constructed.
     */
    typedef UBool (*Filter)(UChar32 codePoint, void* context);

    /**
     * Set the given UnicodeSet to contain all code points for which
     * filter returns TRUE.  The context parameter is passed unchanged
     * to the filter function.
     */
    static void initSetFromFilter(UnicodeSet& set, Filter filter,
                                  void* context);

    //----------------------------------------------------------------
    // Type and value name maps
    //----------------------------------------------------------------

    /**
     * Add a type mapping to the name map.
     */
    static void addType(const UnicodeString& shortName,
                        const UnicodeString& longName,
                        SetFactory factory);

    /**
     * Add a value mapping to the name map.
     */
    static void addValue(Hashtable* map,
                         const UnicodeString& shortName,
                         const UnicodeString& longName,
                         int32_t value);

    static void init();

 private:
    //----------------------------------------------------------------
    // SetFactory <=> void*
    // I don't know why the compiler won't cast between these types.
    // They should be interconvertible.  Does C++ distinguish between
    // pointers into code and pointers into data?  In any case, we
    // convert between these types in a safe way here.
    //----------------------------------------------------------------
    
    union SetFactoryTok {
        void*       voidPointer;
        SetFactory  functionPointer;
    };

    inline static void* setFactoryToVoidPtr(SetFactory f) {
        SetFactoryTok tok;
        tok.functionPointer = f;
        return tok.voidPointer;
    }

    inline static SetFactory voidPtrToSetFactory(void* p) {
        SetFactoryTok tok;
        tok.voidPointer = p;
        return tok.functionPointer;
    }
};

U_NAMESPACE_END

#endif

Generated by  Doxygen 1.6.0   Back to index