Logo Search packages:      
Sourcecode: icu version File versions  Download package

UnicodeSet Class Reference

#include <uniset.h>

Inheritance diagram for UnicodeSet:
Collaboration diagram for UnicodeSet:

List of all members.

Public Types

enum  { MIN_VALUE = 0, MAX_VALUE = 0x10ffff }

Public Member Functions

virtual UnicodeSetadd (UChar32 start, UChar32 end)
UnicodeSetadd (UChar32 c)
UnicodeSetadd (const UnicodeString &s)
UnicodeSetaddAll (const UnicodeString &s)
virtual UnicodeSetaddAll (const UnicodeSet &c)
virtual void addMatchSetTo (UnicodeSet &toUnionTo) const
UnicodeSetapplyIntPropertyValue (UProperty prop, int32_t value, UErrorCode &ec)
UnicodeSetapplyPattern (const UnicodeString &pattern, UErrorCode &status)
UnicodeSetapplyPattern (const UnicodeString &pattern, uint32_t options, const SymbolTable *symbols, UErrorCode &status)
UnicodeSetapplyPattern (const UnicodeString &pattern, ParsePosition &pos, uint32_t options, const SymbolTable *symbols, UErrorCode &status)
UnicodeSetapplyPropertyAlias (const UnicodeString &prop, const UnicodeString &value, UErrorCode &ec)
UChar32 charAt (int32_t index) const
virtual UnicodeSetclear (void)
virtual UnicodeFunctorclone () const
UnicodeFunctorcloneAsThawed () const
UnicodeSetcloseOver (int32_t attribute)
virtual UnicodeSetcompact ()
virtual UnicodeSetcomplement (void)
virtual UnicodeSetcomplement (UChar32 start, UChar32 end)
UnicodeSetcomplement (UChar32 c)
UnicodeSetcomplement (const UnicodeString &s)
UnicodeSetcomplementAll (const UnicodeString &s)
virtual UnicodeSetcomplementAll (const UnicodeSet &c)
virtual UBool contains (UChar32 c) const
virtual UBool contains (UChar32 start, UChar32 end) const
UBool contains (const UnicodeString &s) const
virtual UBool containsAll (const UnicodeSet &c) const
UBool containsAll (const UnicodeString &s) const
UBool containsNone (UChar32 start, UChar32 end) const
UBool containsNone (const UnicodeSet &c) const
UBool containsNone (const UnicodeString &s) const
UBool containsSome (UChar32 start, UChar32 end) const
UBool containsSome (const UnicodeSet &s) const
UBool containsSome (const UnicodeString &s) const
UnicodeFunctorfreeze ()
virtual UClassID getDynamicClassID (void) const
virtual int32_t getRangeCount (void) const
virtual UChar32 getRangeEnd (int32_t index) const
virtual UChar32 getRangeStart (int32_t index) const
virtual int32_t hashCode (void) const
int32_t indexOf (UChar32 c) const
UBool isBogus (void) const
virtual UBool isEmpty (void) const
UBool isFrozen () const
virtual UMatchDegree matches (const Replaceable &text, int32_t &offset, int32_t limit, UBool incremental)
UBool operator!= (const UnicodeSet &o) const
UnicodeSetoperator= (const UnicodeSet &o)
virtual UBool operator== (const UnicodeSet &o) const
virtual UnicodeSetremove (UChar32 start, UChar32 end)
UnicodeSetremove (UChar32 c)
UnicodeSetremove (const UnicodeString &s)
UnicodeSetremoveAll (const UnicodeString &s)
virtual UnicodeSetremoveAll (const UnicodeSet &c)
virtual UnicodeSetremoveAllStrings ()
virtual UnicodeSetretain (UChar32 start, UChar32 end)
UnicodeSetretain (UChar32 c)
UnicodeSetretainAll (const UnicodeString &s)
virtual UnicodeSetretainAll (const UnicodeSet &c)
int32_t serialize (uint16_t *dest, int32_t destCapacity, UErrorCode &ec) const
UnicodeSetset (UChar32 start, UChar32 end)
virtual void setData (const TransliterationRuleData *)
void setToBogus ()
virtual int32_t size (void) const
int32_t span (const UChar *s, int32_t length, USetSpanCondition spanCondition) const
int32_t span (const UnicodeString &s, int32_t start, USetSpanCondition spanCondition) const
int32_t spanBack (const UChar *s, int32_t length, USetSpanCondition spanCondition) const
int32_t spanBack (const UnicodeString &s, int32_t limit, USetSpanCondition spanCondition) const
int32_t spanBackUTF8 (const char *s, int32_t length, USetSpanCondition spanCondition) const
int32_t spanUTF8 (const char *s, int32_t length, USetSpanCondition spanCondition) const
virtual UnicodeMatchertoMatcher () const
virtual UnicodeStringtoPattern (UnicodeString &result, UBool escapeUnprintable=FALSE) const
virtual UnicodeReplacertoReplacer () const
USettoUSet ()
const USettoUSet () const
 UnicodeSet ()
 UnicodeSet (UChar32 start, UChar32 end)
 UnicodeSet (const UnicodeString &pattern, UErrorCode &status)
 UnicodeSet (const UnicodeString &pattern, uint32_t options, const SymbolTable *symbols, UErrorCode &status)
 UnicodeSet (const UnicodeString &pattern, ParsePosition &pos, uint32_t options, const SymbolTable *symbols, UErrorCode &status)
 UnicodeSet (const UnicodeSet &o)
virtual ~UnicodeSet ()

Static Public Member Functions

static UnicodeSet *U_EXPORT2 createFrom (const UnicodeString &s)
static UnicodeSet *U_EXPORT2 createFromAll (const UnicodeString &s)
static UnicodeSetfromUSet (USet *uset)
static const UnicodeSetfromUSet (const USet *uset)
static UClassID U_EXPORT2 getStaticClassID (void)
static void U_EXPORT2 operator delete (void *p) U_NO_THROW
static void U_EXPORT2 operator delete (void *, void *) U_NO_THROW
static void U_EXPORT2 operator delete[] (void *p) U_NO_THROW
static void *U_EXPORT2 operator new (size_t size) U_NO_THROW
static void *U_EXPORT2 operator new (size_t, void *ptr) U_NO_THROW
static void *U_EXPORT2 operator new[] (size_t size) U_NO_THROW
static UBool resemblesPattern (const UnicodeString &pattern, int32_t pos)

Private Types

enum  { kIsBogus = 1 }
typedef UBool(* Filter )(UChar32 codePoint, void *context)

Private Member Functions

void _add (const UnicodeString &s)
UnicodeString_generatePattern (UnicodeString &result, UBool escapeUnprintable) const
UnicodeString_toPattern (UnicodeString &result, UBool escapeUnprintable) const
void add (const UChar32 *other, int32_t otherLen, int8_t polarity)
UBool allocateStrings (UErrorCode &status)
void applyFilter (Filter filter, void *context, int32_t src, UErrorCode &status)
void applyPattern (RuleCharacterIterator &chars, const SymbolTable *symbols, UnicodeString &rebuiltPat, uint32_t options, UErrorCode &ec)
UnicodeSetapplyPropertyPattern (const UnicodeString &pattern, ParsePosition &ppos, UErrorCode &ec)
void applyPropertyPattern (RuleCharacterIterator &chars, UnicodeString &rebuiltPat, UErrorCode &ec)
void ensureBufferCapacity (int32_t newLen, UErrorCode &ec)
void ensureCapacity (int32_t newLen, UErrorCode &ec)
void exclusiveOr (const UChar32 *other, int32_t otherLen, int8_t polarity)
int32_t findCodePoint (UChar32 c) const
const UnicodeStringgetString (int32_t index) const
int32_t getStringCount () const
virtual UBool matchesIndexValue (uint8_t v) const
void releasePattern ()
void retain (const UChar32 *other, int32_t otherLen, int8_t polarity)
void setPattern (const UnicodeString &newPat)
void swapBuffers (void)
 UnicodeSet (const UnicodeSet &o, UBool)

Static Private Member Functions

static void _appendToPat (UnicodeString &buf, const UnicodeString &s, UBool escapeUnprintable)
static void _appendToPat (UnicodeString &buf, UChar32 c, UBool escapeUnprintable)
static const UnicodeSetgetInclusions (int32_t src, UErrorCode &status)
static int32_t getSingleCP (const UnicodeString &s)
static int32_t matchRest (const Replaceable &text, int32_t start, int32_t limit, const UnicodeString &s)
static UBool resemblesPropertyPattern (const UnicodeString &pattern, int32_t pos)
static UBool resemblesPropertyPattern (RuleCharacterIterator &chars, int32_t iterOpts)

Private Attributes

int32_t bufferCapacity
int32_t capacity
uint8_t fFlags
int32_t len
int32_t patLen


class UnicodeSetIterator
class USetAccess

Detailed Description

A mutable set of Unicode characters and multicharacter strings. Objects of this class represent character classes used in regular expressions. A character specifies a subset of Unicode code points. Legal code points are U+0000 to U+10FFFF, inclusive.

The UnicodeSet class is not designed to be subclassed.

UnicodeSet supports two APIs. The first is the operand API that allows the caller to modify the value of a UnicodeSet object. It conforms to Java 2's java.util.Set interface, although UnicodeSet does not actually implement that interface. All methods of Set are supported, with the modification that they take a character range or single character instead of an Object, and they take a UnicodeSet instead of a Collection. The operand API may be thought of in terms of boolean logic: a boolean OR is implemented by add, a boolean AND is implemented by retain, a boolean XOR is implemented by complement taking an argument, and a boolean NOT is implemented by complement with no argument. In terms of traditional set theory function names, add is a union, retain is an intersection, remove is an asymmetric difference, and complement with no argument is a set complement with respect to the superset range MIN_VALUE-MAX_VALUE

The second API is the applyPattern()/toPattern() API from the java.text.Format-derived classes. Unlike the methods that add characters, add categories, and control the logic of the set, the method applyPattern() sets all attributes of a UnicodeSet at once, based on a string pattern.

Pattern syntax

Patterns are accepted by the constructors and the applyPattern() methods and returned by the toPattern() method. These patterns follow a syntax similar to that employed by version 8 regular expression character classes. Here are some simple examples:

[] No characters
[a] The character 'a'
[ae] The characters 'a' and 'e'
[a-e] The characters 'a' through 'e' inclusive, in Unicode code point order
[\u4E01] The character U+4E01
[a{ab}{ac}] The character 'a' and the multicharacter strings "ab" and "ac"
[\p{Lu}] All characters in the general category Uppercase Letter

Any character may be preceded by a backslash in order to remove any special meaning. White space characters, as defined by UCharacter.isWhitespace(), are ignored, unless they are escaped.

Property patterns specify a set of characters having a certain property as defined by the Unicode standard. Both the POSIX-like "[:Lu:]" and the Perl-like syntax "\\p{Lu}" are recognized. For a complete list of supported property patterns, see the User's Guide for UnicodeSet at http://icu-project.org/userguide/unicodeSet.html. Actual determination of property data is defined by the underlying Unicode database as implemented by UCharacter.

Patterns specify individual characters, ranges of characters, and Unicode property sets. When elements are concatenated, they specify their union. To complement a set, place a '^' immediately after the opening '['. Property patterns are inverted by modifying their delimiters; "[:^foo]" and "\\P{foo}". In any other location, '^' has no special meaning.

Ranges are indicated by placing two a '-' between two characters, as in "a-z". This specifies the range of all characters from the left to the right, in Unicode order. If the left character is greater than or equal to the right character it is a syntax error. If a '-' occurs as the first character after the opening '[' or '[^', or if it occurs as the last character before the closing ']', then it is taken as a literal. Thus "[a\-b]", "[-ab]", and "[ab-]" all indicate the same set of three characters, 'a', 'b', and '-'.

Sets may be intersected using the '&' operator or the asymmetric set difference may be taken using the '-' operator, for example, "[[:L:]&[\\u0000-\\u0FFF]]" indicates the set of all Unicode letters with values less than 4096. Operators ('&' and '|') have equal precedence and bind left-to-right. Thus "[[:L:]-[a-z]-[\\u0100-\\u01FF]]" is equivalent to "[[[:L:]-[a-z]]-[\\u0100-\\u01FF]]". This only really matters for difference; intersection is commutative.

[a]The set containing 'a'
[a-z]The set containing 'a' through 'z' and all letters in between, in Unicode order
[^a-z]The set containing all characters but 'a' through 'z', that is, U+0000 through 'a'-1 and 'z'+1 through U+10FFFF
[[pat1][pat2]] The union of sets specified by pat1 and pat2
[[pat1]&[pat2]] The intersection of sets specified by pat1 and pat2
[[pat1]-[pat2]] The asymmetric difference of sets specified by pat1 and pat2
[:Lu:] or \p{Lu} The set of characters having the specified Unicode property; in this case, Unicode uppercase letters
[:^Lu:] or \P{Lu} The set of characters not having the given Unicode property

Warning: you cannot add an empty string ("") to a UnicodeSet.

Formal syntax

pattern :=  ('[' '^'? item* ']') | property
item :=  char | (char '-' char) | pattern-expr
pattern-expr :=  pattern | pattern-expr pattern | pattern-expr op pattern
op :=  '&' | '-'
special :=  '[' | ']' | '-'
char :=  any character that is not special
| ('\'
any character)
| ('\u' hex hex hex hex)
hex :=  any character for which Character.digit(c, 16) returns a non-negative result
property :=  a Unicode property set pattern

a := b   a may be replaced by b
a? zero or one instance of a
a* one or more instances of a
a | b either a or b
'a' the literal string between the quotes


  • Most UnicodeSet methods do not take a UErrorCode parameter because there are usually very few opportunities for failure other than a shortage of memory, error codes in low-level C++ string methods would be inconvenient, and the error code as the last parameter (ICU convention) would prevent the use of default parameter values. Instead, such methods set the UnicodeSet into a "bogus" state (see isBogus()) if an error occurs.
Alan Liu ICU 2.0

Definition at line 272 of file uniset.h.

The documentation for this class was generated from the following files:

Generated by  Doxygen 1.6.0   Back to index