Logo Search packages:      
Sourcecode: icu version File versions  Download package

usearch.h File Reference

C API: StringSearch. More...

#include "unicode/utypes.h"
#include "unicode/localpointer.h"
#include "unicode/ucol.h"
#include "unicode/ucoleitr.h"
#include "unicode/ubrk.h"
Include dependency graph for usearch.h:
This graph shows which files directly or indirectly include this file:

Go to the source code of this file.

Defines

#define USEARCH_DONE   -1

Typedefs

typedef struct UStringSearch UStringSearch

Enumerations

enum  USearchAttribute { USEARCH_OVERLAP, USEARCH_CANONICAL_MATCH, USEARCH_ELEMENT_COMPARISON, USEARCH_ATTRIBUTE_COUNT }
enum  USearchAttributeValue {
  USEARCH_DEFAULT = -1, USEARCH_OFF, USEARCH_ON, USEARCH_STANDARD_ELEMENT_COMPARISON,
  USEARCH_PATTERN_BASE_WEIGHT_IS_WILDCARD, USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD, USEARCH_ATTRIBUTE_VALUE_COUNT
}

Functions

U_STABLE void U_EXPORT2 usearch_close (UStringSearch *searchiter)
U_STABLE int32_t U_EXPORT2 usearch_first (UStringSearch *strsrch, UErrorCode *status)
U_STABLE int32_t U_EXPORT2 usearch_following (UStringSearch *strsrch, int32_t position, UErrorCode *status)
U_STABLE USearchAttributeValue
U_EXPORT2 
usearch_getAttribute (const UStringSearch *strsrch, USearchAttribute attribute)
U_STABLE const UBreakIterator
*U_EXPORT2 
usearch_getBreakIterator (const UStringSearch *strsrch)
U_STABLE UCollator *U_EXPORT2 usearch_getCollator (const UStringSearch *strsrch)
U_STABLE int32_t U_EXPORT2 usearch_getMatchedLength (const UStringSearch *strsrch)
U_STABLE int32_t U_EXPORT2 usearch_getMatchedStart (const UStringSearch *strsrch)
U_STABLE int32_t U_EXPORT2 usearch_getMatchedText (const UStringSearch *strsrch, UChar *result, int32_t resultCapacity, UErrorCode *status)
U_STABLE int32_t U_EXPORT2 usearch_getOffset (const UStringSearch *strsrch)
U_STABLE const UChar *U_EXPORT2 usearch_getPattern (const UStringSearch *strsrch, int32_t *length)
U_STABLE const UChar *U_EXPORT2 usearch_getText (const UStringSearch *strsrch, int32_t *length)
U_STABLE int32_t U_EXPORT2 usearch_last (UStringSearch *strsrch, UErrorCode *status)
U_STABLE int32_t U_EXPORT2 usearch_next (UStringSearch *strsrch, UErrorCode *status)
U_STABLE UStringSearch *U_EXPORT2 usearch_open (const UChar *pattern, int32_t patternlength, const UChar *text, int32_t textlength, const char *locale, UBreakIterator *breakiter, UErrorCode *status)
U_STABLE UStringSearch *U_EXPORT2 usearch_openFromCollator (const UChar *pattern, int32_t patternlength, const UChar *text, int32_t textlength, const UCollator *collator, UBreakIterator *breakiter, UErrorCode *status)
U_STABLE int32_t U_EXPORT2 usearch_preceding (UStringSearch *strsrch, int32_t position, UErrorCode *status)
U_STABLE int32_t U_EXPORT2 usearch_previous (UStringSearch *strsrch, UErrorCode *status)
U_STABLE void U_EXPORT2 usearch_reset (UStringSearch *strsrch)
U_INTERNAL UBool U_EXPORT2 usearch_search (UStringSearch *strsrch, int32_t startIdx, int32_t *matchStart, int32_t *matchLimit, UErrorCode *status)
U_INTERNAL UBool U_EXPORT2 usearch_searchBackwards (UStringSearch *strsrch, int32_t startIdx, int32_t *matchStart, int32_t *matchLimit, UErrorCode *status)
U_STABLE void U_EXPORT2 usearch_setAttribute (UStringSearch *strsrch, USearchAttribute attribute, USearchAttributeValue value, UErrorCode *status)
U_STABLE void U_EXPORT2 usearch_setBreakIterator (UStringSearch *strsrch, UBreakIterator *breakiter, UErrorCode *status)
U_STABLE void U_EXPORT2 usearch_setCollator (UStringSearch *strsrch, const UCollator *collator, UErrorCode *status)
U_STABLE void U_EXPORT2 usearch_setOffset (UStringSearch *strsrch, int32_t position, UErrorCode *status)
U_STABLE void U_EXPORT2 usearch_setPattern (UStringSearch *strsrch, const UChar *pattern, int32_t patternlength, UErrorCode *status)
U_STABLE void U_EXPORT2 usearch_setText (UStringSearch *strsrch, const UChar *text, int32_t textlength, UErrorCode *status)

Detailed Description

C API: StringSearch.

C Apis for an engine that provides language-sensitive text searching based on the comparison rules defined in a UCollator data struct, see ucol.h. This ensures that language eccentricity can be handled, e.g. for the German collator, characters ß and SS will be matched if case is chosen to be ignored. See the "ICU Collation Design Document" for more information.

The algorithm implemented is a modified form of the Boyer Moore's search. For more information see "Efficient Text Searching in Java", published in Java Report in February, 1999, for further information on the algorithm.

There are 2 match options for selection:
Let S' be the sub-string of a text string S between the offsets start and end <start, end>.
A pattern string P matches a text string S at the offsets <start, end> if

 
 option 1. Some canonical equivalent of P matches some canonical equivalent 
           of S'
 option 2. P matches S' and if P starts or ends with a combining mark, 
           there exists no non-ignorable combining mark before or after S' 
           in S respectively. 
 

Option 2. will be the default.

This search has APIs similar to that of other text iteration mechanisms such as the break iterators in ubrk.h. Using these APIs, it is easy to scan through text looking for all occurances of a given pattern. This search iterator allows changing of direction by calling a reset followed by a next or previous. Though a direction change can occur without calling reset first, this operation comes with some speed penalty. Generally, match results in the forward direction will match the result matches in the backwards direction in the reverse order

usearch.h provides APIs to specify the starting position within the text string to be searched, e.g. usearch_setOffset, usearch_preceding and usearch_following. Since the starting position will be set as it is specified, please take note that there are some dangerous positions which the search may render incorrect results:

  • The midst of a substring that requires normalization.
  • If the following match is to be found, the position should not be the second character which requires to be swapped with the preceding character. Vice versa, if the preceding match is to be found, position to search from should not be the first character which requires to be swapped with the next character. E.g certain Thai and Lao characters require swapping.
  • If a following pattern match is to be found, any position within a contracting sequence except the first will fail. Vice versa if a preceding pattern match is to be found, a invalid starting point would be any character within a contracting sequence except the last.

A breakiterator can be used if only matches at logical breaks are desired. Using a breakiterator will only give you results that exactly matches the boundaries given by the breakiterator. For instance the pattern "e" will not be found in the string "\u00e9" if a character break iterator is used.

Options are provided to handle overlapping matches. E.g. In English, overlapping matches produces the result 0 and 2 for the pattern "abab" in the text "ababab", where else mutually exclusive matches only produce the result of 0.

Though collator attributes will be taken into consideration while performing matches, there are no APIs here for setting and getting the attributes. These attributes can be set by getting the collator from usearch_getCollator and using the APIs in ucol.h. Lastly to update String Search to the new collator attributes, usearch_reset() has to be called.

Restriction:
Currently there are no composite characters that consists of a character with combining class > 0 before a character with combining class == 0. However, if such a character exists in the future, the search mechanism does not guarantee the results for option 1.

Example of use:


 char *tgtstr = "The quick brown fox jumped over the lazy fox";
 char *patstr = "fox";
 UChar target[64];
 UChar pattern[16];
 UErrorCode status = U_ZERO_ERROR;
 u_uastrcpy(target, tgtstr);
 u_uastrcpy(pattern, patstr);
 UStringSearch *search = usearch_open(pattern, -1, target, -1, "en_US", 
                                  NULL, &status);
 if (U_SUCCESS(status)) {
     for (int pos = usearch_first(search, &status); 
          pos != USEARCH_DONE; 
          pos = usearch_next(search, &status))
     {
         printf("Found match at %d pos, length is %d\n", pos, 
                                        usearch_getMatchLength(search));
     }
 }
 usearch_close(search);
 

ICU 2.4

Definition in file usearch.h.


Generated by  Doxygen 1.6.0   Back to index