Logo Search packages:      
Sourcecode: icu version File versions  Download package

Defines | Typedefs | Enumerations | Functions

ubrk.h File Reference

C API: BreakIterator. More...

#include "unicode/utypes.h"
#include "unicode/uloc.h"
#include "unicode/utext.h"
#include "unicode/parseerr.h"
Include dependency graph for ubrk.h:
This graph shows which files directly or indirectly include this file:

Go to the source code of this file.

Defines

#define U_BRK_SAFECLONE_BUFFERSIZE   512
#define UBRK_DONE   ((int32_t) -1)
#define UBRK_TYPEDEF_UBREAK_ITERATOR

Typedefs

typedef void UBreakIterator
typedef enum UBreakIteratorType UBreakIteratorType
typedef enum ULineBreakTag ULineBreakTag
typedef enum USentenceBreakTag USentenceBreakTag
typedef enum UWordBreak UWordBreak

Enumerations

enum  UBreakIteratorType {
  UBRK_CHARACTER = 0, UBRK_WORD = 1, UBRK_LINE = 2, UBRK_SENTENCE = 3,
  UBRK_TITLE = 4, UBRK_COUNT = 5
}
enum  ULineBreakTag { UBRK_LINE_SOFT = 0, UBRK_LINE_SOFT_LIMIT = 100, UBRK_LINE_HARD = 100, UBRK_LINE_HARD_LIMIT = 200 }
enum  USentenceBreakTag { UBRK_SENTENCE_TERM = 0, UBRK_SENTENCE_TERM_LIMIT = 100, UBRK_SENTENCE_SEP = 100, UBRK_SENTENCE_SEP_LIMIT = 200 }
enum  UWordBreak {
  UBRK_WORD_NONE = 0, UBRK_WORD_NONE_LIMIT = 100, UBRK_WORD_NUMBER = 100, UBRK_WORD_NUMBER_LIMIT = 200,
  UBRK_WORD_LETTER = 200, UBRK_WORD_LETTER_LIMIT = 300, UBRK_WORD_KANA = 300, UBRK_WORD_KANA_LIMIT = 400,
  UBRK_WORD_IDEO = 400, UBRK_WORD_IDEO_LIMIT = 500
}

Functions

U_STABLE void U_EXPORT2 ubrk_close (UBreakIterator *bi)
U_STABLE int32_t U_EXPORT2 ubrk_countAvailable (void)
U_STABLE int32_t U_EXPORT2 ubrk_current (const UBreakIterator *bi)
U_STABLE int32_t U_EXPORT2 ubrk_first (UBreakIterator *bi)
U_STABLE int32_t U_EXPORT2 ubrk_following (UBreakIterator *bi, int32_t offset)
U_STABLE const char *U_EXPORT2 ubrk_getAvailable (int32_t index)
U_STABLE const char *U_EXPORT2 ubrk_getLocaleByType (const UBreakIterator *bi, ULocDataLocaleType type, UErrorCode *status)
U_STABLE int32_t U_EXPORT2 ubrk_getRuleStatus (UBreakIterator *bi)
U_STABLE int32_t U_EXPORT2 ubrk_getRuleStatusVec (UBreakIterator *bi, int32_t *fillInVec, int32_t capacity, UErrorCode *status)
U_STABLE UBool U_EXPORT2 ubrk_isBoundary (UBreakIterator *bi, int32_t offset)
U_STABLE int32_t U_EXPORT2 ubrk_last (UBreakIterator *bi)
U_STABLE int32_t U_EXPORT2 ubrk_next (UBreakIterator *bi)
U_STABLE UBreakIterator *U_EXPORT2 ubrk_open (UBreakIteratorType type, const char *locale, const UChar *text, int32_t textLength, UErrorCode *status)
U_STABLE UBreakIterator *U_EXPORT2 ubrk_openRules (const UChar *rules, int32_t rulesLength, const UChar *text, int32_t textLength, UParseError *parseErr, UErrorCode *status)
U_STABLE int32_t U_EXPORT2 ubrk_preceding (UBreakIterator *bi, int32_t offset)
U_STABLE int32_t U_EXPORT2 ubrk_previous (UBreakIterator *bi)
U_STABLE UBreakIterator *U_EXPORT2 ubrk_safeClone (const UBreakIterator *bi, void *stackBuffer, int32_t *pBufferSize, UErrorCode *status)
U_STABLE void U_EXPORT2 ubrk_setText (UBreakIterator *bi, const UChar *text, int32_t textLength, UErrorCode *status)
U_STABLE void U_EXPORT2 ubrk_setUText (UBreakIterator *bi, UText *text, UErrorCode *status)

Detailed Description

C API: BreakIterator.

BreakIterator C API

The BreakIterator C API defines methods for finding the location of boundaries in text. Pointer to a UBreakIterator maintain a current position and scan over text returning the index of characters where boundaries occur.

Line boundary analysis determines where a text string can be broken when line-wrapping. The mechanism correctly handles punctuation and hyphenated words.

Sentence boundary analysis allows selection with correct interpretation of periods within numbers and abbreviations, and trailing punctuation marks such as quotation marks and parentheses.

Word boundary analysis is used by search and replace functions, as well as within text editing applications that allow the user to select words with a double click. Word selection provides correct interpretation of punctuation marks within and following words. Characters that are not part of a word, such as symbols or punctuation marks, have word-breaks on both sides.

Character boundary analysis identifies the boundaries of "Extended Grapheme Clusters", which are groupings of codepoints that should be treated as character-like units for many text operations. Please see Unicode Standard Annex #29, Unicode Text Segmentation, http://www.unicode.org/reports/tr29/ for additional information on grapheme clusters and guidelines on their use.

Title boundary analysis locates all positions, typically starts of words, that should be set to Title Case when title casing the text.

The text boundary positions are found according to the rules described in Unicode Standard Annex #29, Text Boundaries, and Unicode Standard Annex #14, Line Breaking Properties. These are available at http://www.unicode.org/reports/tr14/ and http://www.unicode.org/reports/tr29/.

In addition to the plain C API defined in this header file, an object oriented C++ API with equivalent functionality is defined in the file brkiter.h.

Code snippits illustrating the use of the Break Iterator APIs are available in the ICU User Guide, http://icu-project.org/userguide/boundaryAnalysis.html and in the sample program icu/source/samples/break/break.cpp"

Definition in file ubrk.h.


Generated by  Doxygen 1.6.0   Back to index