Logo Search packages:      
Sourcecode: icu version File versions  Download package

U_STABLE UChar32 U_EXPORT2 utf8_nextCharSafeBody ( const uint8_t *  s,
int32_t *  pi,
int32_t  length,
UChar32  c,
UBool  strict 
)

Function for handling "next code point" with error-checking.

This is internal since it is not meant to be called directly by external clients; however it is U_STABLE (not U_INTERNAL) since it is called by public macros in this file and thus must remain stable, and should not be hidden when other internal functions are hidden (otherwise public macros would fail to compile).

For internal use only.

Definition at line 108 of file utf_impl.c.

References U_SENTINEL, UTF8_COUNT_TRAIL_BYTES, UTF8_ERROR_VALUE_1, UTF8_IS_TRAIL, UTF8_MASK_LEAD_BYTE, UTF_IS_SURROGATE, and UTF_IS_UNICODE_NONCHAR.

                                                                                              {
    int32_t i=*pi;
    uint8_t count=UTF8_COUNT_TRAIL_BYTES(c);
    if((i)+count<=(length)) {
        uint8_t trail, illegal=0;

        UTF8_MASK_LEAD_BYTE((c), count);
        /* count==0 for illegally leading trail bytes and the illegal bytes 0xfe and 0xff */
        switch(count) {
        /* each branch falls through to the next one */
        case 5:
        case 4:
            /* count>=4 is always illegal: no more than 3 trail bytes in Unicode's UTF-8 */
            illegal=1;
            break;
        case 3:
            trail=s[(i)++];
            (c)=((c)<<6)|(trail&0x3f);
            if(c<0x110) {
                illegal|=(trail&0xc0)^0x80;
            } else {
                /* code point>0x10ffff, outside Unicode */
                illegal=1;
                break;
            }
        case 2:
            trail=s[(i)++];
            (c)=((c)<<6)|(trail&0x3f);
            illegal|=(trail&0xc0)^0x80;
        case 1:
            trail=s[(i)++];
            (c)=((c)<<6)|(trail&0x3f);
            illegal|=(trail&0xc0)^0x80;
            break;
        case 0:
            if(strict>=0) {
                return UTF8_ERROR_VALUE_1;
            } else {
                return U_SENTINEL;
            }
        /* no default branch to optimize switch()  - all values are covered */
        }

        /*
         * All the error handling should return a value
         * that needs count bytes so that UTF8_GET_CHAR_SAFE() works right.
         *
         * Starting with Unicode 3.0.1, non-shortest forms are illegal.
         * Starting with Unicode 3.2, surrogate code points must not be
         * encoded in UTF-8, and there are no irregular sequences any more.
         *
         * U8_ macros (new in ICU 2.4) return negative values for error conditions.
         */

        /* correct sequence - all trail bytes have (b7..b6)==(10)? */
        /* illegal is also set if count>=4 */
        if(illegal || (c)<utf8_minLegal[count] || (UTF_IS_SURROGATE(c) && strict!=-2)) {
            /* error handling */
            uint8_t errorCount=count;
            /* don't go beyond this sequence */
            i=*pi;
            while(count>0 && UTF8_IS_TRAIL(s[i])) {
                ++(i);
                --count;
            }
            if(strict>=0) {
                c=utf8_errorValue[errorCount-count];
            } else {
                c=U_SENTINEL;
            }
        } else if((strict)>0 && UTF_IS_UNICODE_NONCHAR(c)) {
            /* strict: forbid non-characters like U+fffe */
            c=utf8_errorValue[count];
        }
    } else /* too few bytes left */ {
        /* error handling */
        int32_t i0=i;
        /* don't just set (i)=(length) in case there is an illegal sequence */
        while((i)<(length) && UTF8_IS_TRAIL(s[i])) {
            ++(i);
        }
        if(strict>=0) {
            c=utf8_errorValue[i-i0];
        } else {
            c=U_SENTINEL;
        }
    }
    *pi=i;
    return c;
}


Generated by  Doxygen 1.6.0   Back to index