Logo Search packages:      
Sourcecode: icu version File versions  Download package

U_STABLE UChar* U_EXPORT2 u_strFromUTF8WithSub ( UChar *  dest,
int32_t  destCapacity,
int32_t *  pDestLength,
const char *  src,
int32_t  srcLength,
UChar32  subchar,
int32_t *  pNumSubstitutions,
UErrorCode pErrorCode 
)

Convert a UTF-8 string to UTF-16. If the input string is not well-formed, then the U_INVALID_CHAR_FOUND error code is set.

Same as u_strFromUTF8() except for the additional subchar which is output for illegal input sequences, instead of stopping with the U_INVALID_CHAR_FOUND error code. With subchar==U_SENTINEL, this function behaves exactly like u_strFromUTF8().

Parameters:
dest A buffer for the result string. The result will be zero-terminated if the buffer is large enough.
destCapacity The size of the buffer (number of UChars). If it is 0, then dest may be NULL and the function will only return the length of the result without writing any of the result string (pre-flighting).
pDestLength A pointer to receive the number of units written to the destination. If pDestLength!=NULL then *pDestLength is always set to the number of output units corresponding to the transformation of all the input units, even in case of a buffer overflow.
src The original source string
srcLength The length of the original string. If -1, then src must be zero-terminated.
subchar The substitution character to use in place of an illegal input sequence, or U_SENTINEL if the function is to return with U_INVALID_CHAR_FOUND instead. A substitution character can be any valid Unicode code point (up to U+10FFFF) except for surrogate code points (U+D800..U+DFFF). The recommended value is U+FFFD "REPLACEMENT CHARACTER".
pNumSubstitutions Output parameter receiving the number of substitutions if subchar>=0. Set to 0 if no substitutions occur or subchar<0. pNumSubstitutions can be NULL.
pErrorCode Pointer to a standard ICU error code. Its input value must pass the U_SUCCESS() test, or else the function returns immediately. Check for U_FAILURE() on output or use with function chaining. (See User Guide for details.)
Returns:
The pointer to destination buffer.
See also:
u_strFromUTF8

u_strFromUTF8Lenient

u_strToUTF8WithSub ICU 3.6

Definition at line 394 of file ustrtrns.c.

References NULL, U16_LENGTH, U_FAILURE, U_ILLEGAL_ARGUMENT_ERROR, U_INVALID_CHAR_FOUND, U_IS_SURROGATE, UTF16_LEAD, UTF16_TRAIL, and UTF_CHAR_LENGTH.

                                     {
    UChar *pDest = dest;
    UChar *pDestLimit = dest+destCapacity;
    UChar32 ch;
    int32_t reqLength = 0;
    const uint8_t* pSrc = (const uint8_t*) src;
    uint8_t t1, t2; /* trail bytes */
    int32_t numSubstitutions;

    /* args check */
    if(pErrorCode==NULL || U_FAILURE(*pErrorCode)){
        return NULL;
    }
        
    if( (src==NULL && srcLength!=0) || srcLength < -1 ||
        (destCapacity<0) || (dest == NULL && destCapacity > 0) ||
        subchar > 0x10ffff || U_IS_SURROGATE(subchar)
    ) {
        *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
        return NULL;
    }

    if(pNumSubstitutions!=NULL) {
        *pNumSubstitutions=0;
    }
    numSubstitutions=0;

    /*
     * Inline processing of UTF-8 byte sequences:
     *
     * Byte sequences for the most common characters are handled inline in
     * the conversion loops. In order to reduce the path lengths for those
     * characters, the tests are arranged in a kind of binary search.
     * ASCII (<=0x7f) is checked first, followed by the dividing point
     * between 2- and 3-byte sequences (0xe0).
     * The 3-byte branch is tested first to speed up CJK text.
     * The compiler should combine the subtractions for the two tests for 0xe0.
     * Each branch then tests for the other end of its range.
     */

    if(srcLength < 0){
        /*
         * Transform a NUL-terminated string.
         * The code explicitly checks for NULs only in the lead byte position.
         * A NUL byte in the trail byte position fails the trail byte range check anyway.
         */
        while(((ch = *pSrc) != 0) && (pDest < pDestLimit)) {
            if(ch <= 0x7f){
                *pDest++=(UChar)ch;
                ++pSrc;
            } else {
                if(ch > 0xe0) {
                    if( /* handle U+1000..U+CFFF inline */
                        ch <= 0xec &&
                        (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f &&
                        (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f
                    ) {
                        /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
                        *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2);
                        pSrc += 3;
                        continue;
                    }
                } else if(ch < 0xe0) {
                    if( /* handle U+0080..U+07FF inline */
                        ch >= 0xc2 &&
                        (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f
                    ) {
                        *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1);
                        pSrc += 2;
                        continue;
                    }
                }

                /* function call for "complicated" and error cases */
                ++pSrc; /* continue after the lead byte */
                ch=utf8_nextCharSafeBodyTerminated(&pSrc, ch);
                if(ch<0 && (++numSubstitutions, ch = subchar) < 0) {
                    *pErrorCode = U_INVALID_CHAR_FOUND;
                    return NULL;
                } else if(ch<=0xFFFF) {
                    *(pDest++)=(UChar)ch;
                } else {
                    *(pDest++)=UTF16_LEAD(ch);
                    if(pDest<pDestLimit) {
                        *(pDest++)=UTF16_TRAIL(ch);
                    } else {
                        reqLength++;
                        break;
                    }
                }
            }
        }

        /* Pre-flight the rest of the string. */
        while((ch = *pSrc) != 0) {
            if(ch <= 0x7f){
                ++reqLength;
                ++pSrc;
            } else {
                if(ch > 0xe0) {
                    if( /* handle U+1000..U+CFFF inline */
                        ch <= 0xec &&
                        (uint8_t)(pSrc[1] - 0x80) <= 0x3f &&
                        (uint8_t)(pSrc[2] - 0x80) <= 0x3f
                    ) {
                        ++reqLength;
                        pSrc += 3;
                        continue;
                    }
                } else if(ch < 0xe0) {
                    if( /* handle U+0080..U+07FF inline */
                        ch >= 0xc2 &&
                        (uint8_t)(pSrc[1] - 0x80) <= 0x3f
                    ) {
                        ++reqLength;
                        pSrc += 2;
                        continue;
                    }
                }

                /* function call for "complicated" and error cases */
                ++pSrc; /* continue after the lead byte */
                ch=utf8_nextCharSafeBodyTerminated(&pSrc, ch);
                if(ch<0 && (++numSubstitutions, ch = subchar) < 0) {
                    *pErrorCode = U_INVALID_CHAR_FOUND;
                    return NULL;
                }
                reqLength += U16_LENGTH(ch);
            }
        }
    } else /* srcLength >= 0 */ {
        const uint8_t *pSrcLimit = pSrc + srcLength;
        int32_t count;

        /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */
        for(;;) {
            /*
             * Each iteration of the inner loop progresses by at most 3 UTF-8
             * bytes and one UChar, for most characters.
             * For supplementary code points (4 & 2), which are rare,
             * there is an additional adjustment.
             */
            count = (int32_t)(pDestLimit - pDest);
            srcLength = (int32_t)((pSrcLimit - pSrc) / 3);
            if(count > srcLength) {
                count = srcLength; /* min(remaining dest, remaining src/3) */
            }
            if(count < 3) {
                /*
                 * Too much overhead if we get near the end of the string,
                 * continue with the next loop.
                 */
                break;
            }

            do {
                ch = *pSrc;
                if(ch <= 0x7f){
                    *pDest++=(UChar)ch;
                    ++pSrc;
                } else {
                    if(ch > 0xe0) {
                        if( /* handle U+1000..U+CFFF inline */
                            ch <= 0xec &&
                            (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f &&
                            (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f
                        ) {
                            /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
                            *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2);
                            pSrc += 3;
                            continue;
                        }
                    } else if(ch < 0xe0) {
                        if( /* handle U+0080..U+07FF inline */
                            ch >= 0xc2 &&
                            (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f
                        ) {
                            *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1);
                            pSrc += 2;
                            continue;
                        }
                    }

                    if(ch >= 0xf0 || subchar > 0xffff) {
                        /*
                         * We may read up to six bytes and write up to two UChars,
                         * which we didn't account for with computing count,
                         * so we adjust it here.
                         */
                        if(--count == 0) {
                            break;
                        }
                    }

                    /* function call for "complicated" and error cases */
                    ++pSrc; /* continue after the lead byte */
                    ch=utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
                    if(ch<0 && (++numSubstitutions, ch = subchar) < 0){
                        *pErrorCode = U_INVALID_CHAR_FOUND;
                        return NULL;
                    }else if(ch<=0xFFFF){
                        *(pDest++)=(UChar)ch;
                    }else{
                        *(pDest++)=UTF16_LEAD(ch);
                        *(pDest++)=UTF16_TRAIL(ch);
                    }
                }
            } while(--count > 0);
        }

        while((pSrc<pSrcLimit) && (pDest<pDestLimit)) {
            ch = *pSrc;
            if(ch <= 0x7f){
                *pDest++=(UChar)ch;
                ++pSrc;
            } else {
                if(ch > 0xe0) {
                    if( /* handle U+1000..U+CFFF inline */
                        ch <= 0xec &&
                        ((pSrcLimit - pSrc) >= 3) &&
                        (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f &&
                        (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f
                    ) {
                        /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
                        *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2);
                        pSrc += 3;
                        continue;
                    }
                } else if(ch < 0xe0) {
                    if( /* handle U+0080..U+07FF inline */
                        ch >= 0xc2 &&
                        ((pSrcLimit - pSrc) >= 2) &&
                        (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f
                    ) {
                        *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1);
                        pSrc += 2;
                        continue;
                    }
                }

                /* function call for "complicated" and error cases */
                ++pSrc; /* continue after the lead byte */
                ch=utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
                if(ch<0 && (++numSubstitutions, ch = subchar) < 0){
                    *pErrorCode = U_INVALID_CHAR_FOUND;
                    return NULL;
                }else if(ch<=0xFFFF){
                    *(pDest++)=(UChar)ch;
                }else{
                    *(pDest++)=UTF16_LEAD(ch);
                    if(pDest<pDestLimit){
                        *(pDest++)=UTF16_TRAIL(ch);
                    }else{
                        reqLength++;
                        break;
                    }
                }
            }
        }
        /* do not fill the dest buffer just count the UChars needed */
        while(pSrc < pSrcLimit){
            ch = *pSrc;
            if(ch <= 0x7f){
                reqLength++;
                ++pSrc;
            } else {
                if(ch > 0xe0) {
                    if( /* handle U+1000..U+CFFF inline */
                        ch <= 0xec &&
                        ((pSrcLimit - pSrc) >= 3) &&
                        (uint8_t)(pSrc[1] - 0x80) <= 0x3f &&
                        (uint8_t)(pSrc[2] - 0x80) <= 0x3f
                    ) {
                        reqLength++;
                        pSrc += 3;
                        continue;
                    }
                } else if(ch < 0xe0) {
                    if( /* handle U+0080..U+07FF inline */
                        ch >= 0xc2 &&
                        ((pSrcLimit - pSrc) >= 2) &&
                        (uint8_t)(pSrc[1] - 0x80) <= 0x3f
                    ) {
                        reqLength++;
                        pSrc += 2;
                        continue;
                    }
                }

                /* function call for "complicated" and error cases */
                ++pSrc; /* continue after the lead byte */
                ch=utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
                if(ch<0 && (++numSubstitutions, ch = subchar) < 0){
                    *pErrorCode = U_INVALID_CHAR_FOUND;
                    return NULL;
                }
                reqLength+=UTF_CHAR_LENGTH(ch);
            }
        }
    }

    reqLength+=(int32_t)(pDest - dest);

    if(pNumSubstitutions!=NULL) {
        *pNumSubstitutions=numSubstitutions;
    }

    if(pDestLength){
        *pDestLength = reqLength;
    }

    /* Terminate the buffer */
    u_terminateUChars(dest,destCapacity,reqLength,pErrorCode);

    return dest;
}


Generated by  Doxygen 1.6.0   Back to index