Logo Search packages:      
Sourcecode: icu version File versions  Download package

U_STABLE char* U_EXPORT2 u_strToUTF8WithSub ( char *  dest,
int32_t  destCapacity,
int32_t *  pDestLength,
const UChar *  src,
int32_t  srcLength,
UChar32  subchar,
int32_t *  pNumSubstitutions,
UErrorCode pErrorCode 
)

Convert a UTF-16 string to UTF-8. If the input string is not well-formed, then the U_INVALID_CHAR_FOUND error code is set.

Same as u_strToUTF8() except for the additional subchar which is output for illegal input sequences, instead of stopping with the U_INVALID_CHAR_FOUND error code. With subchar==U_SENTINEL, this function behaves exactly like u_strToUTF8().

Parameters:
dest A buffer for the result string. The result will be zero-terminated if the buffer is large enough.
destCapacity The size of the buffer (number of chars). If it is 0, then dest may be NULL and the function will only return the length of the result without writing any of the result string (pre-flighting).
pDestLength A pointer to receive the number of units written to the destination. If pDestLength!=NULL then *pDestLength is always set to the number of output units corresponding to the transformation of all the input units, even in case of a buffer overflow.
src The original source string
srcLength The length of the original string. If -1, then src must be zero-terminated.
subchar The substitution character to use in place of an illegal input sequence, or U_SENTINEL if the function is to return with U_INVALID_CHAR_FOUND instead. A substitution character can be any valid Unicode code point (up to U+10FFFF) except for surrogate code points (U+D800..U+DFFF). The recommended value is U+FFFD "REPLACEMENT CHARACTER".
pNumSubstitutions Output parameter receiving the number of substitutions if subchar>=0. Set to 0 if no substitutions occur or subchar<0. pNumSubstitutions can be NULL.
pErrorCode Pointer to a standard ICU error code. Its input value must pass the U_SUCCESS() test, or else the function returns immediately. Check for U_FAILURE() on output or use with function chaining. (See User Guide for details.)
Returns:
The pointer to destination buffer.
See also:
u_strToUTF8

u_strFromUTF8WithSub ICU 3.6

Definition at line 971 of file ustrtrns.c.

References NULL, U8_LENGTH, U_FAILURE, U_ILLEGAL_ARGUMENT_ERROR, U_INVALID_CHAR_FOUND, U_IS_SURROGATE, UTF16_GET_PAIR_VALUE, UTF_IS_SURROGATE, UTF_IS_SURROGATE_FIRST, and UTF_IS_TRAIL.

                                   {
    int32_t reqLength=0;
    uint32_t ch=0,ch2=0;
    uint8_t *pDest = (uint8_t *)dest;
    uint8_t *pDestLimit = pDest + destCapacity;
    int32_t numSubstitutions;

    /* args check */
    if(pErrorCode==NULL || U_FAILURE(*pErrorCode)){
        return NULL;
    }
        
    if( (pSrc==NULL && srcLength!=0) || srcLength < -1 ||
        (destCapacity<0) || (dest == NULL && destCapacity > 0) ||
        subchar > 0x10ffff || U_IS_SURROGATE(subchar)
    ) {
        *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
        return NULL;
    }

    if(pNumSubstitutions!=NULL) {
        *pNumSubstitutions=0;
    }
    numSubstitutions=0;

    if(srcLength==-1) {
        while((ch=*pSrc)!=0) {
            ++pSrc;
            if(ch <= 0x7f) {
                if(pDest<pDestLimit) {
                    *pDest++ = (uint8_t)ch;
                } else {
                    reqLength = 1;
                    break;
                }
            } else if(ch <= 0x7ff) {
                if((pDestLimit - pDest) >= 2) {
                    *pDest++=(uint8_t)((ch>>6)|0xc0);
                    *pDest++=(uint8_t)((ch&0x3f)|0x80);
                } else {
                    reqLength = 2;
                    break;
                }
            } else if(ch <= 0xd7ff || ch >= 0xe000) {
                if((pDestLimit - pDest) >= 3) {
                    *pDest++=(uint8_t)((ch>>12)|0xe0);
                    *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
                    *pDest++=(uint8_t)((ch&0x3f)|0x80);
                } else {
                    reqLength = 3;
                    break;
                }
            } else /* ch is a surrogate */ {
                int32_t length;

                /*need not check for NUL because NUL fails UTF_IS_TRAIL() anyway*/
                if(UTF_IS_SURROGATE_FIRST(ch) && UTF_IS_TRAIL(ch2=*pSrc)) { 
                    ++pSrc;
                    ch=UTF16_GET_PAIR_VALUE(ch, ch2);
                } else if(subchar>=0) {
                    ch=subchar;
                    ++numSubstitutions;
                } else {
                    /* Unicode 3.2 forbids surrogate code points in UTF-8 */
                    *pErrorCode = U_INVALID_CHAR_FOUND;
                    return NULL;
                }

                length = U8_LENGTH(ch);
                if((pDestLimit - pDest) >= length) {
                    /* convert and append*/
                    pDest=_appendUTF8(pDest, ch);
                } else {
                    reqLength = length;
                    break;
                }
            }
        }
        while((ch=*pSrc++)!=0) {
            if(ch<=0x7f) {
                ++reqLength;
            } else if(ch<=0x7ff) {
                reqLength+=2;
            } else if(!UTF_IS_SURROGATE(ch)) {
                reqLength+=3;
            } else if(UTF_IS_SURROGATE_FIRST(ch) && UTF_IS_TRAIL(ch2=*pSrc)) {
                ++pSrc;
                reqLength+=4;
            } else if(subchar>=0) {
                reqLength+=U8_LENGTH(subchar);
                ++numSubstitutions;
            } else {
                /* Unicode 3.2 forbids surrogate code points in UTF-8 */
                *pErrorCode = U_INVALID_CHAR_FOUND;
                return NULL;
            }
        }
    } else {
        const UChar *pSrcLimit = pSrc+srcLength;
        int32_t count;

        /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */
        for(;;) {
            /*
             * Each iteration of the inner loop progresses by at most 3 UTF-8
             * bytes and one UChar, for most characters.
             * For supplementary code points (4 & 2), which are rare,
             * there is an additional adjustment.
             */
            count = (int32_t)((pDestLimit - pDest) / 3);
            srcLength = (int32_t)(pSrcLimit - pSrc);
            if(count > srcLength) {
                count = srcLength; /* min(remaining dest/3, remaining src) */
            }
            if(count < 3) {
                /*
                 * Too much overhead if we get near the end of the string,
                 * continue with the next loop.
                 */
                break;
            }
            do {
                ch=*pSrc++;
                if(ch <= 0x7f) {
                    *pDest++ = (uint8_t)ch;
                } else if(ch <= 0x7ff) {
                    *pDest++=(uint8_t)((ch>>6)|0xc0);
                    *pDest++=(uint8_t)((ch&0x3f)|0x80);
                } else if(ch <= 0xd7ff || ch >= 0xe000) {
                    *pDest++=(uint8_t)((ch>>12)|0xe0);
                    *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
                    *pDest++=(uint8_t)((ch&0x3f)|0x80);
                } else /* ch is a surrogate */ {
                    /*
                     * We will read two UChars and probably output four bytes,
                     * which we didn't account for with computing count,
                     * so we adjust it here.
                     */
                    if(--count == 0) {
                        --pSrc; /* undo ch=*pSrc++ for the lead surrogate */
                        break;  /* recompute count */
                    }

                    if(UTF_IS_SURROGATE_FIRST(ch) && UTF_IS_TRAIL(ch2=*pSrc)) { 
                        ++pSrc;
                        ch=UTF16_GET_PAIR_VALUE(ch, ch2);

                        /* writing 4 bytes per 2 UChars is ok */
                        *pDest++=(uint8_t)((ch>>18)|0xf0);
                        *pDest++=(uint8_t)(((ch>>12)&0x3f)|0x80);
                        *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
                        *pDest++=(uint8_t)((ch&0x3f)|0x80);
                    } else  {
                        /* Unicode 3.2 forbids surrogate code points in UTF-8 */
                        if(subchar>=0) {
                            ch=subchar;
                            ++numSubstitutions;
                        } else {
                            *pErrorCode = U_INVALID_CHAR_FOUND;
                            return NULL;
                        }

                        /* convert and append*/
                        pDest=_appendUTF8(pDest, ch);
                    }
                }
            } while(--count > 0);
        }

        while(pSrc<pSrcLimit) {
            ch=*pSrc++;
            if(ch <= 0x7f) {
                if(pDest<pDestLimit) {
                    *pDest++ = (uint8_t)ch;
                } else {
                    reqLength = 1;
                    break;
                }
            } else if(ch <= 0x7ff) {
                if((pDestLimit - pDest) >= 2) {
                    *pDest++=(uint8_t)((ch>>6)|0xc0);
                    *pDest++=(uint8_t)((ch&0x3f)|0x80);
                } else {
                    reqLength = 2;
                    break;
                }
            } else if(ch <= 0xd7ff || ch >= 0xe000) {
                if((pDestLimit - pDest) >= 3) {
                    *pDest++=(uint8_t)((ch>>12)|0xe0);
                    *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
                    *pDest++=(uint8_t)((ch&0x3f)|0x80);
                } else {
                    reqLength = 3;
                    break;
                }
            } else /* ch is a surrogate */ {
                int32_t length;

                if(UTF_IS_SURROGATE_FIRST(ch) && pSrc<pSrcLimit && UTF_IS_TRAIL(ch2=*pSrc)) { 
                    ++pSrc;
                    ch=UTF16_GET_PAIR_VALUE(ch, ch2);
                } else if(subchar>=0) {
                    ch=subchar;
                    ++numSubstitutions;
                } else {
                    /* Unicode 3.2 forbids surrogate code points in UTF-8 */
                    *pErrorCode = U_INVALID_CHAR_FOUND;
                    return NULL;
                }

                length = U8_LENGTH(ch);
                if((pDestLimit - pDest) >= length) {
                    /* convert and append*/
                    pDest=_appendUTF8(pDest, ch);
                } else {
                    reqLength = length;
                    break;
                }
            }
        }
        while(pSrc<pSrcLimit) {
            ch=*pSrc++;
            if(ch<=0x7f) {
                ++reqLength;
            } else if(ch<=0x7ff) {
                reqLength+=2;
            } else if(!UTF_IS_SURROGATE(ch)) {
                reqLength+=3;
            } else if(UTF_IS_SURROGATE_FIRST(ch) && pSrc<pSrcLimit && UTF_IS_TRAIL(ch2=*pSrc)) {
                ++pSrc;
                reqLength+=4;
            } else if(subchar>=0) {
                reqLength+=U8_LENGTH(subchar);
                ++numSubstitutions;
            } else {
                /* Unicode 3.2 forbids surrogate code points in UTF-8 */
                *pErrorCode = U_INVALID_CHAR_FOUND;
                return NULL;
            }
        }
    }

    reqLength+=(int32_t)(pDest - (uint8_t *)dest);

    if(pNumSubstitutions!=NULL) {
        *pNumSubstitutions=numSubstitutions;
    }

    if(pDestLength){
        *pDestLength = reqLength;
    }

    /* Terminate the buffer */
    u_terminateChars(dest, destCapacity, reqLength, pErrorCode);
    return dest;
}


Generated by  Doxygen 1.6.0   Back to index