Logo Search packages:      
Sourcecode: icu version File versions  Download package

U_DRAFT UChar* U_EXPORT2 u_strFromJavaModifiedUTF8WithSub ( UChar *  dest,
int32_t  destCapacity,
int32_t *  pDestLength,
const char *  src,
int32_t  srcLength,
UChar32  subchar,
int32_t *  pNumSubstitutions,
UErrorCode pErrorCode 
)

Convert a Java Modified UTF-8 string to a 16-bit Unicode string. If the input string is not well-formed, then the U_INVALID_CHAR_FOUND error code is set.

This function behaves according to the documentation for Java DataInput.readUTF() except that it takes a length parameter rather than interpreting the first two input bytes as the length. See http://java.sun.com/javase/6/docs/api/java/io/DataInput.html#readUTF()

The output string may not be well-formed UTF-16.

Parameters:
dest A buffer for the result string. The result will be zero-terminated if the buffer is large enough.
destCapacity The size of the buffer (number of UChars). If it is 0, then dest may be NULL and the function will only return the length of the result without writing any of the result string (pre-flighting).
pDestLength A pointer to receive the number of units written to the destination. If pDestLength!=NULL then *pDestLength is always set to the number of output units corresponding to the transformation of all the input units, even in case of a buffer overflow.
src The original source string
srcLength The length of the original string. If -1, then src must be zero-terminated.
subchar The substitution character to use in place of an illegal input sequence, or U_SENTINEL if the function is to return with U_INVALID_CHAR_FOUND instead. A substitution character can be any valid Unicode code point (up to U+10FFFF) except for surrogate code points (U+D800..U+DFFF). The recommended value is U+FFFD "REPLACEMENT CHARACTER".
pNumSubstitutions Output parameter receiving the number of substitutions if subchar>=0. Set to 0 if no substitutions occur or subchar<0. pNumSubstitutions can be NULL.
pErrorCode Pointer to a standard ICU error code. Its input value must pass the U_SUCCESS() test, or else the function returns immediately. Check for U_FAILURE() on output or use with function chaining. (See User Guide for details.)
Returns:
The pointer to destination buffer.
See also:
u_strFromUTF8WithSub

u_strFromUTF8Lenient

u_strToJavaModifiedUTF8 ICU 4.4

Definition at line 1249 of file ustrtrns.c.

References NULL, U16_LEAD, U16_LENGTH, U16_TRAIL, U_FAILURE, U_ILLEGAL_ARGUMENT_ERROR, U_INVALID_CHAR_FOUND, and U_IS_SURROGATE.

                                {
    UChar *pDest = dest;
    UChar *pDestLimit = dest+destCapacity;
    UChar32 ch;
    int32_t reqLength = 0;
    const uint8_t* pSrc = (const uint8_t*) src;
    const uint8_t *pSrcLimit;
    int32_t count;
    uint8_t t1, t2; /* trail bytes */
    int32_t numSubstitutions;

    /* args check */
    if(U_FAILURE(*pErrorCode)){
        return NULL;
    }
    if( (src==NULL && srcLength!=0) || srcLength < -1 ||
        (dest==NULL && destCapacity!=0) || destCapacity<0 ||
        subchar > 0x10ffff || U_IS_SURROGATE(subchar)
    ) {
        *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
        return NULL;
    }

    if(pNumSubstitutions!=NULL) {
        *pNumSubstitutions=0;
    }
    numSubstitutions=0;

    if(srcLength < 0) {
        /*
         * Transform a NUL-terminated ASCII string.
         * Handle non-ASCII strings with slower code.
         */
        while(((ch = *pSrc) != 0) && ch <= 0x7f && (pDest < pDestLimit)) {
            *pDest++=(UChar)ch;
            ++pSrc;
        }
        if(ch == 0) {
            reqLength=(int32_t)(pDest - dest);
            if(pDestLength) {
                *pDestLength = reqLength;
            }

            /* Terminate the buffer */
            u_terminateUChars(dest, destCapacity, reqLength, pErrorCode);
            return dest;
        }
        srcLength = uprv_strlen((const char *)pSrc);
    }

    /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */
    pSrcLimit = pSrc + srcLength;
    for(;;) {
        count = (int32_t)(pDestLimit - pDest);
        srcLength = (int32_t)(pSrcLimit - pSrc);
        if(count >= srcLength && srcLength > 0 && *pSrc <= 0x7f) {
            /* fast ASCII loop */
            const uint8_t *prevSrc = pSrc;
            int32_t delta;
            while(pSrc < pSrcLimit && (ch = *pSrc) <= 0x7f) {
                *pDest++=(UChar)ch;
                ++pSrc;
            }
            delta = (int32_t)(pSrc - prevSrc);
            count -= delta;
            srcLength -= delta;
        }
        /*
         * Each iteration of the inner loop progresses by at most 3 UTF-8
         * bytes and one UChar.
         */
        srcLength /= 3;
        if(count > srcLength) {
            count = srcLength; /* min(remaining dest, remaining src/3) */
        }
        if(count < 3) {
            /*
             * Too much overhead if we get near the end of the string,
             * continue with the next loop.
             */
            break;
        }
        do {
            ch = *pSrc;
            if(ch <= 0x7f){
                *pDest++=(UChar)ch;
                ++pSrc;
            } else {
                if(ch >= 0xe0) {
                    if( /* handle U+0000..U+FFFF inline */
                        ch <= 0xef &&
                        (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f &&
                        (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f
                    ) {
                        /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
                        *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2);
                        pSrc += 3;
                        continue;
                    }
                } else {
                    if( /* handle U+0000..U+07FF inline */
                        ch >= 0xc0 &&
                        (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f
                    ) {
                        *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1);
                        pSrc += 2;
                        continue;
                    }
                }

                if(subchar < 0) {
                    *pErrorCode = U_INVALID_CHAR_FOUND;
                    return NULL;
                } else if(subchar > 0xffff && --count == 0) {
                    /*
                     * We need to write two UChars, adjusted count for that,
                     * and ran out of space.
                     */
                    break;
                } else {
                    /* function call for error cases */
                    ++pSrc; /* continue after the lead byte */
                    utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
                    ++numSubstitutions;
                    if(subchar<=0xFFFF) {
                        *(pDest++)=(UChar)subchar;
                    } else {
                        *(pDest++)=U16_LEAD(subchar);
                        *(pDest++)=U16_TRAIL(subchar);
                    }
                }
            }
        } while(--count > 0);
    }

    while((pSrc<pSrcLimit) && (pDest<pDestLimit)) {
        ch = *pSrc;
        if(ch <= 0x7f){
            *pDest++=(UChar)ch;
            ++pSrc;
        } else {
            if(ch >= 0xe0) {
                if( /* handle U+0000..U+FFFF inline */
                    ch <= 0xef &&
                    ((pSrcLimit - pSrc) >= 3) &&
                    (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f &&
                    (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f
                ) {
                    /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
                    *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2);
                    pSrc += 3;
                    continue;
                }
            } else {
                if( /* handle U+0000..U+07FF inline */
                    ch >= 0xc0 &&
                    ((pSrcLimit - pSrc) >= 2) &&
                    (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f
                ) {
                    *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1);
                    pSrc += 2;
                    continue;
                }
            }

            if(subchar < 0) {
                *pErrorCode = U_INVALID_CHAR_FOUND;
                return NULL;
            } else {
                /* function call for error cases */
                ++pSrc; /* continue after the lead byte */
                utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
                ++numSubstitutions;
                if(subchar<=0xFFFF) {
                    *(pDest++)=(UChar)subchar;
                } else {
                    *(pDest++)=U16_LEAD(subchar);
                    if(pDest<pDestLimit) {
                        *(pDest++)=U16_TRAIL(subchar);
                    } else {
                        reqLength++;
                        break;
                    }
                }
            }
        }
    }

    /* do not fill the dest buffer just count the UChars needed */
    while(pSrc < pSrcLimit){
        ch = *pSrc;
        if(ch <= 0x7f) {
            reqLength++;
            ++pSrc;
        } else {
            if(ch >= 0xe0) {
                if( /* handle U+0000..U+FFFF inline */
                    ch <= 0xef &&
                    ((pSrcLimit - pSrc) >= 3) &&
                    (uint8_t)(pSrc[1] - 0x80) <= 0x3f &&
                    (uint8_t)(pSrc[2] - 0x80) <= 0x3f
                ) {
                    reqLength++;
                    pSrc += 3;
                    continue;
                }
            } else {
                if( /* handle U+0000..U+07FF inline */
                    ch >= 0xc0 &&
                    ((pSrcLimit - pSrc) >= 2) &&
                    (uint8_t)(pSrc[1] - 0x80) <= 0x3f
                ) {
                    reqLength++;
                    pSrc += 2;
                    continue;
                }
            }

            if(subchar < 0) {
                *pErrorCode = U_INVALID_CHAR_FOUND;
                return NULL;
            } else {
                /* function call for error cases */
                ++pSrc; /* continue after the lead byte */
                utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
                ++numSubstitutions;
                reqLength+=U16_LENGTH(ch);
            }
        }
    }

    if(pNumSubstitutions!=NULL) {
        *pNumSubstitutions=numSubstitutions;
    }

    reqLength+=(int32_t)(pDest - dest);
    if(pDestLength) {
        *pDestLength = reqLength;
    }

    /* Terminate the buffer */
    u_terminateUChars(dest, destCapacity, reqLength, pErrorCode);
    return dest;
}


Generated by  Doxygen 1.6.0   Back to index