Logo Search packages:      
Sourcecode: icu version File versions  Download package

U_STABLE UChar* U_EXPORT2 u_strFromUTF8Lenient ( UChar *  dest,
int32_t  destCapacity,
int32_t *  pDestLength,
const char *  src,
int32_t  srcLength,
UErrorCode pErrorCode 
)

Convert a UTF-8 string to UTF-16.

Same as u_strFromUTF8() except that this function is designed to be very fast, which it achieves by being lenient about malformed UTF-8 sequences. This function is intended for use in environments where UTF-8 text is expected to be well-formed.

Its semantics are:

  • Well-formed UTF-8 text is correctly converted to well-formed UTF-16 text.
  • The function will not read beyond the input string, nor write beyond the destCapacity.
  • Malformed UTF-8 results in "garbage" 16-bit Unicode strings which may not be well-formed UTF-16. The function will resynchronize to valid code point boundaries within a small number of code points after an illegal sequence.
  • Non-shortest forms are not detected and will result in "spoofing" output.

For further performance improvement, if srcLength is given (>=0), then it must be destCapacity>=srcLength.

There is no inverse u_strToUTF8Lenient() function because there is practically no performance gain from not checking that a UTF-16 string is well-formed.

Parameters:
dest A buffer for the result string. The result will be zero-terminated if the buffer is large enough.
destCapacity The size of the buffer (number of UChars). If it is 0, then dest may be NULL and the function will only return the length of the result without writing any of the result string (pre-flighting). Unlike for other ICU functions, if srcLength>=0 then it must be destCapacity>=srcLength.
pDestLength A pointer to receive the number of units written to the destination. If pDestLength!=NULL then *pDestLength is always set to the number of output units corresponding to the transformation of all the input units, even in case of a buffer overflow. Unlike for other ICU functions, if srcLength>=0 but destCapacity<srcLength, then *pDestLength will be set to srcLength (and U_BUFFER_OVERFLOW_ERROR will be set) regardless of the actual result length.
src The original source string
srcLength The length of the original string. If -1, then src must be zero-terminated.
pErrorCode Pointer to a standard ICU error code. Its input value must pass the U_SUCCESS() test, or else the function returns immediately. Check for U_FAILURE() on output or use with function chaining. (See User Guide for details.)
Returns:
The pointer to destination buffer.
See also:
u_strFromUTF8

u_strFromUTF8WithSub

u_strToUTF8WithSub ICU 3.6

Definition at line 732 of file ustrtrns.c.

References NULL, U16_LEAD, U16_TRAIL, U_BUFFER_OVERFLOW_ERROR, U_FAILURE, and U_ILLEGAL_ARGUMENT_ERROR.

                                             {
    UChar *pDest = dest;
    UChar32 ch;
    int32_t reqLength = 0;
    uint8_t* pSrc = (uint8_t*) src;

    /* args check */
    if(pErrorCode==NULL || U_FAILURE(*pErrorCode)){
        return NULL;
    }
        
    if( (src==NULL && srcLength!=0) || srcLength < -1 ||
        (destCapacity<0) || (dest == NULL && destCapacity > 0)
    ) {
        *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
        return NULL;
    }

    if(srcLength < 0) {
        /* Transform a NUL-terminated string. */
        UChar *pDestLimit = dest+destCapacity;
        uint8_t t1, t2, t3; /* trail bytes */

        while(((ch = *pSrc) != 0) && (pDest < pDestLimit)) {
            if(ch < 0xc0) {
                /*
                 * ASCII, or a trail byte in lead position which is treated like
                 * a single-byte sequence for better character boundary
                 * resynchronization after illegal sequences.
                 */
                *pDest++=(UChar)ch;
                ++pSrc;
                continue;
            } else if(ch < 0xe0) { /* U+0080..U+07FF */
                if((t1 = pSrc[1]) != 0) {
                    /* 0x3080 = (0xc0 << 6) + 0x80 */
                    *pDest++ = (UChar)((ch << 6) + t1 - 0x3080);
                    pSrc += 2;
                    continue;
                }
            } else if(ch < 0xf0) { /* U+0800..U+FFFF */
                if((t1 = pSrc[1]) != 0 && (t2 = pSrc[2]) != 0) {
                    /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
                    /* 0x2080 = (0x80 << 6) + 0x80 */
                    *pDest++ = (UChar)((ch << 12) + (t1 << 6) + t2 - 0x2080);
                    pSrc += 3;
                    continue;
                }
            } else /* f0..f4 */ { /* U+10000..U+10FFFF */
                if((t1 = pSrc[1]) != 0 && (t2 = pSrc[2]) != 0 && (t3 = pSrc[3]) != 0) {
                    pSrc += 4;
                    /* 0x3c82080 = (0xf0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 */
                    ch = (ch << 18) + (t1 << 12) + (t2 << 6) + t3 - 0x3c82080;
                    *(pDest++) = U16_LEAD(ch);
                    if(pDest < pDestLimit) {
                        *(pDest++) = U16_TRAIL(ch);
                    } else {
                        reqLength = 1;
                        break;
                    }
                    continue;
                }
            }

            /* truncated character at the end */
            *pDest++ = 0xfffd;
            while(*++pSrc != 0) {}
            break;
        }

        /* Pre-flight the rest of the string. */
        while((ch = *pSrc) != 0) {
            if(ch < 0xc0) {
                /*
                 * ASCII, or a trail byte in lead position which is treated like
                 * a single-byte sequence for better character boundary
                 * resynchronization after illegal sequences.
                 */
                ++reqLength;
                ++pSrc;
                continue;
            } else if(ch < 0xe0) { /* U+0080..U+07FF */
                if(pSrc[1] != 0) {
                    ++reqLength;
                    pSrc += 2;
                    continue;
                }
            } else if(ch < 0xf0) { /* U+0800..U+FFFF */
                if(pSrc[1] != 0 && pSrc[2] != 0) {
                    ++reqLength;
                    pSrc += 3;
                    continue;
                }
            } else /* f0..f4 */ { /* U+10000..U+10FFFF */
                if(pSrc[1] != 0 && pSrc[2] != 0 && pSrc[3] != 0) {
                    reqLength += 2;
                    pSrc += 4;
                    continue;
                }
            }

            /* truncated character at the end */
            ++reqLength;
            break;
        }
    } else /* srcLength >= 0 */ {
        const uint8_t *pSrcLimit = pSrc + srcLength;

        /*
         * This function requires that if srcLength is given, then it must be
         * destCapatity >= srcLength so that we need not check for
         * destination buffer overflow in the loop.
         */
        if(destCapacity < srcLength) {
            if(pDestLength != NULL) {
                *pDestLength = srcLength; /* this likely overestimates the true destLength! */
            }
            *pErrorCode = U_BUFFER_OVERFLOW_ERROR;
            return NULL;
        }

        if((pSrcLimit - pSrc) >= 4) {
            pSrcLimit -= 3; /* temporarily reduce pSrcLimit */

            /* in this loop, we can always access at least 4 bytes, up to pSrc+3 */
            do {
                ch = *pSrc++;
                if(ch < 0xc0) {
                    /*
                     * ASCII, or a trail byte in lead position which is treated like
                     * a single-byte sequence for better character boundary
                     * resynchronization after illegal sequences.
                     */
                    *pDest++=(UChar)ch;
                } else if(ch < 0xe0) { /* U+0080..U+07FF */
                    /* 0x3080 = (0xc0 << 6) + 0x80 */
                    *pDest++ = (UChar)((ch << 6) + *pSrc++ - 0x3080);
                } else if(ch < 0xf0) { /* U+0800..U+FFFF */
                    /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
                    /* 0x2080 = (0x80 << 6) + 0x80 */
                    ch = (ch << 12) + (*pSrc++ << 6);
                    *pDest++ = (UChar)(ch + *pSrc++ - 0x2080);
                } else /* f0..f4 */ { /* U+10000..U+10FFFF */
                    /* 0x3c82080 = (0xf0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 */
                    ch = (ch << 18) + (*pSrc++ << 12);
                    ch += *pSrc++ << 6;
                    ch += *pSrc++ - 0x3c82080;
                    *(pDest++) = U16_LEAD(ch);
                    *(pDest++) = U16_TRAIL(ch);
                }
            } while(pSrc < pSrcLimit);

            pSrcLimit += 3; /* restore original pSrcLimit */
        }

        while(pSrc < pSrcLimit) {
            ch = *pSrc++;
            if(ch < 0xc0) {
                /*
                 * ASCII, or a trail byte in lead position which is treated like
                 * a single-byte sequence for better character boundary
                 * resynchronization after illegal sequences.
                 */
                *pDest++=(UChar)ch;
                continue;
            } else if(ch < 0xe0) { /* U+0080..U+07FF */
                if(pSrc < pSrcLimit) {
                    /* 0x3080 = (0xc0 << 6) + 0x80 */
                    *pDest++ = (UChar)((ch << 6) + *pSrc++ - 0x3080);
                    continue;
                }
            } else if(ch < 0xf0) { /* U+0800..U+FFFF */
                if((pSrcLimit - pSrc) >= 2) {
                    /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
                    /* 0x2080 = (0x80 << 6) + 0x80 */
                    ch = (ch << 12) + (*pSrc++ << 6);
                    *pDest++ = (UChar)(ch + *pSrc++ - 0x2080);
                    pSrc += 3;
                    continue;
                }
            } else /* f0..f4 */ { /* U+10000..U+10FFFF */
                if((pSrcLimit - pSrc) >= 3) {
                    /* 0x3c82080 = (0xf0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 */
                    ch = (ch << 18) + (*pSrc++ << 12);
                    ch += *pSrc++ << 6;
                    ch += *pSrc++ - 0x3c82080;
                    *(pDest++) = U16_LEAD(ch);
                    *(pDest++) = U16_TRAIL(ch);
                    pSrc += 4;
                    continue;
                }
            }

            /* truncated character at the end */
            *pDest++ = 0xfffd;
            break;
        }
    }

    reqLength+=(int32_t)(pDest - dest);

    if(pDestLength){
        *pDestLength = reqLength;
    }

    /* Terminate the buffer */
    u_terminateUChars(dest,destCapacity,reqLength,pErrorCode);

    return dest;
}


Generated by  Doxygen 1.6.0   Back to index