Logo Search packages:      
Sourcecode: icu version File versions  Download package

U_INTERNAL UBool U_EXPORT2 usearch_searchBackwards ( UStringSearch strsrch,
int32_t  startIdx,
int32_t *  matchStart,
int32_t *  matchLimit,
UErrorCode status 
)

Simple backwards search for the pattern, starting at a specified index, and using using a default set search options.

This is an experimental function, and is not an official part of the ICU API.

The collator options, such as UCOL_STRENGTH and UCOL_NORMALIZTION, are honored.

The UStringSearch options USEARCH_CANONICAL_MATCH, USEARCH_OVERLAP and any Break Iterator are ignored.

Matches obey the following constraints:

Characters at the start or end positions of a match that are ignorable for collation are not included as part of the match, unless they are part of a combining sequence, as described below.

A match will not include a partial combining sequence. Combining character sequences are considered to be inseperable units, and either match the pattern completely, or are considered to not match at all. Thus, for example, an A followed a combining accent mark will not be found when searching for a plain (unaccented) A. (unless the collation strength has been set to ignore all accents).

When beginning a search, the initial starting position, startIdx, is assumed to be an acceptable match boundary with respect to combining characters. A combining sequence that spans across the starting point will not supress a match beginning at startIdx.

Characters that expand to multiple collation elements (German sharp-S becoming 'ss', or the composed forms of accented characters, for example) also must match completely. Searching for a single 's' in a string containing only a sharp-s will find no match.

Parameters:
strsrchthe UStringSearch struct, which references both the text to be searched and the pattern being sought.
startIdxThe index into the text to begin the search.
matchStartAn out parameter, the starting index of the matched text. This parameter may be NULL. A value of -1 will be returned if no match was found.
matchLimitOut parameter, the index of the first position following the matched text. The matchLimit will be at a suitable position for beginning a subsequent search in the input text. This parameter may be NULL. A value of -1 will be returned if no match was found.
statusReport any errors. Note that no match found is not an error.
Returns:
TRUE if a match was found, FALSE otherwise.

Definition at line 4070 of file usearch.cpp.

References FALSE, TRUE, U_FAILURE, U_ILLEGAL_ARGUMENT_ERROR, U_INTERNAL_PROGRAM_ERROR, and UCOL_PROCESSED_NULLORDER.

{
    if (U_FAILURE(*status)) {
        return FALSE;
    }

    // TODO:  reject search patterns beginning with a combining char.

#ifdef USEARCH_DEBUG
    if (getenv("USEARCH_DEBUG") != NULL) {
        printf("Pattern CEs\n");
        for (int ii=0; ii<strsrch->pattern.CELength; ii++) {
            printf(" %8x", strsrch->pattern.CE[ii]);
        }
        printf("\n");
    }

#endif
    // Input parameter sanity check.
    //  TODO:  should input indicies clip to the text length
    //         in the same way that UText does.
    if(strsrch->pattern.CELength == 0         ||
       startIdx < 0                           ||
       startIdx > strsrch->search->textLength ||
       strsrch->pattern.CE == NULL) {
           *status = U_ILLEGAL_ARGUMENT_ERROR;
           return FALSE;
    }

    if (strsrch->pattern.PCE == NULL) {
        initializePatternPCETable(strsrch, status);
    }

    CEBuffer ceb(strsrch, status);
    int32_t    targetIx = 0;

    /*
     * Pre-load the buffer with the CE's for the grapheme
     * after our starting position so that we're sure that
     * we can look at the CE following the match when we
     * check the match boundaries.
     *
     * This will also pre-fetch the first CE that we'll
     * consider for the match.
     */
    if (startIdx < strsrch->search->textLength) {
        UBreakIterator *bi = strsrch->search->internalBreakIter;
        int32_t next = ubrk_following(bi, startIdx);

        ucol_setOffset(strsrch->textIter, next, status);

        for (targetIx = 0; ; targetIx += 1) {
            if (ceb.getPrevious(targetIx)->lowIndex < startIdx) {
                break;
            }
        }
    } else {
        ucol_setOffset(strsrch->textIter, startIdx, status);
    }


    const CEI *targetCEI = NULL;
    int32_t    patIx;
    UBool      found;

    int32_t  limitIx = targetIx;
    int32_t  mStart = -1;
    int32_t  mLimit = -1;
    int32_t  minLimit;
    int32_t  maxLimit;



    // Outer loop moves over match starting positions in the
    //      target CE space.
    // Here, targetIx values increase toward the beginning of the base text (i.e. we get the text CEs in reverse order).
    // But  patIx is 0 at the beginning of the pattern and increases toward the end.
    // So this loop performs a comparison starting with the end of pattern, and prcessd toward the beginning of the pattern
    // and the beginning of the base text.
    for(targetIx = limitIx; ; targetIx += 1)
    {
        found = TRUE;
        // For targetIx > limitIx, this ceb.getPrevious gets a CE that is as far back in the ring buffer
        // (compared to the last CE fetched for the previous targetIx value) as we need to go
        // for this targetIx value, so if it is non-NULL then other ceb.getPrevious calls should be OK.
        const CEI *lastCEI  = ceb.getPrevious(targetIx);
        if (lastCEI == NULL) {
            *status = U_INTERNAL_PROGRAM_ERROR;
            found = FALSE;
             break;
        }
        //  Inner loop checks for a match beginning at each
        //  position from the outer loop.
        int32_t targetIxOffset = 0;
        for (patIx = strsrch->pattern.PCELength - 1; patIx >= 0; patIx -= 1) {
            int64_t patCE = strsrch->pattern.PCE[patIx];

            targetCEI = ceb.getPrevious(targetIx + strsrch->pattern.PCELength - 1 - patIx + targetIxOffset);
            //  Compare CE from target string with CE from the pattern.
            //    Note that the target CE will be UCOL_NULLORDER if we reach the end of input,
            //    which will fail the compare, below.
            UCompareCEsResult ceMatch = compareCE64s(targetCEI->ce, patCE, strsrch->search->elementComparisonType);
            if ( ceMatch == U_CE_NO_MATCH ) {
                found = FALSE;
                break;
            } else if ( ceMatch > U_CE_NO_MATCH ) {
                if ( ceMatch == U_CE_SKIP_TARG ) {
                    // redo with same patCE, next targCE
                    patIx++;
                    targetIxOffset++;
                } else { // ceMatch == U_CE_SKIP_PATN
                    // redo with same targCE, next patCE
                    targetIxOffset--;
                }
            }
        }

        if (!found && ((targetCEI == NULL) || (targetCEI->ce != UCOL_PROCESSED_NULLORDER))) {
            // No match at this targetIx.  Try again at the next.
            continue;
        }

        if (!found) {
            // No match at all, we have run off the end of the target text.
            break;
        }


        // We have found a match in CE space.
        // Now determine the bounds in string index space.
        //  There still is a chance of match failure if the CE range not correspond to
        //     an acceptable character range.
        //
        const CEI *firstCEI = ceb.getPrevious(targetIx + strsrch->pattern.PCELength - 1 + targetIxOffset);
        mStart   = firstCEI->lowIndex;

        // Check for the start of the match being within a combining sequence.
        //   This can happen if the pattern itself begins with a combining char, and
        //   the match found combining marks in the target text that were attached
        //    to something else.
        //   This type of match should be rejected for not completely consuming a
        //   combining sequence.
        if (!isBreakBoundary(strsrch, mStart)) {
            found = FALSE;
        }

        // Look at the high index of the first CE in the match. If it's the same as the
        // low index, the first CE in the match is in the middle of an expansion.
        if (mStart == firstCEI->highIndex) {
            found = FALSE;
        }


        minLimit = lastCEI->lowIndex;

        if (targetIx > 0) {
            // Look at the CE following the match.  If it is UCOL_NULLORDER the match
            //   extended to the end of input, and the match is good.

            // Look at the high and low indices of the CE following the match. If
            // they are the same it means one of two things:
            //    1. The match extended to the last CE from the target text, which is OK, or
            //    2. The last CE that was part of the match is in an expansion that extends
            //       to the first CE after the match. In this case, we reject the match.
            const CEI *nextCEI  = ceb.getPrevious(targetIx - 1);

            if (nextCEI->lowIndex == nextCEI->highIndex && nextCEI->ce != UCOL_PROCESSED_NULLORDER) {
                found = FALSE;
            }

            mLimit = maxLimit = nextCEI->lowIndex;

            //  Advance the match end position to the first acceptable match boundary.
            //    This advances the index over any combining charcters.
            if (minLimit < maxLimit) {
                int32_t nba = nextBoundaryAfter(strsrch, minLimit);

                if (nba >= lastCEI->highIndex) {
                    mLimit = nba;
                }
            }

            // If advancing to the end of a combining sequence in character indexing space
            //   advanced us beyond the end of the match in CE space, reject this match.
            if (mLimit > maxLimit) {
                found = FALSE;
            }

            // Make sure the end of the match is on a break boundary
            if (!isBreakBoundary(strsrch, mLimit)) {
                found = FALSE;
            }

        } else {
            // No non-ignorable CEs after this point.
            // The maximum position is detected by boundary after
            // the last non-ignorable CE. Combining sequence
            // across the start index will be truncated.
            int32_t nba = nextBoundaryAfter(strsrch, minLimit);
            mLimit = maxLimit = (nba > 0) && (startIdx > nba) ? nba : startIdx;
        }

    #ifdef USEARCH_DEBUG
        if (getenv("USEARCH_DEBUG") != NULL) {
            printf("minLimit, maxLimit, mLimit = %d, %d, %d\n", minLimit, maxLimit, mLimit);
        }
    #endif


        if (! checkIdentical(strsrch, mStart, mLimit)) {
            found = FALSE;
        }

        if (found) {
            break;
        }
    }

    #ifdef USEARCH_DEBUG
    if (getenv("USEARCH_DEBUG") != NULL) {
        printf("Target CEs [%d .. %d]\n", ceb.firstIx, ceb.limitIx);
        int32_t  lastToPrint = ceb.limitIx+2;
        for (int ii=ceb.firstIx; ii<lastToPrint; ii++) {
            printf("%8x@%d ", ceb.get(ii)->ce, ceb.get(ii)->srcIndex);
        }
        printf("\n%s\n", found? "match found" : "no match");
    }
    #endif

    // All Done.  Store back the match bounds to the caller.
    //
    if (found==FALSE) {
        mLimit = -1;
        mStart = -1;
    }

    if (matchStart != NULL) {
        *matchStart= mStart;
    }

    if (matchLimit != NULL) {
        *matchLimit = mLimit;
    }

    return found;
}

Generated by  Doxygen 1.6.0   Back to index