Check this for masked rules and index it to optimize performance. The sequence of operations is: (1) add rules to a set using
addRule() ; (2) freeze the set using freeze() ; (3) use the rule set. If addRule() is called after calling this method, it invalidates this object, and this method must be called again. That is, freeze() may be called multiple times, although for optimal performance it shouldn't be.
Definition at line 272 of file rbt_set.cpp. References UVector::addElement(), UVector::elementAt(), TransliterationRule::getIndexValue(), index, TransliterationRule::masks(), TransliterationRule::matchesIndexValue(), NULL, rules, ruleVector, UVector::size(), U_FAILURE, U_MEMORY_ALLOCATION_ERROR, and U_RULE_MASK_ERROR. Referenced by TransliterationRuleSet(). { /* Construct the rule array and index table. We reorder the * rules by sorting them into 256 bins. Each bin contains all * rules matching the index value for that bin. A rule * matches an index value if string whose first key character * has a low byte equal to the index value can match the rule. * * Each bin contains zero or more rules, in the same order * they were found originally. However, the total rules in * the bins may exceed the number in the original vector, * since rules that have a variable as their first key * character will generally fall into more than one bin. * * That is, each bin contains all rules that either have that * first index value as their first key character, or have * a set containing the index value as their first character. */ int32_t n = ruleVector>size(); int32_t j; int16_t x; UVector v(2*n, status); // heuristic; adjust as needed if (U_FAILURE(status)) { return; } /* Precompute the index values. This saves a LOT of time. * Be careful not to call malloc(0). */ int16_t* indexValue = (int16_t*) uprv_malloc( sizeof(int16_t) * (n > 0 ? n : 1) ); /* test for NULL */ if (indexValue == 0) { status = U_MEMORY_ALLOCATION_ERROR; return; } for (j=0; j<n; ++j) { TransliterationRule* r = (TransliterationRule*) ruleVector>elementAt(j); indexValue[j] = r>getIndexValue(); } for (x=0; x<256; ++x) { index[x] = v.size(); for (j=0; j<n; ++j) { if (indexValue[j] >= 0) { if (indexValue[j] == x) { v.addElement(ruleVector>elementAt(j), status); } } else { // If the indexValue is < 0, then the first key character is // a set, and we must use the more timeconsuming // matchesIndexValue check. In practice this happens // rarely, so we seldom tread this code path. TransliterationRule* r = (TransliterationRule*) ruleVector>elementAt(j); if (r>matchesIndexValue((uint8_t)x)) { v.addElement(r, status); } } } } uprv_free(indexValue); index[256] = v.size(); /* Freeze things into an array. */ uprv_free(rules); // Contains alias pointers /* You can't do malloc(0)! */ if (v.size() == 0) { rules = NULL; return; } rules = (TransliterationRule **)uprv_malloc(v.size() * sizeof(TransliterationRule *)); /* test for NULL */ if (rules == 0) { status = U_MEMORY_ALLOCATION_ERROR; return; } for (j=0; j<v.size(); ++j) { rules[j] = (TransliterationRule*) v.elementAt(j); } // TODO Add error reporting that indicates the rules that // are being masked. //UnicodeString errors; /* Check for masking. This is MUCH faster than our old check, * which was each rule against each following rule, since we * only have to check for masking within each bin now. It's * 256*O(n2^2) instead of O(n1^2), where n1 is the total rule * count, and n2 is the perbin rule count. But n2<<n1, so * it's a big win. */ for (x=0; x<256; ++x) { for (j=index[x]; j<index[x+1]1; ++j) { TransliterationRule* r1 = rules[j]; for (int32_t k=j+1; k<index[x+1]; ++k) { TransliterationRule* r2 = rules[k]; if (r1>masks(*r2)) { // if (errors == null) { // errors = new StringBuffer(); // } else { // errors.append("\n"); // } // errors.append("Rule " + r1 + " masks " + r2); status = U_RULE_MASK_ERROR; maskingError(*r1, *r2, parseError); return; } } } } //if (errors != null) { // throw new IllegalArgumentException(errors.toString()); //} }
