feat: godot-engine-source-4.3-stable
This commit is contained in:
parent
c59a7dcade
commit
7125d019b5
11149 changed files with 5070401 additions and 0 deletions
74
engine/thirdparty/icu4c/common/appendable.cpp
vendored
Normal file
74
engine/thirdparty/icu4c/common/appendable.cpp
vendored
Normal file
|
|
@ -0,0 +1,74 @@
|
|||
// © 2016 and later: Unicode, Inc. and others.
|
||||
// License & terms of use: http://www.unicode.org/copyright.html
|
||||
/*
|
||||
*******************************************************************************
|
||||
* Copyright (C) 2011-2012, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*******************************************************************************
|
||||
* file name: appendable.cpp
|
||||
* encoding: UTF-8
|
||||
* tab size: 8 (not used)
|
||||
* indentation:4
|
||||
*
|
||||
* created on: 2010dec07
|
||||
* created by: Markus W. Scherer
|
||||
*/
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
#include "unicode/appendable.h"
|
||||
#include "unicode/utf16.h"
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
Appendable::~Appendable() {}
|
||||
|
||||
UBool
|
||||
Appendable::appendCodePoint(UChar32 c) {
|
||||
if(c<=0xffff) {
|
||||
return appendCodeUnit((char16_t)c);
|
||||
} else {
|
||||
return appendCodeUnit(U16_LEAD(c)) && appendCodeUnit(U16_TRAIL(c));
|
||||
}
|
||||
}
|
||||
|
||||
UBool
|
||||
Appendable::appendString(const char16_t *s, int32_t length) {
|
||||
if(length<0) {
|
||||
char16_t c;
|
||||
while((c=*s++)!=0) {
|
||||
if(!appendCodeUnit(c)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
} else if(length>0) {
|
||||
const char16_t *limit=s+length;
|
||||
do {
|
||||
if(!appendCodeUnit(*s++)) {
|
||||
return false;
|
||||
}
|
||||
} while(s<limit);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
UBool
|
||||
Appendable::reserveAppendCapacity(int32_t /*appendCapacity*/) {
|
||||
return true;
|
||||
}
|
||||
|
||||
char16_t *
|
||||
Appendable::getAppendBuffer(int32_t minCapacity,
|
||||
int32_t /*desiredCapacityHint*/,
|
||||
char16_t *scratch, int32_t scratchCapacity,
|
||||
int32_t *resultCapacity) {
|
||||
if(minCapacity<1 || scratchCapacity<minCapacity) {
|
||||
*resultCapacity=0;
|
||||
return nullptr;
|
||||
}
|
||||
*resultCapacity=scratchCapacity;
|
||||
return scratch;
|
||||
}
|
||||
|
||||
// UnicodeStringAppendable is implemented in unistr.cpp.
|
||||
|
||||
U_NAMESPACE_END
|
||||
741
engine/thirdparty/icu4c/common/bmpset.cpp
vendored
Normal file
741
engine/thirdparty/icu4c/common/bmpset.cpp
vendored
Normal file
|
|
@ -0,0 +1,741 @@
|
|||
// © 2016 and later: Unicode, Inc. and others.
|
||||
// License & terms of use: http://www.unicode.org/copyright.html
|
||||
/*
|
||||
******************************************************************************
|
||||
*
|
||||
* Copyright (C) 2007-2012, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*
|
||||
******************************************************************************
|
||||
* file name: bmpset.cpp
|
||||
* encoding: UTF-8
|
||||
* tab size: 8 (not used)
|
||||
* indentation:4
|
||||
*
|
||||
* created on: 2007jan29
|
||||
* created by: Markus W. Scherer
|
||||
*/
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
#include "unicode/uniset.h"
|
||||
#include "unicode/utf8.h"
|
||||
#include "unicode/utf16.h"
|
||||
#include "cmemory.h"
|
||||
#include "bmpset.h"
|
||||
#include "uassert.h"
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
BMPSet::BMPSet(const int32_t *parentList, int32_t parentListLength) :
|
||||
list(parentList), listLength(parentListLength) {
|
||||
uprv_memset(latin1Contains, 0, sizeof(latin1Contains));
|
||||
uprv_memset(table7FF, 0, sizeof(table7FF));
|
||||
uprv_memset(bmpBlockBits, 0, sizeof(bmpBlockBits));
|
||||
|
||||
/*
|
||||
* Set the list indexes for binary searches for
|
||||
* U+0800, U+1000, U+2000, .., U+F000, U+10000.
|
||||
* U+0800 is the first 3-byte-UTF-8 code point. Lower code points are
|
||||
* looked up in the bit tables.
|
||||
* The last pair of indexes is for finding supplementary code points.
|
||||
*/
|
||||
list4kStarts[0]=findCodePoint(0x800, 0, listLength-1);
|
||||
int32_t i;
|
||||
for(i=1; i<=0x10; ++i) {
|
||||
list4kStarts[i]=findCodePoint(i<<12, list4kStarts[i-1], listLength-1);
|
||||
}
|
||||
list4kStarts[0x11]=listLength-1;
|
||||
containsFFFD=containsSlow(0xfffd, list4kStarts[0xf], list4kStarts[0x10]);
|
||||
|
||||
initBits();
|
||||
overrideIllegal();
|
||||
}
|
||||
|
||||
BMPSet::BMPSet(const BMPSet &otherBMPSet, const int32_t *newParentList, int32_t newParentListLength) :
|
||||
containsFFFD(otherBMPSet.containsFFFD),
|
||||
list(newParentList), listLength(newParentListLength) {
|
||||
uprv_memcpy(latin1Contains, otherBMPSet.latin1Contains, sizeof(latin1Contains));
|
||||
uprv_memcpy(table7FF, otherBMPSet.table7FF, sizeof(table7FF));
|
||||
uprv_memcpy(bmpBlockBits, otherBMPSet.bmpBlockBits, sizeof(bmpBlockBits));
|
||||
uprv_memcpy(list4kStarts, otherBMPSet.list4kStarts, sizeof(list4kStarts));
|
||||
}
|
||||
|
||||
BMPSet::~BMPSet() {
|
||||
}
|
||||
|
||||
/*
|
||||
* Set bits in a bit rectangle in "vertical" bit organization.
|
||||
* start<limit<=0x800
|
||||
*/
|
||||
static void set32x64Bits(uint32_t table[64], int32_t start, int32_t limit) {
|
||||
U_ASSERT(start<limit);
|
||||
U_ASSERT(limit<=0x800);
|
||||
|
||||
int32_t lead=start>>6; // Named for UTF-8 2-byte lead byte with upper 5 bits.
|
||||
int32_t trail=start&0x3f; // Named for UTF-8 2-byte trail byte with lower 6 bits.
|
||||
|
||||
// Set one bit indicating an all-one block.
|
||||
uint32_t bits=(uint32_t)1<<lead;
|
||||
if((start+1)==limit) { // Single-character shortcut.
|
||||
table[trail]|=bits;
|
||||
return;
|
||||
}
|
||||
|
||||
int32_t limitLead=limit>>6;
|
||||
int32_t limitTrail=limit&0x3f;
|
||||
|
||||
if(lead==limitLead) {
|
||||
// Partial vertical bit column.
|
||||
while(trail<limitTrail) {
|
||||
table[trail++]|=bits;
|
||||
}
|
||||
} else {
|
||||
// Partial vertical bit column,
|
||||
// followed by a bit rectangle,
|
||||
// followed by another partial vertical bit column.
|
||||
if(trail>0) {
|
||||
do {
|
||||
table[trail++]|=bits;
|
||||
} while(trail<64);
|
||||
++lead;
|
||||
}
|
||||
if(lead<limitLead) {
|
||||
bits=~(((unsigned)1<<lead)-1);
|
||||
if(limitLead<0x20) {
|
||||
bits&=((unsigned)1<<limitLead)-1;
|
||||
}
|
||||
for(trail=0; trail<64; ++trail) {
|
||||
table[trail]|=bits;
|
||||
}
|
||||
}
|
||||
// limit<=0x800. If limit==0x800 then limitLead=32 and limitTrail=0.
|
||||
// In that case, bits=1<<limitLead is undefined but the bits value
|
||||
// is not used because trail<limitTrail is already false.
|
||||
bits=(uint32_t)1<<((limitLead == 0x20) ? (limitLead - 1) : limitLead);
|
||||
for(trail=0; trail<limitTrail; ++trail) {
|
||||
table[trail]|=bits;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void BMPSet::initBits() {
|
||||
UChar32 start, limit;
|
||||
int32_t listIndex=0;
|
||||
|
||||
// Set latin1Contains[].
|
||||
do {
|
||||
start=list[listIndex++];
|
||||
if(listIndex<listLength) {
|
||||
limit=list[listIndex++];
|
||||
} else {
|
||||
limit=0x110000;
|
||||
}
|
||||
if(start>=0x100) {
|
||||
break;
|
||||
}
|
||||
do {
|
||||
latin1Contains[start++]=1;
|
||||
} while(start<limit && start<0x100);
|
||||
} while(limit<=0x100);
|
||||
|
||||
// Find the first range overlapping with (or after) 80..FF again,
|
||||
// to include them in table7FF as well.
|
||||
for(listIndex=0;;) {
|
||||
start=list[listIndex++];
|
||||
if(listIndex<listLength) {
|
||||
limit=list[listIndex++];
|
||||
} else {
|
||||
limit=0x110000;
|
||||
}
|
||||
if(limit>0x80) {
|
||||
if(start<0x80) {
|
||||
start=0x80;
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Set table7FF[].
|
||||
while(start<0x800) {
|
||||
set32x64Bits(table7FF, start, limit<=0x800 ? limit : 0x800);
|
||||
if(limit>0x800) {
|
||||
start=0x800;
|
||||
break;
|
||||
}
|
||||
|
||||
start=list[listIndex++];
|
||||
if(listIndex<listLength) {
|
||||
limit=list[listIndex++];
|
||||
} else {
|
||||
limit=0x110000;
|
||||
}
|
||||
}
|
||||
|
||||
// Set bmpBlockBits[].
|
||||
int32_t minStart=0x800;
|
||||
while(start<0x10000) {
|
||||
if(limit>0x10000) {
|
||||
limit=0x10000;
|
||||
}
|
||||
|
||||
if(start<minStart) {
|
||||
start=minStart;
|
||||
}
|
||||
if(start<limit) { // Else: Another range entirely in a known mixed-value block.
|
||||
if(start&0x3f) {
|
||||
// Mixed-value block of 64 code points.
|
||||
start>>=6;
|
||||
bmpBlockBits[start&0x3f]|=0x10001<<(start>>6);
|
||||
start=(start+1)<<6; // Round up to the next block boundary.
|
||||
minStart=start; // Ignore further ranges in this block.
|
||||
}
|
||||
if(start<limit) {
|
||||
if(start<(limit&~0x3f)) {
|
||||
// Multiple all-ones blocks of 64 code points each.
|
||||
set32x64Bits(bmpBlockBits, start>>6, limit>>6);
|
||||
}
|
||||
|
||||
if(limit&0x3f) {
|
||||
// Mixed-value block of 64 code points.
|
||||
limit>>=6;
|
||||
bmpBlockBits[limit&0x3f]|=0x10001<<(limit>>6);
|
||||
limit=(limit+1)<<6; // Round up to the next block boundary.
|
||||
minStart=limit; // Ignore further ranges in this block.
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if(limit==0x10000) {
|
||||
break;
|
||||
}
|
||||
|
||||
start=list[listIndex++];
|
||||
if(listIndex<listLength) {
|
||||
limit=list[listIndex++];
|
||||
} else {
|
||||
limit=0x110000;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Override some bits and bytes to the result of contains(FFFD)
|
||||
* for faster validity checking at runtime.
|
||||
* No need to set 0 values where they were reset to 0 in the constructor
|
||||
* and not modified by initBits().
|
||||
* (table7FF[] 0..7F, bmpBlockBits[] 0..7FF)
|
||||
* Need to set 0 values for surrogates D800..DFFF.
|
||||
*/
|
||||
void BMPSet::overrideIllegal() {
|
||||
uint32_t bits, mask;
|
||||
int32_t i;
|
||||
|
||||
if(containsFFFD) {
|
||||
bits=3; // Lead bytes 0xC0 and 0xC1.
|
||||
for(i=0; i<64; ++i) {
|
||||
table7FF[i]|=bits;
|
||||
}
|
||||
|
||||
bits=1; // Lead byte 0xE0.
|
||||
for(i=0; i<32; ++i) { // First half of 4k block.
|
||||
bmpBlockBits[i]|=bits;
|
||||
}
|
||||
|
||||
mask= static_cast<uint32_t>(~(0x10001<<0xd)); // Lead byte 0xED.
|
||||
bits=1<<0xd;
|
||||
for(i=32; i<64; ++i) { // Second half of 4k block.
|
||||
bmpBlockBits[i]=(bmpBlockBits[i]&mask)|bits;
|
||||
}
|
||||
} else {
|
||||
mask= static_cast<uint32_t>(~(0x10001<<0xd)); // Lead byte 0xED.
|
||||
for(i=32; i<64; ++i) { // Second half of 4k block.
|
||||
bmpBlockBits[i]&=mask;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
int32_t BMPSet::findCodePoint(UChar32 c, int32_t lo, int32_t hi) const {
|
||||
/* Examples:
|
||||
findCodePoint(c)
|
||||
set list[] c=0 1 3 4 7 8
|
||||
=== ============== ===========
|
||||
[] [110000] 0 0 0 0 0 0
|
||||
[\u0000-\u0003] [0, 4, 110000] 1 1 1 2 2 2
|
||||
[\u0004-\u0007] [4, 8, 110000] 0 0 0 1 1 2
|
||||
[:Any:] [0, 110000] 1 1 1 1 1 1
|
||||
*/
|
||||
|
||||
// Return the smallest i such that c < list[i]. Assume
|
||||
// list[len - 1] == HIGH and that c is legal (0..HIGH-1).
|
||||
if (c < list[lo])
|
||||
return lo;
|
||||
// High runner test. c is often after the last range, so an
|
||||
// initial check for this condition pays off.
|
||||
if (lo >= hi || c >= list[hi-1])
|
||||
return hi;
|
||||
// invariant: c >= list[lo]
|
||||
// invariant: c < list[hi]
|
||||
for (;;) {
|
||||
int32_t i = (lo + hi) >> 1;
|
||||
if (i == lo) {
|
||||
break; // Found!
|
||||
} else if (c < list[i]) {
|
||||
hi = i;
|
||||
} else {
|
||||
lo = i;
|
||||
}
|
||||
}
|
||||
return hi;
|
||||
}
|
||||
|
||||
UBool
|
||||
BMPSet::contains(UChar32 c) const {
|
||||
if((uint32_t)c<=0xff) {
|
||||
return (UBool)latin1Contains[c];
|
||||
} else if((uint32_t)c<=0x7ff) {
|
||||
return (UBool)((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))!=0);
|
||||
} else if((uint32_t)c<0xd800 || (c>=0xe000 && c<=0xffff)) {
|
||||
int lead=c>>12;
|
||||
uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001;
|
||||
if(twoBits<=1) {
|
||||
// All 64 code points with the same bits 15..6
|
||||
// are either in the set or not.
|
||||
return (UBool)twoBits;
|
||||
} else {
|
||||
// Look up the code point in its 4k block of code points.
|
||||
return containsSlow(c, list4kStarts[lead], list4kStarts[lead+1]);
|
||||
}
|
||||
} else if((uint32_t)c<=0x10ffff) {
|
||||
// surrogate or supplementary code point
|
||||
return containsSlow(c, list4kStarts[0xd], list4kStarts[0x11]);
|
||||
} else {
|
||||
// Out-of-range code points get false, consistent with long-standing
|
||||
// behavior of UnicodeSet::contains(c).
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Check for sufficient length for trail unit for each surrogate pair.
|
||||
* Handle single surrogates as surrogate code points as usual in ICU.
|
||||
*/
|
||||
const char16_t *
|
||||
BMPSet::span(const char16_t *s, const char16_t *limit, USetSpanCondition spanCondition) const {
|
||||
char16_t c, c2;
|
||||
|
||||
if(spanCondition) {
|
||||
// span
|
||||
do {
|
||||
c=*s;
|
||||
if(c<=0xff) {
|
||||
if(!latin1Contains[c]) {
|
||||
break;
|
||||
}
|
||||
} else if(c<=0x7ff) {
|
||||
if((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))==0) {
|
||||
break;
|
||||
}
|
||||
} else if(c<0xd800 || c>=0xe000) {
|
||||
int lead=c>>12;
|
||||
uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001;
|
||||
if(twoBits<=1) {
|
||||
// All 64 code points with the same bits 15..6
|
||||
// are either in the set or not.
|
||||
if(twoBits==0) {
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
// Look up the code point in its 4k block of code points.
|
||||
if(!containsSlow(c, list4kStarts[lead], list4kStarts[lead+1])) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
} else if(c>=0xdc00 || (s+1)==limit || (c2=s[1])<0xdc00 || c2>=0xe000) {
|
||||
// surrogate code point
|
||||
if(!containsSlow(c, list4kStarts[0xd], list4kStarts[0xe])) {
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
// surrogate pair
|
||||
if(!containsSlow(U16_GET_SUPPLEMENTARY(c, c2), list4kStarts[0x10], list4kStarts[0x11])) {
|
||||
break;
|
||||
}
|
||||
++s;
|
||||
}
|
||||
} while(++s<limit);
|
||||
} else {
|
||||
// span not
|
||||
do {
|
||||
c=*s;
|
||||
if(c<=0xff) {
|
||||
if(latin1Contains[c]) {
|
||||
break;
|
||||
}
|
||||
} else if(c<=0x7ff) {
|
||||
if((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))!=0) {
|
||||
break;
|
||||
}
|
||||
} else if(c<0xd800 || c>=0xe000) {
|
||||
int lead=c>>12;
|
||||
uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001;
|
||||
if(twoBits<=1) {
|
||||
// All 64 code points with the same bits 15..6
|
||||
// are either in the set or not.
|
||||
if(twoBits!=0) {
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
// Look up the code point in its 4k block of code points.
|
||||
if(containsSlow(c, list4kStarts[lead], list4kStarts[lead+1])) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
} else if(c>=0xdc00 || (s+1)==limit || (c2=s[1])<0xdc00 || c2>=0xe000) {
|
||||
// surrogate code point
|
||||
if(containsSlow(c, list4kStarts[0xd], list4kStarts[0xe])) {
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
// surrogate pair
|
||||
if(containsSlow(U16_GET_SUPPLEMENTARY(c, c2), list4kStarts[0x10], list4kStarts[0x11])) {
|
||||
break;
|
||||
}
|
||||
++s;
|
||||
}
|
||||
} while(++s<limit);
|
||||
}
|
||||
return s;
|
||||
}
|
||||
|
||||
/* Symmetrical with span(). */
|
||||
const char16_t *
|
||||
BMPSet::spanBack(const char16_t *s, const char16_t *limit, USetSpanCondition spanCondition) const {
|
||||
char16_t c, c2;
|
||||
|
||||
if(spanCondition) {
|
||||
// span
|
||||
for(;;) {
|
||||
c=*(--limit);
|
||||
if(c<=0xff) {
|
||||
if(!latin1Contains[c]) {
|
||||
break;
|
||||
}
|
||||
} else if(c<=0x7ff) {
|
||||
if((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))==0) {
|
||||
break;
|
||||
}
|
||||
} else if(c<0xd800 || c>=0xe000) {
|
||||
int lead=c>>12;
|
||||
uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001;
|
||||
if(twoBits<=1) {
|
||||
// All 64 code points with the same bits 15..6
|
||||
// are either in the set or not.
|
||||
if(twoBits==0) {
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
// Look up the code point in its 4k block of code points.
|
||||
if(!containsSlow(c, list4kStarts[lead], list4kStarts[lead+1])) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
} else if(c<0xdc00 || s==limit || (c2=*(limit-1))<0xd800 || c2>=0xdc00) {
|
||||
// surrogate code point
|
||||
if(!containsSlow(c, list4kStarts[0xd], list4kStarts[0xe])) {
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
// surrogate pair
|
||||
if(!containsSlow(U16_GET_SUPPLEMENTARY(c2, c), list4kStarts[0x10], list4kStarts[0x11])) {
|
||||
break;
|
||||
}
|
||||
--limit;
|
||||
}
|
||||
if(s==limit) {
|
||||
return s;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// span not
|
||||
for(;;) {
|
||||
c=*(--limit);
|
||||
if(c<=0xff) {
|
||||
if(latin1Contains[c]) {
|
||||
break;
|
||||
}
|
||||
} else if(c<=0x7ff) {
|
||||
if((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))!=0) {
|
||||
break;
|
||||
}
|
||||
} else if(c<0xd800 || c>=0xe000) {
|
||||
int lead=c>>12;
|
||||
uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001;
|
||||
if(twoBits<=1) {
|
||||
// All 64 code points with the same bits 15..6
|
||||
// are either in the set or not.
|
||||
if(twoBits!=0) {
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
// Look up the code point in its 4k block of code points.
|
||||
if(containsSlow(c, list4kStarts[lead], list4kStarts[lead+1])) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
} else if(c<0xdc00 || s==limit || (c2=*(limit-1))<0xd800 || c2>=0xdc00) {
|
||||
// surrogate code point
|
||||
if(containsSlow(c, list4kStarts[0xd], list4kStarts[0xe])) {
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
// surrogate pair
|
||||
if(containsSlow(U16_GET_SUPPLEMENTARY(c2, c), list4kStarts[0x10], list4kStarts[0x11])) {
|
||||
break;
|
||||
}
|
||||
--limit;
|
||||
}
|
||||
if(s==limit) {
|
||||
return s;
|
||||
}
|
||||
}
|
||||
}
|
||||
return limit+1;
|
||||
}
|
||||
|
||||
/*
|
||||
* Precheck for sufficient trail bytes at end of string only once per span.
|
||||
* Check validity.
|
||||
*/
|
||||
const uint8_t *
|
||||
BMPSet::spanUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const {
|
||||
const uint8_t *limit=s+length;
|
||||
uint8_t b=*s;
|
||||
if(U8_IS_SINGLE(b)) {
|
||||
// Initial all-ASCII span.
|
||||
if(spanCondition) {
|
||||
do {
|
||||
if(!latin1Contains[b] || ++s==limit) {
|
||||
return s;
|
||||
}
|
||||
b=*s;
|
||||
} while(U8_IS_SINGLE(b));
|
||||
} else {
|
||||
do {
|
||||
if(latin1Contains[b] || ++s==limit) {
|
||||
return s;
|
||||
}
|
||||
b=*s;
|
||||
} while(U8_IS_SINGLE(b));
|
||||
}
|
||||
length=(int32_t)(limit-s);
|
||||
}
|
||||
|
||||
if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
|
||||
spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values.
|
||||
}
|
||||
|
||||
const uint8_t *limit0=limit;
|
||||
|
||||
/*
|
||||
* Make sure that the last 1/2/3/4-byte sequence before limit is complete
|
||||
* or runs into a lead byte.
|
||||
* In the span loop compare s with limit only once
|
||||
* per multi-byte character.
|
||||
*
|
||||
* Give a trailing illegal sequence the same value as the result of contains(FFFD),
|
||||
* including it if that is part of the span, otherwise set limit0 to before
|
||||
* the truncated sequence.
|
||||
*/
|
||||
b=*(limit-1);
|
||||
if((int8_t)b<0) {
|
||||
// b>=0x80: lead or trail byte
|
||||
if(b<0xc0) {
|
||||
// single trail byte, check for preceding 3- or 4-byte lead byte
|
||||
if(length>=2 && (b=*(limit-2))>=0xe0) {
|
||||
limit-=2;
|
||||
if(containsFFFD!=spanCondition) {
|
||||
limit0=limit;
|
||||
}
|
||||
} else if(b<0xc0 && b>=0x80 && length>=3 && (b=*(limit-3))>=0xf0) {
|
||||
// 4-byte lead byte with only two trail bytes
|
||||
limit-=3;
|
||||
if(containsFFFD!=spanCondition) {
|
||||
limit0=limit;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// lead byte with no trail bytes
|
||||
--limit;
|
||||
if(containsFFFD!=spanCondition) {
|
||||
limit0=limit;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
uint8_t t1, t2, t3;
|
||||
|
||||
while(s<limit) {
|
||||
b=*s;
|
||||
if(U8_IS_SINGLE(b)) {
|
||||
// ASCII
|
||||
if(spanCondition) {
|
||||
do {
|
||||
if(!latin1Contains[b]) {
|
||||
return s;
|
||||
} else if(++s==limit) {
|
||||
return limit0;
|
||||
}
|
||||
b=*s;
|
||||
} while(U8_IS_SINGLE(b));
|
||||
} else {
|
||||
do {
|
||||
if(latin1Contains[b]) {
|
||||
return s;
|
||||
} else if(++s==limit) {
|
||||
return limit0;
|
||||
}
|
||||
b=*s;
|
||||
} while(U8_IS_SINGLE(b));
|
||||
}
|
||||
}
|
||||
++s; // Advance past the lead byte.
|
||||
if(b>=0xe0) {
|
||||
if(b<0xf0) {
|
||||
if( /* handle U+0000..U+FFFF inline */
|
||||
(t1=(uint8_t)(s[0]-0x80)) <= 0x3f &&
|
||||
(t2=(uint8_t)(s[1]-0x80)) <= 0x3f
|
||||
) {
|
||||
b&=0xf;
|
||||
uint32_t twoBits=(bmpBlockBits[t1]>>b)&0x10001;
|
||||
if(twoBits<=1) {
|
||||
// All 64 code points with this lead byte and middle trail byte
|
||||
// are either in the set or not.
|
||||
if(twoBits!=(uint32_t)spanCondition) {
|
||||
return s-1;
|
||||
}
|
||||
} else {
|
||||
// Look up the code point in its 4k block of code points.
|
||||
UChar32 c=(b<<12)|(t1<<6)|t2;
|
||||
if(containsSlow(c, list4kStarts[b], list4kStarts[b+1]) != spanCondition) {
|
||||
return s-1;
|
||||
}
|
||||
}
|
||||
s+=2;
|
||||
continue;
|
||||
}
|
||||
} else if( /* handle U+10000..U+10FFFF inline */
|
||||
(t1=(uint8_t)(s[0]-0x80)) <= 0x3f &&
|
||||
(t2=(uint8_t)(s[1]-0x80)) <= 0x3f &&
|
||||
(t3=(uint8_t)(s[2]-0x80)) <= 0x3f
|
||||
) {
|
||||
// Give an illegal sequence the same value as the result of contains(FFFD).
|
||||
UChar32 c=((UChar32)(b-0xf0)<<18)|((UChar32)t1<<12)|(t2<<6)|t3;
|
||||
if( ( (0x10000<=c && c<=0x10ffff) ?
|
||||
containsSlow(c, list4kStarts[0x10], list4kStarts[0x11]) :
|
||||
containsFFFD
|
||||
) != spanCondition
|
||||
) {
|
||||
return s-1;
|
||||
}
|
||||
s+=3;
|
||||
continue;
|
||||
}
|
||||
} else {
|
||||
if( /* handle U+0000..U+07FF inline */
|
||||
b>=0xc0 &&
|
||||
(t1=(uint8_t)(*s-0x80)) <= 0x3f
|
||||
) {
|
||||
if((USetSpanCondition)((table7FF[t1]&((uint32_t)1<<(b&0x1f)))!=0) != spanCondition) {
|
||||
return s-1;
|
||||
}
|
||||
++s;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
// Give an illegal sequence the same value as the result of contains(FFFD).
|
||||
// Handle each byte of an illegal sequence separately to simplify the code;
|
||||
// no need to optimize error handling.
|
||||
if(containsFFFD!=spanCondition) {
|
||||
return s-1;
|
||||
}
|
||||
}
|
||||
|
||||
return limit0;
|
||||
}
|
||||
|
||||
/*
|
||||
* While going backwards through UTF-8 optimize only for ASCII.
|
||||
* Unlike UTF-16, UTF-8 is not forward-backward symmetrical, that is, it is not
|
||||
* possible to tell from the last byte in a multi-byte sequence how many
|
||||
* preceding bytes there should be. Therefore, going backwards through UTF-8
|
||||
* is much harder than going forward.
|
||||
*/
|
||||
int32_t
|
||||
BMPSet::spanBackUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const {
|
||||
if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
|
||||
spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values.
|
||||
}
|
||||
|
||||
uint8_t b;
|
||||
|
||||
do {
|
||||
b=s[--length];
|
||||
if(U8_IS_SINGLE(b)) {
|
||||
// ASCII sub-span
|
||||
if(spanCondition) {
|
||||
do {
|
||||
if(!latin1Contains[b]) {
|
||||
return length+1;
|
||||
} else if(length==0) {
|
||||
return 0;
|
||||
}
|
||||
b=s[--length];
|
||||
} while(U8_IS_SINGLE(b));
|
||||
} else {
|
||||
do {
|
||||
if(latin1Contains[b]) {
|
||||
return length+1;
|
||||
} else if(length==0) {
|
||||
return 0;
|
||||
}
|
||||
b=s[--length];
|
||||
} while(U8_IS_SINGLE(b));
|
||||
}
|
||||
}
|
||||
|
||||
int32_t prev=length;
|
||||
UChar32 c;
|
||||
// trail byte: collect a multi-byte character
|
||||
// (or lead byte in last-trail position)
|
||||
c=utf8_prevCharSafeBody(s, 0, &length, b, -3);
|
||||
// c is a valid code point, not ASCII, not a surrogate
|
||||
if(c<=0x7ff) {
|
||||
if((USetSpanCondition)((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))!=0) != spanCondition) {
|
||||
return prev+1;
|
||||
}
|
||||
} else if(c<=0xffff) {
|
||||
int lead=c>>12;
|
||||
uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001;
|
||||
if(twoBits<=1) {
|
||||
// All 64 code points with the same bits 15..6
|
||||
// are either in the set or not.
|
||||
if(twoBits!=(uint32_t)spanCondition) {
|
||||
return prev+1;
|
||||
}
|
||||
} else {
|
||||
// Look up the code point in its 4k block of code points.
|
||||
if(containsSlow(c, list4kStarts[lead], list4kStarts[lead+1]) != spanCondition) {
|
||||
return prev+1;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
if(containsSlow(c, list4kStarts[0x10], list4kStarts[0x11]) != spanCondition) {
|
||||
return prev+1;
|
||||
}
|
||||
}
|
||||
} while(length>0);
|
||||
return 0;
|
||||
}
|
||||
|
||||
U_NAMESPACE_END
|
||||
164
engine/thirdparty/icu4c/common/bmpset.h
vendored
Normal file
164
engine/thirdparty/icu4c/common/bmpset.h
vendored
Normal file
|
|
@ -0,0 +1,164 @@
|
|||
// © 2016 and later: Unicode, Inc. and others.
|
||||
// License & terms of use: http://www.unicode.org/copyright.html
|
||||
/*
|
||||
******************************************************************************
|
||||
*
|
||||
* Copyright (C) 2007, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*
|
||||
******************************************************************************
|
||||
* file name: bmpset.h
|
||||
* encoding: UTF-8
|
||||
* tab size: 8 (not used)
|
||||
* indentation:4
|
||||
*
|
||||
* created on: 2007jan29
|
||||
* created by: Markus W. Scherer
|
||||
*/
|
||||
|
||||
#ifndef __BMPSET_H__
|
||||
#define __BMPSET_H__
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
#include "unicode/uniset.h"
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
/*
|
||||
* Helper class for frozen UnicodeSets, implements contains() and span()
|
||||
* optimized for BMP code points. Structured to be UTF-8-friendly.
|
||||
*
|
||||
* Latin-1: Look up bytes.
|
||||
* 2-byte characters: Bits organized vertically.
|
||||
* 3-byte characters: Use zero/one/mixed data per 64-block in U+0000..U+FFFF,
|
||||
* with mixed for illegal ranges.
|
||||
* Supplementary characters: Binary search over
|
||||
* the supplementary part of the parent set's inversion list.
|
||||
*/
|
||||
class BMPSet : public UMemory {
|
||||
public:
|
||||
BMPSet(const int32_t *parentList, int32_t parentListLength);
|
||||
BMPSet(const BMPSet &otherBMPSet, const int32_t *newParentList, int32_t newParentListLength);
|
||||
virtual ~BMPSet();
|
||||
|
||||
virtual UBool contains(UChar32 c) const;
|
||||
|
||||
/*
|
||||
* Span the initial substring for which each character c has spanCondition==contains(c).
|
||||
* It must be s<limit and spanCondition==0 or 1.
|
||||
* @return The string pointer which limits the span.
|
||||
*/
|
||||
const char16_t *span(const char16_t *s, const char16_t *limit, USetSpanCondition spanCondition) const;
|
||||
|
||||
/*
|
||||
* Span the trailing substring for which each character c has spanCondition==contains(c).
|
||||
* It must be s<limit and spanCondition==0 or 1.
|
||||
* @return The string pointer which starts the span.
|
||||
*/
|
||||
const char16_t *spanBack(const char16_t *s, const char16_t *limit, USetSpanCondition spanCondition) const;
|
||||
|
||||
/*
|
||||
* Span the initial substring for which each character c has spanCondition==contains(c).
|
||||
* It must be length>0 and spanCondition==0 or 1.
|
||||
* @return The string pointer which limits the span.
|
||||
*/
|
||||
const uint8_t *spanUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const;
|
||||
|
||||
/*
|
||||
* Span the trailing substring for which each character c has spanCondition==contains(c).
|
||||
* It must be length>0 and spanCondition==0 or 1.
|
||||
* @return The start of the span.
|
||||
*/
|
||||
int32_t spanBackUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const;
|
||||
|
||||
private:
|
||||
void initBits();
|
||||
void overrideIllegal();
|
||||
|
||||
/**
|
||||
* Same as UnicodeSet::findCodePoint(UChar32 c) const except that the
|
||||
* binary search is restricted for finding code points in a certain range.
|
||||
*
|
||||
* For restricting the search for finding in the range start..end,
|
||||
* pass in
|
||||
* lo=findCodePoint(start) and
|
||||
* hi=findCodePoint(end)
|
||||
* with 0<=lo<=hi<len.
|
||||
* findCodePoint(c) defaults to lo=0 and hi=len-1.
|
||||
*
|
||||
* @param c a character in a subrange of MIN_VALUE..MAX_VALUE
|
||||
* @param lo The lowest index to be returned.
|
||||
* @param hi The highest index to be returned.
|
||||
* @return the smallest integer i in the range lo..hi,
|
||||
* inclusive, such that c < list[i]
|
||||
*/
|
||||
int32_t findCodePoint(UChar32 c, int32_t lo, int32_t hi) const;
|
||||
|
||||
inline UBool containsSlow(UChar32 c, int32_t lo, int32_t hi) const;
|
||||
|
||||
/*
|
||||
* One byte 0 or 1 per Latin-1 character.
|
||||
*/
|
||||
UBool latin1Contains[0x100];
|
||||
|
||||
/* true if contains(U+FFFD). */
|
||||
UBool containsFFFD;
|
||||
|
||||
/*
|
||||
* One bit per code point from U+0000..U+07FF.
|
||||
* The bits are organized vertically; consecutive code points
|
||||
* correspond to the same bit positions in consecutive table words.
|
||||
* With code point parts
|
||||
* lead=c{10..6}
|
||||
* trail=c{5..0}
|
||||
* it is set.contains(c)==(table7FF[trail] bit lead)
|
||||
*
|
||||
* Bits for 0..7F (non-shortest forms) are set to the result of contains(FFFD)
|
||||
* for faster validity checking at runtime.
|
||||
*/
|
||||
uint32_t table7FF[64];
|
||||
|
||||
/*
|
||||
* One bit per 64 BMP code points.
|
||||
* The bits are organized vertically; consecutive 64-code point blocks
|
||||
* correspond to the same bit position in consecutive table words.
|
||||
* With code point parts
|
||||
* lead=c{15..12}
|
||||
* t1=c{11..6}
|
||||
* test bits (lead+16) and lead in bmpBlockBits[t1].
|
||||
* If the upper bit is 0, then the lower bit indicates if contains(c)
|
||||
* for all code points in the 64-block.
|
||||
* If the upper bit is 1, then the block is mixed and set.contains(c)
|
||||
* must be called.
|
||||
*
|
||||
* Bits for 0..7FF (non-shortest forms) and D800..DFFF are set to
|
||||
* the result of contains(FFFD) for faster validity checking at runtime.
|
||||
*/
|
||||
uint32_t bmpBlockBits[64];
|
||||
|
||||
/*
|
||||
* Inversion list indexes for restricted binary searches in
|
||||
* findCodePoint(), from
|
||||
* findCodePoint(U+0800, U+1000, U+2000, .., U+F000, U+10000).
|
||||
* U+0800 is the first 3-byte-UTF-8 code point. Code points below U+0800 are
|
||||
* always looked up in the bit tables.
|
||||
* The last pair of indexes is for finding supplementary code points.
|
||||
*/
|
||||
int32_t list4kStarts[18];
|
||||
|
||||
/*
|
||||
* The inversion list of the parent set, for the slower contains() implementation
|
||||
* for mixed BMP blocks and for supplementary code points.
|
||||
* The list is terminated with list[listLength-1]=0x110000.
|
||||
*/
|
||||
const int32_t *list;
|
||||
int32_t listLength;
|
||||
};
|
||||
|
||||
inline UBool BMPSet::containsSlow(UChar32 c, int32_t lo, int32_t hi) const {
|
||||
return (UBool)(findCodePoint(c, lo, hi) & 1);
|
||||
}
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
||||
#endif
|
||||
367
engine/thirdparty/icu4c/common/brkeng.cpp
vendored
Normal file
367
engine/thirdparty/icu4c/common/brkeng.cpp
vendored
Normal file
|
|
@ -0,0 +1,367 @@
|
|||
// © 2016 and later: Unicode, Inc. and others.
|
||||
// License & terms of use: http://www.unicode.org/copyright.html
|
||||
/*
|
||||
************************************************************************************
|
||||
* Copyright (C) 2006-2016, International Business Machines Corporation
|
||||
* and others. All Rights Reserved.
|
||||
************************************************************************************
|
||||
*/
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
|
||||
#if !UCONFIG_NO_BREAK_ITERATION
|
||||
|
||||
#include "unicode/uchar.h"
|
||||
#include "unicode/uniset.h"
|
||||
#include "unicode/chariter.h"
|
||||
#include "unicode/ures.h"
|
||||
#include "unicode/udata.h"
|
||||
#include "unicode/putil.h"
|
||||
#include "unicode/ustring.h"
|
||||
#include "unicode/uscript.h"
|
||||
#include "unicode/ucharstrie.h"
|
||||
#include "unicode/bytestrie.h"
|
||||
#include "unicode/rbbi.h"
|
||||
|
||||
#include "brkeng.h"
|
||||
#include "cmemory.h"
|
||||
#include "dictbe.h"
|
||||
#include "lstmbe.h"
|
||||
#include "charstr.h"
|
||||
#include "dictionarydata.h"
|
||||
#include "mutex.h"
|
||||
#include "uvector.h"
|
||||
#include "umutex.h"
|
||||
#include "uresimp.h"
|
||||
#include "ubrkimpl.h"
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
/*
|
||||
******************************************************************
|
||||
*/
|
||||
|
||||
LanguageBreakEngine::LanguageBreakEngine() {
|
||||
}
|
||||
|
||||
LanguageBreakEngine::~LanguageBreakEngine() {
|
||||
}
|
||||
|
||||
/*
|
||||
******************************************************************
|
||||
*/
|
||||
|
||||
LanguageBreakFactory::LanguageBreakFactory() {
|
||||
}
|
||||
|
||||
LanguageBreakFactory::~LanguageBreakFactory() {
|
||||
}
|
||||
|
||||
/*
|
||||
******************************************************************
|
||||
*/
|
||||
|
||||
UnhandledEngine::UnhandledEngine(UErrorCode &status) : fHandled(nullptr) {
|
||||
(void)status;
|
||||
}
|
||||
|
||||
UnhandledEngine::~UnhandledEngine() {
|
||||
delete fHandled;
|
||||
fHandled = nullptr;
|
||||
}
|
||||
|
||||
UBool
|
||||
UnhandledEngine::handles(UChar32 c, const char* locale) const {
|
||||
(void)locale; // Unused
|
||||
return fHandled && fHandled->contains(c);
|
||||
}
|
||||
|
||||
int32_t
|
||||
UnhandledEngine::findBreaks( UText *text,
|
||||
int32_t startPos,
|
||||
int32_t endPos,
|
||||
UVector32 &/*foundBreaks*/,
|
||||
UBool /* isPhraseBreaking */,
|
||||
UErrorCode &status) const {
|
||||
if (U_FAILURE(status)) return 0;
|
||||
utext_setNativeIndex(text, startPos);
|
||||
UChar32 c = utext_current32(text);
|
||||
while((int32_t)utext_getNativeIndex(text) < endPos && fHandled->contains(c)) {
|
||||
utext_next32(text); // TODO: recast loop to work with post-increment operations.
|
||||
c = utext_current32(text);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
void
|
||||
UnhandledEngine::handleCharacter(UChar32 c) {
|
||||
if (fHandled == nullptr) {
|
||||
fHandled = new UnicodeSet();
|
||||
if (fHandled == nullptr) {
|
||||
return;
|
||||
}
|
||||
}
|
||||
if (!fHandled->contains(c)) {
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
// Apply the entire script of the character.
|
||||
int32_t script = u_getIntPropertyValue(c, UCHAR_SCRIPT);
|
||||
fHandled->applyIntPropertyValue(UCHAR_SCRIPT, script, status);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
******************************************************************
|
||||
*/
|
||||
|
||||
ICULanguageBreakFactory::ICULanguageBreakFactory(UErrorCode &/*status*/) {
|
||||
fEngines = nullptr;
|
||||
}
|
||||
|
||||
ICULanguageBreakFactory::~ICULanguageBreakFactory() {
|
||||
delete fEngines;
|
||||
}
|
||||
|
||||
void ICULanguageBreakFactory::ensureEngines(UErrorCode& status) {
|
||||
static UMutex gBreakEngineMutex;
|
||||
Mutex m(&gBreakEngineMutex);
|
||||
if (fEngines == nullptr) {
|
||||
LocalPointer<UStack> engines(new UStack(uprv_deleteUObject, nullptr, status), status);
|
||||
if (U_SUCCESS(status)) {
|
||||
fEngines = engines.orphan();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const LanguageBreakEngine *
|
||||
ICULanguageBreakFactory::getEngineFor(UChar32 c, const char* locale) {
|
||||
const LanguageBreakEngine *lbe = nullptr;
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
ensureEngines(status);
|
||||
if (U_FAILURE(status) ) {
|
||||
// Note: no way to return error code to caller.
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
static UMutex gBreakEngineMutex;
|
||||
Mutex m(&gBreakEngineMutex);
|
||||
int32_t i = fEngines->size();
|
||||
while (--i >= 0) {
|
||||
lbe = (const LanguageBreakEngine *)(fEngines->elementAt(i));
|
||||
if (lbe != nullptr && lbe->handles(c, locale)) {
|
||||
return lbe;
|
||||
}
|
||||
}
|
||||
|
||||
// We didn't find an engine. Create one.
|
||||
lbe = loadEngineFor(c, locale);
|
||||
if (lbe != nullptr) {
|
||||
fEngines->push((void *)lbe, status);
|
||||
}
|
||||
return U_SUCCESS(status) ? lbe : nullptr;
|
||||
}
|
||||
|
||||
const LanguageBreakEngine *
|
||||
ICULanguageBreakFactory::loadEngineFor(UChar32 c, const char*) {
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
UScriptCode code = uscript_getScript(c, &status);
|
||||
if (U_SUCCESS(status)) {
|
||||
const LanguageBreakEngine *engine = nullptr;
|
||||
// Try to use LSTM first
|
||||
const LSTMData *data = CreateLSTMDataForScript(code, status);
|
||||
if (U_SUCCESS(status)) {
|
||||
if (data != nullptr) {
|
||||
engine = CreateLSTMBreakEngine(code, data, status);
|
||||
if (U_SUCCESS(status) && engine != nullptr) {
|
||||
return engine;
|
||||
}
|
||||
if (engine != nullptr) {
|
||||
delete engine;
|
||||
engine = nullptr;
|
||||
} else {
|
||||
DeleteLSTMData(data);
|
||||
}
|
||||
}
|
||||
}
|
||||
status = U_ZERO_ERROR; // fallback to dictionary based
|
||||
DictionaryMatcher *m = loadDictionaryMatcherFor(code);
|
||||
if (m != nullptr) {
|
||||
switch(code) {
|
||||
case USCRIPT_THAI:
|
||||
engine = new ThaiBreakEngine(m, status);
|
||||
break;
|
||||
case USCRIPT_LAO:
|
||||
engine = new LaoBreakEngine(m, status);
|
||||
break;
|
||||
case USCRIPT_MYANMAR:
|
||||
engine = new BurmeseBreakEngine(m, status);
|
||||
break;
|
||||
case USCRIPT_KHMER:
|
||||
engine = new KhmerBreakEngine(m, status);
|
||||
break;
|
||||
|
||||
#if !UCONFIG_NO_NORMALIZATION
|
||||
// CJK not available w/o normalization
|
||||
case USCRIPT_HANGUL:
|
||||
engine = new CjkBreakEngine(m, kKorean, status);
|
||||
break;
|
||||
|
||||
// use same BreakEngine and dictionary for both Chinese and Japanese
|
||||
case USCRIPT_HIRAGANA:
|
||||
case USCRIPT_KATAKANA:
|
||||
case USCRIPT_HAN:
|
||||
engine = new CjkBreakEngine(m, kChineseJapanese, status);
|
||||
break;
|
||||
#if 0
|
||||
// TODO: Have to get some characters with script=common handled
|
||||
// by CjkBreakEngine (e.g. U+309B). Simply subjecting
|
||||
// them to CjkBreakEngine does not work. The engine has to
|
||||
// special-case them.
|
||||
case USCRIPT_COMMON:
|
||||
{
|
||||
UBlockCode block = ublock_getCode(code);
|
||||
if (block == UBLOCK_HIRAGANA || block == UBLOCK_KATAKANA)
|
||||
engine = new CjkBreakEngine(dict, kChineseJapanese, status);
|
||||
break;
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
|
||||
default:
|
||||
break;
|
||||
}
|
||||
if (engine == nullptr) {
|
||||
delete m;
|
||||
}
|
||||
else if (U_FAILURE(status)) {
|
||||
delete engine;
|
||||
engine = nullptr;
|
||||
}
|
||||
return engine;
|
||||
}
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
DictionaryMatcher *
|
||||
ICULanguageBreakFactory::loadDictionaryMatcherFor(UScriptCode script) {
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
// open root from brkitr tree.
|
||||
UResourceBundle *b = ures_open(U_ICUDATA_BRKITR, "", &status);
|
||||
b = ures_getByKeyWithFallback(b, "dictionaries", b, &status);
|
||||
int32_t dictnlength = 0;
|
||||
const char16_t *dictfname =
|
||||
ures_getStringByKeyWithFallback(b, uscript_getShortName(script), &dictnlength, &status);
|
||||
if (U_FAILURE(status)) {
|
||||
ures_close(b);
|
||||
return nullptr;
|
||||
}
|
||||
CharString dictnbuf;
|
||||
CharString ext;
|
||||
const char16_t *extStart = u_memrchr(dictfname, 0x002e, dictnlength); // last dot
|
||||
if (extStart != nullptr) {
|
||||
int32_t len = (int32_t)(extStart - dictfname);
|
||||
ext.appendInvariantChars(UnicodeString(false, extStart + 1, dictnlength - len - 1), status);
|
||||
dictnlength = len;
|
||||
}
|
||||
dictnbuf.appendInvariantChars(UnicodeString(false, dictfname, dictnlength), status);
|
||||
ures_close(b);
|
||||
|
||||
UDataMemory *file = udata_open(U_ICUDATA_BRKITR, ext.data(), dictnbuf.data(), &status);
|
||||
if (U_SUCCESS(status)) {
|
||||
// build trie
|
||||
const uint8_t *data = (const uint8_t *)udata_getMemory(file);
|
||||
const int32_t *indexes = (const int32_t *)data;
|
||||
const int32_t offset = indexes[DictionaryData::IX_STRING_TRIE_OFFSET];
|
||||
const int32_t trieType = indexes[DictionaryData::IX_TRIE_TYPE] & DictionaryData::TRIE_TYPE_MASK;
|
||||
DictionaryMatcher *m = nullptr;
|
||||
if (trieType == DictionaryData::TRIE_TYPE_BYTES) {
|
||||
const int32_t transform = indexes[DictionaryData::IX_TRANSFORM];
|
||||
const char *characters = (const char *)(data + offset);
|
||||
m = new BytesDictionaryMatcher(characters, transform, file);
|
||||
}
|
||||
else if (trieType == DictionaryData::TRIE_TYPE_UCHARS) {
|
||||
const char16_t *characters = (const char16_t *)(data + offset);
|
||||
m = new UCharsDictionaryMatcher(characters, file);
|
||||
}
|
||||
if (m == nullptr) {
|
||||
// no matcher exists to take ownership - either we are an invalid
|
||||
// type or memory allocation failed
|
||||
udata_close(file);
|
||||
}
|
||||
return m;
|
||||
} else if (dictfname != nullptr) {
|
||||
// we don't have a dictionary matcher.
|
||||
// returning nullptr here will cause us to fail to find a dictionary break engine, as expected
|
||||
status = U_ZERO_ERROR;
|
||||
return nullptr;
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
|
||||
void ICULanguageBreakFactory::addExternalEngine(
|
||||
ExternalBreakEngine* external, UErrorCode& status) {
|
||||
LocalPointer<ExternalBreakEngine> engine(external, status);
|
||||
ensureEngines(status);
|
||||
LocalPointer<BreakEngineWrapper> wrapper(
|
||||
new BreakEngineWrapper(engine.orphan(), status), status);
|
||||
static UMutex gBreakEngineMutex;
|
||||
Mutex m(&gBreakEngineMutex);
|
||||
fEngines->push(wrapper.getAlias(), status);
|
||||
wrapper.orphan();
|
||||
}
|
||||
|
||||
BreakEngineWrapper::BreakEngineWrapper(
|
||||
ExternalBreakEngine* engine, UErrorCode &status) : delegate(engine, status) {
|
||||
}
|
||||
|
||||
BreakEngineWrapper::~BreakEngineWrapper() {
|
||||
}
|
||||
|
||||
UBool BreakEngineWrapper::handles(UChar32 c, const char* locale) const {
|
||||
return delegate->isFor(c, locale);
|
||||
}
|
||||
|
||||
int32_t BreakEngineWrapper::findBreaks(
|
||||
UText *text,
|
||||
int32_t startPos,
|
||||
int32_t endPos,
|
||||
UVector32 &foundBreaks,
|
||||
UBool /* isPhraseBreaking */,
|
||||
UErrorCode &status) const {
|
||||
if (U_FAILURE(status)) return 0;
|
||||
int32_t result = 0;
|
||||
|
||||
// Find the span of characters included in the set.
|
||||
// The span to break begins at the current position in the text, and
|
||||
// extends towards the start or end of the text, depending on 'reverse'.
|
||||
|
||||
utext_setNativeIndex(text, startPos);
|
||||
int32_t start = (int32_t)utext_getNativeIndex(text);
|
||||
int32_t current;
|
||||
int32_t rangeStart;
|
||||
int32_t rangeEnd;
|
||||
UChar32 c = utext_current32(text);
|
||||
while((current = (int32_t)utext_getNativeIndex(text)) < endPos && delegate->handles(c)) {
|
||||
utext_next32(text); // TODO: recast loop for postincrement
|
||||
c = utext_current32(text);
|
||||
}
|
||||
rangeStart = start;
|
||||
rangeEnd = current;
|
||||
int32_t beforeSize = foundBreaks.size();
|
||||
int32_t additionalCapacity = rangeEnd - rangeStart + 1;
|
||||
// enlarge to contains (rangeEnd-rangeStart+1) more items
|
||||
foundBreaks.ensureCapacity(beforeSize+additionalCapacity, status);
|
||||
if (U_FAILURE(status)) return 0;
|
||||
foundBreaks.setSize(beforeSize + beforeSize+additionalCapacity);
|
||||
result = delegate->fillBreaks(text, rangeStart, rangeEnd, foundBreaks.getBuffer()+beforeSize,
|
||||
additionalCapacity, status);
|
||||
if (U_FAILURE(status)) return 0;
|
||||
foundBreaks.setSize(beforeSize + result);
|
||||
utext_setNativeIndex(text, current);
|
||||
return result;
|
||||
}
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
||||
#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
|
||||
324
engine/thirdparty/icu4c/common/brkeng.h
vendored
Normal file
324
engine/thirdparty/icu4c/common/brkeng.h
vendored
Normal file
|
|
@ -0,0 +1,324 @@
|
|||
// © 2016 and later: Unicode, Inc. and others.
|
||||
// License & terms of use: http://www.unicode.org/copyright.html
|
||||
/**
|
||||
************************************************************************************
|
||||
* Copyright (C) 2006-2012, International Business Machines Corporation and others. *
|
||||
* All Rights Reserved. *
|
||||
************************************************************************************
|
||||
*/
|
||||
|
||||
#ifndef BRKENG_H
|
||||
#define BRKENG_H
|
||||
|
||||
#include "unicode/umisc.h"
|
||||
#include "unicode/utypes.h"
|
||||
#include "unicode/uobject.h"
|
||||
#include "unicode/utext.h"
|
||||
#include "unicode/uscript.h"
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
class UnicodeSet;
|
||||
class UStack;
|
||||
class UVector32;
|
||||
class DictionaryMatcher;
|
||||
class ExternalBreakEngine;
|
||||
|
||||
/*******************************************************************
|
||||
* LanguageBreakEngine
|
||||
*/
|
||||
|
||||
/**
|
||||
* <p>LanguageBreakEngines implement language-specific knowledge for
|
||||
* finding text boundaries within a run of characters belonging to a
|
||||
* specific set. The boundaries will be of a specific kind, e.g. word,
|
||||
* line, etc.</p>
|
||||
*
|
||||
* <p>LanguageBreakEngines should normally be implemented so as to
|
||||
* be shared between threads without locking.</p>
|
||||
*/
|
||||
class LanguageBreakEngine : public UObject {
|
||||
public:
|
||||
|
||||
/**
|
||||
* <p>Default constructor.</p>
|
||||
*
|
||||
*/
|
||||
LanguageBreakEngine();
|
||||
|
||||
/**
|
||||
* <p>Virtual destructor.</p>
|
||||
*/
|
||||
virtual ~LanguageBreakEngine();
|
||||
|
||||
/**
|
||||
* <p>Indicate whether this engine handles a particular character for
|
||||
* a particular kind of break.</p>
|
||||
*
|
||||
* @param c A character which begins a run that the engine might handle
|
||||
* @param locale The locale.
|
||||
* @return true if this engine handles the particular character and break
|
||||
* type.
|
||||
*/
|
||||
virtual UBool handles(UChar32 c, const char* locale) const = 0;
|
||||
|
||||
/**
|
||||
* <p>Find any breaks within a run in the supplied text.</p>
|
||||
*
|
||||
* @param text A UText representing the text. The
|
||||
* iterator is left at the end of the run of characters which the engine
|
||||
* is capable of handling.
|
||||
* @param startPos The start of the run within the supplied text.
|
||||
* @param endPos The end of the run within the supplied text.
|
||||
* @param foundBreaks A Vector of int32_t to receive the breaks.
|
||||
* @param status Information on any errors encountered.
|
||||
* @return The number of breaks found.
|
||||
*/
|
||||
virtual int32_t findBreaks( UText *text,
|
||||
int32_t startPos,
|
||||
int32_t endPos,
|
||||
UVector32 &foundBreaks,
|
||||
UBool isPhraseBreaking,
|
||||
UErrorCode &status) const = 0;
|
||||
|
||||
};
|
||||
|
||||
/*******************************************************************
|
||||
* BreakEngineWrapper
|
||||
*/
|
||||
|
||||
/**
|
||||
* <p>BreakEngineWrapper implement LanguageBreakEngine by
|
||||
* a thin wrapper that delegate the task to ExternalBreakEngine
|
||||
* </p>
|
||||
*/
|
||||
class BreakEngineWrapper : public LanguageBreakEngine {
|
||||
public:
|
||||
|
||||
BreakEngineWrapper(ExternalBreakEngine* engine, UErrorCode &status);
|
||||
|
||||
virtual ~BreakEngineWrapper();
|
||||
|
||||
virtual UBool handles(UChar32 c, const char* locale) const override;
|
||||
|
||||
virtual int32_t findBreaks( UText *text,
|
||||
int32_t startPos,
|
||||
int32_t endPos,
|
||||
UVector32 &foundBreaks,
|
||||
UBool isPhraseBreaking,
|
||||
UErrorCode &status) const override;
|
||||
|
||||
private:
|
||||
LocalPointer<ExternalBreakEngine> delegate;
|
||||
};
|
||||
|
||||
/*******************************************************************
|
||||
* LanguageBreakFactory
|
||||
*/
|
||||
|
||||
/**
|
||||
* <p>LanguageBreakFactorys find and return a LanguageBreakEngine
|
||||
* that can determine breaks for characters in a specific set, if
|
||||
* such an object can be found.</p>
|
||||
*
|
||||
* <p>If a LanguageBreakFactory is to be shared between threads,
|
||||
* appropriate synchronization must be used; there is none internal
|
||||
* to the factory.</p>
|
||||
*
|
||||
* <p>A LanguageBreakEngine returned by a LanguageBreakFactory can
|
||||
* normally be shared between threads without synchronization, unless
|
||||
* the specific subclass of LanguageBreakFactory indicates otherwise.</p>
|
||||
*
|
||||
* <p>A LanguageBreakFactory is responsible for deleting any LanguageBreakEngine
|
||||
* it returns when it itself is deleted, unless the specific subclass of
|
||||
* LanguageBreakFactory indicates otherwise. Naturally, the factory should
|
||||
* not be deleted until the LanguageBreakEngines it has returned are no
|
||||
* longer needed.</p>
|
||||
*/
|
||||
class LanguageBreakFactory : public UMemory {
|
||||
public:
|
||||
|
||||
/**
|
||||
* <p>Default constructor.</p>
|
||||
*
|
||||
*/
|
||||
LanguageBreakFactory();
|
||||
|
||||
/**
|
||||
* <p>Virtual destructor.</p>
|
||||
*/
|
||||
virtual ~LanguageBreakFactory();
|
||||
|
||||
/**
|
||||
* <p>Find and return a LanguageBreakEngine that can find the desired
|
||||
* kind of break for the set of characters to which the supplied
|
||||
* character belongs. It is up to the set of available engines to
|
||||
* determine what the sets of characters are.</p>
|
||||
*
|
||||
* @param c A character that begins a run for which a LanguageBreakEngine is
|
||||
* sought.
|
||||
* @param locale The locale.
|
||||
* @return A LanguageBreakEngine with the desired characteristics, or 0.
|
||||
*/
|
||||
virtual const LanguageBreakEngine *getEngineFor(UChar32 c, const char* locale) = 0;
|
||||
|
||||
};
|
||||
|
||||
/*******************************************************************
|
||||
* UnhandledEngine
|
||||
*/
|
||||
|
||||
/**
|
||||
* <p>UnhandledEngine is a special subclass of LanguageBreakEngine that
|
||||
* handles characters that no other LanguageBreakEngine is available to
|
||||
* handle. It is told the character and the type of break; at its
|
||||
* discretion it may handle more than the specified character (e.g.,
|
||||
* the entire script to which that character belongs.</p>
|
||||
*
|
||||
* <p>UnhandledEngines may not be shared between threads without
|
||||
* external synchronization.</p>
|
||||
*/
|
||||
|
||||
class UnhandledEngine : public LanguageBreakEngine {
|
||||
private:
|
||||
|
||||
/**
|
||||
* The sets of characters handled.
|
||||
* @internal
|
||||
*/
|
||||
|
||||
UnicodeSet *fHandled;
|
||||
|
||||
public:
|
||||
|
||||
/**
|
||||
* <p>Default constructor.</p>
|
||||
*
|
||||
*/
|
||||
UnhandledEngine(UErrorCode &status);
|
||||
|
||||
/**
|
||||
* <p>Virtual destructor.</p>
|
||||
*/
|
||||
virtual ~UnhandledEngine();
|
||||
|
||||
/**
|
||||
* <p>Indicate whether this engine handles a particular character for
|
||||
* a particular kind of break.</p>
|
||||
*
|
||||
* @param c A character which begins a run that the engine might handle
|
||||
* @param locale The locale.
|
||||
* @return true if this engine handles the particular character and break
|
||||
* type.
|
||||
*/
|
||||
virtual UBool handles(UChar32 c, const char* locale) const override;
|
||||
|
||||
/**
|
||||
* <p>Find any breaks within a run in the supplied text.</p>
|
||||
*
|
||||
* @param text A UText representing the text (TODO: UText). The
|
||||
* iterator is left at the end of the run of characters which the engine
|
||||
* is capable of handling.
|
||||
* @param startPos The start of the run within the supplied text.
|
||||
* @param endPos The end of the run within the supplied text.
|
||||
* @param foundBreaks An allocated C array of the breaks found, if any
|
||||
* @param status Information on any errors encountered.
|
||||
* @return The number of breaks found.
|
||||
*/
|
||||
virtual int32_t findBreaks( UText *text,
|
||||
int32_t startPos,
|
||||
int32_t endPos,
|
||||
UVector32 &foundBreaks,
|
||||
UBool isPhraseBreaking,
|
||||
UErrorCode &status) const override;
|
||||
|
||||
/**
|
||||
* <p>Tell the engine to handle a particular character and break type.</p>
|
||||
*
|
||||
* @param c A character which the engine should handle
|
||||
*/
|
||||
virtual void handleCharacter(UChar32 c);
|
||||
|
||||
};
|
||||
|
||||
/*******************************************************************
|
||||
* ICULanguageBreakFactory
|
||||
*/
|
||||
|
||||
/**
|
||||
* <p>ICULanguageBreakFactory is the default LanguageBreakFactory for
|
||||
* ICU. It creates dictionary-based LanguageBreakEngines from dictionary
|
||||
* data in the ICU data file.</p>
|
||||
*/
|
||||
class ICULanguageBreakFactory : public LanguageBreakFactory {
|
||||
private:
|
||||
|
||||
/**
|
||||
* The stack of break engines created by this factory
|
||||
* @internal
|
||||
*/
|
||||
|
||||
UStack *fEngines;
|
||||
|
||||
public:
|
||||
|
||||
/**
|
||||
* <p>Standard constructor.</p>
|
||||
*
|
||||
*/
|
||||
ICULanguageBreakFactory(UErrorCode &status);
|
||||
|
||||
/**
|
||||
* <p>Virtual destructor.</p>
|
||||
*/
|
||||
virtual ~ICULanguageBreakFactory();
|
||||
|
||||
/**
|
||||
* <p>Find and return a LanguageBreakEngine that can find the desired
|
||||
* kind of break for the set of characters to which the supplied
|
||||
* character belongs. It is up to the set of available engines to
|
||||
* determine what the sets of characters are.</p>
|
||||
*
|
||||
* @param c A character that begins a run for which a LanguageBreakEngine is
|
||||
* sought.
|
||||
* @param locale The locale.
|
||||
* @return A LanguageBreakEngine with the desired characteristics, or 0.
|
||||
*/
|
||||
virtual const LanguageBreakEngine *getEngineFor(UChar32 c, const char* locale) override;
|
||||
|
||||
/**
|
||||
* Add and adopt the engine and return an URegistryKey.
|
||||
* @param engine The ExternalBreakEngine to be added and adopt. The caller
|
||||
* pass the ownership and should not release the memory after this.
|
||||
* @param status the error code.
|
||||
*/
|
||||
virtual void addExternalEngine(ExternalBreakEngine* engine, UErrorCode& status);
|
||||
|
||||
protected:
|
||||
/**
|
||||
* <p>Create a LanguageBreakEngine for the set of characters to which
|
||||
* the supplied character belongs, for the specified break type.</p>
|
||||
*
|
||||
* @param c A character that begins a run for which a LanguageBreakEngine is
|
||||
* sought.
|
||||
* @param locale The locale.
|
||||
* @return A LanguageBreakEngine with the desired characteristics, or 0.
|
||||
*/
|
||||
virtual const LanguageBreakEngine *loadEngineFor(UChar32 c, const char* locale);
|
||||
|
||||
/**
|
||||
* <p>Create a DictionaryMatcher for the specified script and break type.</p>
|
||||
* @param script An ISO 15924 script code that identifies the dictionary to be
|
||||
* created.
|
||||
* @return A DictionaryMatcher with the desired characteristics, or nullptr.
|
||||
*/
|
||||
virtual DictionaryMatcher *loadDictionaryMatcherFor(UScriptCode script);
|
||||
|
||||
private:
|
||||
void ensureEngines(UErrorCode& status);
|
||||
};
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
||||
/* BRKENG_H */
|
||||
#endif
|
||||
547
engine/thirdparty/icu4c/common/brkiter.cpp
vendored
Normal file
547
engine/thirdparty/icu4c/common/brkiter.cpp
vendored
Normal file
|
|
@ -0,0 +1,547 @@
|
|||
// © 2016 and later: Unicode, Inc. and others.
|
||||
// License & terms of use: http://www.unicode.org/copyright.html
|
||||
/*
|
||||
*******************************************************************************
|
||||
* Copyright (C) 1997-2015, International Business Machines Corporation and
|
||||
* others. All Rights Reserved.
|
||||
*******************************************************************************
|
||||
*
|
||||
* File brkiter.cpp
|
||||
*
|
||||
* Modification History:
|
||||
*
|
||||
* Date Name Description
|
||||
* 02/18/97 aliu Converted from OpenClass. Added DONE.
|
||||
* 01/13/2000 helena Added UErrorCode parameter to createXXXInstance methods.
|
||||
*****************************************************************************************
|
||||
*/
|
||||
|
||||
// *****************************************************************************
|
||||
// This file was generated from the java source file BreakIterator.java
|
||||
// *****************************************************************************
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
|
||||
#if !UCONFIG_NO_BREAK_ITERATION
|
||||
|
||||
#include "unicode/rbbi.h"
|
||||
#include "unicode/brkiter.h"
|
||||
#include "unicode/udata.h"
|
||||
#include "unicode/uloc.h"
|
||||
#include "unicode/ures.h"
|
||||
#include "unicode/ustring.h"
|
||||
#include "unicode/filteredbrk.h"
|
||||
#include "bytesinkutil.h"
|
||||
#include "ucln_cmn.h"
|
||||
#include "cstring.h"
|
||||
#include "umutex.h"
|
||||
#include "servloc.h"
|
||||
#include "locbased.h"
|
||||
#include "uresimp.h"
|
||||
#include "uassert.h"
|
||||
#include "ubrkimpl.h"
|
||||
#include "utracimp.h"
|
||||
#include "charstr.h"
|
||||
|
||||
// *****************************************************************************
|
||||
// class BreakIterator
|
||||
// This class implements methods for finding the location of boundaries in text.
|
||||
// Instances of BreakIterator maintain a current position and scan over text
|
||||
// returning the index of characters where boundaries occur.
|
||||
// *****************************************************************************
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
// -------------------------------------
|
||||
|
||||
BreakIterator*
|
||||
BreakIterator::buildInstance(const Locale& loc, const char *type, UErrorCode &status)
|
||||
{
|
||||
char fnbuff[256];
|
||||
char ext[4]={'\0'};
|
||||
CharString actualLocale;
|
||||
int32_t size;
|
||||
const char16_t* brkfname = nullptr;
|
||||
UResourceBundle brkRulesStack;
|
||||
UResourceBundle brkNameStack;
|
||||
UResourceBundle *brkRules = &brkRulesStack;
|
||||
UResourceBundle *brkName = &brkNameStack;
|
||||
RuleBasedBreakIterator *result = nullptr;
|
||||
|
||||
if (U_FAILURE(status))
|
||||
return nullptr;
|
||||
|
||||
ures_initStackObject(brkRules);
|
||||
ures_initStackObject(brkName);
|
||||
|
||||
// Get the locale
|
||||
UResourceBundle *b = ures_openNoDefault(U_ICUDATA_BRKITR, loc.getName(), &status);
|
||||
|
||||
// Get the "boundaries" array.
|
||||
if (U_SUCCESS(status)) {
|
||||
brkRules = ures_getByKeyWithFallback(b, "boundaries", brkRules, &status);
|
||||
// Get the string object naming the rules file
|
||||
brkName = ures_getByKeyWithFallback(brkRules, type, brkName, &status);
|
||||
// Get the actual string
|
||||
brkfname = ures_getString(brkName, &size, &status);
|
||||
U_ASSERT((size_t)size<sizeof(fnbuff));
|
||||
if ((size_t)size>=sizeof(fnbuff)) {
|
||||
size=0;
|
||||
if (U_SUCCESS(status)) {
|
||||
status = U_BUFFER_OVERFLOW_ERROR;
|
||||
}
|
||||
}
|
||||
|
||||
// Use the string if we found it
|
||||
if (U_SUCCESS(status) && brkfname) {
|
||||
actualLocale.append(ures_getLocaleInternal(brkName, &status), -1, status);
|
||||
|
||||
char16_t* extStart=u_strchr(brkfname, 0x002e);
|
||||
int len = 0;
|
||||
if (extStart != nullptr){
|
||||
len = (int)(extStart-brkfname);
|
||||
u_UCharsToChars(extStart+1, ext, sizeof(ext)); // nul terminates the buff
|
||||
u_UCharsToChars(brkfname, fnbuff, len);
|
||||
}
|
||||
fnbuff[len]=0; // nul terminate
|
||||
}
|
||||
}
|
||||
|
||||
ures_close(brkRules);
|
||||
ures_close(brkName);
|
||||
|
||||
UDataMemory* file = udata_open(U_ICUDATA_BRKITR, ext, fnbuff, &status);
|
||||
if (U_FAILURE(status)) {
|
||||
ures_close(b);
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
// Create a RuleBasedBreakIterator
|
||||
result = new RuleBasedBreakIterator(file, uprv_strstr(type, "phrase") != nullptr, status);
|
||||
|
||||
// If there is a result, set the valid locale and actual locale, and the kind
|
||||
if (U_SUCCESS(status) && result != nullptr) {
|
||||
U_LOCALE_BASED(locBased, *(BreakIterator*)result);
|
||||
|
||||
locBased.setLocaleIDs(ures_getLocaleByType(b, ULOC_VALID_LOCALE, &status),
|
||||
actualLocale.data());
|
||||
uprv_strncpy(result->requestLocale, loc.getName(), ULOC_FULLNAME_CAPACITY);
|
||||
result->requestLocale[ULOC_FULLNAME_CAPACITY-1] = 0; // always terminate
|
||||
}
|
||||
|
||||
ures_close(b);
|
||||
|
||||
if (U_FAILURE(status) && result != nullptr) { // Sometimes redundant check, but simple
|
||||
delete result;
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
if (result == nullptr) {
|
||||
udata_close(file);
|
||||
if (U_SUCCESS(status)) {
|
||||
status = U_MEMORY_ALLOCATION_ERROR;
|
||||
}
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
// Creates a break iterator for word breaks.
|
||||
BreakIterator* U_EXPORT2
|
||||
BreakIterator::createWordInstance(const Locale& key, UErrorCode& status)
|
||||
{
|
||||
return createInstance(key, UBRK_WORD, status);
|
||||
}
|
||||
|
||||
// -------------------------------------
|
||||
|
||||
// Creates a break iterator for line breaks.
|
||||
BreakIterator* U_EXPORT2
|
||||
BreakIterator::createLineInstance(const Locale& key, UErrorCode& status)
|
||||
{
|
||||
return createInstance(key, UBRK_LINE, status);
|
||||
}
|
||||
|
||||
// -------------------------------------
|
||||
|
||||
// Creates a break iterator for character breaks.
|
||||
BreakIterator* U_EXPORT2
|
||||
BreakIterator::createCharacterInstance(const Locale& key, UErrorCode& status)
|
||||
{
|
||||
return createInstance(key, UBRK_CHARACTER, status);
|
||||
}
|
||||
|
||||
// -------------------------------------
|
||||
|
||||
// Creates a break iterator for sentence breaks.
|
||||
BreakIterator* U_EXPORT2
|
||||
BreakIterator::createSentenceInstance(const Locale& key, UErrorCode& status)
|
||||
{
|
||||
return createInstance(key, UBRK_SENTENCE, status);
|
||||
}
|
||||
|
||||
// -------------------------------------
|
||||
|
||||
// Creates a break iterator for title casing breaks.
|
||||
BreakIterator* U_EXPORT2
|
||||
BreakIterator::createTitleInstance(const Locale& key, UErrorCode& status)
|
||||
{
|
||||
return createInstance(key, UBRK_TITLE, status);
|
||||
}
|
||||
|
||||
// -------------------------------------
|
||||
|
||||
// Gets all the available locales that has localized text boundary data.
|
||||
const Locale* U_EXPORT2
|
||||
BreakIterator::getAvailableLocales(int32_t& count)
|
||||
{
|
||||
return Locale::getAvailableLocales(count);
|
||||
}
|
||||
|
||||
// ------------------------------------------
|
||||
//
|
||||
// Constructors, destructor and assignment operator
|
||||
//
|
||||
//-------------------------------------------
|
||||
|
||||
BreakIterator::BreakIterator()
|
||||
{
|
||||
*validLocale = *actualLocale = *requestLocale = 0;
|
||||
}
|
||||
|
||||
BreakIterator::BreakIterator(const BreakIterator &other) : UObject(other) {
|
||||
uprv_strncpy(actualLocale, other.actualLocale, sizeof(actualLocale));
|
||||
uprv_strncpy(validLocale, other.validLocale, sizeof(validLocale));
|
||||
uprv_strncpy(requestLocale, other.requestLocale, sizeof(requestLocale));
|
||||
}
|
||||
|
||||
BreakIterator &BreakIterator::operator =(const BreakIterator &other) {
|
||||
if (this != &other) {
|
||||
uprv_strncpy(actualLocale, other.actualLocale, sizeof(actualLocale));
|
||||
uprv_strncpy(validLocale, other.validLocale, sizeof(validLocale));
|
||||
uprv_strncpy(requestLocale, other.requestLocale, sizeof(requestLocale));
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
||||
BreakIterator::~BreakIterator()
|
||||
{
|
||||
}
|
||||
|
||||
// ------------------------------------------
|
||||
//
|
||||
// Registration
|
||||
//
|
||||
//-------------------------------------------
|
||||
#if !UCONFIG_NO_SERVICE
|
||||
|
||||
// -------------------------------------
|
||||
|
||||
class ICUBreakIteratorFactory : public ICUResourceBundleFactory {
|
||||
public:
|
||||
virtual ~ICUBreakIteratorFactory();
|
||||
protected:
|
||||
virtual UObject* handleCreate(const Locale& loc, int32_t kind, const ICUService* /*service*/, UErrorCode& status) const override {
|
||||
return BreakIterator::makeInstance(loc, kind, status);
|
||||
}
|
||||
};
|
||||
|
||||
ICUBreakIteratorFactory::~ICUBreakIteratorFactory() {}
|
||||
|
||||
// -------------------------------------
|
||||
|
||||
class ICUBreakIteratorService : public ICULocaleService {
|
||||
public:
|
||||
ICUBreakIteratorService()
|
||||
: ICULocaleService(UNICODE_STRING("Break Iterator", 14))
|
||||
{
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
registerFactory(new ICUBreakIteratorFactory(), status);
|
||||
}
|
||||
|
||||
virtual ~ICUBreakIteratorService();
|
||||
|
||||
virtual UObject* cloneInstance(UObject* instance) const override {
|
||||
return ((BreakIterator*)instance)->clone();
|
||||
}
|
||||
|
||||
virtual UObject* handleDefault(const ICUServiceKey& key, UnicodeString* /*actualID*/, UErrorCode& status) const override {
|
||||
LocaleKey& lkey = static_cast<LocaleKey&>(const_cast<ICUServiceKey&>(key));
|
||||
int32_t kind = lkey.kind();
|
||||
Locale loc;
|
||||
lkey.currentLocale(loc);
|
||||
return BreakIterator::makeInstance(loc, kind, status);
|
||||
}
|
||||
|
||||
virtual UBool isDefault() const override {
|
||||
return countFactories() == 1;
|
||||
}
|
||||
};
|
||||
|
||||
ICUBreakIteratorService::~ICUBreakIteratorService() {}
|
||||
|
||||
// -------------------------------------
|
||||
|
||||
// defined in ucln_cmn.h
|
||||
U_NAMESPACE_END
|
||||
|
||||
static icu::UInitOnce gInitOnceBrkiter {};
|
||||
static icu::ICULocaleService* gService = nullptr;
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Release all static memory held by breakiterator.
|
||||
*/
|
||||
U_CDECL_BEGIN
|
||||
static UBool U_CALLCONV breakiterator_cleanup() {
|
||||
#if !UCONFIG_NO_SERVICE
|
||||
if (gService) {
|
||||
delete gService;
|
||||
gService = nullptr;
|
||||
}
|
||||
gInitOnceBrkiter.reset();
|
||||
#endif
|
||||
return true;
|
||||
}
|
||||
U_CDECL_END
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
static void U_CALLCONV
|
||||
initService() {
|
||||
gService = new ICUBreakIteratorService();
|
||||
ucln_common_registerCleanup(UCLN_COMMON_BREAKITERATOR, breakiterator_cleanup);
|
||||
}
|
||||
|
||||
static ICULocaleService*
|
||||
getService()
|
||||
{
|
||||
umtx_initOnce(gInitOnceBrkiter, &initService);
|
||||
return gService;
|
||||
}
|
||||
|
||||
|
||||
// -------------------------------------
|
||||
|
||||
static inline UBool
|
||||
hasService()
|
||||
{
|
||||
return !gInitOnceBrkiter.isReset() && getService() != nullptr;
|
||||
}
|
||||
|
||||
// -------------------------------------
|
||||
|
||||
URegistryKey U_EXPORT2
|
||||
BreakIterator::registerInstance(BreakIterator* toAdopt, const Locale& locale, UBreakIteratorType kind, UErrorCode& status)
|
||||
{
|
||||
ICULocaleService *service = getService();
|
||||
if (service == nullptr) {
|
||||
status = U_MEMORY_ALLOCATION_ERROR;
|
||||
return nullptr;
|
||||
}
|
||||
return service->registerInstance(toAdopt, locale, kind, status);
|
||||
}
|
||||
|
||||
// -------------------------------------
|
||||
|
||||
UBool U_EXPORT2
|
||||
BreakIterator::unregister(URegistryKey key, UErrorCode& status)
|
||||
{
|
||||
if (U_SUCCESS(status)) {
|
||||
if (hasService()) {
|
||||
return gService->unregister(key, status);
|
||||
}
|
||||
status = U_MEMORY_ALLOCATION_ERROR;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
// -------------------------------------
|
||||
|
||||
StringEnumeration* U_EXPORT2
|
||||
BreakIterator::getAvailableLocales()
|
||||
{
|
||||
ICULocaleService *service = getService();
|
||||
if (service == nullptr) {
|
||||
return nullptr;
|
||||
}
|
||||
return service->getAvailableLocales();
|
||||
}
|
||||
#endif /* UCONFIG_NO_SERVICE */
|
||||
|
||||
// -------------------------------------
|
||||
|
||||
BreakIterator*
|
||||
BreakIterator::createInstance(const Locale& loc, int32_t kind, UErrorCode& status)
|
||||
{
|
||||
if (U_FAILURE(status)) {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
#if !UCONFIG_NO_SERVICE
|
||||
if (hasService()) {
|
||||
Locale actualLoc("");
|
||||
BreakIterator *result = (BreakIterator*)gService->get(loc, kind, &actualLoc, status);
|
||||
// TODO: The way the service code works in ICU 2.8 is that if
|
||||
// there is a real registered break iterator, the actualLoc
|
||||
// will be populated, but if the handleDefault path is taken
|
||||
// (because nothing is registered that can handle the
|
||||
// requested locale) then the actualLoc comes back empty. In
|
||||
// that case, the returned object already has its actual/valid
|
||||
// locale data populated (by makeInstance, which is what
|
||||
// handleDefault calls), so we don't touch it. YES, A COMMENT
|
||||
// THIS LONG is a sign of bad code -- so the action item is to
|
||||
// revisit this in ICU 3.0 and clean it up/fix it/remove it.
|
||||
if (U_SUCCESS(status) && (result != nullptr) && *actualLoc.getName() != 0) {
|
||||
U_LOCALE_BASED(locBased, *result);
|
||||
locBased.setLocaleIDs(actualLoc.getName(), actualLoc.getName());
|
||||
}
|
||||
return result;
|
||||
}
|
||||
else
|
||||
#endif
|
||||
{
|
||||
return makeInstance(loc, kind, status);
|
||||
}
|
||||
}
|
||||
|
||||
// -------------------------------------
|
||||
enum { kKeyValueLenMax = 32 };
|
||||
|
||||
BreakIterator*
|
||||
BreakIterator::makeInstance(const Locale& loc, int32_t kind, UErrorCode& status)
|
||||
{
|
||||
|
||||
if (U_FAILURE(status)) {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
BreakIterator *result = nullptr;
|
||||
switch (kind) {
|
||||
case UBRK_CHARACTER:
|
||||
{
|
||||
UTRACE_ENTRY(UTRACE_UBRK_CREATE_CHARACTER);
|
||||
result = BreakIterator::buildInstance(loc, "grapheme", status);
|
||||
UTRACE_EXIT_STATUS(status);
|
||||
}
|
||||
break;
|
||||
case UBRK_WORD:
|
||||
{
|
||||
UTRACE_ENTRY(UTRACE_UBRK_CREATE_WORD);
|
||||
result = BreakIterator::buildInstance(loc, "word", status);
|
||||
UTRACE_EXIT_STATUS(status);
|
||||
}
|
||||
break;
|
||||
case UBRK_LINE:
|
||||
{
|
||||
char lb_lw[kKeyValueLenMax];
|
||||
UTRACE_ENTRY(UTRACE_UBRK_CREATE_LINE);
|
||||
uprv_strcpy(lb_lw, "line");
|
||||
UErrorCode kvStatus = U_ZERO_ERROR;
|
||||
auto value = loc.getKeywordValue<CharString>("lb", kvStatus);
|
||||
if (U_SUCCESS(kvStatus) && (value == "strict" || value == "normal" || value == "loose")) {
|
||||
uprv_strcat(lb_lw, "_");
|
||||
uprv_strcat(lb_lw, value.data());
|
||||
}
|
||||
// lw=phrase is only supported in Japanese and Korean
|
||||
if (uprv_strcmp(loc.getLanguage(), "ja") == 0 || uprv_strcmp(loc.getLanguage(), "ko") == 0) {
|
||||
value = loc.getKeywordValue<CharString>("lw", kvStatus);
|
||||
if (U_SUCCESS(kvStatus) && value == "phrase") {
|
||||
uprv_strcat(lb_lw, "_");
|
||||
uprv_strcat(lb_lw, value.data());
|
||||
}
|
||||
}
|
||||
result = BreakIterator::buildInstance(loc, lb_lw, status);
|
||||
|
||||
UTRACE_DATA1(UTRACE_INFO, "lb_lw=%s", lb_lw);
|
||||
UTRACE_EXIT_STATUS(status);
|
||||
}
|
||||
break;
|
||||
case UBRK_SENTENCE:
|
||||
{
|
||||
UTRACE_ENTRY(UTRACE_UBRK_CREATE_SENTENCE);
|
||||
result = BreakIterator::buildInstance(loc, "sentence", status);
|
||||
#if !UCONFIG_NO_FILTERED_BREAK_ITERATION
|
||||
char ssKeyValue[kKeyValueLenMax] = {0};
|
||||
UErrorCode kvStatus = U_ZERO_ERROR;
|
||||
int32_t kLen = loc.getKeywordValue("ss", ssKeyValue, kKeyValueLenMax, kvStatus);
|
||||
if (U_SUCCESS(kvStatus) && kLen > 0 && uprv_strcmp(ssKeyValue,"standard")==0) {
|
||||
FilteredBreakIteratorBuilder* fbiBuilder = FilteredBreakIteratorBuilder::createInstance(loc, kvStatus);
|
||||
if (U_SUCCESS(kvStatus)) {
|
||||
result = fbiBuilder->build(result, status);
|
||||
delete fbiBuilder;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
UTRACE_EXIT_STATUS(status);
|
||||
}
|
||||
break;
|
||||
case UBRK_TITLE:
|
||||
{
|
||||
UTRACE_ENTRY(UTRACE_UBRK_CREATE_TITLE);
|
||||
result = BreakIterator::buildInstance(loc, "title", status);
|
||||
UTRACE_EXIT_STATUS(status);
|
||||
}
|
||||
break;
|
||||
default:
|
||||
status = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
}
|
||||
|
||||
if (U_FAILURE(status)) {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
Locale
|
||||
BreakIterator::getLocale(ULocDataLocaleType type, UErrorCode& status) const {
|
||||
if (type == ULOC_REQUESTED_LOCALE) {
|
||||
return {requestLocale};
|
||||
}
|
||||
U_LOCALE_BASED(locBased, *this);
|
||||
return locBased.getLocale(type, status);
|
||||
}
|
||||
|
||||
const char *
|
||||
BreakIterator::getLocaleID(ULocDataLocaleType type, UErrorCode& status) const {
|
||||
if (type == ULOC_REQUESTED_LOCALE) {
|
||||
return requestLocale;
|
||||
}
|
||||
U_LOCALE_BASED(locBased, *this);
|
||||
return locBased.getLocaleID(type, status);
|
||||
}
|
||||
|
||||
|
||||
// This implementation of getRuleStatus is a do-nothing stub, here to
|
||||
// provide a default implementation for any derived BreakIterator classes that
|
||||
// do not implement it themselves.
|
||||
int32_t BreakIterator::getRuleStatus() const {
|
||||
return 0;
|
||||
}
|
||||
|
||||
// This implementation of getRuleStatusVec is a do-nothing stub, here to
|
||||
// provide a default implementation for any derived BreakIterator classes that
|
||||
// do not implement it themselves.
|
||||
int32_t BreakIterator::getRuleStatusVec(int32_t *fillInVec, int32_t capacity, UErrorCode &status) {
|
||||
if (U_FAILURE(status)) {
|
||||
return 0;
|
||||
}
|
||||
if (capacity < 1) {
|
||||
status = U_BUFFER_OVERFLOW_ERROR;
|
||||
return 1;
|
||||
}
|
||||
*fillInVec = 0;
|
||||
return 1;
|
||||
}
|
||||
|
||||
BreakIterator::BreakIterator (const Locale& valid, const Locale& actual) {
|
||||
U_LOCALE_BASED(locBased, (*this));
|
||||
locBased.setLocaleIDs(valid, actual);
|
||||
}
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
||||
#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
|
||||
|
||||
//eof
|
||||
161
engine/thirdparty/icu4c/common/bytesinkutil.cpp
vendored
Normal file
161
engine/thirdparty/icu4c/common/bytesinkutil.cpp
vendored
Normal file
|
|
@ -0,0 +1,161 @@
|
|||
// © 2017 and later: Unicode, Inc. and others.
|
||||
// License & terms of use: http://www.unicode.org/copyright.html
|
||||
|
||||
// bytesinkutil.cpp
|
||||
// created: 2017sep14 Markus W. Scherer
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
#include "unicode/bytestream.h"
|
||||
#include "unicode/edits.h"
|
||||
#include "unicode/stringoptions.h"
|
||||
#include "unicode/utf8.h"
|
||||
#include "unicode/utf16.h"
|
||||
#include "bytesinkutil.h"
|
||||
#include "charstr.h"
|
||||
#include "cmemory.h"
|
||||
#include "uassert.h"
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
UBool
|
||||
ByteSinkUtil::appendChange(int32_t length, const char16_t *s16, int32_t s16Length,
|
||||
ByteSink &sink, Edits *edits, UErrorCode &errorCode) {
|
||||
if (U_FAILURE(errorCode)) { return false; }
|
||||
char scratch[200];
|
||||
int32_t s8Length = 0;
|
||||
for (int32_t i = 0; i < s16Length;) {
|
||||
int32_t capacity;
|
||||
int32_t desiredCapacity = s16Length - i;
|
||||
if (desiredCapacity < (INT32_MAX / 3)) {
|
||||
desiredCapacity *= 3; // max 3 UTF-8 bytes per UTF-16 code unit
|
||||
} else if (desiredCapacity < (INT32_MAX / 2)) {
|
||||
desiredCapacity *= 2;
|
||||
} else {
|
||||
desiredCapacity = INT32_MAX;
|
||||
}
|
||||
char *buffer = sink.GetAppendBuffer(U8_MAX_LENGTH, desiredCapacity,
|
||||
scratch, UPRV_LENGTHOF(scratch), &capacity);
|
||||
capacity -= U8_MAX_LENGTH - 1;
|
||||
int32_t j = 0;
|
||||
for (; i < s16Length && j < capacity;) {
|
||||
UChar32 c;
|
||||
U16_NEXT_UNSAFE(s16, i, c);
|
||||
U8_APPEND_UNSAFE(buffer, j, c);
|
||||
}
|
||||
if (j > (INT32_MAX - s8Length)) {
|
||||
errorCode = U_INDEX_OUTOFBOUNDS_ERROR;
|
||||
return false;
|
||||
}
|
||||
sink.Append(buffer, j);
|
||||
s8Length += j;
|
||||
}
|
||||
if (edits != nullptr) {
|
||||
edits->addReplace(length, s8Length);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
UBool
|
||||
ByteSinkUtil::appendChange(const uint8_t *s, const uint8_t *limit,
|
||||
const char16_t *s16, int32_t s16Length,
|
||||
ByteSink &sink, Edits *edits, UErrorCode &errorCode) {
|
||||
if (U_FAILURE(errorCode)) { return false; }
|
||||
if ((limit - s) > INT32_MAX) {
|
||||
errorCode = U_INDEX_OUTOFBOUNDS_ERROR;
|
||||
return false;
|
||||
}
|
||||
return appendChange((int32_t)(limit - s), s16, s16Length, sink, edits, errorCode);
|
||||
}
|
||||
|
||||
void
|
||||
ByteSinkUtil::appendCodePoint(int32_t length, UChar32 c, ByteSink &sink, Edits *edits) {
|
||||
char s8[U8_MAX_LENGTH];
|
||||
int32_t s8Length = 0;
|
||||
U8_APPEND_UNSAFE(s8, s8Length, c);
|
||||
if (edits != nullptr) {
|
||||
edits->addReplace(length, s8Length);
|
||||
}
|
||||
sink.Append(s8, s8Length);
|
||||
}
|
||||
|
||||
namespace {
|
||||
|
||||
// See unicode/utf8.h U8_APPEND_UNSAFE().
|
||||
inline uint8_t getTwoByteLead(UChar32 c) { return (uint8_t)((c >> 6) | 0xc0); }
|
||||
inline uint8_t getTwoByteTrail(UChar32 c) { return (uint8_t)((c & 0x3f) | 0x80); }
|
||||
|
||||
} // namespace
|
||||
|
||||
void
|
||||
ByteSinkUtil::appendTwoBytes(UChar32 c, ByteSink &sink) {
|
||||
U_ASSERT(0x80 <= c && c <= 0x7ff); // 2-byte UTF-8
|
||||
char s8[2] = { (char)getTwoByteLead(c), (char)getTwoByteTrail(c) };
|
||||
sink.Append(s8, 2);
|
||||
}
|
||||
|
||||
void
|
||||
ByteSinkUtil::appendNonEmptyUnchanged(const uint8_t *s, int32_t length,
|
||||
ByteSink &sink, uint32_t options, Edits *edits) {
|
||||
U_ASSERT(length > 0);
|
||||
if (edits != nullptr) {
|
||||
edits->addUnchanged(length);
|
||||
}
|
||||
if ((options & U_OMIT_UNCHANGED_TEXT) == 0) {
|
||||
sink.Append(reinterpret_cast<const char *>(s), length);
|
||||
}
|
||||
}
|
||||
|
||||
UBool
|
||||
ByteSinkUtil::appendUnchanged(const uint8_t *s, const uint8_t *limit,
|
||||
ByteSink &sink, uint32_t options, Edits *edits,
|
||||
UErrorCode &errorCode) {
|
||||
if (U_FAILURE(errorCode)) { return false; }
|
||||
if ((limit - s) > INT32_MAX) {
|
||||
errorCode = U_INDEX_OUTOFBOUNDS_ERROR;
|
||||
return false;
|
||||
}
|
||||
int32_t length = (int32_t)(limit - s);
|
||||
if (length > 0) {
|
||||
appendNonEmptyUnchanged(s, length, sink, options, edits);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
CharStringByteSink::CharStringByteSink(CharString* dest) : dest_(*dest) {
|
||||
}
|
||||
|
||||
CharStringByteSink::~CharStringByteSink() = default;
|
||||
|
||||
void
|
||||
CharStringByteSink::Append(const char* bytes, int32_t n) {
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
dest_.append(bytes, n, status);
|
||||
// Any errors are silently ignored.
|
||||
}
|
||||
|
||||
char*
|
||||
CharStringByteSink::GetAppendBuffer(int32_t min_capacity,
|
||||
int32_t desired_capacity_hint,
|
||||
char* scratch,
|
||||
int32_t scratch_capacity,
|
||||
int32_t* result_capacity) {
|
||||
if (min_capacity < 1 || scratch_capacity < min_capacity) {
|
||||
*result_capacity = 0;
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
char* result = dest_.getAppendBuffer(
|
||||
min_capacity,
|
||||
desired_capacity_hint,
|
||||
*result_capacity,
|
||||
status);
|
||||
if (U_SUCCESS(status)) {
|
||||
return result;
|
||||
}
|
||||
|
||||
*result_capacity = scratch_capacity;
|
||||
return scratch;
|
||||
}
|
||||
|
||||
U_NAMESPACE_END
|
||||
156
engine/thirdparty/icu4c/common/bytesinkutil.h
vendored
Normal file
156
engine/thirdparty/icu4c/common/bytesinkutil.h
vendored
Normal file
|
|
@ -0,0 +1,156 @@
|
|||
// © 2017 and later: Unicode, Inc. and others.
|
||||
// License & terms of use: http://www.unicode.org/copyright.html
|
||||
|
||||
// bytesinkutil.h
|
||||
// created: 2017sep14 Markus W. Scherer
|
||||
|
||||
#ifndef BYTESINKUTIL_H
|
||||
#define BYTESINKUTIL_H
|
||||
|
||||
#include <type_traits>
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
#include "unicode/bytestream.h"
|
||||
#include "unicode/edits.h"
|
||||
#include "charstr.h"
|
||||
#include "cmemory.h"
|
||||
#include "uassert.h"
|
||||
#include "ustr_imp.h"
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
class ByteSink;
|
||||
class Edits;
|
||||
|
||||
class U_COMMON_API CharStringByteSink : public ByteSink {
|
||||
public:
|
||||
CharStringByteSink(CharString* dest);
|
||||
~CharStringByteSink() override;
|
||||
|
||||
CharStringByteSink() = delete;
|
||||
CharStringByteSink(const CharStringByteSink&) = delete;
|
||||
CharStringByteSink& operator=(const CharStringByteSink&) = delete;
|
||||
|
||||
void Append(const char* bytes, int32_t n) override;
|
||||
|
||||
char* GetAppendBuffer(int32_t min_capacity,
|
||||
int32_t desired_capacity_hint,
|
||||
char* scratch,
|
||||
int32_t scratch_capacity,
|
||||
int32_t* result_capacity) override;
|
||||
|
||||
private:
|
||||
CharString& dest_;
|
||||
};
|
||||
|
||||
// CharString doesn't provide the public API that StringByteSink requires a
|
||||
// string class to have so this template specialization replaces the default
|
||||
// implementation of StringByteSink<CharString> with CharStringByteSink.
|
||||
template<>
|
||||
class StringByteSink<CharString> : public CharStringByteSink {
|
||||
public:
|
||||
StringByteSink(CharString* dest) : CharStringByteSink(dest) { }
|
||||
StringByteSink(CharString* dest, int32_t /*initialAppendCapacity*/) : CharStringByteSink(dest) { }
|
||||
};
|
||||
|
||||
class U_COMMON_API ByteSinkUtil {
|
||||
public:
|
||||
ByteSinkUtil() = delete; // all static
|
||||
|
||||
/** (length) bytes were mapped to valid (s16, s16Length). */
|
||||
static UBool appendChange(int32_t length,
|
||||
const char16_t *s16, int32_t s16Length,
|
||||
ByteSink &sink, Edits *edits, UErrorCode &errorCode);
|
||||
|
||||
/** The bytes at [s, limit[ were mapped to valid (s16, s16Length). */
|
||||
static UBool appendChange(const uint8_t *s, const uint8_t *limit,
|
||||
const char16_t *s16, int32_t s16Length,
|
||||
ByteSink &sink, Edits *edits, UErrorCode &errorCode);
|
||||
|
||||
/** (length) bytes were mapped/changed to valid code point c. */
|
||||
static void appendCodePoint(int32_t length, UChar32 c, ByteSink &sink, Edits *edits = nullptr);
|
||||
|
||||
/** The few bytes at [src, nextSrc[ were mapped/changed to valid code point c. */
|
||||
static inline void appendCodePoint(const uint8_t *src, const uint8_t *nextSrc, UChar32 c,
|
||||
ByteSink &sink, Edits *edits = nullptr) {
|
||||
appendCodePoint((int32_t)(nextSrc - src), c, sink, edits);
|
||||
}
|
||||
|
||||
/** Append the two-byte character (U+0080..U+07FF). */
|
||||
static void appendTwoBytes(UChar32 c, ByteSink &sink);
|
||||
|
||||
static UBool appendUnchanged(const uint8_t *s, int32_t length,
|
||||
ByteSink &sink, uint32_t options, Edits *edits,
|
||||
UErrorCode &errorCode) {
|
||||
if (U_FAILURE(errorCode)) { return false; }
|
||||
if (length > 0) { appendNonEmptyUnchanged(s, length, sink, options, edits); }
|
||||
return true;
|
||||
}
|
||||
|
||||
static UBool appendUnchanged(const uint8_t *s, const uint8_t *limit,
|
||||
ByteSink &sink, uint32_t options, Edits *edits,
|
||||
UErrorCode &errorCode);
|
||||
|
||||
/**
|
||||
* Calls a lambda that writes to a ByteSink with a CheckedArrayByteSink
|
||||
* and then returns through u_terminateChars(), in order to implement
|
||||
* the classic ICU4C C API writing to a fix sized buffer on top of a
|
||||
* contemporary C++ API.
|
||||
*
|
||||
* @param buffer receiving buffer
|
||||
* @param capacity capacity of receiving buffer
|
||||
* @param lambda that gets called with the sink as an argument
|
||||
* @param status set to U_BUFFER_OVERFLOW_ERROR on overflow
|
||||
* @return number of bytes written, or needed (in case of overflow)
|
||||
* @internal
|
||||
*/
|
||||
template <typename F,
|
||||
typename = std::enable_if_t<
|
||||
std::is_invocable_r_v<void, F, ByteSink&, UErrorCode&>>>
|
||||
static int32_t viaByteSinkToTerminatedChars(char* buffer, int32_t capacity,
|
||||
F&& lambda,
|
||||
UErrorCode& status) {
|
||||
if (U_FAILURE(status)) { return 0; }
|
||||
CheckedArrayByteSink sink(buffer, capacity);
|
||||
lambda(sink, status);
|
||||
if (U_FAILURE(status)) { return 0; }
|
||||
|
||||
int32_t reslen = sink.NumberOfBytesAppended();
|
||||
|
||||
if (sink.Overflowed()) {
|
||||
status = U_BUFFER_OVERFLOW_ERROR;
|
||||
return reslen;
|
||||
}
|
||||
|
||||
return u_terminateChars(buffer, capacity, reslen, &status);
|
||||
}
|
||||
|
||||
/**
|
||||
* Calls a lambda that writes to a ByteSink with a CharStringByteSink and
|
||||
* then returns a CharString, in order to implement a contemporary C++ API
|
||||
* on top of a C/C++ compatibility ByteSink API.
|
||||
*
|
||||
* @param lambda that gets called with the sink as an argument
|
||||
* @param status to check and report
|
||||
* @return the resulting string, or an empty string (in case of error)
|
||||
* @internal
|
||||
*/
|
||||
template <typename F,
|
||||
typename = std::enable_if_t<
|
||||
std::is_invocable_r_v<void, F, ByteSink&, UErrorCode&>>>
|
||||
static CharString viaByteSinkToCharString(F&& lambda, UErrorCode& status) {
|
||||
if (U_FAILURE(status)) { return {}; }
|
||||
CharString result;
|
||||
CharStringByteSink sink(&result);
|
||||
lambda(sink, status);
|
||||
return result;
|
||||
}
|
||||
|
||||
private:
|
||||
static void appendNonEmptyUnchanged(const uint8_t *s, int32_t length,
|
||||
ByteSink &sink, uint32_t options, Edits *edits);
|
||||
};
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
||||
#endif //BYTESINKUTIL_H
|
||||
85
engine/thirdparty/icu4c/common/bytestream.cpp
vendored
Normal file
85
engine/thirdparty/icu4c/common/bytestream.cpp
vendored
Normal file
|
|
@ -0,0 +1,85 @@
|
|||
// © 2016 and later: Unicode, Inc. and others.
|
||||
// License & terms of use: http://www.unicode.org/copyright.html
|
||||
// Copyright (C) 2009-2011, International Business Machines
|
||||
// Corporation and others. All Rights Reserved.
|
||||
//
|
||||
// Copyright 2007 Google Inc. All Rights Reserved.
|
||||
// Author: sanjay@google.com (Sanjay Ghemawat)
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
#include "unicode/bytestream.h"
|
||||
#include "cmemory.h"
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
ByteSink::~ByteSink() {}
|
||||
|
||||
char* ByteSink::GetAppendBuffer(int32_t min_capacity,
|
||||
int32_t /*desired_capacity_hint*/,
|
||||
char* scratch, int32_t scratch_capacity,
|
||||
int32_t* result_capacity) {
|
||||
if (min_capacity < 1 || scratch_capacity < min_capacity) {
|
||||
*result_capacity = 0;
|
||||
return nullptr;
|
||||
}
|
||||
*result_capacity = scratch_capacity;
|
||||
return scratch;
|
||||
}
|
||||
|
||||
void ByteSink::Flush() {}
|
||||
|
||||
CheckedArrayByteSink::CheckedArrayByteSink(char* outbuf, int32_t capacity)
|
||||
: outbuf_(outbuf), capacity_(capacity < 0 ? 0 : capacity),
|
||||
size_(0), appended_(0), overflowed_(false) {
|
||||
}
|
||||
|
||||
CheckedArrayByteSink::~CheckedArrayByteSink() {}
|
||||
|
||||
CheckedArrayByteSink& CheckedArrayByteSink::Reset() {
|
||||
size_ = appended_ = 0;
|
||||
overflowed_ = false;
|
||||
return *this;
|
||||
}
|
||||
|
||||
void CheckedArrayByteSink::Append(const char* bytes, int32_t n) {
|
||||
if (n <= 0) {
|
||||
return;
|
||||
}
|
||||
if (n > (INT32_MAX - appended_)) {
|
||||
// TODO: Report as integer overflow, not merely buffer overflow.
|
||||
appended_ = INT32_MAX;
|
||||
overflowed_ = true;
|
||||
return;
|
||||
}
|
||||
appended_ += n;
|
||||
int32_t available = capacity_ - size_;
|
||||
if (n > available) {
|
||||
n = available;
|
||||
overflowed_ = true;
|
||||
}
|
||||
if (n > 0 && bytes != (outbuf_ + size_)) {
|
||||
uprv_memcpy(outbuf_ + size_, bytes, n);
|
||||
}
|
||||
size_ += n;
|
||||
}
|
||||
|
||||
char* CheckedArrayByteSink::GetAppendBuffer(int32_t min_capacity,
|
||||
int32_t /*desired_capacity_hint*/,
|
||||
char* scratch,
|
||||
int32_t scratch_capacity,
|
||||
int32_t* result_capacity) {
|
||||
if (min_capacity < 1 || scratch_capacity < min_capacity) {
|
||||
*result_capacity = 0;
|
||||
return nullptr;
|
||||
}
|
||||
int32_t available = capacity_ - size_;
|
||||
if (available >= min_capacity) {
|
||||
*result_capacity = available;
|
||||
return outbuf_ + size_;
|
||||
} else {
|
||||
*result_capacity = scratch_capacity;
|
||||
return scratch;
|
||||
}
|
||||
}
|
||||
|
||||
U_NAMESPACE_END
|
||||
441
engine/thirdparty/icu4c/common/bytestrie.cpp
vendored
Normal file
441
engine/thirdparty/icu4c/common/bytestrie.cpp
vendored
Normal file
|
|
@ -0,0 +1,441 @@
|
|||
// © 2016 and later: Unicode, Inc. and others.
|
||||
// License & terms of use: http://www.unicode.org/copyright.html
|
||||
/*
|
||||
*******************************************************************************
|
||||
* Copyright (C) 2010-2011, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*******************************************************************************
|
||||
* file name: bytestrie.cpp
|
||||
* encoding: UTF-8
|
||||
* tab size: 8 (not used)
|
||||
* indentation:4
|
||||
*
|
||||
* created on: 2010sep25
|
||||
* created by: Markus W. Scherer
|
||||
*/
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
#include "unicode/bytestream.h"
|
||||
#include "unicode/bytestrie.h"
|
||||
#include "unicode/uobject.h"
|
||||
#include "cmemory.h"
|
||||
#include "uassert.h"
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
BytesTrie::~BytesTrie() {
|
||||
uprv_free(ownedArray_);
|
||||
}
|
||||
|
||||
// lead byte already shifted right by 1.
|
||||
int32_t
|
||||
BytesTrie::readValue(const uint8_t *pos, int32_t leadByte) {
|
||||
int32_t value;
|
||||
if(leadByte<kMinTwoByteValueLead) {
|
||||
value=leadByte-kMinOneByteValueLead;
|
||||
} else if(leadByte<kMinThreeByteValueLead) {
|
||||
value=((leadByte-kMinTwoByteValueLead)<<8)|*pos;
|
||||
} else if(leadByte<kFourByteValueLead) {
|
||||
value=((leadByte-kMinThreeByteValueLead)<<16)|(pos[0]<<8)|pos[1];
|
||||
} else if(leadByte==kFourByteValueLead) {
|
||||
value=(pos[0]<<16)|(pos[1]<<8)|pos[2];
|
||||
} else {
|
||||
value=(pos[0]<<24)|(pos[1]<<16)|(pos[2]<<8)|pos[3];
|
||||
}
|
||||
return value;
|
||||
}
|
||||
|
||||
const uint8_t *
|
||||
BytesTrie::jumpByDelta(const uint8_t *pos) {
|
||||
int32_t delta=*pos++;
|
||||
if(delta<kMinTwoByteDeltaLead) {
|
||||
// nothing to do
|
||||
} else if(delta<kMinThreeByteDeltaLead) {
|
||||
delta=((delta-kMinTwoByteDeltaLead)<<8)|*pos++;
|
||||
} else if(delta<kFourByteDeltaLead) {
|
||||
delta=((delta-kMinThreeByteDeltaLead)<<16)|(pos[0]<<8)|pos[1];
|
||||
pos+=2;
|
||||
} else if(delta==kFourByteDeltaLead) {
|
||||
delta=(pos[0]<<16)|(pos[1]<<8)|pos[2];
|
||||
pos+=3;
|
||||
} else {
|
||||
delta=(pos[0]<<24)|(pos[1]<<16)|(pos[2]<<8)|pos[3];
|
||||
pos+=4;
|
||||
}
|
||||
return pos+delta;
|
||||
}
|
||||
|
||||
UStringTrieResult
|
||||
BytesTrie::current() const {
|
||||
const uint8_t *pos=pos_;
|
||||
if(pos==nullptr) {
|
||||
return USTRINGTRIE_NO_MATCH;
|
||||
} else {
|
||||
int32_t node;
|
||||
return (remainingMatchLength_<0 && (node=*pos)>=kMinValueLead) ?
|
||||
valueResult(node) : USTRINGTRIE_NO_VALUE;
|
||||
}
|
||||
}
|
||||
|
||||
UStringTrieResult
|
||||
BytesTrie::branchNext(const uint8_t *pos, int32_t length, int32_t inByte) {
|
||||
// Branch according to the current byte.
|
||||
if(length==0) {
|
||||
length=*pos++;
|
||||
}
|
||||
++length;
|
||||
// The length of the branch is the number of bytes to select from.
|
||||
// The data structure encodes a binary search.
|
||||
while(length>kMaxBranchLinearSubNodeLength) {
|
||||
if(inByte<*pos++) {
|
||||
length>>=1;
|
||||
pos=jumpByDelta(pos);
|
||||
} else {
|
||||
length=length-(length>>1);
|
||||
pos=skipDelta(pos);
|
||||
}
|
||||
}
|
||||
// Drop down to linear search for the last few bytes.
|
||||
// length>=2 because the loop body above sees length>kMaxBranchLinearSubNodeLength>=3
|
||||
// and divides length by 2.
|
||||
do {
|
||||
if(inByte==*pos++) {
|
||||
UStringTrieResult result;
|
||||
int32_t node=*pos;
|
||||
U_ASSERT(node>=kMinValueLead);
|
||||
if(node&kValueIsFinal) {
|
||||
// Leave the final value for getValue() to read.
|
||||
result=USTRINGTRIE_FINAL_VALUE;
|
||||
} else {
|
||||
// Use the non-final value as the jump delta.
|
||||
++pos;
|
||||
// int32_t delta=readValue(pos, node>>1);
|
||||
node>>=1;
|
||||
int32_t delta;
|
||||
if(node<kMinTwoByteValueLead) {
|
||||
delta=node-kMinOneByteValueLead;
|
||||
} else if(node<kMinThreeByteValueLead) {
|
||||
delta=((node-kMinTwoByteValueLead)<<8)|*pos++;
|
||||
} else if(node<kFourByteValueLead) {
|
||||
delta=((node-kMinThreeByteValueLead)<<16)|(pos[0]<<8)|pos[1];
|
||||
pos+=2;
|
||||
} else if(node==kFourByteValueLead) {
|
||||
delta=(pos[0]<<16)|(pos[1]<<8)|pos[2];
|
||||
pos+=3;
|
||||
} else {
|
||||
delta=(pos[0]<<24)|(pos[1]<<16)|(pos[2]<<8)|pos[3];
|
||||
pos+=4;
|
||||
}
|
||||
// end readValue()
|
||||
pos+=delta;
|
||||
node=*pos;
|
||||
result= node>=kMinValueLead ? valueResult(node) : USTRINGTRIE_NO_VALUE;
|
||||
}
|
||||
pos_=pos;
|
||||
return result;
|
||||
}
|
||||
--length;
|
||||
pos=skipValue(pos);
|
||||
} while(length>1);
|
||||
if(inByte==*pos++) {
|
||||
pos_=pos;
|
||||
int32_t node=*pos;
|
||||
return node>=kMinValueLead ? valueResult(node) : USTRINGTRIE_NO_VALUE;
|
||||
} else {
|
||||
stop();
|
||||
return USTRINGTRIE_NO_MATCH;
|
||||
}
|
||||
}
|
||||
|
||||
UStringTrieResult
|
||||
BytesTrie::nextImpl(const uint8_t *pos, int32_t inByte) {
|
||||
for(;;) {
|
||||
int32_t node=*pos++;
|
||||
if(node<kMinLinearMatch) {
|
||||
return branchNext(pos, node, inByte);
|
||||
} else if(node<kMinValueLead) {
|
||||
// Match the first of length+1 bytes.
|
||||
int32_t length=node-kMinLinearMatch; // Actual match length minus 1.
|
||||
if(inByte==*pos++) {
|
||||
remainingMatchLength_=--length;
|
||||
pos_=pos;
|
||||
return (length<0 && (node=*pos)>=kMinValueLead) ?
|
||||
valueResult(node) : USTRINGTRIE_NO_VALUE;
|
||||
} else {
|
||||
// No match.
|
||||
break;
|
||||
}
|
||||
} else if(node&kValueIsFinal) {
|
||||
// No further matching bytes.
|
||||
break;
|
||||
} else {
|
||||
// Skip intermediate value.
|
||||
pos=skipValue(pos, node);
|
||||
// The next node must not also be a value node.
|
||||
U_ASSERT(*pos<kMinValueLead);
|
||||
}
|
||||
}
|
||||
stop();
|
||||
return USTRINGTRIE_NO_MATCH;
|
||||
}
|
||||
|
||||
UStringTrieResult
|
||||
BytesTrie::next(int32_t inByte) {
|
||||
const uint8_t *pos=pos_;
|
||||
if(pos==nullptr) {
|
||||
return USTRINGTRIE_NO_MATCH;
|
||||
}
|
||||
if(inByte<0) {
|
||||
inByte+=0x100;
|
||||
}
|
||||
int32_t length=remainingMatchLength_; // Actual remaining match length minus 1.
|
||||
if(length>=0) {
|
||||
// Remaining part of a linear-match node.
|
||||
if(inByte==*pos++) {
|
||||
remainingMatchLength_=--length;
|
||||
pos_=pos;
|
||||
int32_t node;
|
||||
return (length<0 && (node=*pos)>=kMinValueLead) ?
|
||||
valueResult(node) : USTRINGTRIE_NO_VALUE;
|
||||
} else {
|
||||
stop();
|
||||
return USTRINGTRIE_NO_MATCH;
|
||||
}
|
||||
}
|
||||
return nextImpl(pos, inByte);
|
||||
}
|
||||
|
||||
UStringTrieResult
|
||||
BytesTrie::next(const char *s, int32_t sLength) {
|
||||
if(sLength<0 ? *s==0 : sLength==0) {
|
||||
// Empty input.
|
||||
return current();
|
||||
}
|
||||
const uint8_t *pos=pos_;
|
||||
if(pos==nullptr) {
|
||||
return USTRINGTRIE_NO_MATCH;
|
||||
}
|
||||
int32_t length=remainingMatchLength_; // Actual remaining match length minus 1.
|
||||
for(;;) {
|
||||
// Fetch the next input byte, if there is one.
|
||||
// Continue a linear-match node without rechecking sLength<0.
|
||||
int32_t inByte;
|
||||
if(sLength<0) {
|
||||
for(;;) {
|
||||
if((inByte=*s++)==0) {
|
||||
remainingMatchLength_=length;
|
||||
pos_=pos;
|
||||
int32_t node;
|
||||
return (length<0 && (node=*pos)>=kMinValueLead) ?
|
||||
valueResult(node) : USTRINGTRIE_NO_VALUE;
|
||||
}
|
||||
if(length<0) {
|
||||
remainingMatchLength_=length;
|
||||
break;
|
||||
}
|
||||
if(inByte!=*pos) {
|
||||
stop();
|
||||
return USTRINGTRIE_NO_MATCH;
|
||||
}
|
||||
++pos;
|
||||
--length;
|
||||
}
|
||||
} else {
|
||||
for(;;) {
|
||||
if(sLength==0) {
|
||||
remainingMatchLength_=length;
|
||||
pos_=pos;
|
||||
int32_t node;
|
||||
return (length<0 && (node=*pos)>=kMinValueLead) ?
|
||||
valueResult(node) : USTRINGTRIE_NO_VALUE;
|
||||
}
|
||||
inByte=*s++;
|
||||
--sLength;
|
||||
if(length<0) {
|
||||
remainingMatchLength_=length;
|
||||
break;
|
||||
}
|
||||
if(inByte!=*pos) {
|
||||
stop();
|
||||
return USTRINGTRIE_NO_MATCH;
|
||||
}
|
||||
++pos;
|
||||
--length;
|
||||
}
|
||||
}
|
||||
for(;;) {
|
||||
int32_t node=*pos++;
|
||||
if(node<kMinLinearMatch) {
|
||||
UStringTrieResult result=branchNext(pos, node, inByte);
|
||||
if(result==USTRINGTRIE_NO_MATCH) {
|
||||
return USTRINGTRIE_NO_MATCH;
|
||||
}
|
||||
// Fetch the next input byte, if there is one.
|
||||
if(sLength<0) {
|
||||
if((inByte=*s++)==0) {
|
||||
return result;
|
||||
}
|
||||
} else {
|
||||
if(sLength==0) {
|
||||
return result;
|
||||
}
|
||||
inByte=*s++;
|
||||
--sLength;
|
||||
}
|
||||
if(result==USTRINGTRIE_FINAL_VALUE) {
|
||||
// No further matching bytes.
|
||||
stop();
|
||||
return USTRINGTRIE_NO_MATCH;
|
||||
}
|
||||
pos=pos_; // branchNext() advanced pos and wrote it to pos_ .
|
||||
} else if(node<kMinValueLead) {
|
||||
// Match length+1 bytes.
|
||||
length=node-kMinLinearMatch; // Actual match length minus 1.
|
||||
if(inByte!=*pos) {
|
||||
stop();
|
||||
return USTRINGTRIE_NO_MATCH;
|
||||
}
|
||||
++pos;
|
||||
--length;
|
||||
break;
|
||||
} else if(node&kValueIsFinal) {
|
||||
// No further matching bytes.
|
||||
stop();
|
||||
return USTRINGTRIE_NO_MATCH;
|
||||
} else {
|
||||
// Skip intermediate value.
|
||||
pos=skipValue(pos, node);
|
||||
// The next node must not also be a value node.
|
||||
U_ASSERT(*pos<kMinValueLead);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const uint8_t *
|
||||
BytesTrie::findUniqueValueFromBranch(const uint8_t *pos, int32_t length,
|
||||
UBool haveUniqueValue, int32_t &uniqueValue) {
|
||||
while(length>kMaxBranchLinearSubNodeLength) {
|
||||
++pos; // ignore the comparison byte
|
||||
if(nullptr==findUniqueValueFromBranch(jumpByDelta(pos), length>>1, haveUniqueValue, uniqueValue)) {
|
||||
return nullptr;
|
||||
}
|
||||
length=length-(length>>1);
|
||||
pos=skipDelta(pos);
|
||||
}
|
||||
do {
|
||||
++pos; // ignore a comparison byte
|
||||
// handle its value
|
||||
int32_t node=*pos++;
|
||||
UBool isFinal=(UBool)(node&kValueIsFinal);
|
||||
int32_t value=readValue(pos, node>>1);
|
||||
pos=skipValue(pos, node);
|
||||
if(isFinal) {
|
||||
if(haveUniqueValue) {
|
||||
if(value!=uniqueValue) {
|
||||
return nullptr;
|
||||
}
|
||||
} else {
|
||||
uniqueValue=value;
|
||||
haveUniqueValue=true;
|
||||
}
|
||||
} else {
|
||||
if(!findUniqueValue(pos+value, haveUniqueValue, uniqueValue)) {
|
||||
return nullptr;
|
||||
}
|
||||
haveUniqueValue=true;
|
||||
}
|
||||
} while(--length>1);
|
||||
return pos+1; // ignore the last comparison byte
|
||||
}
|
||||
|
||||
UBool
|
||||
BytesTrie::findUniqueValue(const uint8_t *pos, UBool haveUniqueValue, int32_t &uniqueValue) {
|
||||
for(;;) {
|
||||
int32_t node=*pos++;
|
||||
if(node<kMinLinearMatch) {
|
||||
if(node==0) {
|
||||
node=*pos++;
|
||||
}
|
||||
pos=findUniqueValueFromBranch(pos, node+1, haveUniqueValue, uniqueValue);
|
||||
if(pos==nullptr) {
|
||||
return false;
|
||||
}
|
||||
haveUniqueValue=true;
|
||||
} else if(node<kMinValueLead) {
|
||||
// linear-match node
|
||||
pos+=node-kMinLinearMatch+1; // Ignore the match bytes.
|
||||
} else {
|
||||
UBool isFinal=(UBool)(node&kValueIsFinal);
|
||||
int32_t value=readValue(pos, node>>1);
|
||||
if(haveUniqueValue) {
|
||||
if(value!=uniqueValue) {
|
||||
return false;
|
||||
}
|
||||
} else {
|
||||
uniqueValue=value;
|
||||
haveUniqueValue=true;
|
||||
}
|
||||
if(isFinal) {
|
||||
return true;
|
||||
}
|
||||
pos=skipValue(pos, node);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
int32_t
|
||||
BytesTrie::getNextBytes(ByteSink &out) const {
|
||||
const uint8_t *pos=pos_;
|
||||
if(pos==nullptr) {
|
||||
return 0;
|
||||
}
|
||||
if(remainingMatchLength_>=0) {
|
||||
append(out, *pos); // Next byte of a pending linear-match node.
|
||||
return 1;
|
||||
}
|
||||
int32_t node=*pos++;
|
||||
if(node>=kMinValueLead) {
|
||||
if(node&kValueIsFinal) {
|
||||
return 0;
|
||||
} else {
|
||||
pos=skipValue(pos, node);
|
||||
node=*pos++;
|
||||
U_ASSERT(node<kMinValueLead);
|
||||
}
|
||||
}
|
||||
if(node<kMinLinearMatch) {
|
||||
if(node==0) {
|
||||
node=*pos++;
|
||||
}
|
||||
getNextBranchBytes(pos, ++node, out);
|
||||
return node;
|
||||
} else {
|
||||
// First byte of the linear-match node.
|
||||
append(out, *pos);
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
BytesTrie::getNextBranchBytes(const uint8_t *pos, int32_t length, ByteSink &out) {
|
||||
while(length>kMaxBranchLinearSubNodeLength) {
|
||||
++pos; // ignore the comparison byte
|
||||
getNextBranchBytes(jumpByDelta(pos), length>>1, out);
|
||||
length=length-(length>>1);
|
||||
pos=skipDelta(pos);
|
||||
}
|
||||
do {
|
||||
append(out, *pos++);
|
||||
pos=skipValue(pos);
|
||||
} while(--length>1);
|
||||
append(out, *pos);
|
||||
}
|
||||
|
||||
void
|
||||
BytesTrie::append(ByteSink &out, int c) {
|
||||
char ch=(char)c;
|
||||
out.Append(&ch, 1);
|
||||
}
|
||||
|
||||
U_NAMESPACE_END
|
||||
512
engine/thirdparty/icu4c/common/bytestriebuilder.cpp
vendored
Normal file
512
engine/thirdparty/icu4c/common/bytestriebuilder.cpp
vendored
Normal file
|
|
@ -0,0 +1,512 @@
|
|||
// © 2016 and later: Unicode, Inc. and others.
|
||||
// License & terms of use: http://www.unicode.org/copyright.html
|
||||
/*
|
||||
*******************************************************************************
|
||||
* Copyright (C) 2010-2012, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*******************************************************************************
|
||||
* file name: bytestriebuilder.cpp
|
||||
* encoding: UTF-8
|
||||
* tab size: 8 (not used)
|
||||
* indentation:4
|
||||
*
|
||||
* created on: 2010sep25
|
||||
* created by: Markus W. Scherer
|
||||
*/
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
#include "unicode/bytestrie.h"
|
||||
#include "unicode/bytestriebuilder.h"
|
||||
#include "unicode/stringpiece.h"
|
||||
#include "charstr.h"
|
||||
#include "cmemory.h"
|
||||
#include "uhash.h"
|
||||
#include "uarrsort.h"
|
||||
#include "uassert.h"
|
||||
#include "ustr_imp.h"
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
/*
|
||||
* Note: This builder implementation stores (bytes, value) pairs with full copies
|
||||
* of the byte sequences, until the BytesTrie is built.
|
||||
* It might(!) take less memory if we collected the data in a temporary, dynamic trie.
|
||||
*/
|
||||
|
||||
class BytesTrieElement : public UMemory {
|
||||
public:
|
||||
// Use compiler's default constructor, initializes nothing.
|
||||
|
||||
void setTo(StringPiece s, int32_t val, CharString &strings, UErrorCode &errorCode);
|
||||
|
||||
StringPiece getString(const CharString &strings) const {
|
||||
int32_t offset=stringOffset;
|
||||
int32_t length;
|
||||
if(offset>=0) {
|
||||
length=(uint8_t)strings[offset++];
|
||||
} else {
|
||||
offset=~offset;
|
||||
length=((int32_t)(uint8_t)strings[offset]<<8)|(uint8_t)strings[offset+1];
|
||||
offset+=2;
|
||||
}
|
||||
return StringPiece(strings.data()+offset, length);
|
||||
}
|
||||
int32_t getStringLength(const CharString &strings) const {
|
||||
int32_t offset=stringOffset;
|
||||
if(offset>=0) {
|
||||
return (uint8_t)strings[offset];
|
||||
} else {
|
||||
offset=~offset;
|
||||
return ((int32_t)(uint8_t)strings[offset]<<8)|(uint8_t)strings[offset+1];
|
||||
}
|
||||
}
|
||||
|
||||
char charAt(int32_t index, const CharString &strings) const { return data(strings)[index]; }
|
||||
|
||||
int32_t getValue() const { return value; }
|
||||
|
||||
int32_t compareStringTo(const BytesTrieElement &o, const CharString &strings) const;
|
||||
|
||||
private:
|
||||
const char *data(const CharString &strings) const {
|
||||
int32_t offset=stringOffset;
|
||||
if(offset>=0) {
|
||||
++offset;
|
||||
} else {
|
||||
offset=~offset+2;
|
||||
}
|
||||
return strings.data()+offset;
|
||||
}
|
||||
|
||||
// If the stringOffset is non-negative, then the first strings byte contains
|
||||
// the string length.
|
||||
// If the stringOffset is negative, then the first two strings bytes contain
|
||||
// the string length (big-endian), and the offset needs to be bit-inverted.
|
||||
// (Compared with a stringLength field here, this saves 3 bytes per string for most strings.)
|
||||
int32_t stringOffset;
|
||||
int32_t value;
|
||||
};
|
||||
|
||||
void
|
||||
BytesTrieElement::setTo(StringPiece s, int32_t val,
|
||||
CharString &strings, UErrorCode &errorCode) {
|
||||
if(U_FAILURE(errorCode)) {
|
||||
return;
|
||||
}
|
||||
int32_t length=s.length();
|
||||
if(length>0xffff) {
|
||||
// Too long: We store the length in 1 or 2 bytes.
|
||||
errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
|
||||
return;
|
||||
}
|
||||
int32_t offset=strings.length();
|
||||
if(length>0xff) {
|
||||
offset=~offset;
|
||||
strings.append((char)(length>>8), errorCode);
|
||||
}
|
||||
strings.append((char)length, errorCode);
|
||||
stringOffset=offset;
|
||||
value=val;
|
||||
strings.append(s, errorCode);
|
||||
}
|
||||
|
||||
int32_t
|
||||
BytesTrieElement::compareStringTo(const BytesTrieElement &other, const CharString &strings) const {
|
||||
// TODO: add StringPiece::compare(), see ticket #8187
|
||||
StringPiece thisString=getString(strings);
|
||||
StringPiece otherString=other.getString(strings);
|
||||
int32_t lengthDiff=thisString.length()-otherString.length();
|
||||
int32_t commonLength;
|
||||
if(lengthDiff<=0) {
|
||||
commonLength=thisString.length();
|
||||
} else {
|
||||
commonLength=otherString.length();
|
||||
}
|
||||
int32_t diff=uprv_memcmp(thisString.data(), otherString.data(), commonLength);
|
||||
return diff!=0 ? diff : lengthDiff;
|
||||
}
|
||||
|
||||
BytesTrieBuilder::BytesTrieBuilder(UErrorCode &errorCode)
|
||||
: strings(nullptr), elements(nullptr), elementsCapacity(0), elementsLength(0),
|
||||
bytes(nullptr), bytesCapacity(0), bytesLength(0) {
|
||||
if(U_FAILURE(errorCode)) {
|
||||
return;
|
||||
}
|
||||
strings=new CharString();
|
||||
if(strings==nullptr) {
|
||||
errorCode=U_MEMORY_ALLOCATION_ERROR;
|
||||
}
|
||||
}
|
||||
|
||||
BytesTrieBuilder::~BytesTrieBuilder() {
|
||||
delete strings;
|
||||
delete[] elements;
|
||||
uprv_free(bytes);
|
||||
}
|
||||
|
||||
BytesTrieBuilder &
|
||||
BytesTrieBuilder::add(StringPiece s, int32_t value, UErrorCode &errorCode) {
|
||||
if(U_FAILURE(errorCode)) {
|
||||
return *this;
|
||||
}
|
||||
if(bytesLength>0) {
|
||||
// Cannot add elements after building.
|
||||
errorCode=U_NO_WRITE_PERMISSION;
|
||||
return *this;
|
||||
}
|
||||
if(elementsLength==elementsCapacity) {
|
||||
int32_t newCapacity;
|
||||
if(elementsCapacity==0) {
|
||||
newCapacity=1024;
|
||||
} else {
|
||||
newCapacity=4*elementsCapacity;
|
||||
}
|
||||
BytesTrieElement *newElements=new BytesTrieElement[newCapacity];
|
||||
if(newElements==nullptr) {
|
||||
errorCode=U_MEMORY_ALLOCATION_ERROR;
|
||||
return *this; // error instead of dereferencing null
|
||||
}
|
||||
if(elementsLength>0) {
|
||||
uprv_memcpy(newElements, elements, (size_t)elementsLength*sizeof(BytesTrieElement));
|
||||
}
|
||||
delete[] elements;
|
||||
elements=newElements;
|
||||
elementsCapacity=newCapacity;
|
||||
}
|
||||
elements[elementsLength++].setTo(s, value, *strings, errorCode);
|
||||
return *this;
|
||||
}
|
||||
|
||||
U_CDECL_BEGIN
|
||||
|
||||
static int32_t U_CALLCONV
|
||||
compareElementStrings(const void *context, const void *left, const void *right) {
|
||||
const CharString *strings=static_cast<const CharString *>(context);
|
||||
const BytesTrieElement *leftElement=static_cast<const BytesTrieElement *>(left);
|
||||
const BytesTrieElement *rightElement=static_cast<const BytesTrieElement *>(right);
|
||||
return leftElement->compareStringTo(*rightElement, *strings);
|
||||
}
|
||||
|
||||
U_CDECL_END
|
||||
|
||||
BytesTrie *
|
||||
BytesTrieBuilder::build(UStringTrieBuildOption buildOption, UErrorCode &errorCode) {
|
||||
buildBytes(buildOption, errorCode);
|
||||
BytesTrie *newTrie=nullptr;
|
||||
if(U_SUCCESS(errorCode)) {
|
||||
newTrie=new BytesTrie(bytes, bytes+(bytesCapacity-bytesLength));
|
||||
if(newTrie==nullptr) {
|
||||
errorCode=U_MEMORY_ALLOCATION_ERROR;
|
||||
} else {
|
||||
bytes=nullptr; // The new trie now owns the array.
|
||||
bytesCapacity=0;
|
||||
}
|
||||
}
|
||||
return newTrie;
|
||||
}
|
||||
|
||||
StringPiece
|
||||
BytesTrieBuilder::buildStringPiece(UStringTrieBuildOption buildOption, UErrorCode &errorCode) {
|
||||
buildBytes(buildOption, errorCode);
|
||||
StringPiece result;
|
||||
if(U_SUCCESS(errorCode)) {
|
||||
result.set(bytes+(bytesCapacity-bytesLength), bytesLength);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
void
|
||||
BytesTrieBuilder::buildBytes(UStringTrieBuildOption buildOption, UErrorCode &errorCode) {
|
||||
if(U_FAILURE(errorCode)) {
|
||||
return;
|
||||
}
|
||||
if(bytes!=nullptr && bytesLength>0) {
|
||||
// Already built.
|
||||
return;
|
||||
}
|
||||
if(bytesLength==0) {
|
||||
if(elementsLength==0) {
|
||||
errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
|
||||
return;
|
||||
}
|
||||
uprv_sortArray(elements, elementsLength, (int32_t)sizeof(BytesTrieElement),
|
||||
compareElementStrings, strings,
|
||||
false, // need not be a stable sort
|
||||
&errorCode);
|
||||
if(U_FAILURE(errorCode)) {
|
||||
return;
|
||||
}
|
||||
// Duplicate strings are not allowed.
|
||||
StringPiece prev=elements[0].getString(*strings);
|
||||
for(int32_t i=1; i<elementsLength; ++i) {
|
||||
StringPiece current=elements[i].getString(*strings);
|
||||
if(prev==current) {
|
||||
errorCode=U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return;
|
||||
}
|
||||
prev=current;
|
||||
}
|
||||
}
|
||||
// Create and byte-serialize the trie for the elements.
|
||||
bytesLength=0;
|
||||
int32_t capacity=strings->length();
|
||||
if(capacity<1024) {
|
||||
capacity=1024;
|
||||
}
|
||||
if(bytesCapacity<capacity) {
|
||||
uprv_free(bytes);
|
||||
bytes=static_cast<char *>(uprv_malloc(capacity));
|
||||
if(bytes==nullptr) {
|
||||
errorCode=U_MEMORY_ALLOCATION_ERROR;
|
||||
bytesCapacity=0;
|
||||
return;
|
||||
}
|
||||
bytesCapacity=capacity;
|
||||
}
|
||||
StringTrieBuilder::build(buildOption, elementsLength, errorCode);
|
||||
if(bytes==nullptr) {
|
||||
errorCode=U_MEMORY_ALLOCATION_ERROR;
|
||||
}
|
||||
}
|
||||
|
||||
BytesTrieBuilder &
|
||||
BytesTrieBuilder::clear() {
|
||||
strings->clear();
|
||||
elementsLength=0;
|
||||
bytesLength=0;
|
||||
return *this;
|
||||
}
|
||||
|
||||
int32_t
|
||||
BytesTrieBuilder::getElementStringLength(int32_t i) const {
|
||||
return elements[i].getStringLength(*strings);
|
||||
}
|
||||
|
||||
char16_t
|
||||
BytesTrieBuilder::getElementUnit(int32_t i, int32_t byteIndex) const {
|
||||
return (uint8_t)elements[i].charAt(byteIndex, *strings);
|
||||
}
|
||||
|
||||
int32_t
|
||||
BytesTrieBuilder::getElementValue(int32_t i) const {
|
||||
return elements[i].getValue();
|
||||
}
|
||||
|
||||
int32_t
|
||||
BytesTrieBuilder::getLimitOfLinearMatch(int32_t first, int32_t last, int32_t byteIndex) const {
|
||||
const BytesTrieElement &firstElement=elements[first];
|
||||
const BytesTrieElement &lastElement=elements[last];
|
||||
int32_t minStringLength=firstElement.getStringLength(*strings);
|
||||
while(++byteIndex<minStringLength &&
|
||||
firstElement.charAt(byteIndex, *strings)==
|
||||
lastElement.charAt(byteIndex, *strings)) {}
|
||||
return byteIndex;
|
||||
}
|
||||
|
||||
int32_t
|
||||
BytesTrieBuilder::countElementUnits(int32_t start, int32_t limit, int32_t byteIndex) const {
|
||||
int32_t length=0; // Number of different bytes at byteIndex.
|
||||
int32_t i=start;
|
||||
do {
|
||||
char byte=elements[i++].charAt(byteIndex, *strings);
|
||||
while(i<limit && byte==elements[i].charAt(byteIndex, *strings)) {
|
||||
++i;
|
||||
}
|
||||
++length;
|
||||
} while(i<limit);
|
||||
return length;
|
||||
}
|
||||
|
||||
int32_t
|
||||
BytesTrieBuilder::skipElementsBySomeUnits(int32_t i, int32_t byteIndex, int32_t count) const {
|
||||
do {
|
||||
char byte=elements[i++].charAt(byteIndex, *strings);
|
||||
while(byte==elements[i].charAt(byteIndex, *strings)) {
|
||||
++i;
|
||||
}
|
||||
} while(--count>0);
|
||||
return i;
|
||||
}
|
||||
|
||||
int32_t
|
||||
BytesTrieBuilder::indexOfElementWithNextUnit(int32_t i, int32_t byteIndex, char16_t byte) const {
|
||||
char b=(char)byte;
|
||||
while(b==elements[i].charAt(byteIndex, *strings)) {
|
||||
++i;
|
||||
}
|
||||
return i;
|
||||
}
|
||||
|
||||
BytesTrieBuilder::BTLinearMatchNode::BTLinearMatchNode(const char *bytes, int32_t len, Node *nextNode)
|
||||
: LinearMatchNode(len, nextNode), s(bytes) {
|
||||
hash=static_cast<int32_t>(
|
||||
static_cast<uint32_t>(hash)*37u + static_cast<uint32_t>(ustr_hashCharsN(bytes, len)));
|
||||
}
|
||||
|
||||
bool
|
||||
BytesTrieBuilder::BTLinearMatchNode::operator==(const Node &other) const {
|
||||
if(this==&other) {
|
||||
return true;
|
||||
}
|
||||
if(!LinearMatchNode::operator==(other)) {
|
||||
return false;
|
||||
}
|
||||
const BTLinearMatchNode &o=static_cast<const BTLinearMatchNode &>(other);
|
||||
return 0==uprv_memcmp(s, o.s, length);
|
||||
}
|
||||
|
||||
void
|
||||
BytesTrieBuilder::BTLinearMatchNode::write(StringTrieBuilder &builder) {
|
||||
BytesTrieBuilder &b=static_cast<BytesTrieBuilder &>(builder);
|
||||
next->write(builder);
|
||||
b.write(s, length);
|
||||
offset=b.write(b.getMinLinearMatch()+length-1);
|
||||
}
|
||||
|
||||
StringTrieBuilder::Node *
|
||||
BytesTrieBuilder::createLinearMatchNode(int32_t i, int32_t byteIndex, int32_t length,
|
||||
Node *nextNode) const {
|
||||
return new BTLinearMatchNode(
|
||||
elements[i].getString(*strings).data()+byteIndex,
|
||||
length,
|
||||
nextNode);
|
||||
}
|
||||
|
||||
UBool
|
||||
BytesTrieBuilder::ensureCapacity(int32_t length) {
|
||||
if(bytes==nullptr) {
|
||||
return false; // previous memory allocation had failed
|
||||
}
|
||||
if(length>bytesCapacity) {
|
||||
int32_t newCapacity=bytesCapacity;
|
||||
do {
|
||||
newCapacity*=2;
|
||||
} while(newCapacity<=length);
|
||||
char *newBytes=static_cast<char *>(uprv_malloc(newCapacity));
|
||||
if(newBytes==nullptr) {
|
||||
// unable to allocate memory
|
||||
uprv_free(bytes);
|
||||
bytes=nullptr;
|
||||
bytesCapacity=0;
|
||||
return false;
|
||||
}
|
||||
uprv_memcpy(newBytes+(newCapacity-bytesLength),
|
||||
bytes+(bytesCapacity-bytesLength), bytesLength);
|
||||
uprv_free(bytes);
|
||||
bytes=newBytes;
|
||||
bytesCapacity=newCapacity;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
int32_t
|
||||
BytesTrieBuilder::write(int32_t byte) {
|
||||
int32_t newLength=bytesLength+1;
|
||||
if(ensureCapacity(newLength)) {
|
||||
bytesLength=newLength;
|
||||
bytes[bytesCapacity-bytesLength]=(char)byte;
|
||||
}
|
||||
return bytesLength;
|
||||
}
|
||||
|
||||
int32_t
|
||||
BytesTrieBuilder::write(const char *b, int32_t length) {
|
||||
int32_t newLength=bytesLength+length;
|
||||
if(ensureCapacity(newLength)) {
|
||||
bytesLength=newLength;
|
||||
uprv_memcpy(bytes+(bytesCapacity-bytesLength), b, length);
|
||||
}
|
||||
return bytesLength;
|
||||
}
|
||||
|
||||
int32_t
|
||||
BytesTrieBuilder::writeElementUnits(int32_t i, int32_t byteIndex, int32_t length) {
|
||||
return write(elements[i].getString(*strings).data()+byteIndex, length);
|
||||
}
|
||||
|
||||
int32_t
|
||||
BytesTrieBuilder::writeValueAndFinal(int32_t i, UBool isFinal) {
|
||||
if(0<=i && i<=BytesTrie::kMaxOneByteValue) {
|
||||
return write(((BytesTrie::kMinOneByteValueLead+i)<<1)|isFinal);
|
||||
}
|
||||
char intBytes[5];
|
||||
int32_t length=1;
|
||||
if(i<0 || i>0xffffff) {
|
||||
intBytes[0]=(char)BytesTrie::kFiveByteValueLead;
|
||||
intBytes[1]=(char)((uint32_t)i>>24);
|
||||
intBytes[2]=(char)((uint32_t)i>>16);
|
||||
intBytes[3]=(char)((uint32_t)i>>8);
|
||||
intBytes[4]=(char)i;
|
||||
length=5;
|
||||
// } else if(i<=BytesTrie::kMaxOneByteValue) {
|
||||
// intBytes[0]=(char)(BytesTrie::kMinOneByteValueLead+i);
|
||||
} else {
|
||||
if(i<=BytesTrie::kMaxTwoByteValue) {
|
||||
intBytes[0]=(char)(BytesTrie::kMinTwoByteValueLead+(i>>8));
|
||||
} else {
|
||||
if(i<=BytesTrie::kMaxThreeByteValue) {
|
||||
intBytes[0]=(char)(BytesTrie::kMinThreeByteValueLead+(i>>16));
|
||||
} else {
|
||||
intBytes[0]=(char)BytesTrie::kFourByteValueLead;
|
||||
intBytes[1]=(char)(i>>16);
|
||||
length=2;
|
||||
}
|
||||
intBytes[length++]=(char)(i>>8);
|
||||
}
|
||||
intBytes[length++]=(char)i;
|
||||
}
|
||||
intBytes[0]=(char)((intBytes[0]<<1)|isFinal);
|
||||
return write(intBytes, length);
|
||||
}
|
||||
|
||||
int32_t
|
||||
BytesTrieBuilder::writeValueAndType(UBool hasValue, int32_t value, int32_t node) {
|
||||
int32_t offset=write(node);
|
||||
if(hasValue) {
|
||||
offset=writeValueAndFinal(value, false);
|
||||
}
|
||||
return offset;
|
||||
}
|
||||
|
||||
int32_t
|
||||
BytesTrieBuilder::writeDeltaTo(int32_t jumpTarget) {
|
||||
int32_t i=bytesLength-jumpTarget;
|
||||
U_ASSERT(i>=0);
|
||||
if(i<=BytesTrie::kMaxOneByteDelta) {
|
||||
return write(i);
|
||||
} else {
|
||||
char intBytes[5];
|
||||
return write(intBytes, internalEncodeDelta(i, intBytes));
|
||||
}
|
||||
}
|
||||
|
||||
int32_t
|
||||
BytesTrieBuilder::internalEncodeDelta(int32_t i, char intBytes[]) {
|
||||
U_ASSERT(i>=0);
|
||||
if(i<=BytesTrie::kMaxOneByteDelta) {
|
||||
intBytes[0]=(char)i;
|
||||
return 1;
|
||||
}
|
||||
int32_t length=1;
|
||||
if(i<=BytesTrie::kMaxTwoByteDelta) {
|
||||
intBytes[0]=(char)(BytesTrie::kMinTwoByteDeltaLead+(i>>8));
|
||||
} else {
|
||||
if(i<=BytesTrie::kMaxThreeByteDelta) {
|
||||
intBytes[0]=(char)(BytesTrie::kMinThreeByteDeltaLead+(i>>16));
|
||||
} else {
|
||||
if(i<=0xffffff) {
|
||||
intBytes[0]=(char)BytesTrie::kFourByteDeltaLead;
|
||||
} else {
|
||||
intBytes[0]=(char)BytesTrie::kFiveByteDeltaLead;
|
||||
intBytes[1]=(char)(i>>24);
|
||||
length=2;
|
||||
}
|
||||
intBytes[length++]=(char)(i>>16);
|
||||
}
|
||||
intBytes[length++]=(char)(i>>8);
|
||||
}
|
||||
intBytes[length++]=(char)i;
|
||||
return length;
|
||||
}
|
||||
|
||||
U_NAMESPACE_END
|
||||
214
engine/thirdparty/icu4c/common/bytestrieiterator.cpp
vendored
Normal file
214
engine/thirdparty/icu4c/common/bytestrieiterator.cpp
vendored
Normal file
|
|
@ -0,0 +1,214 @@
|
|||
// © 2016 and later: Unicode, Inc. and others.
|
||||
// License & terms of use: http://www.unicode.org/copyright.html
|
||||
/*
|
||||
*******************************************************************************
|
||||
* Copyright (C) 2010-2012, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*******************************************************************************
|
||||
* file name: bytestrieiterator.cpp
|
||||
* encoding: UTF-8
|
||||
* tab size: 8 (not used)
|
||||
* indentation:4
|
||||
*
|
||||
* created on: 2010nov03
|
||||
* created by: Markus W. Scherer
|
||||
*/
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
#include "unicode/bytestrie.h"
|
||||
#include "unicode/stringpiece.h"
|
||||
#include "charstr.h"
|
||||
#include "uvectr32.h"
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
BytesTrie::Iterator::Iterator(const void *trieBytes, int32_t maxStringLength,
|
||||
UErrorCode &errorCode)
|
||||
: bytes_(static_cast<const uint8_t *>(trieBytes)),
|
||||
pos_(bytes_), initialPos_(bytes_),
|
||||
remainingMatchLength_(-1), initialRemainingMatchLength_(-1),
|
||||
str_(nullptr), maxLength_(maxStringLength), value_(0), stack_(nullptr) {
|
||||
if(U_FAILURE(errorCode)) {
|
||||
return;
|
||||
}
|
||||
// str_ and stack_ are pointers so that it's easy to turn bytestrie.h into
|
||||
// a public API header for which we would want it to depend only on
|
||||
// other public headers.
|
||||
// Unlike BytesTrie itself, its Iterator performs memory allocations anyway
|
||||
// via the CharString and UVector32 implementations, so this additional
|
||||
// cost is minimal.
|
||||
str_=new CharString();
|
||||
stack_=new UVector32(errorCode);
|
||||
if(U_SUCCESS(errorCode) && (str_==nullptr || stack_==nullptr)) {
|
||||
errorCode=U_MEMORY_ALLOCATION_ERROR;
|
||||
}
|
||||
}
|
||||
|
||||
BytesTrie::Iterator::Iterator(const BytesTrie &trie, int32_t maxStringLength,
|
||||
UErrorCode &errorCode)
|
||||
: bytes_(trie.bytes_), pos_(trie.pos_), initialPos_(trie.pos_),
|
||||
remainingMatchLength_(trie.remainingMatchLength_),
|
||||
initialRemainingMatchLength_(trie.remainingMatchLength_),
|
||||
str_(nullptr), maxLength_(maxStringLength), value_(0), stack_(nullptr) {
|
||||
if(U_FAILURE(errorCode)) {
|
||||
return;
|
||||
}
|
||||
str_=new CharString();
|
||||
stack_=new UVector32(errorCode);
|
||||
if(U_FAILURE(errorCode)) {
|
||||
return;
|
||||
}
|
||||
if(str_==nullptr || stack_==nullptr) {
|
||||
errorCode=U_MEMORY_ALLOCATION_ERROR;
|
||||
return;
|
||||
}
|
||||
int32_t length=remainingMatchLength_; // Actual remaining match length minus 1.
|
||||
if(length>=0) {
|
||||
// Pending linear-match node, append remaining bytes to str_.
|
||||
++length;
|
||||
if(maxLength_>0 && length>maxLength_) {
|
||||
length=maxLength_; // This will leave remainingMatchLength>=0 as a signal.
|
||||
}
|
||||
str_->append(reinterpret_cast<const char *>(pos_), length, errorCode);
|
||||
pos_+=length;
|
||||
remainingMatchLength_-=length;
|
||||
}
|
||||
}
|
||||
|
||||
BytesTrie::Iterator::~Iterator() {
|
||||
delete str_;
|
||||
delete stack_;
|
||||
}
|
||||
|
||||
BytesTrie::Iterator &
|
||||
BytesTrie::Iterator::reset() {
|
||||
pos_=initialPos_;
|
||||
remainingMatchLength_=initialRemainingMatchLength_;
|
||||
int32_t length=remainingMatchLength_+1; // Remaining match length.
|
||||
if(maxLength_>0 && length>maxLength_) {
|
||||
length=maxLength_;
|
||||
}
|
||||
str_->truncate(length);
|
||||
pos_+=length;
|
||||
remainingMatchLength_-=length;
|
||||
stack_->setSize(0);
|
||||
return *this;
|
||||
}
|
||||
|
||||
UBool
|
||||
BytesTrie::Iterator::hasNext() const { return pos_!=nullptr || !stack_->isEmpty(); }
|
||||
|
||||
UBool
|
||||
BytesTrie::Iterator::next(UErrorCode &errorCode) {
|
||||
if(U_FAILURE(errorCode)) {
|
||||
return false;
|
||||
}
|
||||
const uint8_t *pos=pos_;
|
||||
if(pos==nullptr) {
|
||||
if(stack_->isEmpty()) {
|
||||
return false;
|
||||
}
|
||||
// Pop the state off the stack and continue with the next outbound edge of
|
||||
// the branch node.
|
||||
int32_t stackSize=stack_->size();
|
||||
int32_t length=stack_->elementAti(stackSize-1);
|
||||
pos=bytes_+stack_->elementAti(stackSize-2);
|
||||
stack_->setSize(stackSize-2);
|
||||
str_->truncate(length&0xffff);
|
||||
length=(int32_t)((uint32_t)length>>16);
|
||||
if(length>1) {
|
||||
pos=branchNext(pos, length, errorCode);
|
||||
if(pos==nullptr) {
|
||||
return true; // Reached a final value.
|
||||
}
|
||||
} else {
|
||||
str_->append((char)*pos++, errorCode);
|
||||
}
|
||||
}
|
||||
if(remainingMatchLength_>=0) {
|
||||
// We only get here if we started in a pending linear-match node
|
||||
// with more than maxLength remaining bytes.
|
||||
return truncateAndStop();
|
||||
}
|
||||
for(;;) {
|
||||
int32_t node=*pos++;
|
||||
if(node>=kMinValueLead) {
|
||||
// Deliver value for the byte sequence so far.
|
||||
UBool isFinal=(UBool)(node&kValueIsFinal);
|
||||
value_=readValue(pos, node>>1);
|
||||
if(isFinal || (maxLength_>0 && str_->length()==maxLength_)) {
|
||||
pos_=nullptr;
|
||||
} else {
|
||||
pos_=skipValue(pos, node);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
if(maxLength_>0 && str_->length()==maxLength_) {
|
||||
return truncateAndStop();
|
||||
}
|
||||
if(node<kMinLinearMatch) {
|
||||
if(node==0) {
|
||||
node=*pos++;
|
||||
}
|
||||
pos=branchNext(pos, node+1, errorCode);
|
||||
if(pos==nullptr) {
|
||||
return true; // Reached a final value.
|
||||
}
|
||||
} else {
|
||||
// Linear-match node, append length bytes to str_.
|
||||
int32_t length=node-kMinLinearMatch+1;
|
||||
if(maxLength_>0 && str_->length()+length>maxLength_) {
|
||||
str_->append(reinterpret_cast<const char *>(pos),
|
||||
maxLength_-str_->length(), errorCode);
|
||||
return truncateAndStop();
|
||||
}
|
||||
str_->append(reinterpret_cast<const char *>(pos), length, errorCode);
|
||||
pos+=length;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
StringPiece
|
||||
BytesTrie::Iterator::getString() const {
|
||||
return str_ == nullptr ? StringPiece() : str_->toStringPiece();
|
||||
}
|
||||
|
||||
UBool
|
||||
BytesTrie::Iterator::truncateAndStop() {
|
||||
pos_=nullptr;
|
||||
value_=-1; // no real value for str
|
||||
return true;
|
||||
}
|
||||
|
||||
// Branch node, needs to take the first outbound edge and push state for the rest.
|
||||
const uint8_t *
|
||||
BytesTrie::Iterator::branchNext(const uint8_t *pos, int32_t length, UErrorCode &errorCode) {
|
||||
while(length>kMaxBranchLinearSubNodeLength) {
|
||||
++pos; // ignore the comparison byte
|
||||
// Push state for the greater-or-equal edge.
|
||||
stack_->addElement((int32_t)(skipDelta(pos)-bytes_), errorCode);
|
||||
stack_->addElement(((length-(length>>1))<<16)|str_->length(), errorCode);
|
||||
// Follow the less-than edge.
|
||||
length>>=1;
|
||||
pos=jumpByDelta(pos);
|
||||
}
|
||||
// List of key-value pairs where values are either final values or jump deltas.
|
||||
// Read the first (key, value) pair.
|
||||
uint8_t trieByte=*pos++;
|
||||
int32_t node=*pos++;
|
||||
UBool isFinal=(UBool)(node&kValueIsFinal);
|
||||
int32_t value=readValue(pos, node>>1);
|
||||
pos=skipValue(pos, node);
|
||||
stack_->addElement((int32_t)(pos-bytes_), errorCode);
|
||||
stack_->addElement(((length-1)<<16)|str_->length(), errorCode);
|
||||
str_->append((char)trieByte, errorCode);
|
||||
if(isFinal) {
|
||||
pos_=nullptr;
|
||||
value_=value;
|
||||
return nullptr;
|
||||
} else {
|
||||
return pos+value;
|
||||
}
|
||||
}
|
||||
|
||||
U_NAMESPACE_END
|
||||
594
engine/thirdparty/icu4c/common/caniter.cpp
vendored
Normal file
594
engine/thirdparty/icu4c/common/caniter.cpp
vendored
Normal file
|
|
@ -0,0 +1,594 @@
|
|||
// © 2016 and later: Unicode, Inc. and others.
|
||||
// License & terms of use: http://www.unicode.org/copyright.html
|
||||
/*
|
||||
*****************************************************************************
|
||||
* Copyright (C) 1996-2015, International Business Machines Corporation and
|
||||
* others. All Rights Reserved.
|
||||
*****************************************************************************
|
||||
*/
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
|
||||
#if !UCONFIG_NO_NORMALIZATION
|
||||
|
||||
#include "unicode/caniter.h"
|
||||
#include "unicode/normalizer2.h"
|
||||
#include "unicode/uchar.h"
|
||||
#include "unicode/uniset.h"
|
||||
#include "unicode/usetiter.h"
|
||||
#include "unicode/ustring.h"
|
||||
#include "unicode/utf16.h"
|
||||
#include "cmemory.h"
|
||||
#include "hash.h"
|
||||
#include "normalizer2impl.h"
|
||||
|
||||
/**
|
||||
* This class allows one to iterate through all the strings that are canonically equivalent to a given
|
||||
* string. For example, here are some sample results:
|
||||
Results for: {LATIN CAPITAL LETTER A WITH RING ABOVE}{LATIN SMALL LETTER D}{COMBINING DOT ABOVE}{COMBINING CEDILLA}
|
||||
1: \u0041\u030A\u0064\u0307\u0327
|
||||
= {LATIN CAPITAL LETTER A}{COMBINING RING ABOVE}{LATIN SMALL LETTER D}{COMBINING DOT ABOVE}{COMBINING CEDILLA}
|
||||
2: \u0041\u030A\u0064\u0327\u0307
|
||||
= {LATIN CAPITAL LETTER A}{COMBINING RING ABOVE}{LATIN SMALL LETTER D}{COMBINING CEDILLA}{COMBINING DOT ABOVE}
|
||||
3: \u0041\u030A\u1E0B\u0327
|
||||
= {LATIN CAPITAL LETTER A}{COMBINING RING ABOVE}{LATIN SMALL LETTER D WITH DOT ABOVE}{COMBINING CEDILLA}
|
||||
4: \u0041\u030A\u1E11\u0307
|
||||
= {LATIN CAPITAL LETTER A}{COMBINING RING ABOVE}{LATIN SMALL LETTER D WITH CEDILLA}{COMBINING DOT ABOVE}
|
||||
5: \u00C5\u0064\u0307\u0327
|
||||
= {LATIN CAPITAL LETTER A WITH RING ABOVE}{LATIN SMALL LETTER D}{COMBINING DOT ABOVE}{COMBINING CEDILLA}
|
||||
6: \u00C5\u0064\u0327\u0307
|
||||
= {LATIN CAPITAL LETTER A WITH RING ABOVE}{LATIN SMALL LETTER D}{COMBINING CEDILLA}{COMBINING DOT ABOVE}
|
||||
7: \u00C5\u1E0B\u0327
|
||||
= {LATIN CAPITAL LETTER A WITH RING ABOVE}{LATIN SMALL LETTER D WITH DOT ABOVE}{COMBINING CEDILLA}
|
||||
8: \u00C5\u1E11\u0307
|
||||
= {LATIN CAPITAL LETTER A WITH RING ABOVE}{LATIN SMALL LETTER D WITH CEDILLA}{COMBINING DOT ABOVE}
|
||||
9: \u212B\u0064\u0307\u0327
|
||||
= {ANGSTROM SIGN}{LATIN SMALL LETTER D}{COMBINING DOT ABOVE}{COMBINING CEDILLA}
|
||||
10: \u212B\u0064\u0327\u0307
|
||||
= {ANGSTROM SIGN}{LATIN SMALL LETTER D}{COMBINING CEDILLA}{COMBINING DOT ABOVE}
|
||||
11: \u212B\u1E0B\u0327
|
||||
= {ANGSTROM SIGN}{LATIN SMALL LETTER D WITH DOT ABOVE}{COMBINING CEDILLA}
|
||||
12: \u212B\u1E11\u0307
|
||||
= {ANGSTROM SIGN}{LATIN SMALL LETTER D WITH CEDILLA}{COMBINING DOT ABOVE}
|
||||
*<br>Note: the code is intended for use with small strings, and is not suitable for larger ones,
|
||||
* since it has not been optimized for that situation.
|
||||
*@author M. Davis
|
||||
*@draft
|
||||
*/
|
||||
|
||||
// public
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
// TODO: add boilerplate methods.
|
||||
|
||||
UOBJECT_DEFINE_RTTI_IMPLEMENTATION(CanonicalIterator)
|
||||
|
||||
|
||||
/**
|
||||
*@param source string to get results for
|
||||
*/
|
||||
CanonicalIterator::CanonicalIterator(const UnicodeString &sourceStr, UErrorCode &status) :
|
||||
pieces(nullptr),
|
||||
pieces_length(0),
|
||||
pieces_lengths(nullptr),
|
||||
current(nullptr),
|
||||
current_length(0),
|
||||
nfd(Normalizer2::getNFDInstance(status)),
|
||||
nfcImpl(Normalizer2Factory::getNFCImpl(status))
|
||||
{
|
||||
if(U_SUCCESS(status) && nfcImpl->ensureCanonIterData(status)) {
|
||||
setSource(sourceStr, status);
|
||||
}
|
||||
}
|
||||
|
||||
CanonicalIterator::~CanonicalIterator() {
|
||||
cleanPieces();
|
||||
}
|
||||
|
||||
void CanonicalIterator::cleanPieces() {
|
||||
int32_t i = 0;
|
||||
if(pieces != nullptr) {
|
||||
for(i = 0; i < pieces_length; i++) {
|
||||
if(pieces[i] != nullptr) {
|
||||
delete[] pieces[i];
|
||||
}
|
||||
}
|
||||
uprv_free(pieces);
|
||||
pieces = nullptr;
|
||||
pieces_length = 0;
|
||||
}
|
||||
if(pieces_lengths != nullptr) {
|
||||
uprv_free(pieces_lengths);
|
||||
pieces_lengths = nullptr;
|
||||
}
|
||||
if(current != nullptr) {
|
||||
uprv_free(current);
|
||||
current = nullptr;
|
||||
current_length = 0;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
*@return gets the source: NOTE: it is the NFD form of source
|
||||
*/
|
||||
UnicodeString CanonicalIterator::getSource() {
|
||||
return source;
|
||||
}
|
||||
|
||||
/**
|
||||
* Resets the iterator so that one can start again from the beginning.
|
||||
*/
|
||||
void CanonicalIterator::reset() {
|
||||
done = false;
|
||||
for (int i = 0; i < current_length; ++i) {
|
||||
current[i] = 0;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
*@return the next string that is canonically equivalent. The value null is returned when
|
||||
* the iteration is done.
|
||||
*/
|
||||
UnicodeString CanonicalIterator::next() {
|
||||
int32_t i = 0;
|
||||
|
||||
if (done) {
|
||||
buffer.setToBogus();
|
||||
return buffer;
|
||||
}
|
||||
|
||||
// delete old contents
|
||||
buffer.remove();
|
||||
|
||||
// construct return value
|
||||
|
||||
for (i = 0; i < pieces_length; ++i) {
|
||||
buffer.append(pieces[i][current[i]]);
|
||||
}
|
||||
//String result = buffer.toString(); // not needed
|
||||
|
||||
// find next value for next time
|
||||
|
||||
for (i = current_length - 1; ; --i) {
|
||||
if (i < 0) {
|
||||
done = true;
|
||||
break;
|
||||
}
|
||||
current[i]++;
|
||||
if (current[i] < pieces_lengths[i]) break; // got sequence
|
||||
current[i] = 0;
|
||||
}
|
||||
return buffer;
|
||||
}
|
||||
|
||||
/**
|
||||
*@param set the source string to iterate against. This allows the same iterator to be used
|
||||
* while changing the source string, saving object creation.
|
||||
*/
|
||||
void CanonicalIterator::setSource(const UnicodeString &newSource, UErrorCode &status) {
|
||||
int32_t list_length = 0;
|
||||
UChar32 cp = 0;
|
||||
int32_t start = 0;
|
||||
int32_t i = 0;
|
||||
UnicodeString *list = nullptr;
|
||||
|
||||
nfd->normalize(newSource, source, status);
|
||||
if(U_FAILURE(status)) {
|
||||
return;
|
||||
}
|
||||
done = false;
|
||||
|
||||
cleanPieces();
|
||||
|
||||
// catch degenerate case
|
||||
if (newSource.length() == 0) {
|
||||
pieces = (UnicodeString **)uprv_malloc(sizeof(UnicodeString *));
|
||||
pieces_lengths = (int32_t*)uprv_malloc(1 * sizeof(int32_t));
|
||||
pieces_length = 1;
|
||||
current = (int32_t*)uprv_malloc(1 * sizeof(int32_t));
|
||||
current_length = 1;
|
||||
if (pieces == nullptr || pieces_lengths == nullptr || current == nullptr) {
|
||||
status = U_MEMORY_ALLOCATION_ERROR;
|
||||
goto CleanPartialInitialization;
|
||||
}
|
||||
current[0] = 0;
|
||||
pieces[0] = new UnicodeString[1];
|
||||
pieces_lengths[0] = 1;
|
||||
if (pieces[0] == nullptr) {
|
||||
status = U_MEMORY_ALLOCATION_ERROR;
|
||||
goto CleanPartialInitialization;
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
list = new UnicodeString[source.length()];
|
||||
if (list == nullptr) {
|
||||
status = U_MEMORY_ALLOCATION_ERROR;
|
||||
goto CleanPartialInitialization;
|
||||
}
|
||||
|
||||
// i should initially be the number of code units at the
|
||||
// start of the string
|
||||
i = U16_LENGTH(source.char32At(0));
|
||||
// int32_t i = 1;
|
||||
// find the segments
|
||||
// This code iterates through the source string and
|
||||
// extracts segments that end up on a codepoint that
|
||||
// doesn't start any decompositions. (Analysis is done
|
||||
// on the NFD form - see above).
|
||||
for (; i < source.length(); i += U16_LENGTH(cp)) {
|
||||
cp = source.char32At(i);
|
||||
if (nfcImpl->isCanonSegmentStarter(cp)) {
|
||||
source.extract(start, i-start, list[list_length++]); // add up to i
|
||||
start = i;
|
||||
}
|
||||
}
|
||||
source.extract(start, i-start, list[list_length++]); // add last one
|
||||
|
||||
|
||||
// allocate the arrays, and find the strings that are CE to each segment
|
||||
pieces = (UnicodeString **)uprv_malloc(list_length * sizeof(UnicodeString *));
|
||||
pieces_length = list_length;
|
||||
pieces_lengths = (int32_t*)uprv_malloc(list_length * sizeof(int32_t));
|
||||
current = (int32_t*)uprv_malloc(list_length * sizeof(int32_t));
|
||||
current_length = list_length;
|
||||
if (pieces == nullptr || pieces_lengths == nullptr || current == nullptr) {
|
||||
status = U_MEMORY_ALLOCATION_ERROR;
|
||||
goto CleanPartialInitialization;
|
||||
}
|
||||
|
||||
for (i = 0; i < current_length; i++) {
|
||||
current[i] = 0;
|
||||
}
|
||||
// for each segment, get all the combinations that can produce
|
||||
// it after NFD normalization
|
||||
for (i = 0; i < pieces_length; ++i) {
|
||||
//if (PROGRESS) printf("SEGMENT\n");
|
||||
pieces[i] = getEquivalents(list[i], pieces_lengths[i], status);
|
||||
}
|
||||
|
||||
delete[] list;
|
||||
return;
|
||||
// Common section to cleanup all local variables and reset object variables.
|
||||
CleanPartialInitialization:
|
||||
delete[] list;
|
||||
cleanPieces();
|
||||
}
|
||||
|
||||
/**
|
||||
* Dumb recursive implementation of permutation.
|
||||
* TODO: optimize
|
||||
* @param source the string to find permutations for
|
||||
* @return the results in a set.
|
||||
*/
|
||||
void U_EXPORT2 CanonicalIterator::permute(UnicodeString &source, UBool skipZeros, Hashtable *result, UErrorCode &status, int32_t depth) {
|
||||
if(U_FAILURE(status)) {
|
||||
return;
|
||||
}
|
||||
// To avoid infinity loop caused by permute, we limit the depth of recursive
|
||||
// call to permute and return U_UNSUPPORTED_ERROR.
|
||||
// We know in some unit test we need at least 4. Set to 8 just in case some
|
||||
// unforseen use cases.
|
||||
constexpr int32_t kPermuteDepthLimit = 8;
|
||||
if (depth > kPermuteDepthLimit) {
|
||||
status = U_UNSUPPORTED_ERROR;
|
||||
return;
|
||||
}
|
||||
//if (PROGRESS) printf("Permute: %s\n", UToS(Tr(source)));
|
||||
int32_t i = 0;
|
||||
|
||||
// optimization:
|
||||
// if zero or one character, just return a set with it
|
||||
// we check for length < 2 to keep from counting code points all the time
|
||||
if (source.length() <= 2 && source.countChar32() <= 1) {
|
||||
UnicodeString *toPut = new UnicodeString(source);
|
||||
/* test for nullptr */
|
||||
if (toPut == nullptr) {
|
||||
status = U_MEMORY_ALLOCATION_ERROR;
|
||||
return;
|
||||
}
|
||||
result->put(source, toPut, status);
|
||||
return;
|
||||
}
|
||||
|
||||
// otherwise iterate through the string, and recursively permute all the other characters
|
||||
UChar32 cp;
|
||||
Hashtable subpermute(status);
|
||||
if(U_FAILURE(status)) {
|
||||
return;
|
||||
}
|
||||
subpermute.setValueDeleter(uprv_deleteUObject);
|
||||
|
||||
for (i = 0; i < source.length(); i += U16_LENGTH(cp)) {
|
||||
cp = source.char32At(i);
|
||||
const UHashElement *ne = nullptr;
|
||||
int32_t el = UHASH_FIRST;
|
||||
UnicodeString subPermuteString = source;
|
||||
|
||||
// optimization:
|
||||
// if the character is canonical combining class zero,
|
||||
// don't permute it
|
||||
if (skipZeros && i != 0 && u_getCombiningClass(cp) == 0) {
|
||||
//System.out.println("Skipping " + Utility.hex(UTF16.valueOf(source, i)));
|
||||
continue;
|
||||
}
|
||||
|
||||
subpermute.removeAll();
|
||||
|
||||
// see what the permutations of the characters before and after this one are
|
||||
//Hashtable *subpermute = permute(source.substring(0,i) + source.substring(i + UTF16.getCharCount(cp)));
|
||||
permute(subPermuteString.remove(i, U16_LENGTH(cp)), skipZeros, &subpermute, status, depth+1);
|
||||
/* Test for buffer overflows */
|
||||
if(U_FAILURE(status)) {
|
||||
return;
|
||||
}
|
||||
// The upper remove is destructive. The question is do we have to make a copy, or we don't care about the contents
|
||||
// of source at this point.
|
||||
|
||||
// prefix this character to all of them
|
||||
ne = subpermute.nextElement(el);
|
||||
while (ne != nullptr) {
|
||||
UnicodeString *permRes = (UnicodeString *)(ne->value.pointer);
|
||||
UnicodeString *chStr = new UnicodeString(cp);
|
||||
//test for nullptr
|
||||
if (chStr == nullptr) {
|
||||
status = U_MEMORY_ALLOCATION_ERROR;
|
||||
return;
|
||||
}
|
||||
chStr->append(*permRes); //*((UnicodeString *)(ne->value.pointer));
|
||||
//if (PROGRESS) printf(" Piece: %s\n", UToS(*chStr));
|
||||
result->put(*chStr, chStr, status);
|
||||
ne = subpermute.nextElement(el);
|
||||
}
|
||||
}
|
||||
//return result;
|
||||
}
|
||||
|
||||
// privates
|
||||
|
||||
// we have a segment, in NFD. Find all the strings that are canonically equivalent to it.
|
||||
UnicodeString* CanonicalIterator::getEquivalents(const UnicodeString &segment, int32_t &result_len, UErrorCode &status) {
|
||||
Hashtable result(status);
|
||||
Hashtable permutations(status);
|
||||
Hashtable basic(status);
|
||||
if (U_FAILURE(status)) {
|
||||
return nullptr;
|
||||
}
|
||||
result.setValueDeleter(uprv_deleteUObject);
|
||||
permutations.setValueDeleter(uprv_deleteUObject);
|
||||
basic.setValueDeleter(uprv_deleteUObject);
|
||||
|
||||
char16_t USeg[256];
|
||||
int32_t segLen = segment.extract(USeg, 256, status);
|
||||
getEquivalents2(&basic, USeg, segLen, status);
|
||||
|
||||
// now get all the permutations
|
||||
// add only the ones that are canonically equivalent
|
||||
// TODO: optimize by not permuting any class zero.
|
||||
|
||||
const UHashElement *ne = nullptr;
|
||||
int32_t el = UHASH_FIRST;
|
||||
//Iterator it = basic.iterator();
|
||||
ne = basic.nextElement(el);
|
||||
//while (it.hasNext())
|
||||
while (ne != nullptr) {
|
||||
//String item = (String) it.next();
|
||||
UnicodeString item = *((UnicodeString *)(ne->value.pointer));
|
||||
|
||||
permutations.removeAll();
|
||||
permute(item, CANITER_SKIP_ZEROES, &permutations, status);
|
||||
const UHashElement *ne2 = nullptr;
|
||||
int32_t el2 = UHASH_FIRST;
|
||||
//Iterator it2 = permutations.iterator();
|
||||
ne2 = permutations.nextElement(el2);
|
||||
//while (it2.hasNext())
|
||||
while (ne2 != nullptr) {
|
||||
//String possible = (String) it2.next();
|
||||
//UnicodeString *possible = new UnicodeString(*((UnicodeString *)(ne2->value.pointer)));
|
||||
UnicodeString possible(*((UnicodeString *)(ne2->value.pointer)));
|
||||
UnicodeString attempt;
|
||||
nfd->normalize(possible, attempt, status);
|
||||
|
||||
// TODO: check if operator == is semanticaly the same as attempt.equals(segment)
|
||||
if (attempt==segment) {
|
||||
//if (PROGRESS) printf("Adding Permutation: %s\n", UToS(Tr(*possible)));
|
||||
// TODO: use the hashtable just to catch duplicates - store strings directly (somehow).
|
||||
result.put(possible, new UnicodeString(possible), status); //add(possible);
|
||||
} else {
|
||||
//if (PROGRESS) printf("-Skipping Permutation: %s\n", UToS(Tr(*possible)));
|
||||
}
|
||||
|
||||
ne2 = permutations.nextElement(el2);
|
||||
}
|
||||
ne = basic.nextElement(el);
|
||||
}
|
||||
|
||||
/* Test for buffer overflows */
|
||||
if(U_FAILURE(status)) {
|
||||
return nullptr;
|
||||
}
|
||||
// convert into a String[] to clean up storage
|
||||
//String[] finalResult = new String[result.size()];
|
||||
UnicodeString *finalResult = nullptr;
|
||||
int32_t resultCount;
|
||||
if((resultCount = result.count()) != 0) {
|
||||
finalResult = new UnicodeString[resultCount];
|
||||
if (finalResult == nullptr) {
|
||||
status = U_MEMORY_ALLOCATION_ERROR;
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
else {
|
||||
status = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return nullptr;
|
||||
}
|
||||
//result.toArray(finalResult);
|
||||
result_len = 0;
|
||||
el = UHASH_FIRST;
|
||||
ne = result.nextElement(el);
|
||||
while(ne != nullptr) {
|
||||
finalResult[result_len++] = *((UnicodeString *)(ne->value.pointer));
|
||||
ne = result.nextElement(el);
|
||||
}
|
||||
|
||||
|
||||
return finalResult;
|
||||
}
|
||||
|
||||
Hashtable *CanonicalIterator::getEquivalents2(Hashtable *fillinResult, const char16_t *segment, int32_t segLen, UErrorCode &status) {
|
||||
|
||||
if (U_FAILURE(status)) {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
//if (PROGRESS) printf("Adding: %s\n", UToS(Tr(segment)));
|
||||
|
||||
UnicodeString toPut(segment, segLen);
|
||||
|
||||
fillinResult->put(toPut, new UnicodeString(toPut), status);
|
||||
|
||||
UnicodeSet starts;
|
||||
|
||||
// cycle through all the characters
|
||||
UChar32 cp;
|
||||
for (int32_t i = 0; i < segLen; i += U16_LENGTH(cp)) {
|
||||
// see if any character is at the start of some decomposition
|
||||
U16_GET(segment, 0, i, segLen, cp);
|
||||
if (!nfcImpl->getCanonStartSet(cp, starts)) {
|
||||
continue;
|
||||
}
|
||||
// if so, see which decompositions match
|
||||
UnicodeSetIterator iter(starts);
|
||||
while (iter.next()) {
|
||||
UChar32 cp2 = iter.getCodepoint();
|
||||
Hashtable remainder(status);
|
||||
remainder.setValueDeleter(uprv_deleteUObject);
|
||||
if (extract(&remainder, cp2, segment, segLen, i, status) == nullptr) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// there were some matches, so add all the possibilities to the set.
|
||||
UnicodeString prefix(segment, i);
|
||||
prefix += cp2;
|
||||
|
||||
int32_t el = UHASH_FIRST;
|
||||
const UHashElement *ne = remainder.nextElement(el);
|
||||
while (ne != nullptr) {
|
||||
UnicodeString item = *((UnicodeString *)(ne->value.pointer));
|
||||
UnicodeString *toAdd = new UnicodeString(prefix);
|
||||
/* test for nullptr */
|
||||
if (toAdd == nullptr) {
|
||||
status = U_MEMORY_ALLOCATION_ERROR;
|
||||
return nullptr;
|
||||
}
|
||||
*toAdd += item;
|
||||
fillinResult->put(*toAdd, toAdd, status);
|
||||
|
||||
//if (PROGRESS) printf("Adding: %s\n", UToS(Tr(*toAdd)));
|
||||
|
||||
ne = remainder.nextElement(el);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* Test for buffer overflows */
|
||||
if(U_FAILURE(status)) {
|
||||
return nullptr;
|
||||
}
|
||||
return fillinResult;
|
||||
}
|
||||
|
||||
/**
|
||||
* See if the decomposition of cp2 is at segment starting at segmentPos
|
||||
* (with canonical rearrangement!)
|
||||
* If so, take the remainder, and return the equivalents
|
||||
*/
|
||||
Hashtable *CanonicalIterator::extract(Hashtable *fillinResult, UChar32 comp, const char16_t *segment, int32_t segLen, int32_t segmentPos, UErrorCode &status) {
|
||||
//Hashtable *CanonicalIterator::extract(UChar32 comp, const UnicodeString &segment, int32_t segLen, int32_t segmentPos, UErrorCode &status) {
|
||||
//if (PROGRESS) printf(" extract: %s, ", UToS(Tr(UnicodeString(comp))));
|
||||
//if (PROGRESS) printf("%s, %i\n", UToS(Tr(segment)), segmentPos);
|
||||
|
||||
if (U_FAILURE(status)) {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
UnicodeString temp(comp);
|
||||
int32_t inputLen=temp.length();
|
||||
UnicodeString decompString;
|
||||
nfd->normalize(temp, decompString, status);
|
||||
if (U_FAILURE(status)) {
|
||||
return nullptr;
|
||||
}
|
||||
if (decompString.isBogus()) {
|
||||
status = U_MEMORY_ALLOCATION_ERROR;
|
||||
return nullptr;
|
||||
}
|
||||
const char16_t *decomp=decompString.getBuffer();
|
||||
int32_t decompLen=decompString.length();
|
||||
|
||||
// See if it matches the start of segment (at segmentPos)
|
||||
UBool ok = false;
|
||||
UChar32 cp;
|
||||
int32_t decompPos = 0;
|
||||
UChar32 decompCp;
|
||||
U16_NEXT(decomp, decompPos, decompLen, decompCp);
|
||||
|
||||
int32_t i = segmentPos;
|
||||
while(i < segLen) {
|
||||
U16_NEXT(segment, i, segLen, cp);
|
||||
|
||||
if (cp == decompCp) { // if equal, eat another cp from decomp
|
||||
|
||||
//if (PROGRESS) printf(" matches: %s\n", UToS(Tr(UnicodeString(cp))));
|
||||
|
||||
if (decompPos == decompLen) { // done, have all decomp characters!
|
||||
temp.append(segment+i, segLen-i);
|
||||
ok = true;
|
||||
break;
|
||||
}
|
||||
U16_NEXT(decomp, decompPos, decompLen, decompCp);
|
||||
} else {
|
||||
//if (PROGRESS) printf(" buffer: %s\n", UToS(Tr(UnicodeString(cp))));
|
||||
|
||||
// brute force approach
|
||||
temp.append(cp);
|
||||
|
||||
/* TODO: optimize
|
||||
// since we know that the classes are monotonically increasing, after zero
|
||||
// e.g. 0 5 7 9 0 3
|
||||
// we can do an optimization
|
||||
// there are only a few cases that work: zero, less, same, greater
|
||||
// if both classes are the same, we fail
|
||||
// if the decomp class < the segment class, we fail
|
||||
|
||||
segClass = getClass(cp);
|
||||
if (decompClass <= segClass) return null;
|
||||
*/
|
||||
}
|
||||
}
|
||||
if (!ok)
|
||||
return nullptr; // we failed, characters left over
|
||||
|
||||
//if (PROGRESS) printf("Matches\n");
|
||||
|
||||
if (inputLen == temp.length()) {
|
||||
fillinResult->put(UnicodeString(), new UnicodeString(), status);
|
||||
return fillinResult; // succeed, but no remainder
|
||||
}
|
||||
|
||||
// brute force approach
|
||||
// check to make sure result is canonically equivalent
|
||||
UnicodeString trial;
|
||||
nfd->normalize(temp, trial, status);
|
||||
if(U_FAILURE(status) || trial.compare(segment+segmentPos, segLen - segmentPos) != 0) {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
return getEquivalents2(fillinResult, temp.getBuffer()+inputLen, temp.length()-inputLen, status);
|
||||
}
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
||||
#endif /* #if !UCONFIG_NO_NORMALIZATION */
|
||||
97
engine/thirdparty/icu4c/common/capi_helper.h
vendored
Normal file
97
engine/thirdparty/icu4c/common/capi_helper.h
vendored
Normal file
|
|
@ -0,0 +1,97 @@
|
|||
// © 2018 and later: Unicode, Inc. and others.
|
||||
// License & terms of use: http://www.unicode.org/copyright.html
|
||||
|
||||
#ifndef __CAPI_HELPER_H__
|
||||
#define __CAPI_HELPER_H__
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
/**
|
||||
* An internal helper class to help convert between C and C++ APIs.
|
||||
*/
|
||||
template<typename CType, typename CPPType, int32_t kMagic>
|
||||
class IcuCApiHelper {
|
||||
public:
|
||||
/**
|
||||
* Convert from the C type to the C++ type (const version).
|
||||
*/
|
||||
static const CPPType* validate(const CType* input, UErrorCode& status);
|
||||
|
||||
/**
|
||||
* Convert from the C type to the C++ type (non-const version).
|
||||
*/
|
||||
static CPPType* validate(CType* input, UErrorCode& status);
|
||||
|
||||
/**
|
||||
* Convert from the C++ type to the C type (const version).
|
||||
*/
|
||||
const CType* exportConstForC() const;
|
||||
|
||||
/**
|
||||
* Convert from the C++ type to the C type (non-const version).
|
||||
*/
|
||||
CType* exportForC();
|
||||
|
||||
/**
|
||||
* Invalidates the object.
|
||||
*/
|
||||
~IcuCApiHelper();
|
||||
|
||||
private:
|
||||
/**
|
||||
* While the object is valid, fMagic equals kMagic.
|
||||
*/
|
||||
int32_t fMagic = kMagic;
|
||||
};
|
||||
|
||||
|
||||
template<typename CType, typename CPPType, int32_t kMagic>
|
||||
const CPPType*
|
||||
IcuCApiHelper<CType, CPPType, kMagic>::validate(const CType* input, UErrorCode& status) {
|
||||
if (U_FAILURE(status)) {
|
||||
return nullptr;
|
||||
}
|
||||
if (input == nullptr) {
|
||||
status = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return nullptr;
|
||||
}
|
||||
auto* impl = reinterpret_cast<const CPPType*>(input);
|
||||
if (static_cast<const IcuCApiHelper<CType, CPPType, kMagic>*>(impl)->fMagic != kMagic) {
|
||||
status = U_INVALID_FORMAT_ERROR;
|
||||
return nullptr;
|
||||
}
|
||||
return impl;
|
||||
}
|
||||
|
||||
template<typename CType, typename CPPType, int32_t kMagic>
|
||||
CPPType*
|
||||
IcuCApiHelper<CType, CPPType, kMagic>::validate(CType* input, UErrorCode& status) {
|
||||
auto* constInput = static_cast<const CType*>(input);
|
||||
auto* validated = validate(constInput, status);
|
||||
return const_cast<CPPType*>(validated);
|
||||
}
|
||||
|
||||
template<typename CType, typename CPPType, int32_t kMagic>
|
||||
const CType*
|
||||
IcuCApiHelper<CType, CPPType, kMagic>::exportConstForC() const {
|
||||
return reinterpret_cast<const CType*>(static_cast<const CPPType*>(this));
|
||||
}
|
||||
|
||||
template<typename CType, typename CPPType, int32_t kMagic>
|
||||
CType*
|
||||
IcuCApiHelper<CType, CPPType, kMagic>::exportForC() {
|
||||
return reinterpret_cast<CType*>(static_cast<CPPType*>(this));
|
||||
}
|
||||
|
||||
template<typename CType, typename CPPType, int32_t kMagic>
|
||||
IcuCApiHelper<CType, CPPType, kMagic>::~IcuCApiHelper() {
|
||||
// head off application errors by preventing use of of deleted objects.
|
||||
fMagic = 0;
|
||||
}
|
||||
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
||||
#endif // __CAPI_HELPER_H__
|
||||
427
engine/thirdparty/icu4c/common/characterproperties.cpp
vendored
Normal file
427
engine/thirdparty/icu4c/common/characterproperties.cpp
vendored
Normal file
|
|
@ -0,0 +1,427 @@
|
|||
// © 2018 and later: Unicode, Inc. and others.
|
||||
// License & terms of use: http://www.unicode.org/copyright.html
|
||||
|
||||
// characterproperties.cpp
|
||||
// created: 2018sep03 Markus W. Scherer
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
#include "unicode/localpointer.h"
|
||||
#include "unicode/uchar.h"
|
||||
#include "unicode/ucpmap.h"
|
||||
#include "unicode/ucptrie.h"
|
||||
#include "unicode/umutablecptrie.h"
|
||||
#include "unicode/uniset.h"
|
||||
#include "unicode/uscript.h"
|
||||
#include "unicode/uset.h"
|
||||
#include "cmemory.h"
|
||||
#include "emojiprops.h"
|
||||
#include "mutex.h"
|
||||
#include "normalizer2impl.h"
|
||||
#include "uassert.h"
|
||||
#include "ubidi_props.h"
|
||||
#include "ucase.h"
|
||||
#include "ucln_cmn.h"
|
||||
#include "umutex.h"
|
||||
#include "uprops.h"
|
||||
|
||||
using icu::LocalPointer;
|
||||
#if !UCONFIG_NO_NORMALIZATION
|
||||
using icu::Normalizer2Factory;
|
||||
using icu::Normalizer2Impl;
|
||||
#endif
|
||||
using icu::UInitOnce;
|
||||
using icu::UnicodeSet;
|
||||
|
||||
namespace {
|
||||
|
||||
UBool U_CALLCONV characterproperties_cleanup();
|
||||
|
||||
constexpr int32_t NUM_INCLUSIONS = UPROPS_SRC_COUNT + (UCHAR_INT_LIMIT - UCHAR_INT_START);
|
||||
|
||||
struct Inclusion {
|
||||
UnicodeSet *fSet = nullptr;
|
||||
UInitOnce fInitOnce {};
|
||||
};
|
||||
Inclusion gInclusions[NUM_INCLUSIONS]; // cached getInclusions()
|
||||
|
||||
UnicodeSet *sets[UCHAR_BINARY_LIMIT] = {};
|
||||
|
||||
UCPMap *maps[UCHAR_INT_LIMIT - UCHAR_INT_START] = {};
|
||||
|
||||
icu::UMutex cpMutex;
|
||||
|
||||
//----------------------------------------------------------------
|
||||
// Inclusions list
|
||||
//----------------------------------------------------------------
|
||||
|
||||
// USetAdder implementation
|
||||
// Does not use uset.h to reduce code dependencies
|
||||
void U_CALLCONV
|
||||
_set_add(USet *set, UChar32 c) {
|
||||
((UnicodeSet *)set)->add(c);
|
||||
}
|
||||
|
||||
void U_CALLCONV
|
||||
_set_addRange(USet *set, UChar32 start, UChar32 end) {
|
||||
((UnicodeSet *)set)->add(start, end);
|
||||
}
|
||||
|
||||
void U_CALLCONV
|
||||
_set_addString(USet *set, const char16_t *str, int32_t length) {
|
||||
((UnicodeSet *)set)->add(icu::UnicodeString((UBool)(length<0), str, length));
|
||||
}
|
||||
|
||||
UBool U_CALLCONV characterproperties_cleanup() {
|
||||
for (Inclusion &in: gInclusions) {
|
||||
delete in.fSet;
|
||||
in.fSet = nullptr;
|
||||
in.fInitOnce.reset();
|
||||
}
|
||||
for (int32_t i = 0; i < UPRV_LENGTHOF(sets); ++i) {
|
||||
delete sets[i];
|
||||
sets[i] = nullptr;
|
||||
}
|
||||
for (int32_t i = 0; i < UPRV_LENGTHOF(maps); ++i) {
|
||||
ucptrie_close(reinterpret_cast<UCPTrie *>(maps[i]));
|
||||
maps[i] = nullptr;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
void U_CALLCONV initInclusion(UPropertySource src, UErrorCode &errorCode) {
|
||||
// This function is invoked only via umtx_initOnce().
|
||||
U_ASSERT(0 <= src && src < UPROPS_SRC_COUNT);
|
||||
if (src == UPROPS_SRC_NONE) {
|
||||
errorCode = U_INTERNAL_PROGRAM_ERROR;
|
||||
return;
|
||||
}
|
||||
U_ASSERT(gInclusions[src].fSet == nullptr);
|
||||
|
||||
LocalPointer<UnicodeSet> incl(new UnicodeSet());
|
||||
if (incl.isNull()) {
|
||||
errorCode = U_MEMORY_ALLOCATION_ERROR;
|
||||
return;
|
||||
}
|
||||
USetAdder sa = {
|
||||
(USet *)incl.getAlias(),
|
||||
_set_add,
|
||||
_set_addRange,
|
||||
_set_addString,
|
||||
nullptr, // don't need remove()
|
||||
nullptr // don't need removeRange()
|
||||
};
|
||||
|
||||
switch(src) {
|
||||
case UPROPS_SRC_CHAR:
|
||||
uchar_addPropertyStarts(&sa, &errorCode);
|
||||
break;
|
||||
case UPROPS_SRC_PROPSVEC:
|
||||
upropsvec_addPropertyStarts(&sa, &errorCode);
|
||||
break;
|
||||
case UPROPS_SRC_CHAR_AND_PROPSVEC:
|
||||
uchar_addPropertyStarts(&sa, &errorCode);
|
||||
upropsvec_addPropertyStarts(&sa, &errorCode);
|
||||
break;
|
||||
#if !UCONFIG_NO_NORMALIZATION
|
||||
case UPROPS_SRC_CASE_AND_NORM: {
|
||||
const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(errorCode);
|
||||
if(U_SUCCESS(errorCode)) {
|
||||
impl->addPropertyStarts(&sa, errorCode);
|
||||
}
|
||||
ucase_addPropertyStarts(&sa, &errorCode);
|
||||
break;
|
||||
}
|
||||
case UPROPS_SRC_NFC: {
|
||||
const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(errorCode);
|
||||
if(U_SUCCESS(errorCode)) {
|
||||
impl->addPropertyStarts(&sa, errorCode);
|
||||
}
|
||||
break;
|
||||
}
|
||||
case UPROPS_SRC_NFKC: {
|
||||
const Normalizer2Impl *impl=Normalizer2Factory::getNFKCImpl(errorCode);
|
||||
if(U_SUCCESS(errorCode)) {
|
||||
impl->addPropertyStarts(&sa, errorCode);
|
||||
}
|
||||
break;
|
||||
}
|
||||
case UPROPS_SRC_NFKC_CF: {
|
||||
const Normalizer2Impl *impl=Normalizer2Factory::getNFKC_CFImpl(errorCode);
|
||||
if(U_SUCCESS(errorCode)) {
|
||||
impl->addPropertyStarts(&sa, errorCode);
|
||||
}
|
||||
break;
|
||||
}
|
||||
case UPROPS_SRC_NFC_CANON_ITER: {
|
||||
const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(errorCode);
|
||||
if(U_SUCCESS(errorCode)) {
|
||||
impl->addCanonIterPropertyStarts(&sa, errorCode);
|
||||
}
|
||||
break;
|
||||
}
|
||||
#endif
|
||||
case UPROPS_SRC_CASE:
|
||||
ucase_addPropertyStarts(&sa, &errorCode);
|
||||
break;
|
||||
case UPROPS_SRC_BIDI:
|
||||
ubidi_addPropertyStarts(&sa, &errorCode);
|
||||
break;
|
||||
case UPROPS_SRC_INPC:
|
||||
case UPROPS_SRC_INSC:
|
||||
case UPROPS_SRC_VO:
|
||||
uprops_addPropertyStarts(src, &sa, &errorCode);
|
||||
break;
|
||||
case UPROPS_SRC_EMOJI: {
|
||||
const icu::EmojiProps *ep = icu::EmojiProps::getSingleton(errorCode);
|
||||
if (U_SUCCESS(errorCode)) {
|
||||
ep->addPropertyStarts(&sa, errorCode);
|
||||
}
|
||||
break;
|
||||
}
|
||||
case UPROPS_SRC_IDSU:
|
||||
// New in Unicode 15.1 for just two characters.
|
||||
sa.add(sa.set, 0x2FFE);
|
||||
sa.add(sa.set, 0x2FFF + 1);
|
||||
break;
|
||||
case UPROPS_SRC_ID_COMPAT_MATH:
|
||||
uprops_addPropertyStarts(src, &sa, &errorCode);
|
||||
break;
|
||||
default:
|
||||
errorCode = U_INTERNAL_PROGRAM_ERROR;
|
||||
break;
|
||||
}
|
||||
|
||||
if (U_FAILURE(errorCode)) {
|
||||
return;
|
||||
}
|
||||
if (incl->isBogus()) {
|
||||
errorCode = U_MEMORY_ALLOCATION_ERROR;
|
||||
return;
|
||||
}
|
||||
// Compact for caching.
|
||||
incl->compact();
|
||||
gInclusions[src].fSet = incl.orphan();
|
||||
ucln_common_registerCleanup(UCLN_COMMON_CHARACTERPROPERTIES, characterproperties_cleanup);
|
||||
}
|
||||
|
||||
const UnicodeSet *getInclusionsForSource(UPropertySource src, UErrorCode &errorCode) {
|
||||
if (U_FAILURE(errorCode)) { return nullptr; }
|
||||
if (src < 0 || UPROPS_SRC_COUNT <= src) {
|
||||
errorCode = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return nullptr;
|
||||
}
|
||||
Inclusion &i = gInclusions[src];
|
||||
umtx_initOnce(i.fInitOnce, &initInclusion, src, errorCode);
|
||||
return i.fSet;
|
||||
}
|
||||
|
||||
void U_CALLCONV initIntPropInclusion(UProperty prop, UErrorCode &errorCode) {
|
||||
// This function is invoked only via umtx_initOnce().
|
||||
U_ASSERT(UCHAR_INT_START <= prop && prop < UCHAR_INT_LIMIT);
|
||||
int32_t inclIndex = UPROPS_SRC_COUNT + (prop - UCHAR_INT_START);
|
||||
U_ASSERT(gInclusions[inclIndex].fSet == nullptr);
|
||||
UPropertySource src = uprops_getSource(prop);
|
||||
const UnicodeSet *incl = getInclusionsForSource(src, errorCode);
|
||||
if (U_FAILURE(errorCode)) {
|
||||
return;
|
||||
}
|
||||
|
||||
LocalPointer<UnicodeSet> intPropIncl(new UnicodeSet(0, 0));
|
||||
if (intPropIncl.isNull()) {
|
||||
errorCode = U_MEMORY_ALLOCATION_ERROR;
|
||||
return;
|
||||
}
|
||||
int32_t numRanges = incl->getRangeCount();
|
||||
int32_t prevValue = 0;
|
||||
for (int32_t i = 0; i < numRanges; ++i) {
|
||||
UChar32 rangeEnd = incl->getRangeEnd(i);
|
||||
for (UChar32 c = incl->getRangeStart(i); c <= rangeEnd; ++c) {
|
||||
// TODO: Get a UCharacterProperty.IntProperty to avoid the property dispatch.
|
||||
int32_t value = u_getIntPropertyValue(c, prop);
|
||||
if (value != prevValue) {
|
||||
intPropIncl->add(c);
|
||||
prevValue = value;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (intPropIncl->isBogus()) {
|
||||
errorCode = U_MEMORY_ALLOCATION_ERROR;
|
||||
return;
|
||||
}
|
||||
// Compact for caching.
|
||||
intPropIncl->compact();
|
||||
gInclusions[inclIndex].fSet = intPropIncl.orphan();
|
||||
ucln_common_registerCleanup(UCLN_COMMON_CHARACTERPROPERTIES, characterproperties_cleanup);
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
const UnicodeSet *CharacterProperties::getInclusionsForProperty(
|
||||
UProperty prop, UErrorCode &errorCode) {
|
||||
if (U_FAILURE(errorCode)) { return nullptr; }
|
||||
if (UCHAR_INT_START <= prop && prop < UCHAR_INT_LIMIT) {
|
||||
int32_t inclIndex = UPROPS_SRC_COUNT + (prop - UCHAR_INT_START);
|
||||
Inclusion &i = gInclusions[inclIndex];
|
||||
umtx_initOnce(i.fInitOnce, &initIntPropInclusion, prop, errorCode);
|
||||
return i.fSet;
|
||||
} else {
|
||||
UPropertySource src = uprops_getSource(prop);
|
||||
return getInclusionsForSource(src, errorCode);
|
||||
}
|
||||
}
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
||||
namespace {
|
||||
|
||||
UnicodeSet *makeSet(UProperty property, UErrorCode &errorCode) {
|
||||
if (U_FAILURE(errorCode)) { return nullptr; }
|
||||
LocalPointer<UnicodeSet> set(new UnicodeSet());
|
||||
if (set.isNull()) {
|
||||
errorCode = U_MEMORY_ALLOCATION_ERROR;
|
||||
return nullptr;
|
||||
}
|
||||
if (UCHAR_BASIC_EMOJI <= property && property <= UCHAR_RGI_EMOJI) {
|
||||
// property of strings
|
||||
const icu::EmojiProps *ep = icu::EmojiProps::getSingleton(errorCode);
|
||||
if (U_FAILURE(errorCode)) { return nullptr; }
|
||||
USetAdder sa = {
|
||||
(USet *)set.getAlias(),
|
||||
_set_add,
|
||||
_set_addRange,
|
||||
_set_addString,
|
||||
nullptr, // don't need remove()
|
||||
nullptr // don't need removeRange()
|
||||
};
|
||||
ep->addStrings(&sa, property, errorCode);
|
||||
if (property != UCHAR_BASIC_EMOJI && property != UCHAR_RGI_EMOJI) {
|
||||
// property of _only_ strings
|
||||
set->freeze();
|
||||
return set.orphan();
|
||||
}
|
||||
}
|
||||
|
||||
const UnicodeSet *inclusions =
|
||||
icu::CharacterProperties::getInclusionsForProperty(property, errorCode);
|
||||
if (U_FAILURE(errorCode)) { return nullptr; }
|
||||
int32_t numRanges = inclusions->getRangeCount();
|
||||
UChar32 startHasProperty = -1;
|
||||
|
||||
for (int32_t i = 0; i < numRanges; ++i) {
|
||||
UChar32 rangeEnd = inclusions->getRangeEnd(i);
|
||||
for (UChar32 c = inclusions->getRangeStart(i); c <= rangeEnd; ++c) {
|
||||
// TODO: Get a UCharacterProperty.BinaryProperty to avoid the property dispatch.
|
||||
if (u_hasBinaryProperty(c, property)) {
|
||||
if (startHasProperty < 0) {
|
||||
// Transition from false to true.
|
||||
startHasProperty = c;
|
||||
}
|
||||
} else if (startHasProperty >= 0) {
|
||||
// Transition from true to false.
|
||||
set->add(startHasProperty, c - 1);
|
||||
startHasProperty = -1;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (startHasProperty >= 0) {
|
||||
set->add(startHasProperty, 0x10FFFF);
|
||||
}
|
||||
set->freeze();
|
||||
return set.orphan();
|
||||
}
|
||||
|
||||
UCPMap *makeMap(UProperty property, UErrorCode &errorCode) {
|
||||
if (U_FAILURE(errorCode)) { return nullptr; }
|
||||
uint32_t nullValue = property == UCHAR_SCRIPT ? USCRIPT_UNKNOWN : 0;
|
||||
icu::LocalUMutableCPTriePointer mutableTrie(
|
||||
umutablecptrie_open(nullValue, nullValue, &errorCode));
|
||||
const UnicodeSet *inclusions =
|
||||
icu::CharacterProperties::getInclusionsForProperty(property, errorCode);
|
||||
if (U_FAILURE(errorCode)) { return nullptr; }
|
||||
int32_t numRanges = inclusions->getRangeCount();
|
||||
UChar32 start = 0;
|
||||
uint32_t value = nullValue;
|
||||
|
||||
for (int32_t i = 0; i < numRanges; ++i) {
|
||||
UChar32 rangeEnd = inclusions->getRangeEnd(i);
|
||||
for (UChar32 c = inclusions->getRangeStart(i); c <= rangeEnd; ++c) {
|
||||
// TODO: Get a UCharacterProperty.IntProperty to avoid the property dispatch.
|
||||
uint32_t nextValue = u_getIntPropertyValue(c, property);
|
||||
if (value != nextValue) {
|
||||
if (value != nullValue) {
|
||||
umutablecptrie_setRange(mutableTrie.getAlias(), start, c - 1, value, &errorCode);
|
||||
}
|
||||
start = c;
|
||||
value = nextValue;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (value != 0) {
|
||||
umutablecptrie_setRange(mutableTrie.getAlias(), start, 0x10FFFF, value, &errorCode);
|
||||
}
|
||||
|
||||
UCPTrieType type;
|
||||
if (property == UCHAR_BIDI_CLASS || property == UCHAR_GENERAL_CATEGORY) {
|
||||
type = UCPTRIE_TYPE_FAST;
|
||||
} else {
|
||||
type = UCPTRIE_TYPE_SMALL;
|
||||
}
|
||||
UCPTrieValueWidth valueWidth;
|
||||
// TODO: UCharacterProperty.IntProperty
|
||||
int32_t max = u_getIntPropertyMaxValue(property);
|
||||
if (max <= 0xff) {
|
||||
valueWidth = UCPTRIE_VALUE_BITS_8;
|
||||
} else if (max <= 0xffff) {
|
||||
valueWidth = UCPTRIE_VALUE_BITS_16;
|
||||
} else {
|
||||
valueWidth = UCPTRIE_VALUE_BITS_32;
|
||||
}
|
||||
return reinterpret_cast<UCPMap *>(
|
||||
umutablecptrie_buildImmutable(mutableTrie.getAlias(), type, valueWidth, &errorCode));
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
const UnicodeSet *CharacterProperties::getBinaryPropertySet(UProperty property, UErrorCode &errorCode) {
|
||||
if (U_FAILURE(errorCode)) { return nullptr; }
|
||||
if (property < 0 || UCHAR_BINARY_LIMIT <= property) {
|
||||
errorCode = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return nullptr;
|
||||
}
|
||||
Mutex m(&cpMutex);
|
||||
UnicodeSet *set = sets[property];
|
||||
if (set == nullptr) {
|
||||
sets[property] = set = makeSet(property, errorCode);
|
||||
}
|
||||
return set;
|
||||
}
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
||||
U_NAMESPACE_USE
|
||||
|
||||
U_CAPI const USet * U_EXPORT2
|
||||
u_getBinaryPropertySet(UProperty property, UErrorCode *pErrorCode) {
|
||||
const UnicodeSet *set = CharacterProperties::getBinaryPropertySet(property, *pErrorCode);
|
||||
return U_SUCCESS(*pErrorCode) ? set->toUSet() : nullptr;
|
||||
}
|
||||
|
||||
U_CAPI const UCPMap * U_EXPORT2
|
||||
u_getIntPropertyMap(UProperty property, UErrorCode *pErrorCode) {
|
||||
if (U_FAILURE(*pErrorCode)) { return nullptr; }
|
||||
if (property < UCHAR_INT_START || UCHAR_INT_LIMIT <= property) {
|
||||
*pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return nullptr;
|
||||
}
|
||||
Mutex m(&cpMutex);
|
||||
UCPMap *map = maps[property - UCHAR_INT_START];
|
||||
if (map == nullptr) {
|
||||
maps[property - UCHAR_INT_START] = map = makeMap(property, *pErrorCode);
|
||||
}
|
||||
return map;
|
||||
}
|
||||
100
engine/thirdparty/icu4c/common/chariter.cpp
vendored
Normal file
100
engine/thirdparty/icu4c/common/chariter.cpp
vendored
Normal file
|
|
@ -0,0 +1,100 @@
|
|||
// © 2016 and later: Unicode, Inc. and others.
|
||||
// License & terms of use: http://www.unicode.org/copyright.html
|
||||
/*
|
||||
**********************************************************************
|
||||
* Copyright (C) 1999-2011, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
**********************************************************************
|
||||
*/
|
||||
|
||||
#include "unicode/chariter.h"
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
ForwardCharacterIterator::~ForwardCharacterIterator() {}
|
||||
ForwardCharacterIterator::ForwardCharacterIterator()
|
||||
: UObject()
|
||||
{}
|
||||
ForwardCharacterIterator::ForwardCharacterIterator(const ForwardCharacterIterator &other)
|
||||
: UObject(other)
|
||||
{}
|
||||
|
||||
|
||||
CharacterIterator::CharacterIterator()
|
||||
: textLength(0), pos(0), begin(0), end(0) {
|
||||
}
|
||||
|
||||
CharacterIterator::CharacterIterator(int32_t length)
|
||||
: textLength(length), pos(0), begin(0), end(length) {
|
||||
if(textLength < 0) {
|
||||
textLength = end = 0;
|
||||
}
|
||||
}
|
||||
|
||||
CharacterIterator::CharacterIterator(int32_t length, int32_t position)
|
||||
: textLength(length), pos(position), begin(0), end(length) {
|
||||
if(textLength < 0) {
|
||||
textLength = end = 0;
|
||||
}
|
||||
if(pos < 0) {
|
||||
pos = 0;
|
||||
} else if(pos > end) {
|
||||
pos = end;
|
||||
}
|
||||
}
|
||||
|
||||
CharacterIterator::CharacterIterator(int32_t length, int32_t textBegin, int32_t textEnd, int32_t position)
|
||||
: textLength(length), pos(position), begin(textBegin), end(textEnd) {
|
||||
if(textLength < 0) {
|
||||
textLength = 0;
|
||||
}
|
||||
if(begin < 0) {
|
||||
begin = 0;
|
||||
} else if(begin > textLength) {
|
||||
begin = textLength;
|
||||
}
|
||||
if(end < begin) {
|
||||
end = begin;
|
||||
} else if(end > textLength) {
|
||||
end = textLength;
|
||||
}
|
||||
if(pos < begin) {
|
||||
pos = begin;
|
||||
} else if(pos > end) {
|
||||
pos = end;
|
||||
}
|
||||
}
|
||||
|
||||
CharacterIterator::~CharacterIterator() {}
|
||||
|
||||
CharacterIterator::CharacterIterator(const CharacterIterator &that) :
|
||||
ForwardCharacterIterator(that),
|
||||
textLength(that.textLength), pos(that.pos), begin(that.begin), end(that.end)
|
||||
{
|
||||
}
|
||||
|
||||
CharacterIterator &
|
||||
CharacterIterator::operator=(const CharacterIterator &that) {
|
||||
ForwardCharacterIterator::operator=(that);
|
||||
textLength = that.textLength;
|
||||
pos = that.pos;
|
||||
begin = that.begin;
|
||||
end = that.end;
|
||||
return *this;
|
||||
}
|
||||
|
||||
// implementing first[32]PostInc() directly in a subclass should be faster
|
||||
// but these implementations make subclassing a little easier
|
||||
char16_t
|
||||
CharacterIterator::firstPostInc() {
|
||||
setToStart();
|
||||
return nextPostInc();
|
||||
}
|
||||
|
||||
UChar32
|
||||
CharacterIterator::first32PostInc() {
|
||||
setToStart();
|
||||
return next32PostInc();
|
||||
}
|
||||
|
||||
U_NAMESPACE_END
|
||||
273
engine/thirdparty/icu4c/common/charstr.cpp
vendored
Normal file
273
engine/thirdparty/icu4c/common/charstr.cpp
vendored
Normal file
|
|
@ -0,0 +1,273 @@
|
|||
// © 2016 and later: Unicode, Inc. and others.
|
||||
// License & terms of use: http://www.unicode.org/copyright.html
|
||||
/*
|
||||
*******************************************************************************
|
||||
* Copyright (C) 2010-2015, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*******************************************************************************
|
||||
* file name: charstr.cpp
|
||||
* encoding: UTF-8
|
||||
* tab size: 8 (not used)
|
||||
* indentation:4
|
||||
*
|
||||
* created on: 2010may19
|
||||
* created by: Markus W. Scherer
|
||||
*/
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
#include "unicode/putil.h"
|
||||
#include "charstr.h"
|
||||
#include "cmemory.h"
|
||||
#include "cstring.h"
|
||||
#include "uinvchar.h"
|
||||
#include "ustr_imp.h"
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
CharString::CharString(CharString&& src) noexcept
|
||||
: buffer(std::move(src.buffer)), len(src.len) {
|
||||
src.len = 0; // not strictly necessary because we make no guarantees on the source string
|
||||
}
|
||||
|
||||
CharString& CharString::operator=(CharString&& src) noexcept {
|
||||
buffer = std::move(src.buffer);
|
||||
len = src.len;
|
||||
src.len = 0; // not strictly necessary because we make no guarantees on the source string
|
||||
return *this;
|
||||
}
|
||||
|
||||
char *CharString::cloneData(UErrorCode &errorCode) const {
|
||||
if (U_FAILURE(errorCode)) { return nullptr; }
|
||||
char *p = static_cast<char *>(uprv_malloc(len + 1));
|
||||
if (p == nullptr) {
|
||||
errorCode = U_MEMORY_ALLOCATION_ERROR;
|
||||
return nullptr;
|
||||
}
|
||||
uprv_memcpy(p, buffer.getAlias(), len + 1);
|
||||
return p;
|
||||
}
|
||||
|
||||
int32_t CharString::extract(char *dest, int32_t capacity, UErrorCode &errorCode) const {
|
||||
if (U_FAILURE(errorCode)) { return len; }
|
||||
if (capacity < 0 || (capacity > 0 && dest == nullptr)) {
|
||||
errorCode = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return len;
|
||||
}
|
||||
const char *src = buffer.getAlias();
|
||||
if (0 < len && len <= capacity && src != dest) {
|
||||
uprv_memcpy(dest, src, len);
|
||||
}
|
||||
return u_terminateChars(dest, capacity, len, &errorCode);
|
||||
}
|
||||
|
||||
CharString &CharString::copyFrom(const CharString &s, UErrorCode &errorCode) {
|
||||
if(U_SUCCESS(errorCode) && this!=&s && ensureCapacity(s.len+1, 0, errorCode)) {
|
||||
len=s.len;
|
||||
uprv_memcpy(buffer.getAlias(), s.buffer.getAlias(), len+1);
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
||||
int32_t CharString::lastIndexOf(char c) const {
|
||||
for(int32_t i=len; i>0;) {
|
||||
if(buffer[--i]==c) {
|
||||
return i;
|
||||
}
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
bool CharString::contains(StringPiece s) const {
|
||||
if (s.empty()) { return false; }
|
||||
const char *p = buffer.getAlias();
|
||||
int32_t lastStart = len - s.length();
|
||||
for (int32_t i = 0; i <= lastStart; ++i) {
|
||||
if (uprv_memcmp(p + i, s.data(), s.length()) == 0) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
CharString &CharString::truncate(int32_t newLength) {
|
||||
if(newLength<0) {
|
||||
newLength=0;
|
||||
}
|
||||
if(newLength<len) {
|
||||
buffer[len=newLength]=0;
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
||||
CharString &CharString::append(char c, UErrorCode &errorCode) {
|
||||
if(ensureCapacity(len+2, 0, errorCode)) {
|
||||
buffer[len++]=c;
|
||||
buffer[len]=0;
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
||||
CharString &CharString::append(const char *s, int32_t sLength, UErrorCode &errorCode) {
|
||||
if(U_FAILURE(errorCode)) {
|
||||
return *this;
|
||||
}
|
||||
if(sLength<-1 || (s==nullptr && sLength!=0)) {
|
||||
errorCode=U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return *this;
|
||||
}
|
||||
if(sLength<0) {
|
||||
sLength= static_cast<int32_t>(uprv_strlen(s));
|
||||
}
|
||||
if(sLength>0) {
|
||||
if(s==(buffer.getAlias()+len)) {
|
||||
// The caller wrote into the getAppendBuffer().
|
||||
if(sLength>=(buffer.getCapacity()-len)) {
|
||||
// The caller wrote too much.
|
||||
errorCode=U_INTERNAL_PROGRAM_ERROR;
|
||||
} else {
|
||||
buffer[len+=sLength]=0;
|
||||
}
|
||||
} else if(buffer.getAlias()<=s && s<(buffer.getAlias()+len) &&
|
||||
sLength>=(buffer.getCapacity()-len)
|
||||
) {
|
||||
// (Part of) this string is appended to itself which requires reallocation,
|
||||
// so we have to make a copy of the substring and append that.
|
||||
return append(CharString(s, sLength, errorCode), errorCode);
|
||||
} else if(ensureCapacity(len+sLength+1, 0, errorCode)) {
|
||||
uprv_memcpy(buffer.getAlias()+len, s, sLength);
|
||||
buffer[len+=sLength]=0;
|
||||
}
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
||||
CharString &CharString::appendNumber(int32_t number, UErrorCode &status) {
|
||||
if (number < 0) {
|
||||
this->append('-', status);
|
||||
if (U_FAILURE(status)) {
|
||||
return *this;
|
||||
}
|
||||
}
|
||||
|
||||
if (number == 0) {
|
||||
this->append('0', status);
|
||||
return *this;
|
||||
}
|
||||
|
||||
int32_t numLen = 0;
|
||||
while (number != 0) {
|
||||
int32_t residue = number % 10;
|
||||
number /= 10;
|
||||
this->append(std::abs(residue) + '0', status);
|
||||
numLen++;
|
||||
if (U_FAILURE(status)) {
|
||||
return *this;
|
||||
}
|
||||
}
|
||||
|
||||
int32_t start = this->length() - numLen, end = this->length() - 1;
|
||||
while(start < end) {
|
||||
std::swap(this->data()[start++], this->data()[end--]);
|
||||
}
|
||||
|
||||
return *this;
|
||||
}
|
||||
|
||||
char *CharString::getAppendBuffer(int32_t minCapacity,
|
||||
int32_t desiredCapacityHint,
|
||||
int32_t &resultCapacity,
|
||||
UErrorCode &errorCode) {
|
||||
if(U_FAILURE(errorCode)) {
|
||||
resultCapacity=0;
|
||||
return nullptr;
|
||||
}
|
||||
int32_t appendCapacity=buffer.getCapacity()-len-1; // -1 for NUL
|
||||
if(appendCapacity>=minCapacity) {
|
||||
resultCapacity=appendCapacity;
|
||||
return buffer.getAlias()+len;
|
||||
}
|
||||
if(ensureCapacity(len+minCapacity+1, len+desiredCapacityHint+1, errorCode)) {
|
||||
resultCapacity=buffer.getCapacity()-len-1;
|
||||
return buffer.getAlias()+len;
|
||||
}
|
||||
resultCapacity=0;
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
CharString &CharString::appendInvariantChars(const UnicodeString &s, UErrorCode &errorCode) {
|
||||
return appendInvariantChars(s.getBuffer(), s.length(), errorCode);
|
||||
}
|
||||
|
||||
CharString &CharString::appendInvariantChars(const char16_t* uchars, int32_t ucharsLen, UErrorCode &errorCode) {
|
||||
if(U_FAILURE(errorCode)) {
|
||||
return *this;
|
||||
}
|
||||
if (!uprv_isInvariantUString(uchars, ucharsLen)) {
|
||||
errorCode = U_INVARIANT_CONVERSION_ERROR;
|
||||
return *this;
|
||||
}
|
||||
if(ensureCapacity(len+ucharsLen+1, 0, errorCode)) {
|
||||
u_UCharsToChars(uchars, buffer.getAlias()+len, ucharsLen);
|
||||
len += ucharsLen;
|
||||
buffer[len] = 0;
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
||||
UBool CharString::ensureCapacity(int32_t capacity,
|
||||
int32_t desiredCapacityHint,
|
||||
UErrorCode &errorCode) {
|
||||
if(U_FAILURE(errorCode)) {
|
||||
return false;
|
||||
}
|
||||
if(capacity>buffer.getCapacity()) {
|
||||
if(desiredCapacityHint==0) {
|
||||
desiredCapacityHint=capacity+buffer.getCapacity();
|
||||
}
|
||||
if( (desiredCapacityHint<=capacity || buffer.resize(desiredCapacityHint, len+1)==nullptr) &&
|
||||
buffer.resize(capacity, len+1)==nullptr
|
||||
) {
|
||||
errorCode=U_MEMORY_ALLOCATION_ERROR;
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
CharString &CharString::appendPathPart(StringPiece s, UErrorCode &errorCode) {
|
||||
if(U_FAILURE(errorCode)) {
|
||||
return *this;
|
||||
}
|
||||
if(s.length()==0) {
|
||||
return *this;
|
||||
}
|
||||
char c;
|
||||
if(len>0 && (c=buffer[len-1])!=U_FILE_SEP_CHAR && c!=U_FILE_ALT_SEP_CHAR) {
|
||||
append(getDirSepChar(), errorCode);
|
||||
}
|
||||
append(s, errorCode);
|
||||
return *this;
|
||||
}
|
||||
|
||||
CharString &CharString::ensureEndsWithFileSeparator(UErrorCode &errorCode) {
|
||||
char c;
|
||||
if(U_SUCCESS(errorCode) && len>0 &&
|
||||
(c=buffer[len-1])!=U_FILE_SEP_CHAR && c!=U_FILE_ALT_SEP_CHAR) {
|
||||
append(getDirSepChar(), errorCode);
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
||||
char CharString::getDirSepChar() const {
|
||||
char dirSepChar = U_FILE_SEP_CHAR;
|
||||
#if (U_FILE_SEP_CHAR != U_FILE_ALT_SEP_CHAR)
|
||||
// We may need to return a different directory separator when building for Cygwin or MSYS2.
|
||||
if(len>0 && !uprv_strchr(data(), U_FILE_SEP_CHAR) && uprv_strchr(data(), U_FILE_ALT_SEP_CHAR))
|
||||
dirSepChar = U_FILE_ALT_SEP_CHAR;
|
||||
#endif
|
||||
return dirSepChar;
|
||||
}
|
||||
|
||||
U_NAMESPACE_END
|
||||
200
engine/thirdparty/icu4c/common/charstr.h
vendored
Normal file
200
engine/thirdparty/icu4c/common/charstr.h
vendored
Normal file
|
|
@ -0,0 +1,200 @@
|
|||
// © 2016 and later: Unicode, Inc. and others.
|
||||
// License & terms of use: http://www.unicode.org/copyright.html
|
||||
/*
|
||||
**********************************************************************
|
||||
* Copyright (c) 2001-2015, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
**********************************************************************
|
||||
* Date Name Description
|
||||
* 11/19/2001 aliu Creation.
|
||||
* 05/19/2010 markus Rewritten from scratch
|
||||
**********************************************************************
|
||||
*/
|
||||
|
||||
#ifndef CHARSTRING_H
|
||||
#define CHARSTRING_H
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
#include "unicode/unistr.h"
|
||||
#include "unicode/uobject.h"
|
||||
#include "cmemory.h"
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
// Windows needs us to DLL-export the MaybeStackArray template specialization,
|
||||
// but MacOS X cannot handle it. Same as in digitlst.h.
|
||||
#if !U_PLATFORM_IS_DARWIN_BASED
|
||||
template class U_COMMON_API MaybeStackArray<char, 40>;
|
||||
#endif
|
||||
|
||||
/**
|
||||
* ICU-internal char * string class.
|
||||
* This class does not assume or enforce any particular character encoding.
|
||||
* Raw bytes can be stored. The string object owns its characters.
|
||||
* A terminating NUL is stored, but the class does not prevent embedded NUL characters.
|
||||
*
|
||||
* This class wants to be convenient but is also deliberately minimalist.
|
||||
* Please do not add methods if they only add minor convenience.
|
||||
* For example:
|
||||
* cs.data()[5]='a'; // no need for setCharAt(5, 'a')
|
||||
*/
|
||||
class U_COMMON_API CharString : public UMemory {
|
||||
public:
|
||||
CharString() : len(0) { buffer[0]=0; }
|
||||
CharString(StringPiece s, UErrorCode &errorCode) : len(0) {
|
||||
buffer[0]=0;
|
||||
append(s, errorCode);
|
||||
}
|
||||
CharString(const CharString &s, UErrorCode &errorCode) : len(0) {
|
||||
buffer[0]=0;
|
||||
append(s, errorCode);
|
||||
}
|
||||
CharString(const char *s, int32_t sLength, UErrorCode &errorCode) : len(0) {
|
||||
buffer[0]=0;
|
||||
append(s, sLength, errorCode);
|
||||
}
|
||||
~CharString() {}
|
||||
|
||||
/**
|
||||
* Move constructor; might leave src in an undefined state.
|
||||
* This string will have the same contents and state that the source string had.
|
||||
*/
|
||||
CharString(CharString &&src) noexcept;
|
||||
/**
|
||||
* Move assignment operator; might leave src in an undefined state.
|
||||
* This string will have the same contents and state that the source string had.
|
||||
* The behavior is undefined if *this and src are the same object.
|
||||
*/
|
||||
CharString &operator=(CharString &&src) noexcept;
|
||||
|
||||
/**
|
||||
* Replaces this string's contents with the other string's contents.
|
||||
* CharString does not support the standard copy constructor nor
|
||||
* the assignment operator, to make copies explicit and to
|
||||
* use a UErrorCode where memory allocations might be needed.
|
||||
*/
|
||||
CharString ©From(const CharString &other, UErrorCode &errorCode);
|
||||
|
||||
UBool isEmpty() const { return len==0; }
|
||||
int32_t length() const { return len; }
|
||||
char operator[](int32_t index) const { return buffer[index]; }
|
||||
StringPiece toStringPiece() const { return StringPiece(buffer.getAlias(), len); }
|
||||
|
||||
const char *data() const { return buffer.getAlias(); }
|
||||
char *data() { return buffer.getAlias(); }
|
||||
/**
|
||||
* Allocates length()+1 chars and copies the NUL-terminated data().
|
||||
* The caller must uprv_free() the result.
|
||||
*/
|
||||
char *cloneData(UErrorCode &errorCode) const;
|
||||
/**
|
||||
* Copies the contents of the string into dest.
|
||||
* Checks if there is enough space in dest, extracts the entire string if possible,
|
||||
* and NUL-terminates dest if possible.
|
||||
*
|
||||
* If the string fits into dest but cannot be NUL-terminated (length()==capacity),
|
||||
* then the error code is set to U_STRING_NOT_TERMINATED_WARNING.
|
||||
* If the string itself does not fit into dest (length()>capacity),
|
||||
* then the error code is set to U_BUFFER_OVERFLOW_ERROR.
|
||||
*
|
||||
* @param dest Destination string buffer.
|
||||
* @param capacity Size of the dest buffer (number of chars).
|
||||
* @param errorCode ICU error code.
|
||||
* @return length()
|
||||
*/
|
||||
int32_t extract(char *dest, int32_t capacity, UErrorCode &errorCode) const;
|
||||
|
||||
bool operator==(const CharString& other) const {
|
||||
return len == other.length() && (len == 0 || uprv_memcmp(data(), other.data(), len) == 0);
|
||||
}
|
||||
bool operator!=(const CharString& other) const {
|
||||
return !operator==(other);
|
||||
}
|
||||
|
||||
bool operator==(StringPiece other) const {
|
||||
return len == other.length() && (len == 0 || uprv_memcmp(data(), other.data(), len) == 0);
|
||||
}
|
||||
bool operator!=(StringPiece other) const {
|
||||
return !operator==(other);
|
||||
}
|
||||
|
||||
/** @return last index of c, or -1 if c is not in this string */
|
||||
int32_t lastIndexOf(char c) const;
|
||||
|
||||
bool contains(StringPiece s) const;
|
||||
|
||||
CharString &clear() { len=0; buffer[0]=0; return *this; }
|
||||
CharString &truncate(int32_t newLength);
|
||||
|
||||
CharString &append(char c, UErrorCode &errorCode);
|
||||
CharString &append(StringPiece s, UErrorCode &errorCode) {
|
||||
return append(s.data(), s.length(), errorCode);
|
||||
}
|
||||
CharString &append(const CharString &s, UErrorCode &errorCode) {
|
||||
return append(s.data(), s.length(), errorCode);
|
||||
}
|
||||
CharString &append(const char *s, int32_t sLength, UErrorCode &status);
|
||||
|
||||
CharString &appendNumber(int32_t number, UErrorCode &status);
|
||||
|
||||
/**
|
||||
* Returns a writable buffer for appending and writes the buffer's capacity to
|
||||
* resultCapacity. Guarantees resultCapacity>=minCapacity if U_SUCCESS().
|
||||
* There will additionally be space for a terminating NUL right at resultCapacity.
|
||||
* (This function is similar to ByteSink.GetAppendBuffer().)
|
||||
*
|
||||
* The returned buffer is only valid until the next write operation
|
||||
* on this string.
|
||||
*
|
||||
* After writing at most resultCapacity bytes, call append() with the
|
||||
* pointer returned from this function and the number of bytes written.
|
||||
*
|
||||
* @param minCapacity required minimum capacity of the returned buffer;
|
||||
* must be non-negative
|
||||
* @param desiredCapacityHint desired capacity of the returned buffer;
|
||||
* must be non-negative
|
||||
* @param resultCapacity will be set to the capacity of the returned buffer
|
||||
* @param errorCode in/out error code
|
||||
* @return a buffer with resultCapacity>=min_capacity
|
||||
*/
|
||||
char *getAppendBuffer(int32_t minCapacity,
|
||||
int32_t desiredCapacityHint,
|
||||
int32_t &resultCapacity,
|
||||
UErrorCode &errorCode);
|
||||
|
||||
CharString &appendInvariantChars(const UnicodeString &s, UErrorCode &errorCode);
|
||||
CharString &appendInvariantChars(const char16_t* uchars, int32_t ucharsLen, UErrorCode& errorCode);
|
||||
|
||||
/**
|
||||
* Appends a filename/path part, e.g., a directory name.
|
||||
* First appends a U_FILE_SEP_CHAR or U_FILE_ALT_SEP_CHAR if necessary.
|
||||
* Does nothing if s is empty.
|
||||
*/
|
||||
CharString &appendPathPart(StringPiece s, UErrorCode &errorCode);
|
||||
|
||||
/**
|
||||
* Appends a U_FILE_SEP_CHAR or U_FILE_ALT_SEP_CHAR if this string is not empty
|
||||
* and does not already end with a U_FILE_SEP_CHAR or U_FILE_ALT_SEP_CHAR.
|
||||
*/
|
||||
CharString &ensureEndsWithFileSeparator(UErrorCode &errorCode);
|
||||
|
||||
private:
|
||||
MaybeStackArray<char, 40> buffer;
|
||||
int32_t len;
|
||||
|
||||
UBool ensureCapacity(int32_t capacity, int32_t desiredCapacityHint, UErrorCode &errorCode);
|
||||
|
||||
CharString(const CharString &other) = delete; // forbid copying of this class
|
||||
CharString &operator=(const CharString &other) = delete; // forbid copying of this class
|
||||
|
||||
/**
|
||||
* Returns U_FILE_ALT_SEP_CHAR if found in string, and U_FILE_SEP_CHAR is not found.
|
||||
* Otherwise returns U_FILE_SEP_CHAR.
|
||||
*/
|
||||
char getDirSepChar() const;
|
||||
};
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
||||
#endif
|
||||
//eof
|
||||
55
engine/thirdparty/icu4c/common/charstrmap.h
vendored
Normal file
55
engine/thirdparty/icu4c/common/charstrmap.h
vendored
Normal file
|
|
@ -0,0 +1,55 @@
|
|||
// © 2020 and later: Unicode, Inc. and others.
|
||||
// License & terms of use: http://www.unicode.org/copyright.html
|
||||
|
||||
// charstrmap.h
|
||||
// created: 2020sep01 Frank Yung-Fong Tang
|
||||
|
||||
#ifndef __CHARSTRMAP_H__
|
||||
#define __CHARSTRMAP_H__
|
||||
|
||||
#include <utility>
|
||||
#include "unicode/utypes.h"
|
||||
#include "unicode/uobject.h"
|
||||
#include "uhash.h"
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
/**
|
||||
* Map of const char * keys & values.
|
||||
* Stores pointers as is: Does not own/copy/adopt/release strings.
|
||||
*/
|
||||
class CharStringMap final : public UMemory {
|
||||
public:
|
||||
/** Constructs an unusable non-map. */
|
||||
CharStringMap() : map(nullptr) {}
|
||||
CharStringMap(int32_t size, UErrorCode &errorCode) {
|
||||
map = uhash_openSize(uhash_hashChars, uhash_compareChars, uhash_compareChars,
|
||||
size, &errorCode);
|
||||
}
|
||||
CharStringMap(CharStringMap &&other) noexcept : map(other.map) {
|
||||
other.map = nullptr;
|
||||
}
|
||||
CharStringMap(const CharStringMap &other) = delete;
|
||||
~CharStringMap() {
|
||||
uhash_close(map);
|
||||
}
|
||||
|
||||
CharStringMap &operator=(CharStringMap &&other) noexcept {
|
||||
map = other.map;
|
||||
other.map = nullptr;
|
||||
return *this;
|
||||
}
|
||||
CharStringMap &operator=(const CharStringMap &other) = delete;
|
||||
|
||||
const char *get(const char *key) const { return static_cast<const char *>(uhash_get(map, key)); }
|
||||
void put(const char *key, const char *value, UErrorCode &errorCode) {
|
||||
uhash_put(map, const_cast<char *>(key), const_cast<char *>(value), &errorCode);
|
||||
}
|
||||
|
||||
private:
|
||||
UHashtable *map;
|
||||
};
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
||||
#endif // __CHARSTRMAP_H__
|
||||
138
engine/thirdparty/icu4c/common/cmemory.cpp
vendored
Normal file
138
engine/thirdparty/icu4c/common/cmemory.cpp
vendored
Normal file
|
|
@ -0,0 +1,138 @@
|
|||
// © 2016 and later: Unicode, Inc. and others.
|
||||
// License & terms of use: http://www.unicode.org/copyright.html
|
||||
/*
|
||||
******************************************************************************
|
||||
*
|
||||
* Copyright (C) 2002-2015, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*
|
||||
******************************************************************************
|
||||
*
|
||||
* File cmemory.c ICU Heap allocation.
|
||||
* All ICU heap allocation, both for C and C++ new of ICU
|
||||
* class types, comes through these functions.
|
||||
*
|
||||
* If you have a need to replace ICU allocation, this is the
|
||||
* place to do it.
|
||||
*
|
||||
* Note that uprv_malloc(0) returns a non-nullptr pointer,
|
||||
* and that a subsequent free of that pointer value is a NOP.
|
||||
*
|
||||
******************************************************************************
|
||||
*/
|
||||
#include "unicode/uclean.h"
|
||||
#include "cmemory.h"
|
||||
#include "putilimp.h"
|
||||
#include "uassert.h"
|
||||
#include <stdlib.h>
|
||||
|
||||
/* uprv_malloc(0) returns a pointer to this read-only data. */
|
||||
static const int32_t zeroMem[] = {0, 0, 0, 0, 0, 0};
|
||||
|
||||
/* Function Pointers for user-supplied heap functions */
|
||||
static const void *pContext;
|
||||
static UMemAllocFn *pAlloc;
|
||||
static UMemReallocFn *pRealloc;
|
||||
static UMemFreeFn *pFree;
|
||||
|
||||
#if U_DEBUG && defined(UPRV_MALLOC_COUNT)
|
||||
#include <stdio.h>
|
||||
static int n=0;
|
||||
static long b=0;
|
||||
#endif
|
||||
|
||||
U_CAPI void * U_EXPORT2
|
||||
uprv_malloc(size_t s) {
|
||||
#if U_DEBUG && defined(UPRV_MALLOC_COUNT)
|
||||
#if 1
|
||||
putchar('>');
|
||||
fflush(stdout);
|
||||
#else
|
||||
fprintf(stderr,"MALLOC\t#%d\t%ul bytes\t%ul total\n", ++n,s,(b+=s)); fflush(stderr);
|
||||
#endif
|
||||
#endif
|
||||
if (s > 0) {
|
||||
if (pAlloc) {
|
||||
return (*pAlloc)(pContext, s);
|
||||
} else {
|
||||
return uprv_default_malloc(s);
|
||||
}
|
||||
} else {
|
||||
return (void *)zeroMem;
|
||||
}
|
||||
}
|
||||
|
||||
U_CAPI void * U_EXPORT2
|
||||
uprv_realloc(void * buffer, size_t size) {
|
||||
#if U_DEBUG && defined(UPRV_MALLOC_COUNT)
|
||||
putchar('~');
|
||||
fflush(stdout);
|
||||
#endif
|
||||
if (buffer == zeroMem) {
|
||||
return uprv_malloc(size);
|
||||
} else if (size == 0) {
|
||||
if (pFree) {
|
||||
(*pFree)(pContext, buffer);
|
||||
} else {
|
||||
uprv_default_free(buffer);
|
||||
}
|
||||
return (void *)zeroMem;
|
||||
} else {
|
||||
if (pRealloc) {
|
||||
return (*pRealloc)(pContext, buffer, size);
|
||||
} else {
|
||||
return uprv_default_realloc(buffer, size);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
U_CAPI void U_EXPORT2
|
||||
uprv_free(void *buffer) {
|
||||
#if U_DEBUG && defined(UPRV_MALLOC_COUNT)
|
||||
putchar('<');
|
||||
fflush(stdout);
|
||||
#endif
|
||||
if (buffer != zeroMem) {
|
||||
if (pFree) {
|
||||
(*pFree)(pContext, buffer);
|
||||
} else {
|
||||
uprv_default_free(buffer);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
U_CAPI void * U_EXPORT2
|
||||
uprv_calloc(size_t num, size_t size) {
|
||||
void *mem = nullptr;
|
||||
size *= num;
|
||||
mem = uprv_malloc(size);
|
||||
if (mem) {
|
||||
uprv_memset(mem, 0, size);
|
||||
}
|
||||
return mem;
|
||||
}
|
||||
|
||||
U_CAPI void U_EXPORT2
|
||||
u_setMemoryFunctions(const void *context, UMemAllocFn *a, UMemReallocFn *r, UMemFreeFn *f, UErrorCode *status)
|
||||
{
|
||||
if (U_FAILURE(*status)) {
|
||||
return;
|
||||
}
|
||||
if (a==nullptr || r==nullptr || f==nullptr) {
|
||||
*status = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return;
|
||||
}
|
||||
pContext = context;
|
||||
pAlloc = a;
|
||||
pRealloc = r;
|
||||
pFree = f;
|
||||
}
|
||||
|
||||
|
||||
U_CFUNC UBool cmemory_cleanup() {
|
||||
pContext = nullptr;
|
||||
pAlloc = nullptr;
|
||||
pRealloc = nullptr;
|
||||
pFree = nullptr;
|
||||
return true;
|
||||
}
|
||||
900
engine/thirdparty/icu4c/common/cmemory.h
vendored
Normal file
900
engine/thirdparty/icu4c/common/cmemory.h
vendored
Normal file
|
|
@ -0,0 +1,900 @@
|
|||
// © 2016 and later: Unicode, Inc. and others.
|
||||
// License & terms of use: http://www.unicode.org/copyright.html
|
||||
/*
|
||||
******************************************************************************
|
||||
*
|
||||
* Copyright (C) 1997-2016, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*
|
||||
******************************************************************************
|
||||
*
|
||||
* File CMEMORY.H
|
||||
*
|
||||
* Contains stdlib.h/string.h memory functions
|
||||
*
|
||||
* @author Bertrand A. Damiba
|
||||
*
|
||||
* Modification History:
|
||||
*
|
||||
* Date Name Description
|
||||
* 6/20/98 Bertrand Created.
|
||||
* 05/03/99 stephen Changed from functions to macros.
|
||||
*
|
||||
******************************************************************************
|
||||
*/
|
||||
|
||||
#ifndef CMEMORY_H
|
||||
#define CMEMORY_H
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
|
||||
#include <stddef.h>
|
||||
#include <string.h>
|
||||
#include "unicode/localpointer.h"
|
||||
#include "uassert.h"
|
||||
|
||||
#if U_DEBUG && defined(UPRV_MALLOC_COUNT)
|
||||
#include <stdio.h>
|
||||
#endif
|
||||
|
||||
// uprv_memcpy and uprv_memmove
|
||||
#if defined(__clang__)
|
||||
#define uprv_memcpy(dst, src, size) UPRV_BLOCK_MACRO_BEGIN { \
|
||||
/* Suppress warnings about addresses that will never be NULL */ \
|
||||
_Pragma("clang diagnostic push") \
|
||||
_Pragma("clang diagnostic ignored \"-Waddress\"") \
|
||||
U_ASSERT(dst != NULL); \
|
||||
U_ASSERT(src != NULL); \
|
||||
_Pragma("clang diagnostic pop") \
|
||||
U_STANDARD_CPP_NAMESPACE memcpy(dst, src, size); \
|
||||
} UPRV_BLOCK_MACRO_END
|
||||
#define uprv_memmove(dst, src, size) UPRV_BLOCK_MACRO_BEGIN { \
|
||||
/* Suppress warnings about addresses that will never be NULL */ \
|
||||
_Pragma("clang diagnostic push") \
|
||||
_Pragma("clang diagnostic ignored \"-Waddress\"") \
|
||||
U_ASSERT(dst != NULL); \
|
||||
U_ASSERT(src != NULL); \
|
||||
_Pragma("clang diagnostic pop") \
|
||||
U_STANDARD_CPP_NAMESPACE memmove(dst, src, size); \
|
||||
} UPRV_BLOCK_MACRO_END
|
||||
#elif defined(__GNUC__)
|
||||
#define uprv_memcpy(dst, src, size) UPRV_BLOCK_MACRO_BEGIN { \
|
||||
/* Suppress warnings about addresses that will never be NULL */ \
|
||||
_Pragma("GCC diagnostic push") \
|
||||
_Pragma("GCC diagnostic ignored \"-Waddress\"") \
|
||||
U_ASSERT(dst != NULL); \
|
||||
U_ASSERT(src != NULL); \
|
||||
_Pragma("GCC diagnostic pop") \
|
||||
U_STANDARD_CPP_NAMESPACE memcpy(dst, src, size); \
|
||||
} UPRV_BLOCK_MACRO_END
|
||||
#define uprv_memmove(dst, src, size) UPRV_BLOCK_MACRO_BEGIN { \
|
||||
/* Suppress warnings about addresses that will never be NULL */ \
|
||||
_Pragma("GCC diagnostic push") \
|
||||
_Pragma("GCC diagnostic ignored \"-Waddress\"") \
|
||||
U_ASSERT(dst != NULL); \
|
||||
U_ASSERT(src != NULL); \
|
||||
_Pragma("GCC diagnostic pop") \
|
||||
U_STANDARD_CPP_NAMESPACE memmove(dst, src, size); \
|
||||
} UPRV_BLOCK_MACRO_END
|
||||
#else
|
||||
#define uprv_memcpy(dst, src, size) UPRV_BLOCK_MACRO_BEGIN { \
|
||||
U_ASSERT(dst != NULL); \
|
||||
U_ASSERT(src != NULL); \
|
||||
U_STANDARD_CPP_NAMESPACE memcpy(dst, src, size); \
|
||||
} UPRV_BLOCK_MACRO_END
|
||||
#define uprv_memmove(dst, src, size) UPRV_BLOCK_MACRO_BEGIN { \
|
||||
U_ASSERT(dst != NULL); \
|
||||
U_ASSERT(src != NULL); \
|
||||
U_STANDARD_CPP_NAMESPACE memmove(dst, src, size); \
|
||||
} UPRV_BLOCK_MACRO_END
|
||||
#endif
|
||||
|
||||
/**
|
||||
* \def UPRV_LENGTHOF
|
||||
* Convenience macro to determine the length of a fixed array at compile-time.
|
||||
* @param array A fixed length array
|
||||
* @return The length of the array, in elements
|
||||
* @internal
|
||||
*/
|
||||
#define UPRV_LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
|
||||
#define uprv_memset(buffer, mark, size) U_STANDARD_CPP_NAMESPACE memset(buffer, mark, size)
|
||||
#define uprv_memcmp(buffer1, buffer2, size) U_STANDARD_CPP_NAMESPACE memcmp(buffer1, buffer2,size)
|
||||
#define uprv_memchr(ptr, value, num) U_STANDARD_CPP_NAMESPACE memchr(ptr, value, num)
|
||||
|
||||
U_CAPI void * U_EXPORT2
|
||||
uprv_malloc(size_t s) U_MALLOC_ATTR U_ALLOC_SIZE_ATTR(1);
|
||||
|
||||
U_CAPI void * U_EXPORT2
|
||||
uprv_realloc(void *mem, size_t size) U_ALLOC_SIZE_ATTR(2);
|
||||
|
||||
U_CAPI void U_EXPORT2
|
||||
uprv_free(void *mem);
|
||||
|
||||
U_CAPI void * U_EXPORT2
|
||||
uprv_calloc(size_t num, size_t size) U_MALLOC_ATTR U_ALLOC_SIZE_ATTR2(1,2);
|
||||
|
||||
/**
|
||||
* Get the least significant bits of a pointer (a memory address).
|
||||
* For example, with a mask of 3, the macro gets the 2 least significant bits,
|
||||
* which will be 0 if the pointer is 32-bit (4-byte) aligned.
|
||||
*
|
||||
* uintptr_t is the most appropriate integer type to cast to.
|
||||
*/
|
||||
#define U_POINTER_MASK_LSB(ptr, mask) ((uintptr_t)(ptr) & (mask))
|
||||
|
||||
/**
|
||||
* Create & return an instance of "type" in statically allocated storage.
|
||||
* e.g.
|
||||
* static std::mutex *myMutex = STATIC_NEW(std::mutex);
|
||||
* To destroy an object created in this way, invoke the destructor explicitly, e.g.
|
||||
* myMutex->~mutex();
|
||||
* DO NOT use delete.
|
||||
* DO NOT use with class UMutex, which has specific support for static instances.
|
||||
*
|
||||
* STATIC_NEW is intended for use when
|
||||
* - We want a static (or global) object.
|
||||
* - We don't want it to ever be destructed, or to explicitly control destruction,
|
||||
* to avoid use-after-destruction problems.
|
||||
* - We want to avoid an ordinary heap allocated object,
|
||||
* to avoid the possibility of memory allocation failures, and
|
||||
* to avoid memory leak reports, from valgrind, for example.
|
||||
* This is defined as a macro rather than a template function because each invocation
|
||||
* must define distinct static storage for the object being returned.
|
||||
*/
|
||||
#define STATIC_NEW(type) [] () { \
|
||||
alignas(type) static char storage[sizeof(type)]; \
|
||||
return new(storage) type();} ()
|
||||
|
||||
/**
|
||||
* Heap clean up function, called from u_cleanup()
|
||||
* Clears any user heap functions from u_setMemoryFunctions()
|
||||
* Does NOT deallocate any remaining allocated memory.
|
||||
*/
|
||||
U_CFUNC UBool
|
||||
cmemory_cleanup(void);
|
||||
|
||||
/**
|
||||
* A function called by <TT>uhash_remove</TT>,
|
||||
* <TT>uhash_close</TT>, or <TT>uhash_put</TT> to delete
|
||||
* an existing key or value.
|
||||
* @param obj A key or value stored in a hashtable
|
||||
* @see uprv_deleteUObject
|
||||
*/
|
||||
typedef void U_CALLCONV UObjectDeleter(void* obj);
|
||||
|
||||
/**
|
||||
* Deleter for UObject instances.
|
||||
* Works for all subclasses of UObject because it has a virtual destructor.
|
||||
*/
|
||||
U_CAPI void U_EXPORT2
|
||||
uprv_deleteUObject(void *obj);
|
||||
|
||||
#ifdef __cplusplus
|
||||
|
||||
#include <utility>
|
||||
#include "unicode/uobject.h"
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
/**
|
||||
* "Smart pointer" class, deletes memory via uprv_free().
|
||||
* For most methods see the LocalPointerBase base class.
|
||||
* Adds operator[] for array item access.
|
||||
*
|
||||
* @see LocalPointerBase
|
||||
*/
|
||||
template<typename T>
|
||||
class LocalMemory : public LocalPointerBase<T> {
|
||||
public:
|
||||
using LocalPointerBase<T>::operator*;
|
||||
using LocalPointerBase<T>::operator->;
|
||||
/**
|
||||
* Constructor takes ownership.
|
||||
* @param p simple pointer to an array of T items that is adopted
|
||||
*/
|
||||
explicit LocalMemory(T *p=nullptr) : LocalPointerBase<T>(p) {}
|
||||
/**
|
||||
* Move constructor, leaves src with isNull().
|
||||
* @param src source smart pointer
|
||||
*/
|
||||
LocalMemory(LocalMemory<T> &&src) noexcept : LocalPointerBase<T>(src.ptr) {
|
||||
src.ptr=nullptr;
|
||||
}
|
||||
/**
|
||||
* Destructor deletes the memory it owns.
|
||||
*/
|
||||
~LocalMemory() {
|
||||
uprv_free(LocalPointerBase<T>::ptr);
|
||||
}
|
||||
/**
|
||||
* Move assignment operator, leaves src with isNull().
|
||||
* The behavior is undefined if *this and src are the same object.
|
||||
* @param src source smart pointer
|
||||
* @return *this
|
||||
*/
|
||||
LocalMemory<T> &operator=(LocalMemory<T> &&src) noexcept {
|
||||
uprv_free(LocalPointerBase<T>::ptr);
|
||||
LocalPointerBase<T>::ptr=src.ptr;
|
||||
src.ptr=nullptr;
|
||||
return *this;
|
||||
}
|
||||
/**
|
||||
* Swap pointers.
|
||||
* @param other other smart pointer
|
||||
*/
|
||||
void swap(LocalMemory<T> &other) noexcept {
|
||||
T *temp=LocalPointerBase<T>::ptr;
|
||||
LocalPointerBase<T>::ptr=other.ptr;
|
||||
other.ptr=temp;
|
||||
}
|
||||
/**
|
||||
* Non-member LocalMemory swap function.
|
||||
* @param p1 will get p2's pointer
|
||||
* @param p2 will get p1's pointer
|
||||
*/
|
||||
friend inline void swap(LocalMemory<T> &p1, LocalMemory<T> &p2) noexcept {
|
||||
p1.swap(p2);
|
||||
}
|
||||
/**
|
||||
* Deletes the array it owns,
|
||||
* and adopts (takes ownership of) the one passed in.
|
||||
* @param p simple pointer to an array of T items that is adopted
|
||||
*/
|
||||
void adoptInstead(T *p) {
|
||||
uprv_free(LocalPointerBase<T>::ptr);
|
||||
LocalPointerBase<T>::ptr=p;
|
||||
}
|
||||
/**
|
||||
* Deletes the array it owns, allocates a new one and reset its bytes to 0.
|
||||
* Returns the new array pointer.
|
||||
* If the allocation fails, then the current array is unchanged and
|
||||
* this method returns nullptr.
|
||||
* @param newCapacity must be >0
|
||||
* @return the allocated array pointer, or nullptr if the allocation failed
|
||||
*/
|
||||
inline T *allocateInsteadAndReset(int32_t newCapacity=1);
|
||||
/**
|
||||
* Deletes the array it owns and allocates a new one, copying length T items.
|
||||
* Returns the new array pointer.
|
||||
* If the allocation fails, then the current array is unchanged and
|
||||
* this method returns nullptr.
|
||||
* @param newCapacity must be >0
|
||||
* @param length number of T items to be copied from the old array to the new one;
|
||||
* must be no more than the capacity of the old array,
|
||||
* which the caller must track because the LocalMemory does not track it
|
||||
* @return the allocated array pointer, or nullptr if the allocation failed
|
||||
*/
|
||||
inline T *allocateInsteadAndCopy(int32_t newCapacity=1, int32_t length=0);
|
||||
/**
|
||||
* Array item access (writable).
|
||||
* No index bounds check.
|
||||
* @param i array index
|
||||
* @return reference to the array item
|
||||
*/
|
||||
T &operator[](ptrdiff_t i) const { return LocalPointerBase<T>::ptr[i]; }
|
||||
};
|
||||
|
||||
template<typename T>
|
||||
inline T *LocalMemory<T>::allocateInsteadAndReset(int32_t newCapacity) {
|
||||
if(newCapacity>0) {
|
||||
T *p=(T *)uprv_malloc(newCapacity*sizeof(T));
|
||||
if(p!=nullptr) {
|
||||
uprv_memset(p, 0, newCapacity*sizeof(T));
|
||||
uprv_free(LocalPointerBase<T>::ptr);
|
||||
LocalPointerBase<T>::ptr=p;
|
||||
}
|
||||
return p;
|
||||
} else {
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
template<typename T>
|
||||
inline T *LocalMemory<T>::allocateInsteadAndCopy(int32_t newCapacity, int32_t length) {
|
||||
if(newCapacity>0) {
|
||||
T *p=(T *)uprv_malloc(newCapacity*sizeof(T));
|
||||
if(p!=nullptr) {
|
||||
if(length>0) {
|
||||
if(length>newCapacity) {
|
||||
length=newCapacity;
|
||||
}
|
||||
uprv_memcpy(p, LocalPointerBase<T>::ptr, (size_t)length*sizeof(T));
|
||||
}
|
||||
uprv_free(LocalPointerBase<T>::ptr);
|
||||
LocalPointerBase<T>::ptr=p;
|
||||
}
|
||||
return p;
|
||||
} else {
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Simple array/buffer management class using uprv_malloc() and uprv_free().
|
||||
* Provides an internal array with fixed capacity. Can alias another array
|
||||
* or allocate one.
|
||||
*
|
||||
* The array address is properly aligned for type T. It might not be properly
|
||||
* aligned for types larger than T (or larger than the largest subtype of T).
|
||||
*
|
||||
* Unlike LocalMemory and LocalArray, this class never adopts
|
||||
* (takes ownership of) another array.
|
||||
*
|
||||
* WARNING: MaybeStackArray only works with primitive (plain-old data) types.
|
||||
* It does NOT know how to call a destructor! If you work with classes with
|
||||
* destructors, consider:
|
||||
*
|
||||
* - LocalArray in localpointer.h if you know the length ahead of time
|
||||
* - MaybeStackVector if you know the length at runtime
|
||||
*/
|
||||
template<typename T, int32_t stackCapacity>
|
||||
class MaybeStackArray {
|
||||
public:
|
||||
// No heap allocation. Use only on the stack.
|
||||
static void* U_EXPORT2 operator new(size_t) noexcept = delete;
|
||||
static void* U_EXPORT2 operator new[](size_t) noexcept = delete;
|
||||
#if U_HAVE_PLACEMENT_NEW
|
||||
static void* U_EXPORT2 operator new(size_t, void*) noexcept = delete;
|
||||
#endif
|
||||
|
||||
/**
|
||||
* Default constructor initializes with internal T[stackCapacity] buffer.
|
||||
*/
|
||||
MaybeStackArray() : ptr(stackArray), capacity(stackCapacity), needToRelease(false) {}
|
||||
/**
|
||||
* Automatically allocates the heap array if the argument is larger than the stack capacity.
|
||||
* Intended for use when an approximate capacity is known at compile time but the true
|
||||
* capacity is not known until runtime.
|
||||
*/
|
||||
MaybeStackArray(int32_t newCapacity, UErrorCode status) : MaybeStackArray() {
|
||||
if (U_FAILURE(status)) {
|
||||
return;
|
||||
}
|
||||
if (capacity < newCapacity) {
|
||||
if (resize(newCapacity) == nullptr) {
|
||||
status = U_MEMORY_ALLOCATION_ERROR;
|
||||
}
|
||||
}
|
||||
}
|
||||
/**
|
||||
* Destructor deletes the array (if owned).
|
||||
*/
|
||||
~MaybeStackArray() { releaseArray(); }
|
||||
/**
|
||||
* Move constructor: transfers ownership or copies the stack array.
|
||||
*/
|
||||
MaybeStackArray(MaybeStackArray<T, stackCapacity> &&src) noexcept;
|
||||
/**
|
||||
* Move assignment: transfers ownership or copies the stack array.
|
||||
*/
|
||||
MaybeStackArray<T, stackCapacity> &operator=(MaybeStackArray<T, stackCapacity> &&src) noexcept;
|
||||
/**
|
||||
* Returns the array capacity (number of T items).
|
||||
* @return array capacity
|
||||
*/
|
||||
int32_t getCapacity() const { return capacity; }
|
||||
/**
|
||||
* Access without ownership change.
|
||||
* @return the array pointer
|
||||
*/
|
||||
T *getAlias() const { return ptr; }
|
||||
/**
|
||||
* Returns the array limit. Simple convenience method.
|
||||
* @return getAlias()+getCapacity()
|
||||
*/
|
||||
T *getArrayLimit() const { return getAlias()+capacity; }
|
||||
// No "operator T *() const" because that can make
|
||||
// expressions like mbs[index] ambiguous for some compilers.
|
||||
/**
|
||||
* Array item access (const).
|
||||
* No index bounds check.
|
||||
* @param i array index
|
||||
* @return reference to the array item
|
||||
*/
|
||||
const T &operator[](ptrdiff_t i) const { return ptr[i]; }
|
||||
/**
|
||||
* Array item access (writable).
|
||||
* No index bounds check.
|
||||
* @param i array index
|
||||
* @return reference to the array item
|
||||
*/
|
||||
T &operator[](ptrdiff_t i) { return ptr[i]; }
|
||||
/**
|
||||
* Deletes the array (if owned) and aliases another one, no transfer of ownership.
|
||||
* If the arguments are illegal, then the current array is unchanged.
|
||||
* @param otherArray must not be nullptr
|
||||
* @param otherCapacity must be >0
|
||||
*/
|
||||
void aliasInstead(T *otherArray, int32_t otherCapacity) {
|
||||
if(otherArray!=nullptr && otherCapacity>0) {
|
||||
releaseArray();
|
||||
ptr=otherArray;
|
||||
capacity=otherCapacity;
|
||||
needToRelease=false;
|
||||
}
|
||||
}
|
||||
/**
|
||||
* Deletes the array (if owned) and allocates a new one, copying length T items.
|
||||
* Returns the new array pointer.
|
||||
* If the allocation fails, then the current array is unchanged and
|
||||
* this method returns nullptr.
|
||||
* @param newCapacity can be less than or greater than the current capacity;
|
||||
* must be >0
|
||||
* @param length number of T items to be copied from the old array to the new one
|
||||
* @return the allocated array pointer, or nullptr if the allocation failed
|
||||
*/
|
||||
inline T *resize(int32_t newCapacity, int32_t length=0);
|
||||
/**
|
||||
* Gives up ownership of the array if owned, or else clones it,
|
||||
* copying length T items; resets itself to the internal stack array.
|
||||
* Returns nullptr if the allocation failed.
|
||||
* @param length number of T items to copy when cloning,
|
||||
* and capacity of the clone when cloning
|
||||
* @param resultCapacity will be set to the returned array's capacity (output-only)
|
||||
* @return the array pointer;
|
||||
* caller becomes responsible for deleting the array
|
||||
*/
|
||||
inline T *orphanOrClone(int32_t length, int32_t &resultCapacity);
|
||||
|
||||
protected:
|
||||
// Resizes the array to the size of src, then copies the contents of src.
|
||||
void copyFrom(const MaybeStackArray &src, UErrorCode &status) {
|
||||
if (U_FAILURE(status)) {
|
||||
return;
|
||||
}
|
||||
if (this->resize(src.capacity, 0) == nullptr) {
|
||||
status = U_MEMORY_ALLOCATION_ERROR;
|
||||
return;
|
||||
}
|
||||
uprv_memcpy(this->ptr, src.ptr, (size_t)capacity * sizeof(T));
|
||||
}
|
||||
|
||||
private:
|
||||
T *ptr;
|
||||
int32_t capacity;
|
||||
UBool needToRelease;
|
||||
T stackArray[stackCapacity];
|
||||
void releaseArray() {
|
||||
if(needToRelease) {
|
||||
uprv_free(ptr);
|
||||
}
|
||||
}
|
||||
void resetToStackArray() {
|
||||
ptr=stackArray;
|
||||
capacity=stackCapacity;
|
||||
needToRelease=false;
|
||||
}
|
||||
/* No comparison operators with other MaybeStackArray's. */
|
||||
bool operator==(const MaybeStackArray & /*other*/) = delete;
|
||||
bool operator!=(const MaybeStackArray & /*other*/) = delete;
|
||||
/* No ownership transfer: No copy constructor, no assignment operator. */
|
||||
MaybeStackArray(const MaybeStackArray & /*other*/) = delete;
|
||||
void operator=(const MaybeStackArray & /*other*/) = delete;
|
||||
};
|
||||
|
||||
template<typename T, int32_t stackCapacity>
|
||||
icu::MaybeStackArray<T, stackCapacity>::MaybeStackArray(
|
||||
MaybeStackArray <T, stackCapacity>&& src) noexcept
|
||||
: ptr(src.ptr), capacity(src.capacity), needToRelease(src.needToRelease) {
|
||||
if (src.ptr == src.stackArray) {
|
||||
ptr = stackArray;
|
||||
uprv_memcpy(stackArray, src.stackArray, sizeof(T) * src.capacity);
|
||||
} else {
|
||||
src.resetToStackArray(); // take ownership away from src
|
||||
}
|
||||
}
|
||||
|
||||
template<typename T, int32_t stackCapacity>
|
||||
inline MaybeStackArray <T, stackCapacity>&
|
||||
MaybeStackArray<T, stackCapacity>::operator=(MaybeStackArray <T, stackCapacity>&& src) noexcept {
|
||||
releaseArray(); // in case this instance had its own memory allocated
|
||||
capacity = src.capacity;
|
||||
needToRelease = src.needToRelease;
|
||||
if (src.ptr == src.stackArray) {
|
||||
ptr = stackArray;
|
||||
uprv_memcpy(stackArray, src.stackArray, sizeof(T) * src.capacity);
|
||||
} else {
|
||||
ptr = src.ptr;
|
||||
src.resetToStackArray(); // take ownership away from src
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
||||
template<typename T, int32_t stackCapacity>
|
||||
inline T *MaybeStackArray<T, stackCapacity>::resize(int32_t newCapacity, int32_t length) {
|
||||
if(newCapacity>0) {
|
||||
#if U_DEBUG && defined(UPRV_MALLOC_COUNT)
|
||||
::fprintf(::stderr, "MaybeStackArray (resize) alloc %d * %lu\n", newCapacity, sizeof(T));
|
||||
#endif
|
||||
T *p=(T *)uprv_malloc(newCapacity*sizeof(T));
|
||||
if(p!=nullptr) {
|
||||
if(length>0) {
|
||||
if(length>capacity) {
|
||||
length=capacity;
|
||||
}
|
||||
if(length>newCapacity) {
|
||||
length=newCapacity;
|
||||
}
|
||||
uprv_memcpy(p, ptr, (size_t)length*sizeof(T));
|
||||
}
|
||||
releaseArray();
|
||||
ptr=p;
|
||||
capacity=newCapacity;
|
||||
needToRelease=true;
|
||||
}
|
||||
return p;
|
||||
} else {
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
template<typename T, int32_t stackCapacity>
|
||||
inline T *MaybeStackArray<T, stackCapacity>::orphanOrClone(int32_t length, int32_t &resultCapacity) {
|
||||
T *p;
|
||||
if(needToRelease) {
|
||||
p=ptr;
|
||||
} else if(length<=0) {
|
||||
return nullptr;
|
||||
} else {
|
||||
if(length>capacity) {
|
||||
length=capacity;
|
||||
}
|
||||
p=(T *)uprv_malloc(length*sizeof(T));
|
||||
#if U_DEBUG && defined(UPRV_MALLOC_COUNT)
|
||||
::fprintf(::stderr,"MaybeStacArray (orphan) alloc %d * %lu\n", length,sizeof(T));
|
||||
#endif
|
||||
if(p==nullptr) {
|
||||
return nullptr;
|
||||
}
|
||||
uprv_memcpy(p, ptr, (size_t)length*sizeof(T));
|
||||
}
|
||||
resultCapacity=length;
|
||||
resetToStackArray();
|
||||
return p;
|
||||
}
|
||||
|
||||
/**
|
||||
* Variant of MaybeStackArray that allocates a header struct and an array
|
||||
* in one contiguous memory block, using uprv_malloc() and uprv_free().
|
||||
* Provides internal memory with fixed array capacity. Can alias another memory
|
||||
* block or allocate one.
|
||||
* The stackCapacity is the number of T items in the internal memory,
|
||||
* not counting the H header.
|
||||
* Unlike LocalMemory and LocalArray, this class never adopts
|
||||
* (takes ownership of) another memory block.
|
||||
*/
|
||||
template<typename H, typename T, int32_t stackCapacity>
|
||||
class MaybeStackHeaderAndArray {
|
||||
public:
|
||||
// No heap allocation. Use only on the stack.
|
||||
static void* U_EXPORT2 operator new(size_t) noexcept = delete;
|
||||
static void* U_EXPORT2 operator new[](size_t) noexcept = delete;
|
||||
#if U_HAVE_PLACEMENT_NEW
|
||||
static void* U_EXPORT2 operator new(size_t, void*) noexcept = delete;
|
||||
#endif
|
||||
|
||||
/**
|
||||
* Default constructor initializes with internal H+T[stackCapacity] buffer.
|
||||
*/
|
||||
MaybeStackHeaderAndArray() : ptr(&stackHeader), capacity(stackCapacity), needToRelease(false) {}
|
||||
/**
|
||||
* Destructor deletes the memory (if owned).
|
||||
*/
|
||||
~MaybeStackHeaderAndArray() { releaseMemory(); }
|
||||
/**
|
||||
* Returns the array capacity (number of T items).
|
||||
* @return array capacity
|
||||
*/
|
||||
int32_t getCapacity() const { return capacity; }
|
||||
/**
|
||||
* Access without ownership change.
|
||||
* @return the header pointer
|
||||
*/
|
||||
H *getAlias() const { return ptr; }
|
||||
/**
|
||||
* Returns the array start.
|
||||
* @return array start, same address as getAlias()+1
|
||||
*/
|
||||
T *getArrayStart() const { return reinterpret_cast<T *>(getAlias()+1); }
|
||||
/**
|
||||
* Returns the array limit.
|
||||
* @return array limit
|
||||
*/
|
||||
T *getArrayLimit() const { return getArrayStart()+capacity; }
|
||||
/**
|
||||
* Access without ownership change. Same as getAlias().
|
||||
* A class instance can be used directly in expressions that take a T *.
|
||||
* @return the header pointer
|
||||
*/
|
||||
operator H *() const { return ptr; }
|
||||
/**
|
||||
* Array item access (writable).
|
||||
* No index bounds check.
|
||||
* @param i array index
|
||||
* @return reference to the array item
|
||||
*/
|
||||
T &operator[](ptrdiff_t i) { return getArrayStart()[i]; }
|
||||
/**
|
||||
* Deletes the memory block (if owned) and aliases another one, no transfer of ownership.
|
||||
* If the arguments are illegal, then the current memory is unchanged.
|
||||
* @param otherArray must not be nullptr
|
||||
* @param otherCapacity must be >0
|
||||
*/
|
||||
void aliasInstead(H *otherMemory, int32_t otherCapacity) {
|
||||
if(otherMemory!=nullptr && otherCapacity>0) {
|
||||
releaseMemory();
|
||||
ptr=otherMemory;
|
||||
capacity=otherCapacity;
|
||||
needToRelease=false;
|
||||
}
|
||||
}
|
||||
/**
|
||||
* Deletes the memory block (if owned) and allocates a new one,
|
||||
* copying the header and length T array items.
|
||||
* Returns the new header pointer.
|
||||
* If the allocation fails, then the current memory is unchanged and
|
||||
* this method returns nullptr.
|
||||
* @param newCapacity can be less than or greater than the current capacity;
|
||||
* must be >0
|
||||
* @param length number of T items to be copied from the old array to the new one
|
||||
* @return the allocated pointer, or nullptr if the allocation failed
|
||||
*/
|
||||
inline H *resize(int32_t newCapacity, int32_t length=0);
|
||||
/**
|
||||
* Gives up ownership of the memory if owned, or else clones it,
|
||||
* copying the header and length T array items; resets itself to the internal memory.
|
||||
* Returns nullptr if the allocation failed.
|
||||
* @param length number of T items to copy when cloning,
|
||||
* and array capacity of the clone when cloning
|
||||
* @param resultCapacity will be set to the returned array's capacity (output-only)
|
||||
* @return the header pointer;
|
||||
* caller becomes responsible for deleting the array
|
||||
*/
|
||||
inline H *orphanOrClone(int32_t length, int32_t &resultCapacity);
|
||||
private:
|
||||
H *ptr;
|
||||
int32_t capacity;
|
||||
UBool needToRelease;
|
||||
// stackHeader must precede stackArray immediately.
|
||||
H stackHeader;
|
||||
T stackArray[stackCapacity];
|
||||
void releaseMemory() {
|
||||
if(needToRelease) {
|
||||
uprv_free(ptr);
|
||||
}
|
||||
}
|
||||
/* No comparison operators with other MaybeStackHeaderAndArray's. */
|
||||
bool operator==(const MaybeStackHeaderAndArray & /*other*/) {return false;}
|
||||
bool operator!=(const MaybeStackHeaderAndArray & /*other*/) {return true;}
|
||||
/* No ownership transfer: No copy constructor, no assignment operator. */
|
||||
MaybeStackHeaderAndArray(const MaybeStackHeaderAndArray & /*other*/) {}
|
||||
void operator=(const MaybeStackHeaderAndArray & /*other*/) {}
|
||||
};
|
||||
|
||||
template<typename H, typename T, int32_t stackCapacity>
|
||||
inline H *MaybeStackHeaderAndArray<H, T, stackCapacity>::resize(int32_t newCapacity,
|
||||
int32_t length) {
|
||||
if(newCapacity>=0) {
|
||||
#if U_DEBUG && defined(UPRV_MALLOC_COUNT)
|
||||
::fprintf(::stderr,"MaybeStackHeaderAndArray alloc %d + %d * %ul\n", sizeof(H),newCapacity,sizeof(T));
|
||||
#endif
|
||||
H *p=(H *)uprv_malloc(sizeof(H)+newCapacity*sizeof(T));
|
||||
if(p!=nullptr) {
|
||||
if(length<0) {
|
||||
length=0;
|
||||
} else if(length>0) {
|
||||
if(length>capacity) {
|
||||
length=capacity;
|
||||
}
|
||||
if(length>newCapacity) {
|
||||
length=newCapacity;
|
||||
}
|
||||
}
|
||||
uprv_memcpy(p, ptr, sizeof(H)+(size_t)length*sizeof(T));
|
||||
releaseMemory();
|
||||
ptr=p;
|
||||
capacity=newCapacity;
|
||||
needToRelease=true;
|
||||
}
|
||||
return p;
|
||||
} else {
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
template<typename H, typename T, int32_t stackCapacity>
|
||||
inline H *MaybeStackHeaderAndArray<H, T, stackCapacity>::orphanOrClone(int32_t length,
|
||||
int32_t &resultCapacity) {
|
||||
H *p;
|
||||
if(needToRelease) {
|
||||
p=ptr;
|
||||
} else {
|
||||
if(length<0) {
|
||||
length=0;
|
||||
} else if(length>capacity) {
|
||||
length=capacity;
|
||||
}
|
||||
#if U_DEBUG && defined(UPRV_MALLOC_COUNT)
|
||||
::fprintf(::stderr,"MaybeStackHeaderAndArray (orphan) alloc %ul + %d * %lu\n", sizeof(H),length,sizeof(T));
|
||||
#endif
|
||||
p=(H *)uprv_malloc(sizeof(H)+length*sizeof(T));
|
||||
if(p==nullptr) {
|
||||
return nullptr;
|
||||
}
|
||||
uprv_memcpy(p, ptr, sizeof(H)+(size_t)length*sizeof(T));
|
||||
}
|
||||
resultCapacity=length;
|
||||
ptr=&stackHeader;
|
||||
capacity=stackCapacity;
|
||||
needToRelease=false;
|
||||
return p;
|
||||
}
|
||||
|
||||
/**
|
||||
* A simple memory management class that creates new heap allocated objects (of
|
||||
* any class that has a public constructor), keeps track of them and eventually
|
||||
* deletes them all in its own destructor.
|
||||
*
|
||||
* A typical use-case would be code like this:
|
||||
*
|
||||
* MemoryPool<MyType> pool;
|
||||
*
|
||||
* MyType* o1 = pool.create();
|
||||
* if (o1 != nullptr) {
|
||||
* foo(o1);
|
||||
* }
|
||||
*
|
||||
* MyType* o2 = pool.create(1, 2, 3);
|
||||
* if (o2 != nullptr) {
|
||||
* bar(o2);
|
||||
* }
|
||||
*
|
||||
* // MemoryPool will take care of deleting the MyType objects.
|
||||
*
|
||||
* It doesn't do anything more than that, and is intentionally kept minimalist.
|
||||
*/
|
||||
template<typename T, int32_t stackCapacity = 8>
|
||||
class MemoryPool : public UMemory {
|
||||
public:
|
||||
MemoryPool() : fCount(0), fPool() {}
|
||||
|
||||
~MemoryPool() {
|
||||
for (int32_t i = 0; i < fCount; ++i) {
|
||||
delete fPool[i];
|
||||
}
|
||||
}
|
||||
|
||||
MemoryPool(const MemoryPool&) = delete;
|
||||
MemoryPool& operator=(const MemoryPool&) = delete;
|
||||
|
||||
MemoryPool(MemoryPool&& other) noexcept : fCount(other.fCount),
|
||||
fPool(std::move(other.fPool)) {
|
||||
other.fCount = 0;
|
||||
}
|
||||
|
||||
MemoryPool& operator=(MemoryPool&& other) noexcept {
|
||||
// Since `this` may contain instances that need to be deleted, we can't
|
||||
// just throw them away and replace them with `other`. The normal way of
|
||||
// dealing with this in C++ is to swap `this` and `other`, rather than
|
||||
// simply overwrite: the destruction of `other` can then take care of
|
||||
// running MemoryPool::~MemoryPool() over the still-to-be-deallocated
|
||||
// instances.
|
||||
std::swap(fCount, other.fCount);
|
||||
std::swap(fPool, other.fPool);
|
||||
return *this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a new object of typename T, by forwarding any and all arguments
|
||||
* to the typename T constructor.
|
||||
*
|
||||
* @param args Arguments to be forwarded to the typename T constructor.
|
||||
* @return A pointer to the newly created object, or nullptr on error.
|
||||
*/
|
||||
template<typename... Args>
|
||||
T* create(Args&&... args) {
|
||||
int32_t capacity = fPool.getCapacity();
|
||||
if (fCount == capacity &&
|
||||
fPool.resize(capacity == stackCapacity ? 4 * capacity : 2 * capacity,
|
||||
capacity) == nullptr) {
|
||||
return nullptr;
|
||||
}
|
||||
return fPool[fCount++] = new T(std::forward<Args>(args)...);
|
||||
}
|
||||
|
||||
template <typename... Args>
|
||||
T* createAndCheckErrorCode(UErrorCode &status, Args &&... args) {
|
||||
if (U_FAILURE(status)) {
|
||||
return nullptr;
|
||||
}
|
||||
T *pointer = this->create(args...);
|
||||
if (U_SUCCESS(status) && pointer == nullptr) {
|
||||
status = U_MEMORY_ALLOCATION_ERROR;
|
||||
}
|
||||
return pointer;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return Number of elements that have been allocated.
|
||||
*/
|
||||
int32_t count() const {
|
||||
return fCount;
|
||||
}
|
||||
|
||||
protected:
|
||||
int32_t fCount;
|
||||
MaybeStackArray<T*, stackCapacity> fPool;
|
||||
};
|
||||
|
||||
/**
|
||||
* An internal Vector-like implementation based on MemoryPool.
|
||||
*
|
||||
* Heap-allocates each element and stores pointers.
|
||||
*
|
||||
* To append an item to the vector, use emplaceBack.
|
||||
*
|
||||
* MaybeStackVector<MyType> vector;
|
||||
* MyType* element = vector.emplaceBack();
|
||||
* if (!element) {
|
||||
* status = U_MEMORY_ALLOCATION_ERROR;
|
||||
* }
|
||||
* // do stuff with element
|
||||
*
|
||||
* To loop over the vector, use a for loop with indices:
|
||||
*
|
||||
* for (int32_t i = 0; i < vector.length(); i++) {
|
||||
* MyType* element = vector[i];
|
||||
* }
|
||||
*/
|
||||
template<typename T, int32_t stackCapacity = 8>
|
||||
class MaybeStackVector : protected MemoryPool<T, stackCapacity> {
|
||||
public:
|
||||
template<typename... Args>
|
||||
T* emplaceBack(Args&&... args) {
|
||||
return this->create(args...);
|
||||
}
|
||||
|
||||
template <typename... Args>
|
||||
T *emplaceBackAndCheckErrorCode(UErrorCode &status, Args &&... args) {
|
||||
return this->createAndCheckErrorCode(status, args...);
|
||||
}
|
||||
|
||||
int32_t length() const {
|
||||
return this->fCount;
|
||||
}
|
||||
|
||||
T** getAlias() {
|
||||
return this->fPool.getAlias();
|
||||
}
|
||||
|
||||
const T *const *getAlias() const {
|
||||
return this->fPool.getAlias();
|
||||
}
|
||||
|
||||
/**
|
||||
* Array item access (read-only).
|
||||
* No index bounds check.
|
||||
* @param i array index
|
||||
* @return reference to the array item
|
||||
*/
|
||||
const T* operator[](ptrdiff_t i) const {
|
||||
return this->fPool[i];
|
||||
}
|
||||
|
||||
/**
|
||||
* Array item access (writable).
|
||||
* No index bounds check.
|
||||
* @param i array index
|
||||
* @return reference to the array item
|
||||
*/
|
||||
T* operator[](ptrdiff_t i) {
|
||||
return this->fPool[i];
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
||||
#endif /* __cplusplus */
|
||||
#endif /* CMEMORY_H */
|
||||
97
engine/thirdparty/icu4c/common/cpputils.h
vendored
Normal file
97
engine/thirdparty/icu4c/common/cpputils.h
vendored
Normal file
|
|
@ -0,0 +1,97 @@
|
|||
// © 2016 and later: Unicode, Inc. and others.
|
||||
// License & terms of use: http://www.unicode.org/copyright.html
|
||||
/*
|
||||
******************************************************************************
|
||||
*
|
||||
* Copyright (C) 1997-2011, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*
|
||||
******************************************************************************
|
||||
* file name: cpputils.h
|
||||
* encoding: UTF-8
|
||||
* tab size: 8 (not used)
|
||||
* indentation:4
|
||||
*/
|
||||
|
||||
#ifndef CPPUTILS_H
|
||||
#define CPPUTILS_H
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
#include "unicode/unistr.h"
|
||||
#include "cmemory.h"
|
||||
|
||||
/*==========================================================================*/
|
||||
/* Array copy utility functions */
|
||||
/*==========================================================================*/
|
||||
|
||||
static
|
||||
inline void uprv_arrayCopy(const double* src, double* dst, int32_t count)
|
||||
{ uprv_memcpy(dst, src, (size_t)count * sizeof(*src)); }
|
||||
|
||||
static
|
||||
inline void uprv_arrayCopy(const double* src, int32_t srcStart,
|
||||
double* dst, int32_t dstStart, int32_t count)
|
||||
{ uprv_memcpy(dst+dstStart, src+srcStart, (size_t)count * sizeof(*src)); }
|
||||
|
||||
static
|
||||
inline void uprv_arrayCopy(const int8_t* src, int8_t* dst, int32_t count)
|
||||
{ uprv_memcpy(dst, src, (size_t)count * sizeof(*src)); }
|
||||
|
||||
static
|
||||
inline void uprv_arrayCopy(const int8_t* src, int32_t srcStart,
|
||||
int8_t* dst, int32_t dstStart, int32_t count)
|
||||
{ uprv_memcpy(dst+dstStart, src+srcStart, (size_t)count * sizeof(*src)); }
|
||||
|
||||
static
|
||||
inline void uprv_arrayCopy(const int16_t* src, int16_t* dst, int32_t count)
|
||||
{ uprv_memcpy(dst, src, (size_t)count * sizeof(*src)); }
|
||||
|
||||
static
|
||||
inline void uprv_arrayCopy(const int16_t* src, int32_t srcStart,
|
||||
int16_t* dst, int32_t dstStart, int32_t count)
|
||||
{ uprv_memcpy(dst+dstStart, src+srcStart, (size_t)count * sizeof(*src)); }
|
||||
|
||||
static
|
||||
inline void uprv_arrayCopy(const int32_t* src, int32_t* dst, int32_t count)
|
||||
{ uprv_memcpy(dst, src, (size_t)count * sizeof(*src)); }
|
||||
|
||||
static
|
||||
inline void uprv_arrayCopy(const int32_t* src, int32_t srcStart,
|
||||
int32_t* dst, int32_t dstStart, int32_t count)
|
||||
{ uprv_memcpy(dst+dstStart, src+srcStart, (size_t)count * sizeof(*src)); }
|
||||
|
||||
static
|
||||
inline void
|
||||
uprv_arrayCopy(const char16_t *src, int32_t srcStart,
|
||||
char16_t *dst, int32_t dstStart, int32_t count)
|
||||
{ uprv_memcpy(dst+dstStart, src+srcStart, (size_t)count * sizeof(*src)); }
|
||||
|
||||
/**
|
||||
* Copy an array of UnicodeString OBJECTS (not pointers).
|
||||
* @internal
|
||||
*/
|
||||
static inline void
|
||||
uprv_arrayCopy(const icu::UnicodeString *src, icu::UnicodeString *dst, int32_t count)
|
||||
{ while(count-- > 0) *dst++ = *src++; }
|
||||
|
||||
/**
|
||||
* Copy an array of UnicodeString OBJECTS (not pointers).
|
||||
* @internal
|
||||
*/
|
||||
static inline void
|
||||
uprv_arrayCopy(const icu::UnicodeString *src, int32_t srcStart,
|
||||
icu::UnicodeString *dst, int32_t dstStart, int32_t count)
|
||||
{ uprv_arrayCopy(src+srcStart, dst+dstStart, count); }
|
||||
|
||||
/**
|
||||
* Checks that the string is readable and writable.
|
||||
* Sets U_ILLEGAL_ARGUMENT_ERROR if the string isBogus() or has an open getBuffer().
|
||||
*/
|
||||
inline void
|
||||
uprv_checkCanGetBuffer(const icu::UnicodeString &s, UErrorCode &errorCode) {
|
||||
if(U_SUCCESS(errorCode) && s.isBogus()) {
|
||||
errorCode=U_ILLEGAL_ARGUMENT_ERROR;
|
||||
}
|
||||
}
|
||||
|
||||
#endif /* _CPPUTILS */
|
||||
54
engine/thirdparty/icu4c/common/cstr.cpp
vendored
Normal file
54
engine/thirdparty/icu4c/common/cstr.cpp
vendored
Normal file
|
|
@ -0,0 +1,54 @@
|
|||
// © 2016 and later: Unicode, Inc. and others.
|
||||
// License & terms of use: http://www.unicode.org/copyright.html
|
||||
/*
|
||||
*******************************************************************************
|
||||
* Copyright (C) 2015-2016, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*******************************************************************************
|
||||
* file name: charstr.cpp
|
||||
*/
|
||||
#include "unicode/utypes.h"
|
||||
#include "unicode/putil.h"
|
||||
#include "unicode/unistr.h"
|
||||
|
||||
#include "cstr.h"
|
||||
|
||||
#include "charstr.h"
|
||||
#include "uinvchar.h"
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
CStr::CStr(const UnicodeString &in) {
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
#if !UCONFIG_NO_CONVERSION || U_CHARSET_IS_UTF8
|
||||
int32_t length = in.extract(0, in.length(), static_cast<char *>(nullptr), static_cast<uint32_t>(0));
|
||||
int32_t resultCapacity = 0;
|
||||
char *buf = s.getAppendBuffer(length, length, resultCapacity, status);
|
||||
if (U_SUCCESS(status)) {
|
||||
in.extract(0, in.length(), buf, resultCapacity);
|
||||
s.append(buf, length, status);
|
||||
}
|
||||
#else
|
||||
// No conversion available. Convert any invariant characters; substitute '?' for the rest.
|
||||
// Note: can't just call u_UCharsToChars() or CharString.appendInvariantChars() on the
|
||||
// whole string because they require that the entire input be invariant.
|
||||
char buf[2];
|
||||
for (int i=0; i<in.length(); i = in.moveIndex32(i, 1)) {
|
||||
if (uprv_isInvariantUString(in.getBuffer()+i, 1)) {
|
||||
u_UCharsToChars(in.getBuffer()+i, buf, 1);
|
||||
} else {
|
||||
buf[0] = '?';
|
||||
}
|
||||
s.append(buf, 1, status);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
CStr::~CStr() {
|
||||
}
|
||||
|
||||
const char * CStr::operator ()() const {
|
||||
return s.data();
|
||||
}
|
||||
|
||||
U_NAMESPACE_END
|
||||
60
engine/thirdparty/icu4c/common/cstr.h
vendored
Normal file
60
engine/thirdparty/icu4c/common/cstr.h
vendored
Normal file
|
|
@ -0,0 +1,60 @@
|
|||
// © 2016 and later: Unicode, Inc. and others.
|
||||
// License & terms of use: http://www.unicode.org/copyright.html
|
||||
/*
|
||||
******************************************************************************
|
||||
*
|
||||
* Copyright (C) 2016, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*
|
||||
******************************************************************************
|
||||
*
|
||||
* File: cstr.h
|
||||
*/
|
||||
|
||||
#ifndef CSTR_H
|
||||
#define CSTR_H
|
||||
|
||||
#include "unicode/unistr.h"
|
||||
#include "unicode/uobject.h"
|
||||
#include "unicode/utypes.h"
|
||||
|
||||
#include "charstr.h"
|
||||
|
||||
/**
|
||||
* ICU-internal class CStr, a small helper class to facilitate passing UnicodeStrings
|
||||
* to functions needing (const char *) strings, such as printf().
|
||||
*
|
||||
* It is intended primarily for use in debugging or in tests. Uses platform
|
||||
* default code page conversion, which will do the best job possible,
|
||||
* but may be lossy, depending on the platform.
|
||||
*
|
||||
* If no other conversion is available, use invariant conversion and substitute
|
||||
* '?' for non-invariant characters.
|
||||
*
|
||||
* Example Usage:
|
||||
* UnicodeString s = whatever;
|
||||
* printf("%s", CStr(s)());
|
||||
*
|
||||
* The explicit call to the CStr() constructor creates a temporary object.
|
||||
* Operator () on the temporary object returns a (const char *) pointer.
|
||||
* The lifetime of the (const char *) data is that of the temporary object,
|
||||
* which works well when passing it as a parameter to another function, such as printf.
|
||||
*/
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
class U_COMMON_API CStr : public UMemory {
|
||||
public:
|
||||
CStr(const UnicodeString &in);
|
||||
~CStr();
|
||||
const char * operator ()() const;
|
||||
|
||||
private:
|
||||
CharString s;
|
||||
CStr(const CStr &other) = delete; // Forbid copying of this class.
|
||||
CStr &operator =(const CStr &other) = delete; // Forbid assignment.
|
||||
};
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
||||
#endif
|
||||
341
engine/thirdparty/icu4c/common/cstring.cpp
vendored
Normal file
341
engine/thirdparty/icu4c/common/cstring.cpp
vendored
Normal file
|
|
@ -0,0 +1,341 @@
|
|||
// © 2016 and later: Unicode, Inc. and others.
|
||||
// License & terms of use: http://www.unicode.org/copyright.html
|
||||
/*
|
||||
******************************************************************************
|
||||
*
|
||||
* Copyright (C) 1997-2011, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*
|
||||
******************************************************************************
|
||||
*
|
||||
* File CSTRING.C
|
||||
*
|
||||
* @author Helena Shih
|
||||
*
|
||||
* Modification History:
|
||||
*
|
||||
* Date Name Description
|
||||
* 6/18/98 hshih Created
|
||||
* 09/08/98 stephen Added include for ctype, for Mac Port
|
||||
* 11/15/99 helena Integrated S/390 IEEE changes.
|
||||
******************************************************************************
|
||||
*/
|
||||
|
||||
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
#include "unicode/utypes.h"
|
||||
#include "cmemory.h"
|
||||
#include "cstring.h"
|
||||
#include "uassert.h"
|
||||
|
||||
/*
|
||||
* We hardcode case conversion for invariant characters to match our expectation
|
||||
* and the compiler execution charset.
|
||||
* This prevents problems on systems
|
||||
* - with non-default casing behavior, like Turkish system locales where
|
||||
* tolower('I') maps to dotless i and toupper('i') maps to dotted I
|
||||
* - where there are no lowercase Latin characters at all, or using different
|
||||
* codes (some old EBCDIC codepages)
|
||||
*
|
||||
* This works because the compiler usually runs on a platform where the execution
|
||||
* charset includes all of the invariant characters at their expected
|
||||
* code positions, so that the char * string literals in ICU code match
|
||||
* the char literals here.
|
||||
*
|
||||
* Note that the set of lowercase Latin letters is discontiguous in EBCDIC
|
||||
* and the set of uppercase Latin letters is discontiguous as well.
|
||||
*/
|
||||
|
||||
U_CAPI UBool U_EXPORT2
|
||||
uprv_isASCIILetter(char c) {
|
||||
#if U_CHARSET_FAMILY==U_EBCDIC_FAMILY
|
||||
return
|
||||
('a'<=c && c<='i') || ('j'<=c && c<='r') || ('s'<=c && c<='z') ||
|
||||
('A'<=c && c<='I') || ('J'<=c && c<='R') || ('S'<=c && c<='Z');
|
||||
#else
|
||||
return ('a'<=c && c<='z') || ('A'<=c && c<='Z');
|
||||
#endif
|
||||
}
|
||||
|
||||
U_CAPI char U_EXPORT2
|
||||
uprv_toupper(char c) {
|
||||
#if U_CHARSET_FAMILY==U_EBCDIC_FAMILY
|
||||
if(('a'<=c && c<='i') || ('j'<=c && c<='r') || ('s'<=c && c<='z')) {
|
||||
c=(char)(c+('A'-'a'));
|
||||
}
|
||||
#else
|
||||
if('a'<=c && c<='z') {
|
||||
c=(char)(c+('A'-'a'));
|
||||
}
|
||||
#endif
|
||||
return c;
|
||||
}
|
||||
|
||||
|
||||
#if 0
|
||||
/*
|
||||
* Commented out because cstring.h defines uprv_tolower() to be
|
||||
* the same as either uprv_asciitolower() or uprv_ebcdictolower()
|
||||
* to reduce the amount of code to cover with tests.
|
||||
*
|
||||
* Note that this uprv_tolower() definition is likely to work for most
|
||||
* charset families, not just ASCII and EBCDIC, because its #else branch
|
||||
* is written generically.
|
||||
*/
|
||||
U_CAPI char U_EXPORT2
|
||||
uprv_tolower(char c) {
|
||||
#if U_CHARSET_FAMILY==U_EBCDIC_FAMILY
|
||||
if(('A'<=c && c<='I') || ('J'<=c && c<='R') || ('S'<=c && c<='Z')) {
|
||||
c=(char)(c+('a'-'A'));
|
||||
}
|
||||
#else
|
||||
if('A'<=c && c<='Z') {
|
||||
c=(char)(c+('a'-'A'));
|
||||
}
|
||||
#endif
|
||||
return c;
|
||||
}
|
||||
#endif
|
||||
|
||||
U_CAPI char U_EXPORT2
|
||||
uprv_asciitolower(char c) {
|
||||
if(0x41<=c && c<=0x5a) {
|
||||
c=(char)(c+0x20);
|
||||
}
|
||||
return c;
|
||||
}
|
||||
|
||||
U_CAPI char U_EXPORT2
|
||||
uprv_ebcdictolower(char c) {
|
||||
if( (0xc1<=(uint8_t)c && (uint8_t)c<=0xc9) ||
|
||||
(0xd1<=(uint8_t)c && (uint8_t)c<=0xd9) ||
|
||||
(0xe2<=(uint8_t)c && (uint8_t)c<=0xe9)
|
||||
) {
|
||||
c=(char)(c-0x40);
|
||||
}
|
||||
return c;
|
||||
}
|
||||
|
||||
|
||||
U_CAPI char* U_EXPORT2
|
||||
T_CString_toLowerCase(char* str)
|
||||
{
|
||||
char* origPtr = str;
|
||||
|
||||
if (str) {
|
||||
do
|
||||
*str = (char)uprv_tolower(*str);
|
||||
while (*(str++));
|
||||
}
|
||||
|
||||
return origPtr;
|
||||
}
|
||||
|
||||
U_CAPI char* U_EXPORT2
|
||||
T_CString_toUpperCase(char* str)
|
||||
{
|
||||
char* origPtr = str;
|
||||
|
||||
if (str) {
|
||||
do
|
||||
*str = (char)uprv_toupper(*str);
|
||||
while (*(str++));
|
||||
}
|
||||
|
||||
return origPtr;
|
||||
}
|
||||
|
||||
/*
|
||||
* Takes a int32_t and fills in a char* string with that number "radix"-based.
|
||||
* Does not handle negative values (makes an empty string for them).
|
||||
* Writes at most 12 chars ("-2147483647" plus NUL).
|
||||
* Returns the length of the string (not including the NUL).
|
||||
*/
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
T_CString_integerToString(char* buffer, int32_t v, int32_t radix)
|
||||
{
|
||||
char tbuf[30];
|
||||
int32_t tbx = sizeof(tbuf);
|
||||
uint8_t digit;
|
||||
int32_t length = 0;
|
||||
uint32_t uval;
|
||||
|
||||
U_ASSERT(radix>=2 && radix<=16);
|
||||
uval = (uint32_t) v;
|
||||
if(v<0 && radix == 10) {
|
||||
/* Only in base 10 do we conside numbers to be signed. */
|
||||
uval = (uint32_t)(-v);
|
||||
buffer[length++] = '-';
|
||||
}
|
||||
|
||||
tbx = sizeof(tbuf)-1;
|
||||
tbuf[tbx] = 0; /* We are generating the digits backwards. Null term the end. */
|
||||
do {
|
||||
digit = (uint8_t)(uval % radix);
|
||||
tbuf[--tbx] = (char)(T_CString_itosOffset(digit));
|
||||
uval = uval / radix;
|
||||
} while (uval != 0);
|
||||
|
||||
/* copy converted number into user buffer */
|
||||
uprv_strcpy(buffer+length, tbuf+tbx);
|
||||
length += sizeof(tbuf) - tbx -1;
|
||||
return length;
|
||||
}
|
||||
|
||||
|
||||
|
||||
/*
|
||||
* Takes a int64_t and fills in a char* string with that number "radix"-based.
|
||||
* Writes at most 21: chars ("-9223372036854775807" plus NUL).
|
||||
* Returns the length of the string, not including the terminating NUL.
|
||||
*/
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
T_CString_int64ToString(char* buffer, int64_t v, uint32_t radix)
|
||||
{
|
||||
char tbuf[30];
|
||||
int32_t tbx = sizeof(tbuf);
|
||||
uint8_t digit;
|
||||
int32_t length = 0;
|
||||
uint64_t uval;
|
||||
|
||||
U_ASSERT(radix>=2 && radix<=16);
|
||||
uval = (uint64_t) v;
|
||||
if(v<0 && radix == 10) {
|
||||
/* Only in base 10 do we conside numbers to be signed. */
|
||||
uval = (uint64_t)(-v);
|
||||
buffer[length++] = '-';
|
||||
}
|
||||
|
||||
tbx = sizeof(tbuf)-1;
|
||||
tbuf[tbx] = 0; /* We are generating the digits backwards. Null term the end. */
|
||||
do {
|
||||
digit = (uint8_t)(uval % radix);
|
||||
tbuf[--tbx] = (char)(T_CString_itosOffset(digit));
|
||||
uval = uval / radix;
|
||||
} while (uval != 0);
|
||||
|
||||
/* copy converted number into user buffer */
|
||||
uprv_strcpy(buffer+length, tbuf+tbx);
|
||||
length += sizeof(tbuf) - tbx -1;
|
||||
return length;
|
||||
}
|
||||
|
||||
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
T_CString_stringToInteger(const char *integerString, int32_t radix)
|
||||
{
|
||||
char *end;
|
||||
return uprv_strtoul(integerString, &end, radix);
|
||||
|
||||
}
|
||||
|
||||
U_CAPI int U_EXPORT2
|
||||
uprv_stricmp(const char *str1, const char *str2) {
|
||||
if(str1==nullptr) {
|
||||
if(str2==nullptr) {
|
||||
return 0;
|
||||
} else {
|
||||
return -1;
|
||||
}
|
||||
} else if(str2==nullptr) {
|
||||
return 1;
|
||||
} else {
|
||||
/* compare non-nullptr strings lexically with lowercase */
|
||||
int rc;
|
||||
unsigned char c1, c2;
|
||||
|
||||
for(;;) {
|
||||
c1=(unsigned char)*str1;
|
||||
c2=(unsigned char)*str2;
|
||||
if(c1==0) {
|
||||
if(c2==0) {
|
||||
return 0;
|
||||
} else {
|
||||
return -1;
|
||||
}
|
||||
} else if(c2==0) {
|
||||
return 1;
|
||||
} else {
|
||||
/* compare non-zero characters with lowercase */
|
||||
rc=(int)(unsigned char)uprv_tolower(c1)-(int)(unsigned char)uprv_tolower(c2);
|
||||
if(rc!=0) {
|
||||
return rc;
|
||||
}
|
||||
}
|
||||
++str1;
|
||||
++str2;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
U_CAPI int U_EXPORT2
|
||||
uprv_strnicmp(const char *str1, const char *str2, uint32_t n) {
|
||||
if(str1==nullptr) {
|
||||
if(str2==nullptr) {
|
||||
return 0;
|
||||
} else {
|
||||
return -1;
|
||||
}
|
||||
} else if(str2==nullptr) {
|
||||
return 1;
|
||||
} else {
|
||||
/* compare non-nullptr strings lexically with lowercase */
|
||||
int rc;
|
||||
unsigned char c1, c2;
|
||||
|
||||
for(; n--;) {
|
||||
c1=(unsigned char)*str1;
|
||||
c2=(unsigned char)*str2;
|
||||
if(c1==0) {
|
||||
if(c2==0) {
|
||||
return 0;
|
||||
} else {
|
||||
return -1;
|
||||
}
|
||||
} else if(c2==0) {
|
||||
return 1;
|
||||
} else {
|
||||
/* compare non-zero characters with lowercase */
|
||||
rc=(int)(unsigned char)uprv_tolower(c1)-(int)(unsigned char)uprv_tolower(c2);
|
||||
if(rc!=0) {
|
||||
return rc;
|
||||
}
|
||||
}
|
||||
++str1;
|
||||
++str2;
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
U_CAPI char* U_EXPORT2
|
||||
uprv_strdup(const char *src) {
|
||||
size_t len = uprv_strlen(src) + 1;
|
||||
char *dup = (char *) uprv_malloc(len);
|
||||
|
||||
if (dup) {
|
||||
uprv_memcpy(dup, src, len);
|
||||
}
|
||||
|
||||
return dup;
|
||||
}
|
||||
|
||||
U_CAPI char* U_EXPORT2
|
||||
uprv_strndup(const char *src, int32_t n) {
|
||||
char *dup;
|
||||
|
||||
if(n < 0) {
|
||||
dup = uprv_strdup(src);
|
||||
} else {
|
||||
dup = (char*)uprv_malloc(n+1);
|
||||
if (dup) {
|
||||
uprv_memcpy(dup, src, n);
|
||||
dup[n] = 0;
|
||||
}
|
||||
}
|
||||
|
||||
return dup;
|
||||
}
|
||||
126
engine/thirdparty/icu4c/common/cstring.h
vendored
Normal file
126
engine/thirdparty/icu4c/common/cstring.h
vendored
Normal file
|
|
@ -0,0 +1,126 @@
|
|||
// © 2016 and later: Unicode, Inc. and others.
|
||||
// License & terms of use: http://www.unicode.org/copyright.html
|
||||
/*
|
||||
******************************************************************************
|
||||
*
|
||||
* Copyright (C) 1997-2012, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*
|
||||
******************************************************************************
|
||||
*
|
||||
* File CSTRING.H
|
||||
*
|
||||
* Contains CString interface
|
||||
*
|
||||
* @author Helena Shih
|
||||
*
|
||||
* Modification History:
|
||||
*
|
||||
* Date Name Description
|
||||
* 6/17/98 hshih Created.
|
||||
* 05/03/99 stephen Changed from functions to macros.
|
||||
* 06/14/99 stephen Added icu_strncat, icu_strncmp, icu_tolower
|
||||
*
|
||||
******************************************************************************
|
||||
*/
|
||||
|
||||
#ifndef CSTRING_H
|
||||
#define CSTRING_H 1
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
#include "cmemory.h"
|
||||
#include <string.h>
|
||||
#include <stdlib.h>
|
||||
#include <ctype.h>
|
||||
|
||||
#define uprv_strcpy(dst, src) U_STANDARD_CPP_NAMESPACE strcpy(dst, src)
|
||||
#define uprv_strlen(str) U_STANDARD_CPP_NAMESPACE strlen(str)
|
||||
#define uprv_strcmp(s1, s2) U_STANDARD_CPP_NAMESPACE strcmp(s1, s2)
|
||||
#define uprv_strcat(dst, src) U_STANDARD_CPP_NAMESPACE strcat(dst, src)
|
||||
#define uprv_strchr(s, c) U_STANDARD_CPP_NAMESPACE strchr(s, c)
|
||||
#define uprv_strstr(s, c) U_STANDARD_CPP_NAMESPACE strstr(s, c)
|
||||
#define uprv_strrchr(s, c) U_STANDARD_CPP_NAMESPACE strrchr(s, c)
|
||||
#define uprv_strncpy(dst, src, size) U_STANDARD_CPP_NAMESPACE strncpy(dst, src, size)
|
||||
#define uprv_strncmp(s1, s2, n) U_STANDARD_CPP_NAMESPACE strncmp(s1, s2, n)
|
||||
#define uprv_strncat(dst, src, n) U_STANDARD_CPP_NAMESPACE strncat(dst, src, n)
|
||||
|
||||
/**
|
||||
* Is c an ASCII-repertoire letter a-z or A-Z?
|
||||
* Note: The implementation is specific to whether ICU is compiled for
|
||||
* an ASCII-based or EBCDIC-based machine. There just does not seem to be a better name for this.
|
||||
*/
|
||||
U_CAPI UBool U_EXPORT2
|
||||
uprv_isASCIILetter(char c);
|
||||
|
||||
// NOTE: For u_asciiToUpper that takes a UChar, see ustr_imp.h
|
||||
|
||||
U_CAPI char U_EXPORT2
|
||||
uprv_toupper(char c);
|
||||
|
||||
|
||||
U_CAPI char U_EXPORT2
|
||||
uprv_asciitolower(char c);
|
||||
|
||||
U_CAPI char U_EXPORT2
|
||||
uprv_ebcdictolower(char c);
|
||||
|
||||
#if U_CHARSET_FAMILY==U_ASCII_FAMILY
|
||||
# define uprv_tolower uprv_asciitolower
|
||||
#elif U_CHARSET_FAMILY==U_EBCDIC_FAMILY
|
||||
# define uprv_tolower uprv_ebcdictolower
|
||||
#else
|
||||
# error U_CHARSET_FAMILY is not valid
|
||||
#endif
|
||||
|
||||
#define uprv_strtod(source, end) U_STANDARD_CPP_NAMESPACE strtod(source, end)
|
||||
#define uprv_strtoul(str, end, base) U_STANDARD_CPP_NAMESPACE strtoul(str, end, base)
|
||||
#define uprv_strtol(str, end, base) U_STANDARD_CPP_NAMESPACE strtol(str, end, base)
|
||||
|
||||
/* Conversion from a digit to the character with radix base from 2-19 */
|
||||
/* May need to use U_UPPER_ORDINAL*/
|
||||
#define T_CString_itosOffset(a) ((a)<=9?('0'+(a)):('A'+(a)-10))
|
||||
|
||||
U_CAPI char* U_EXPORT2
|
||||
uprv_strdup(const char *src);
|
||||
|
||||
/**
|
||||
* uprv_malloc n+1 bytes, and copy n bytes from src into the new string.
|
||||
* Terminate with a null at offset n. If n is -1, works like uprv_strdup
|
||||
* @param src
|
||||
* @param n length of the input string, not including null.
|
||||
* @return new string (owned by caller, use uprv_free to free).
|
||||
* @internal
|
||||
*/
|
||||
U_CAPI char* U_EXPORT2
|
||||
uprv_strndup(const char *src, int32_t n);
|
||||
|
||||
U_CAPI char* U_EXPORT2
|
||||
T_CString_toLowerCase(char* str);
|
||||
|
||||
U_CAPI char* U_EXPORT2
|
||||
T_CString_toUpperCase(char* str);
|
||||
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
T_CString_integerToString(char *buffer, int32_t n, int32_t radix);
|
||||
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
T_CString_int64ToString(char *buffer, int64_t n, uint32_t radix);
|
||||
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
T_CString_stringToInteger(const char *integerString, int32_t radix);
|
||||
|
||||
/**
|
||||
* Case-insensitive, language-independent string comparison
|
||||
* limited to the ASCII character repertoire.
|
||||
*/
|
||||
U_CAPI int U_EXPORT2
|
||||
uprv_stricmp(const char *str1, const char *str2);
|
||||
|
||||
/**
|
||||
* Case-insensitive, language-independent string comparison
|
||||
* limited to the ASCII character repertoire.
|
||||
*/
|
||||
U_CAPI int U_EXPORT2
|
||||
uprv_strnicmp(const char *str1, const char *str2, uint32_t n);
|
||||
|
||||
#endif /* ! CSTRING_H */
|
||||
55
engine/thirdparty/icu4c/common/cwchar.cpp
vendored
Normal file
55
engine/thirdparty/icu4c/common/cwchar.cpp
vendored
Normal file
|
|
@ -0,0 +1,55 @@
|
|||
// © 2016 and later: Unicode, Inc. and others.
|
||||
// License & terms of use: http://www.unicode.org/copyright.html
|
||||
/*
|
||||
******************************************************************************
|
||||
*
|
||||
* Copyright (C) 2001, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*
|
||||
******************************************************************************
|
||||
* file name: cwchar.c
|
||||
* encoding: UTF-8
|
||||
* tab size: 8 (not used)
|
||||
* indentation:4
|
||||
*
|
||||
* created on: 2001may25
|
||||
* created by: Markus W. Scherer
|
||||
*/
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
|
||||
#if !U_HAVE_WCSCPY
|
||||
|
||||
#include "cwchar.h"
|
||||
|
||||
U_CAPI wchar_t *uprv_wcscat(wchar_t *dst, const wchar_t *src) {
|
||||
wchar_t *start=dst;
|
||||
while(*dst!=0) {
|
||||
++dst;
|
||||
}
|
||||
while((*dst=*src)!=0) {
|
||||
++dst;
|
||||
++src;
|
||||
}
|
||||
return start;
|
||||
}
|
||||
|
||||
U_CAPI wchar_t *uprv_wcscpy(wchar_t *dst, const wchar_t *src) {
|
||||
wchar_t *start=dst;
|
||||
while((*dst=*src)!=0) {
|
||||
++dst;
|
||||
++src;
|
||||
}
|
||||
return start;
|
||||
}
|
||||
|
||||
U_CAPI size_t uprv_wcslen(const wchar_t *src) {
|
||||
const wchar_t *start=src;
|
||||
while(*src!=0) {
|
||||
++src;
|
||||
}
|
||||
return src-start;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
58
engine/thirdparty/icu4c/common/cwchar.h
vendored
Normal file
58
engine/thirdparty/icu4c/common/cwchar.h
vendored
Normal file
|
|
@ -0,0 +1,58 @@
|
|||
// © 2016 and later: Unicode, Inc. and others.
|
||||
// License & terms of use: http://www.unicode.org/copyright.html
|
||||
/*
|
||||
******************************************************************************
|
||||
*
|
||||
* Copyright (C) 2001, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*
|
||||
******************************************************************************
|
||||
* file name: cwchar.h
|
||||
* encoding: UTF-8
|
||||
* tab size: 8 (not used)
|
||||
* indentation:4
|
||||
*
|
||||
* created on: 2001may25
|
||||
* created by: Markus W. Scherer
|
||||
*
|
||||
* This file contains ICU-internal definitions of wchar_t operations.
|
||||
* These definitions were moved here from cstring.h so that fewer
|
||||
* ICU implementation files include wchar.h.
|
||||
*/
|
||||
|
||||
#ifndef __CWCHAR_H__
|
||||
#define __CWCHAR_H__
|
||||
|
||||
#include <string.h>
|
||||
#include <stdlib.h>
|
||||
#include "unicode/utypes.h"
|
||||
|
||||
/* Do this after utypes.h so that we have U_HAVE_WCHAR_H . */
|
||||
#if U_HAVE_WCHAR_H
|
||||
# include <wchar.h>
|
||||
#endif
|
||||
|
||||
/*===========================================================================*/
|
||||
/* Wide-character functions */
|
||||
/*===========================================================================*/
|
||||
|
||||
/* The following are not available on all systems, defined in wchar.h or string.h. */
|
||||
#if U_HAVE_WCSCPY
|
||||
# define uprv_wcscpy wcscpy
|
||||
# define uprv_wcscat wcscat
|
||||
# define uprv_wcslen wcslen
|
||||
#else
|
||||
U_CAPI wchar_t* U_EXPORT2
|
||||
uprv_wcscpy(wchar_t *dst, const wchar_t *src);
|
||||
U_CAPI wchar_t* U_EXPORT2
|
||||
uprv_wcscat(wchar_t *dst, const wchar_t *src);
|
||||
U_CAPI size_t U_EXPORT2
|
||||
uprv_wcslen(const wchar_t *src);
|
||||
#endif
|
||||
|
||||
/* The following are part of the ANSI C standard, defined in stdlib.h . */
|
||||
#define uprv_wcstombs(mbstr, wcstr, count) U_STANDARD_CPP_NAMESPACE wcstombs(mbstr, wcstr, count)
|
||||
#define uprv_mbstowcs(wcstr, mbstr, count) U_STANDARD_CPP_NAMESPACE mbstowcs(wcstr, mbstr, count)
|
||||
|
||||
|
||||
#endif
|
||||
1503
engine/thirdparty/icu4c/common/dictbe.cpp
vendored
Normal file
1503
engine/thirdparty/icu4c/common/dictbe.cpp
vendored
Normal file
File diff suppressed because it is too large
Load diff
434
engine/thirdparty/icu4c/common/dictbe.h
vendored
Normal file
434
engine/thirdparty/icu4c/common/dictbe.h
vendored
Normal file
|
|
@ -0,0 +1,434 @@
|
|||
// © 2016 and later: Unicode, Inc. and others.
|
||||
// License & terms of use: http://www.unicode.org/copyright.html
|
||||
/**
|
||||
*******************************************************************************
|
||||
* Copyright (C) 2006-2014, International Business Machines Corporation *
|
||||
* and others. All Rights Reserved. *
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
||||
#ifndef DICTBE_H
|
||||
#define DICTBE_H
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
#include "unicode/uniset.h"
|
||||
#include "unicode/utext.h"
|
||||
|
||||
#include "brkeng.h"
|
||||
#include "hash.h"
|
||||
#include "mlbe.h"
|
||||
#include "uvectr32.h"
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
class DictionaryMatcher;
|
||||
class MlBreakEngine;
|
||||
class Normalizer2;
|
||||
|
||||
/*******************************************************************
|
||||
* DictionaryBreakEngine
|
||||
*/
|
||||
|
||||
/**
|
||||
* <p>DictionaryBreakEngine is a kind of LanguageBreakEngine that uses a
|
||||
* dictionary to determine language-specific breaks.</p>
|
||||
*
|
||||
* <p>After it is constructed a DictionaryBreakEngine may be shared between
|
||||
* threads without synchronization.</p>
|
||||
*/
|
||||
class DictionaryBreakEngine : public LanguageBreakEngine {
|
||||
private:
|
||||
/**
|
||||
* The set of characters handled by this engine
|
||||
* @internal
|
||||
*/
|
||||
|
||||
UnicodeSet fSet;
|
||||
|
||||
public:
|
||||
|
||||
/**
|
||||
* <p>Constructor </p>
|
||||
*/
|
||||
DictionaryBreakEngine();
|
||||
|
||||
/**
|
||||
* <p>Virtual destructor.</p>
|
||||
*/
|
||||
virtual ~DictionaryBreakEngine();
|
||||
|
||||
/**
|
||||
* <p>Indicate whether this engine handles a particular character for
|
||||
* a particular kind of break.</p>
|
||||
*
|
||||
* @param c A character which begins a run that the engine might handle
|
||||
* @param locale The locale.
|
||||
* @return true if this engine handles the particular character and break
|
||||
* type.
|
||||
*/
|
||||
virtual UBool handles(UChar32 c, const char* locale) const override;
|
||||
|
||||
/**
|
||||
* <p>Find any breaks within a run in the supplied text.</p>
|
||||
*
|
||||
* @param text A UText representing the text. The iterator is left at
|
||||
* the end of the run of characters which the engine is capable of handling
|
||||
* that starts from the first character in the range.
|
||||
* @param startPos The start of the run within the supplied text.
|
||||
* @param endPos The end of the run within the supplied text.
|
||||
* @param foundBreaks vector of int32_t to receive the break positions
|
||||
* @param status Information on any errors encountered.
|
||||
* @return The number of breaks found.
|
||||
*/
|
||||
virtual int32_t findBreaks( UText *text,
|
||||
int32_t startPos,
|
||||
int32_t endPos,
|
||||
UVector32 &foundBreaks,
|
||||
UBool isPhraseBreaking,
|
||||
UErrorCode& status ) const override;
|
||||
|
||||
protected:
|
||||
|
||||
/**
|
||||
* <p>Set the character set handled by this engine.</p>
|
||||
*
|
||||
* @param set A UnicodeSet of the set of characters handled by the engine
|
||||
*/
|
||||
virtual void setCharacters( const UnicodeSet &set );
|
||||
|
||||
/**
|
||||
* <p>Divide up a range of known dictionary characters handled by this break engine.</p>
|
||||
*
|
||||
* @param text A UText representing the text
|
||||
* @param rangeStart The start of the range of dictionary characters
|
||||
* @param rangeEnd The end of the range of dictionary characters
|
||||
* @param foundBreaks Output of C array of int32_t break positions, or 0
|
||||
* @param status Information on any errors encountered.
|
||||
* @return The number of breaks found
|
||||
*/
|
||||
virtual int32_t divideUpDictionaryRange( UText *text,
|
||||
int32_t rangeStart,
|
||||
int32_t rangeEnd,
|
||||
UVector32 &foundBreaks,
|
||||
UBool isPhraseBreaking,
|
||||
UErrorCode& status) const = 0;
|
||||
|
||||
};
|
||||
|
||||
/*******************************************************************
|
||||
* ThaiBreakEngine
|
||||
*/
|
||||
|
||||
/**
|
||||
* <p>ThaiBreakEngine is a kind of DictionaryBreakEngine that uses a
|
||||
* dictionary and heuristics to determine Thai-specific breaks.</p>
|
||||
*
|
||||
* <p>After it is constructed a ThaiBreakEngine may be shared between
|
||||
* threads without synchronization.</p>
|
||||
*/
|
||||
class ThaiBreakEngine : public DictionaryBreakEngine {
|
||||
private:
|
||||
/**
|
||||
* The set of characters handled by this engine
|
||||
* @internal
|
||||
*/
|
||||
|
||||
UnicodeSet fEndWordSet;
|
||||
UnicodeSet fBeginWordSet;
|
||||
UnicodeSet fSuffixSet;
|
||||
UnicodeSet fMarkSet;
|
||||
DictionaryMatcher *fDictionary;
|
||||
|
||||
public:
|
||||
|
||||
/**
|
||||
* <p>Default constructor.</p>
|
||||
*
|
||||
* @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
|
||||
* engine is deleted.
|
||||
*/
|
||||
ThaiBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status);
|
||||
|
||||
/**
|
||||
* <p>Virtual destructor.</p>
|
||||
*/
|
||||
virtual ~ThaiBreakEngine();
|
||||
|
||||
protected:
|
||||
/**
|
||||
* <p>Divide up a range of known dictionary characters handled by this break engine.</p>
|
||||
*
|
||||
* @param text A UText representing the text
|
||||
* @param rangeStart The start of the range of dictionary characters
|
||||
* @param rangeEnd The end of the range of dictionary characters
|
||||
* @param foundBreaks Output of C array of int32_t break positions, or 0
|
||||
* @param status Information on any errors encountered.
|
||||
* @return The number of breaks found
|
||||
*/
|
||||
virtual int32_t divideUpDictionaryRange( UText *text,
|
||||
int32_t rangeStart,
|
||||
int32_t rangeEnd,
|
||||
UVector32 &foundBreaks,
|
||||
UBool isPhraseBreaking,
|
||||
UErrorCode& status) const override;
|
||||
|
||||
};
|
||||
|
||||
/*******************************************************************
|
||||
* LaoBreakEngine
|
||||
*/
|
||||
|
||||
/**
|
||||
* <p>LaoBreakEngine is a kind of DictionaryBreakEngine that uses a
|
||||
* dictionary and heuristics to determine Lao-specific breaks.</p>
|
||||
*
|
||||
* <p>After it is constructed a LaoBreakEngine may be shared between
|
||||
* threads without synchronization.</p>
|
||||
*/
|
||||
class LaoBreakEngine : public DictionaryBreakEngine {
|
||||
private:
|
||||
/**
|
||||
* The set of characters handled by this engine
|
||||
* @internal
|
||||
*/
|
||||
|
||||
UnicodeSet fEndWordSet;
|
||||
UnicodeSet fBeginWordSet;
|
||||
UnicodeSet fMarkSet;
|
||||
DictionaryMatcher *fDictionary;
|
||||
|
||||
public:
|
||||
|
||||
/**
|
||||
* <p>Default constructor.</p>
|
||||
*
|
||||
* @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
|
||||
* engine is deleted.
|
||||
*/
|
||||
LaoBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status);
|
||||
|
||||
/**
|
||||
* <p>Virtual destructor.</p>
|
||||
*/
|
||||
virtual ~LaoBreakEngine();
|
||||
|
||||
protected:
|
||||
/**
|
||||
* <p>Divide up a range of known dictionary characters handled by this break engine.</p>
|
||||
*
|
||||
* @param text A UText representing the text
|
||||
* @param rangeStart The start of the range of dictionary characters
|
||||
* @param rangeEnd The end of the range of dictionary characters
|
||||
* @param foundBreaks Output of C array of int32_t break positions, or 0
|
||||
* @param status Information on any errors encountered.
|
||||
* @return The number of breaks found
|
||||
*/
|
||||
virtual int32_t divideUpDictionaryRange( UText *text,
|
||||
int32_t rangeStart,
|
||||
int32_t rangeEnd,
|
||||
UVector32 &foundBreaks,
|
||||
UBool isPhraseBreaking,
|
||||
UErrorCode& status) const override;
|
||||
|
||||
};
|
||||
|
||||
/*******************************************************************
|
||||
* BurmeseBreakEngine
|
||||
*/
|
||||
|
||||
/**
|
||||
* <p>BurmeseBreakEngine is a kind of DictionaryBreakEngine that uses a
|
||||
* DictionaryMatcher and heuristics to determine Burmese-specific breaks.</p>
|
||||
*
|
||||
* <p>After it is constructed a BurmeseBreakEngine may be shared between
|
||||
* threads without synchronization.</p>
|
||||
*/
|
||||
class BurmeseBreakEngine : public DictionaryBreakEngine {
|
||||
private:
|
||||
/**
|
||||
* The set of characters handled by this engine
|
||||
* @internal
|
||||
*/
|
||||
|
||||
UnicodeSet fEndWordSet;
|
||||
UnicodeSet fBeginWordSet;
|
||||
UnicodeSet fMarkSet;
|
||||
DictionaryMatcher *fDictionary;
|
||||
|
||||
public:
|
||||
|
||||
/**
|
||||
* <p>Default constructor.</p>
|
||||
*
|
||||
* @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
|
||||
* engine is deleted.
|
||||
*/
|
||||
BurmeseBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status);
|
||||
|
||||
/**
|
||||
* <p>Virtual destructor.</p>
|
||||
*/
|
||||
virtual ~BurmeseBreakEngine();
|
||||
|
||||
protected:
|
||||
/**
|
||||
* <p>Divide up a range of known dictionary characters.</p>
|
||||
*
|
||||
* @param text A UText representing the text
|
||||
* @param rangeStart The start of the range of dictionary characters
|
||||
* @param rangeEnd The end of the range of dictionary characters
|
||||
* @param foundBreaks Output of C array of int32_t break positions, or 0
|
||||
* @param status Information on any errors encountered.
|
||||
* @return The number of breaks found
|
||||
*/
|
||||
virtual int32_t divideUpDictionaryRange( UText *text,
|
||||
int32_t rangeStart,
|
||||
int32_t rangeEnd,
|
||||
UVector32 &foundBreaks,
|
||||
UBool isPhraseBreaking,
|
||||
UErrorCode& status) const override;
|
||||
|
||||
};
|
||||
|
||||
/*******************************************************************
|
||||
* KhmerBreakEngine
|
||||
*/
|
||||
|
||||
/**
|
||||
* <p>KhmerBreakEngine is a kind of DictionaryBreakEngine that uses a
|
||||
* DictionaryMatcher and heuristics to determine Khmer-specific breaks.</p>
|
||||
*
|
||||
* <p>After it is constructed a KhmerBreakEngine may be shared between
|
||||
* threads without synchronization.</p>
|
||||
*/
|
||||
class KhmerBreakEngine : public DictionaryBreakEngine {
|
||||
private:
|
||||
/**
|
||||
* The set of characters handled by this engine
|
||||
* @internal
|
||||
*/
|
||||
|
||||
UnicodeSet fEndWordSet;
|
||||
UnicodeSet fBeginWordSet;
|
||||
UnicodeSet fMarkSet;
|
||||
DictionaryMatcher *fDictionary;
|
||||
|
||||
public:
|
||||
|
||||
/**
|
||||
* <p>Default constructor.</p>
|
||||
*
|
||||
* @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
|
||||
* engine is deleted.
|
||||
*/
|
||||
KhmerBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status);
|
||||
|
||||
/**
|
||||
* <p>Virtual destructor.</p>
|
||||
*/
|
||||
virtual ~KhmerBreakEngine();
|
||||
|
||||
protected:
|
||||
/**
|
||||
* <p>Divide up a range of known dictionary characters.</p>
|
||||
*
|
||||
* @param text A UText representing the text
|
||||
* @param rangeStart The start of the range of dictionary characters
|
||||
* @param rangeEnd The end of the range of dictionary characters
|
||||
* @param foundBreaks Output of C array of int32_t break positions, or 0
|
||||
* @param status Information on any errors encountered.
|
||||
* @return The number of breaks found
|
||||
*/
|
||||
virtual int32_t divideUpDictionaryRange( UText *text,
|
||||
int32_t rangeStart,
|
||||
int32_t rangeEnd,
|
||||
UVector32 &foundBreaks,
|
||||
UBool isPhraseBreaking,
|
||||
UErrorCode& status) const override;
|
||||
|
||||
};
|
||||
|
||||
#if !UCONFIG_NO_NORMALIZATION
|
||||
|
||||
/*******************************************************************
|
||||
* CjkBreakEngine
|
||||
*/
|
||||
|
||||
//indicates language/script that the CjkBreakEngine will handle
|
||||
enum LanguageType {
|
||||
kKorean,
|
||||
kChineseJapanese
|
||||
};
|
||||
|
||||
/**
|
||||
* <p>CjkBreakEngine is a kind of DictionaryBreakEngine that uses a
|
||||
* dictionary with costs associated with each word and
|
||||
* Viterbi decoding to determine CJK-specific breaks.</p>
|
||||
*/
|
||||
class CjkBreakEngine : public DictionaryBreakEngine {
|
||||
protected:
|
||||
/**
|
||||
* The set of characters handled by this engine
|
||||
* @internal
|
||||
*/
|
||||
UnicodeSet fHangulWordSet;
|
||||
UnicodeSet fDigitOrOpenPunctuationOrAlphabetSet;
|
||||
UnicodeSet fClosePunctuationSet;
|
||||
|
||||
DictionaryMatcher *fDictionary;
|
||||
const Normalizer2 *nfkcNorm2;
|
||||
MlBreakEngine *fMlBreakEngine;
|
||||
bool isCj;
|
||||
|
||||
private:
|
||||
// Load Japanese extensions.
|
||||
void loadJapaneseExtensions(UErrorCode& error);
|
||||
// Load Japanese Hiragana.
|
||||
void loadHiragana(UErrorCode& error);
|
||||
// Initialize fSkipSet by loading Japanese Hiragana and extensions.
|
||||
void initJapanesePhraseParameter(UErrorCode& error);
|
||||
|
||||
Hashtable fSkipSet;
|
||||
|
||||
public:
|
||||
|
||||
/**
|
||||
* <p>Default constructor.</p>
|
||||
*
|
||||
* @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
|
||||
* engine is deleted. The DictionaryMatcher must contain costs for each word
|
||||
* in order for the dictionary to work properly.
|
||||
*/
|
||||
CjkBreakEngine(DictionaryMatcher *adoptDictionary, LanguageType type, UErrorCode &status);
|
||||
|
||||
/**
|
||||
* <p>Virtual destructor.</p>
|
||||
*/
|
||||
virtual ~CjkBreakEngine();
|
||||
|
||||
protected:
|
||||
/**
|
||||
* <p>Divide up a range of known dictionary characters handled by this break engine.</p>
|
||||
*
|
||||
* @param text A UText representing the text
|
||||
* @param rangeStart The start of the range of dictionary characters
|
||||
* @param rangeEnd The end of the range of dictionary characters
|
||||
* @param foundBreaks Output of C array of int32_t break positions, or 0
|
||||
* @param status Information on any errors encountered.
|
||||
* @return The number of breaks found
|
||||
*/
|
||||
virtual int32_t divideUpDictionaryRange( UText *text,
|
||||
int32_t rangeStart,
|
||||
int32_t rangeEnd,
|
||||
UVector32 &foundBreaks,
|
||||
UBool isPhraseBreaking,
|
||||
UErrorCode& status) const override;
|
||||
|
||||
};
|
||||
|
||||
#endif
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
||||
/* DICTBE_H */
|
||||
#endif
|
||||
242
engine/thirdparty/icu4c/common/dictionarydata.cpp
vendored
Normal file
242
engine/thirdparty/icu4c/common/dictionarydata.cpp
vendored
Normal file
|
|
@ -0,0 +1,242 @@
|
|||
// © 2016 and later: Unicode, Inc. and others.
|
||||
// License & terms of use: http://www.unicode.org/copyright.html
|
||||
/*
|
||||
*******************************************************************************
|
||||
* Copyright (C) 2014-2016, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*******************************************************************************
|
||||
* dictionarydata.h
|
||||
*
|
||||
* created on: 2012may31
|
||||
* created by: Markus W. Scherer & Maxime Serrano
|
||||
*/
|
||||
|
||||
#include "dictionarydata.h"
|
||||
#include "unicode/ucharstrie.h"
|
||||
#include "unicode/bytestrie.h"
|
||||
#include "unicode/udata.h"
|
||||
#include "cmemory.h"
|
||||
|
||||
#if !UCONFIG_NO_BREAK_ITERATION
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
const int32_t DictionaryData::TRIE_TYPE_BYTES = 0;
|
||||
const int32_t DictionaryData::TRIE_TYPE_UCHARS = 1;
|
||||
const int32_t DictionaryData::TRIE_TYPE_MASK = 7;
|
||||
const int32_t DictionaryData::TRIE_HAS_VALUES = 8;
|
||||
|
||||
const int32_t DictionaryData::TRANSFORM_NONE = 0;
|
||||
const int32_t DictionaryData::TRANSFORM_TYPE_OFFSET = 0x1000000;
|
||||
const int32_t DictionaryData::TRANSFORM_TYPE_MASK = 0x7f000000;
|
||||
const int32_t DictionaryData::TRANSFORM_OFFSET_MASK = 0x1fffff;
|
||||
|
||||
DictionaryMatcher::~DictionaryMatcher() {
|
||||
}
|
||||
|
||||
UCharsDictionaryMatcher::~UCharsDictionaryMatcher() {
|
||||
udata_close(file);
|
||||
}
|
||||
|
||||
int32_t UCharsDictionaryMatcher::getType() const {
|
||||
return DictionaryData::TRIE_TYPE_UCHARS;
|
||||
}
|
||||
|
||||
int32_t UCharsDictionaryMatcher::matches(UText *text, int32_t maxLength, int32_t limit,
|
||||
int32_t *lengths, int32_t *cpLengths, int32_t *values,
|
||||
int32_t *prefix) const {
|
||||
|
||||
UCharsTrie uct(characters);
|
||||
int32_t startingTextIndex = (int32_t)utext_getNativeIndex(text);
|
||||
int32_t wordCount = 0;
|
||||
int32_t codePointsMatched = 0;
|
||||
|
||||
for (UChar32 c = utext_next32(text); c >= 0; c=utext_next32(text)) {
|
||||
UStringTrieResult result = (codePointsMatched == 0) ? uct.first(c) : uct.next(c);
|
||||
int32_t lengthMatched = (int32_t)utext_getNativeIndex(text) - startingTextIndex;
|
||||
codePointsMatched += 1;
|
||||
if (USTRINGTRIE_HAS_VALUE(result)) {
|
||||
if (wordCount < limit) {
|
||||
if (values != nullptr) {
|
||||
values[wordCount] = uct.getValue();
|
||||
}
|
||||
if (lengths != nullptr) {
|
||||
lengths[wordCount] = lengthMatched;
|
||||
}
|
||||
if (cpLengths != nullptr) {
|
||||
cpLengths[wordCount] = codePointsMatched;
|
||||
}
|
||||
++wordCount;
|
||||
}
|
||||
if (result == USTRINGTRIE_FINAL_VALUE) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
else if (result == USTRINGTRIE_NO_MATCH) {
|
||||
break;
|
||||
}
|
||||
if (lengthMatched >= maxLength) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (prefix != nullptr) {
|
||||
*prefix = codePointsMatched;
|
||||
}
|
||||
return wordCount;
|
||||
}
|
||||
|
||||
BytesDictionaryMatcher::~BytesDictionaryMatcher() {
|
||||
udata_close(file);
|
||||
}
|
||||
|
||||
UChar32 BytesDictionaryMatcher::transform(UChar32 c) const {
|
||||
if ((transformConstant & DictionaryData::TRANSFORM_TYPE_MASK) == DictionaryData::TRANSFORM_TYPE_OFFSET) {
|
||||
if (c == 0x200D) {
|
||||
return 0xFF;
|
||||
} else if (c == 0x200C) {
|
||||
return 0xFE;
|
||||
}
|
||||
int32_t delta = c - (transformConstant & DictionaryData::TRANSFORM_OFFSET_MASK);
|
||||
if (delta < 0 || 0xFD < delta) {
|
||||
return U_SENTINEL;
|
||||
}
|
||||
return (UChar32)delta;
|
||||
}
|
||||
return c;
|
||||
}
|
||||
|
||||
int32_t BytesDictionaryMatcher::getType() const {
|
||||
return DictionaryData::TRIE_TYPE_BYTES;
|
||||
}
|
||||
|
||||
int32_t BytesDictionaryMatcher::matches(UText *text, int32_t maxLength, int32_t limit,
|
||||
int32_t *lengths, int32_t *cpLengths, int32_t *values,
|
||||
int32_t *prefix) const {
|
||||
BytesTrie bt(characters);
|
||||
int32_t startingTextIndex = (int32_t)utext_getNativeIndex(text);
|
||||
int32_t wordCount = 0;
|
||||
int32_t codePointsMatched = 0;
|
||||
|
||||
for (UChar32 c = utext_next32(text); c >= 0; c=utext_next32(text)) {
|
||||
UStringTrieResult result = (codePointsMatched == 0) ? bt.first(transform(c)) : bt.next(transform(c));
|
||||
int32_t lengthMatched = (int32_t)utext_getNativeIndex(text) - startingTextIndex;
|
||||
codePointsMatched += 1;
|
||||
if (USTRINGTRIE_HAS_VALUE(result)) {
|
||||
if (wordCount < limit) {
|
||||
if (values != nullptr) {
|
||||
values[wordCount] = bt.getValue();
|
||||
}
|
||||
if (lengths != nullptr) {
|
||||
lengths[wordCount] = lengthMatched;
|
||||
}
|
||||
if (cpLengths != nullptr) {
|
||||
cpLengths[wordCount] = codePointsMatched;
|
||||
}
|
||||
++wordCount;
|
||||
}
|
||||
if (result == USTRINGTRIE_FINAL_VALUE) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
else if (result == USTRINGTRIE_NO_MATCH) {
|
||||
break;
|
||||
}
|
||||
if (lengthMatched >= maxLength) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (prefix != nullptr) {
|
||||
*prefix = codePointsMatched;
|
||||
}
|
||||
return wordCount;
|
||||
}
|
||||
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
||||
U_NAMESPACE_USE
|
||||
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
udict_swap(const UDataSwapper *ds, const void *inData, int32_t length,
|
||||
void *outData, UErrorCode *pErrorCode) {
|
||||
const UDataInfo *pInfo;
|
||||
int32_t headerSize;
|
||||
const uint8_t *inBytes;
|
||||
uint8_t *outBytes;
|
||||
const int32_t *inIndexes;
|
||||
int32_t indexes[DictionaryData::IX_COUNT];
|
||||
int32_t i, offset, size;
|
||||
|
||||
headerSize = udata_swapDataHeader(ds, inData, length, outData, pErrorCode);
|
||||
if (pErrorCode == nullptr || U_FAILURE(*pErrorCode)) return 0;
|
||||
pInfo = (const UDataInfo *)((const char *)inData + 4);
|
||||
if (!(pInfo->dataFormat[0] == 0x44 &&
|
||||
pInfo->dataFormat[1] == 0x69 &&
|
||||
pInfo->dataFormat[2] == 0x63 &&
|
||||
pInfo->dataFormat[3] == 0x74 &&
|
||||
pInfo->formatVersion[0] == 1)) {
|
||||
udata_printError(ds, "udict_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as dictionary data\n",
|
||||
pInfo->dataFormat[0], pInfo->dataFormat[1], pInfo->dataFormat[2], pInfo->dataFormat[3], pInfo->formatVersion[0]);
|
||||
*pErrorCode = U_UNSUPPORTED_ERROR;
|
||||
return 0;
|
||||
}
|
||||
|
||||
inBytes = (const uint8_t *)inData + headerSize;
|
||||
outBytes = (outData == nullptr) ? nullptr : (uint8_t *)outData + headerSize;
|
||||
|
||||
inIndexes = (const int32_t *)inBytes;
|
||||
if (length >= 0) {
|
||||
length -= headerSize;
|
||||
if (length < (int32_t)(sizeof(indexes))) {
|
||||
udata_printError(ds, "udict_swap(): too few bytes (%d after header) for dictionary data\n", length);
|
||||
*pErrorCode = U_INDEX_OUTOFBOUNDS_ERROR;
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
for (i = 0; i < DictionaryData::IX_COUNT; i++) {
|
||||
indexes[i] = udata_readInt32(ds, inIndexes[i]);
|
||||
}
|
||||
|
||||
size = indexes[DictionaryData::IX_TOTAL_SIZE];
|
||||
|
||||
if (length >= 0) {
|
||||
if (length < size) {
|
||||
udata_printError(ds, "udict_swap(): too few bytes (%d after header) for all of dictionary data\n", length);
|
||||
*pErrorCode = U_INDEX_OUTOFBOUNDS_ERROR;
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (inBytes != outBytes) {
|
||||
uprv_memcpy(outBytes, inBytes, size);
|
||||
}
|
||||
|
||||
offset = 0;
|
||||
ds->swapArray32(ds, inBytes, sizeof(indexes), outBytes, pErrorCode);
|
||||
offset = (int32_t)sizeof(indexes);
|
||||
int32_t trieType = indexes[DictionaryData::IX_TRIE_TYPE] & DictionaryData::TRIE_TYPE_MASK;
|
||||
int32_t nextOffset = indexes[DictionaryData::IX_RESERVED1_OFFSET];
|
||||
|
||||
if (trieType == DictionaryData::TRIE_TYPE_UCHARS) {
|
||||
ds->swapArray16(ds, inBytes + offset, nextOffset - offset, outBytes + offset, pErrorCode);
|
||||
} else if (trieType == DictionaryData::TRIE_TYPE_BYTES) {
|
||||
// nothing to do
|
||||
} else {
|
||||
udata_printError(ds, "udict_swap(): unknown trie type!\n");
|
||||
*pErrorCode = U_UNSUPPORTED_ERROR;
|
||||
return 0;
|
||||
}
|
||||
|
||||
// these next two sections are empty in the current format,
|
||||
// but may be used later.
|
||||
offset = nextOffset;
|
||||
nextOffset = indexes[DictionaryData::IX_RESERVED2_OFFSET];
|
||||
offset = nextOffset;
|
||||
nextOffset = indexes[DictionaryData::IX_TOTAL_SIZE];
|
||||
offset = nextOffset;
|
||||
}
|
||||
return headerSize + size;
|
||||
}
|
||||
#endif
|
||||
191
engine/thirdparty/icu4c/common/dictionarydata.h
vendored
Normal file
191
engine/thirdparty/icu4c/common/dictionarydata.h
vendored
Normal file
|
|
@ -0,0 +1,191 @@
|
|||
// © 2016 and later: Unicode, Inc. and others.
|
||||
// License & terms of use: http://www.unicode.org/copyright.html
|
||||
/*
|
||||
*******************************************************************************
|
||||
* Copyright (C) 2014, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*******************************************************************************
|
||||
* dictionarydata.h
|
||||
*
|
||||
* created on: 2012may31
|
||||
* created by: Markus W. Scherer & Maxime Serrano
|
||||
*/
|
||||
|
||||
#ifndef __DICTIONARYDATA_H__
|
||||
#define __DICTIONARYDATA_H__
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
|
||||
#if !UCONFIG_NO_BREAK_ITERATION
|
||||
|
||||
#include "unicode/utext.h"
|
||||
#include "unicode/udata.h"
|
||||
#include "udataswp.h"
|
||||
#include "unicode/uobject.h"
|
||||
#include "unicode/ustringtrie.h"
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
class UCharsTrie;
|
||||
class BytesTrie;
|
||||
|
||||
class U_COMMON_API DictionaryData : public UMemory {
|
||||
public:
|
||||
static const int32_t TRIE_TYPE_BYTES; // = 0;
|
||||
static const int32_t TRIE_TYPE_UCHARS; // = 1;
|
||||
static const int32_t TRIE_TYPE_MASK; // = 7;
|
||||
static const int32_t TRIE_HAS_VALUES; // = 8;
|
||||
|
||||
static const int32_t TRANSFORM_NONE; // = 0;
|
||||
static const int32_t TRANSFORM_TYPE_OFFSET; // = 0x1000000;
|
||||
static const int32_t TRANSFORM_TYPE_MASK; // = 0x7f000000;
|
||||
static const int32_t TRANSFORM_OFFSET_MASK; // = 0x1fffff;
|
||||
|
||||
enum {
|
||||
// Byte offsets from the start of the data, after the generic header.
|
||||
IX_STRING_TRIE_OFFSET,
|
||||
IX_RESERVED1_OFFSET,
|
||||
IX_RESERVED2_OFFSET,
|
||||
IX_TOTAL_SIZE,
|
||||
|
||||
// Trie type: TRIE_HAS_VALUES | TRIE_TYPE_BYTES etc.
|
||||
IX_TRIE_TYPE,
|
||||
// Transform specification: TRANSFORM_TYPE_OFFSET | 0xe00 etc.
|
||||
IX_TRANSFORM,
|
||||
|
||||
IX_RESERVED6,
|
||||
IX_RESERVED7,
|
||||
IX_COUNT
|
||||
};
|
||||
};
|
||||
|
||||
/**
|
||||
* Wrapper class around generic dictionaries, implementing matches().
|
||||
* getType() should return a TRIE_TYPE_??? constant from DictionaryData.
|
||||
*
|
||||
* All implementations of this interface must be thread-safe if they are to be used inside of the
|
||||
* dictionary-based break iteration code.
|
||||
*/
|
||||
class U_COMMON_API DictionaryMatcher : public UMemory {
|
||||
public:
|
||||
DictionaryMatcher() {}
|
||||
virtual ~DictionaryMatcher();
|
||||
// this should emulate CompactTrieDictionary::matches()
|
||||
/* @param text The text in which to look for matching words. Matching begins
|
||||
* at the current position of the UText.
|
||||
* @param maxLength The max length of match to consider. Units are the native indexing
|
||||
* units of the UText.
|
||||
* @param limit Capacity of output arrays, which is also the maximum number of
|
||||
* matching words to be found.
|
||||
* @param lengths output array, filled with the lengths of the matches, in order,
|
||||
* from shortest to longest. Lengths are in native indexing units
|
||||
* of the UText. May be nullptr.
|
||||
* @param cpLengths output array, filled with the lengths of the matches, in order,
|
||||
* from shortest to longest. Lengths are the number of Unicode code points.
|
||||
* May be nullptr.
|
||||
* @param values Output array, filled with the values associated with the words found.
|
||||
* May be nullptr.
|
||||
* @param prefix Output parameter, the code point length of the prefix match, even if that
|
||||
* prefix didn't lead to a complete word. Will always be >= the cpLength
|
||||
* of the longest complete word matched. May be nullptr.
|
||||
* @return Number of matching words found.
|
||||
*/
|
||||
virtual int32_t matches(UText *text, int32_t maxLength, int32_t limit,
|
||||
int32_t *lengths, int32_t *cpLengths, int32_t *values,
|
||||
int32_t *prefix) const = 0;
|
||||
|
||||
/** @return DictionaryData::TRIE_TYPE_XYZ */
|
||||
virtual int32_t getType() const = 0;
|
||||
};
|
||||
|
||||
// Implementation of the DictionaryMatcher interface for a UCharsTrie dictionary
|
||||
class U_COMMON_API UCharsDictionaryMatcher : public DictionaryMatcher {
|
||||
public:
|
||||
// constructs a new UCharsDictionaryMatcher.
|
||||
// The UDataMemory * will be closed on this object's destruction.
|
||||
UCharsDictionaryMatcher(const char16_t *c, UDataMemory *f) : characters(c), file(f) { }
|
||||
virtual ~UCharsDictionaryMatcher();
|
||||
virtual int32_t matches(UText *text, int32_t maxLength, int32_t limit,
|
||||
int32_t *lengths, int32_t *cpLengths, int32_t *values,
|
||||
int32_t *prefix) const override;
|
||||
virtual int32_t getType() const override;
|
||||
private:
|
||||
const char16_t *characters;
|
||||
UDataMemory *file;
|
||||
};
|
||||
|
||||
// Implementation of the DictionaryMatcher interface for a BytesTrie dictionary
|
||||
class U_COMMON_API BytesDictionaryMatcher : public DictionaryMatcher {
|
||||
public:
|
||||
// constructs a new BytesTrieDictionaryMatcher
|
||||
// the transform constant should be the constant read from the file, not a masked version!
|
||||
// the UDataMemory * fed in here will be closed on this object's destruction
|
||||
BytesDictionaryMatcher(const char *c, int32_t t, UDataMemory *f)
|
||||
: characters(c), transformConstant(t), file(f) { }
|
||||
virtual ~BytesDictionaryMatcher();
|
||||
virtual int32_t matches(UText *text, int32_t maxLength, int32_t limit,
|
||||
int32_t *lengths, int32_t *cpLengths, int32_t *values,
|
||||
int32_t *prefix) const override;
|
||||
virtual int32_t getType() const override;
|
||||
private:
|
||||
UChar32 transform(UChar32 c) const;
|
||||
|
||||
const char *characters;
|
||||
int32_t transformConstant;
|
||||
UDataMemory *file;
|
||||
};
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
udict_swap(const UDataSwapper *ds, const void *inData, int32_t length, void *outData, UErrorCode *pErrorCode);
|
||||
|
||||
/**
|
||||
* Format of dictionary .dict data files.
|
||||
* Format version 1.0.
|
||||
*
|
||||
* A dictionary .dict data file contains a byte-serialized BytesTrie or
|
||||
* a UChars-serialized UCharsTrie.
|
||||
* Such files are used in dictionary-based break iteration (DBBI).
|
||||
*
|
||||
* For a BytesTrie, a transformation type is specified for
|
||||
* transforming Unicode strings into byte sequences.
|
||||
*
|
||||
* A .dict file begins with a standard ICU data file header
|
||||
* (DataHeader, see ucmndata.h and unicode/udata.h).
|
||||
* The UDataInfo.dataVersion field is currently unused (set to 0.0.0.0).
|
||||
*
|
||||
* After the header, the file contains the following parts.
|
||||
* Constants are defined in the DictionaryData class.
|
||||
*
|
||||
* For the data structure of BytesTrie & UCharsTrie see
|
||||
* https://icu.unicode.org/design/struct/tries
|
||||
* and the bytestrie.h and ucharstrie.h header files.
|
||||
*
|
||||
* int32_t indexes[indexesLength]; -- indexesLength=indexes[IX_STRING_TRIE_OFFSET]/4;
|
||||
*
|
||||
* The first four indexes are byte offsets in ascending order.
|
||||
* Each byte offset marks the start of the next part in the data file,
|
||||
* and the end of the previous one.
|
||||
* When two consecutive byte offsets are the same, then the corresponding part is empty.
|
||||
* Byte offsets are offsets from after the header,
|
||||
* that is, from the beginning of the indexes[].
|
||||
* Each part starts at an offset with proper alignment for its data.
|
||||
* If necessary, the previous part may include padding bytes to achieve this alignment.
|
||||
*
|
||||
* trieType=indexes[IX_TRIE_TYPE] defines the trie type.
|
||||
* transform=indexes[IX_TRANSFORM] defines the Unicode-to-bytes transformation.
|
||||
* If the transformation type is TRANSFORM_TYPE_OFFSET,
|
||||
* then the lower 21 bits contain the offset code point.
|
||||
* Each code point c is mapped to byte b = (c - offset).
|
||||
* Code points outside the range offset..(offset+0xff) cannot be mapped
|
||||
* and do not occur in the dictionary.
|
||||
*
|
||||
* stringTrie; -- a serialized BytesTrie or UCharsTrie
|
||||
*
|
||||
* The dictionary maps strings to specific values (TRIE_HAS_VALUES bit set in trieType),
|
||||
* or it maps all strings to 0 (TRIE_HAS_VALUES bit not set).
|
||||
*/
|
||||
|
||||
#endif /* !UCONFIG_NO_BREAK_ITERATION */
|
||||
#endif /* __DICTIONARYDATA_H__ */
|
||||
63
engine/thirdparty/icu4c/common/dtintrv.cpp
vendored
Normal file
63
engine/thirdparty/icu4c/common/dtintrv.cpp
vendored
Normal file
|
|
@ -0,0 +1,63 @@
|
|||
// © 2016 and later: Unicode, Inc. and others.
|
||||
// License & terms of use: http://www.unicode.org/copyright.html
|
||||
/*******************************************************************************
|
||||
* Copyright (C) 2008, International Business Machines Corporation and
|
||||
* others. All Rights Reserved.
|
||||
*******************************************************************************
|
||||
*
|
||||
* File DTINTRV.CPP
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
||||
|
||||
|
||||
#include "unicode/dtintrv.h"
|
||||
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
UOBJECT_DEFINE_RTTI_IMPLEMENTATION(DateInterval)
|
||||
|
||||
//DateInterval::DateInterval(){}
|
||||
|
||||
|
||||
DateInterval::DateInterval(UDate from, UDate to)
|
||||
: fromDate(from),
|
||||
toDate(to)
|
||||
{}
|
||||
|
||||
|
||||
DateInterval::~DateInterval(){}
|
||||
|
||||
|
||||
DateInterval::DateInterval(const DateInterval& other)
|
||||
: UObject(other) {
|
||||
*this = other;
|
||||
}
|
||||
|
||||
|
||||
DateInterval&
|
||||
DateInterval::operator=(const DateInterval& other) {
|
||||
if ( this != &other ) {
|
||||
fromDate = other.fromDate;
|
||||
toDate = other.toDate;
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
||||
|
||||
DateInterval*
|
||||
DateInterval::clone() const {
|
||||
return new DateInterval(*this);
|
||||
}
|
||||
|
||||
|
||||
bool
|
||||
DateInterval::operator==(const DateInterval& other) const {
|
||||
return ( fromDate == other.fromDate && toDate == other.toDate );
|
||||
}
|
||||
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
||||
804
engine/thirdparty/icu4c/common/edits.cpp
vendored
Normal file
804
engine/thirdparty/icu4c/common/edits.cpp
vendored
Normal file
|
|
@ -0,0 +1,804 @@
|
|||
// © 2017 and later: Unicode, Inc. and others.
|
||||
// License & terms of use: http://www.unicode.org/copyright.html
|
||||
|
||||
// edits.cpp
|
||||
// created: 2017feb08 Markus W. Scherer
|
||||
|
||||
#include "unicode/edits.h"
|
||||
#include "unicode/unistr.h"
|
||||
#include "unicode/utypes.h"
|
||||
#include "cmemory.h"
|
||||
#include "uassert.h"
|
||||
#include "util.h"
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
namespace {
|
||||
|
||||
// 0000uuuuuuuuuuuu records u+1 unchanged text units.
|
||||
const int32_t MAX_UNCHANGED_LENGTH = 0x1000;
|
||||
const int32_t MAX_UNCHANGED = MAX_UNCHANGED_LENGTH - 1;
|
||||
|
||||
// 0mmmnnnccccccccc with m=1..6 records ccc+1 replacements of m:n text units.
|
||||
const int32_t MAX_SHORT_CHANGE_OLD_LENGTH = 6;
|
||||
const int32_t MAX_SHORT_CHANGE_NEW_LENGTH = 7;
|
||||
const int32_t SHORT_CHANGE_NUM_MASK = 0x1ff;
|
||||
const int32_t MAX_SHORT_CHANGE = 0x6fff;
|
||||
|
||||
// 0111mmmmmmnnnnnn records a replacement of m text units with n.
|
||||
// m or n = 61: actual length follows in the next edits array unit.
|
||||
// m or n = 62..63: actual length follows in the next two edits array units.
|
||||
// Bit 30 of the actual length is in the head unit.
|
||||
// Trailing units have bit 15 set.
|
||||
const int32_t LENGTH_IN_1TRAIL = 61;
|
||||
const int32_t LENGTH_IN_2TRAIL = 62;
|
||||
|
||||
} // namespace
|
||||
|
||||
void Edits::releaseArray() noexcept {
|
||||
if (array != stackArray) {
|
||||
uprv_free(array);
|
||||
}
|
||||
}
|
||||
|
||||
Edits &Edits::copyArray(const Edits &other) {
|
||||
if (U_FAILURE(errorCode_)) {
|
||||
length = delta = numChanges = 0;
|
||||
return *this;
|
||||
}
|
||||
if (length > capacity) {
|
||||
uint16_t *newArray = (uint16_t *)uprv_malloc((size_t)length * 2);
|
||||
if (newArray == nullptr) {
|
||||
length = delta = numChanges = 0;
|
||||
errorCode_ = U_MEMORY_ALLOCATION_ERROR;
|
||||
return *this;
|
||||
}
|
||||
releaseArray();
|
||||
array = newArray;
|
||||
capacity = length;
|
||||
}
|
||||
if (length > 0) {
|
||||
uprv_memcpy(array, other.array, (size_t)length * 2);
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
||||
Edits &Edits::moveArray(Edits &src) noexcept {
|
||||
if (U_FAILURE(errorCode_)) {
|
||||
length = delta = numChanges = 0;
|
||||
return *this;
|
||||
}
|
||||
releaseArray();
|
||||
if (length > STACK_CAPACITY) {
|
||||
array = src.array;
|
||||
capacity = src.capacity;
|
||||
src.array = src.stackArray;
|
||||
src.capacity = STACK_CAPACITY;
|
||||
src.reset();
|
||||
return *this;
|
||||
}
|
||||
array = stackArray;
|
||||
capacity = STACK_CAPACITY;
|
||||
if (length > 0) {
|
||||
uprv_memcpy(array, src.array, (size_t)length * 2);
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
||||
Edits &Edits::operator=(const Edits &other) {
|
||||
if (this == &other) { return *this; } // self-assignment: no-op
|
||||
length = other.length;
|
||||
delta = other.delta;
|
||||
numChanges = other.numChanges;
|
||||
errorCode_ = other.errorCode_;
|
||||
return copyArray(other);
|
||||
}
|
||||
|
||||
Edits &Edits::operator=(Edits &&src) noexcept {
|
||||
length = src.length;
|
||||
delta = src.delta;
|
||||
numChanges = src.numChanges;
|
||||
errorCode_ = src.errorCode_;
|
||||
return moveArray(src);
|
||||
}
|
||||
|
||||
Edits::~Edits() {
|
||||
releaseArray();
|
||||
}
|
||||
|
||||
void Edits::reset() noexcept {
|
||||
length = delta = numChanges = 0;
|
||||
errorCode_ = U_ZERO_ERROR;
|
||||
}
|
||||
|
||||
void Edits::addUnchanged(int32_t unchangedLength) {
|
||||
if(U_FAILURE(errorCode_) || unchangedLength == 0) { return; }
|
||||
if(unchangedLength < 0) {
|
||||
errorCode_ = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return;
|
||||
}
|
||||
// Merge into previous unchanged-text record, if any.
|
||||
int32_t last = lastUnit();
|
||||
if(last < MAX_UNCHANGED) {
|
||||
int32_t remaining = MAX_UNCHANGED - last;
|
||||
if (remaining >= unchangedLength) {
|
||||
setLastUnit(last + unchangedLength);
|
||||
return;
|
||||
}
|
||||
setLastUnit(MAX_UNCHANGED);
|
||||
unchangedLength -= remaining;
|
||||
}
|
||||
// Split large lengths into multiple units.
|
||||
while(unchangedLength >= MAX_UNCHANGED_LENGTH) {
|
||||
append(MAX_UNCHANGED);
|
||||
unchangedLength -= MAX_UNCHANGED_LENGTH;
|
||||
}
|
||||
// Write a small (remaining) length.
|
||||
if(unchangedLength > 0) {
|
||||
append(unchangedLength - 1);
|
||||
}
|
||||
}
|
||||
|
||||
void Edits::addReplace(int32_t oldLength, int32_t newLength) {
|
||||
if(U_FAILURE(errorCode_)) { return; }
|
||||
if(oldLength < 0 || newLength < 0) {
|
||||
errorCode_ = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return;
|
||||
}
|
||||
if (oldLength == 0 && newLength == 0) {
|
||||
return;
|
||||
}
|
||||
++numChanges;
|
||||
int32_t newDelta = newLength - oldLength;
|
||||
if (newDelta != 0) {
|
||||
if ((newDelta > 0 && delta >= 0 && newDelta > (INT32_MAX - delta)) ||
|
||||
(newDelta < 0 && delta < 0 && newDelta < (INT32_MIN - delta))) {
|
||||
// Integer overflow or underflow.
|
||||
errorCode_ = U_INDEX_OUTOFBOUNDS_ERROR;
|
||||
return;
|
||||
}
|
||||
delta += newDelta;
|
||||
}
|
||||
|
||||
if(0 < oldLength && oldLength <= MAX_SHORT_CHANGE_OLD_LENGTH &&
|
||||
newLength <= MAX_SHORT_CHANGE_NEW_LENGTH) {
|
||||
// Merge into previous same-lengths short-replacement record, if any.
|
||||
int32_t u = (oldLength << 12) | (newLength << 9);
|
||||
int32_t last = lastUnit();
|
||||
if(MAX_UNCHANGED < last && last < MAX_SHORT_CHANGE &&
|
||||
(last & ~SHORT_CHANGE_NUM_MASK) == u &&
|
||||
(last & SHORT_CHANGE_NUM_MASK) < SHORT_CHANGE_NUM_MASK) {
|
||||
setLastUnit(last + 1);
|
||||
return;
|
||||
}
|
||||
append(u);
|
||||
return;
|
||||
}
|
||||
|
||||
int32_t head = 0x7000;
|
||||
if (oldLength < LENGTH_IN_1TRAIL && newLength < LENGTH_IN_1TRAIL) {
|
||||
head |= oldLength << 6;
|
||||
head |= newLength;
|
||||
append(head);
|
||||
} else if ((capacity - length) >= 5 || growArray()) {
|
||||
int32_t limit = length + 1;
|
||||
if(oldLength < LENGTH_IN_1TRAIL) {
|
||||
head |= oldLength << 6;
|
||||
} else if(oldLength <= 0x7fff) {
|
||||
head |= LENGTH_IN_1TRAIL << 6;
|
||||
array[limit++] = (uint16_t)(0x8000 | oldLength);
|
||||
} else {
|
||||
head |= (LENGTH_IN_2TRAIL + (oldLength >> 30)) << 6;
|
||||
array[limit++] = (uint16_t)(0x8000 | (oldLength >> 15));
|
||||
array[limit++] = (uint16_t)(0x8000 | oldLength);
|
||||
}
|
||||
if(newLength < LENGTH_IN_1TRAIL) {
|
||||
head |= newLength;
|
||||
} else if(newLength <= 0x7fff) {
|
||||
head |= LENGTH_IN_1TRAIL;
|
||||
array[limit++] = (uint16_t)(0x8000 | newLength);
|
||||
} else {
|
||||
head |= LENGTH_IN_2TRAIL + (newLength >> 30);
|
||||
array[limit++] = (uint16_t)(0x8000 | (newLength >> 15));
|
||||
array[limit++] = (uint16_t)(0x8000 | newLength);
|
||||
}
|
||||
array[length] = (uint16_t)head;
|
||||
length = limit;
|
||||
}
|
||||
}
|
||||
|
||||
void Edits::append(int32_t r) {
|
||||
if(length < capacity || growArray()) {
|
||||
array[length++] = (uint16_t)r;
|
||||
}
|
||||
}
|
||||
|
||||
UBool Edits::growArray() {
|
||||
int32_t newCapacity;
|
||||
if (array == stackArray) {
|
||||
newCapacity = 2000;
|
||||
} else if (capacity == INT32_MAX) {
|
||||
// Not U_BUFFER_OVERFLOW_ERROR because that could be confused on a string transform API
|
||||
// with a result-string-buffer overflow.
|
||||
errorCode_ = U_INDEX_OUTOFBOUNDS_ERROR;
|
||||
return false;
|
||||
} else if (capacity >= (INT32_MAX / 2)) {
|
||||
newCapacity = INT32_MAX;
|
||||
} else {
|
||||
newCapacity = 2 * capacity;
|
||||
}
|
||||
// Grow by at least 5 units so that a maximal change record will fit.
|
||||
if ((newCapacity - capacity) < 5) {
|
||||
errorCode_ = U_INDEX_OUTOFBOUNDS_ERROR;
|
||||
return false;
|
||||
}
|
||||
uint16_t *newArray = (uint16_t *)uprv_malloc((size_t)newCapacity * 2);
|
||||
if (newArray == nullptr) {
|
||||
errorCode_ = U_MEMORY_ALLOCATION_ERROR;
|
||||
return false;
|
||||
}
|
||||
uprv_memcpy(newArray, array, (size_t)length * 2);
|
||||
releaseArray();
|
||||
array = newArray;
|
||||
capacity = newCapacity;
|
||||
return true;
|
||||
}
|
||||
|
||||
UBool Edits::copyErrorTo(UErrorCode &outErrorCode) const {
|
||||
if (U_FAILURE(outErrorCode)) { return true; }
|
||||
if (U_SUCCESS(errorCode_)) { return false; }
|
||||
outErrorCode = errorCode_;
|
||||
return true;
|
||||
}
|
||||
|
||||
Edits &Edits::mergeAndAppend(const Edits &ab, const Edits &bc, UErrorCode &errorCode) {
|
||||
if (copyErrorTo(errorCode)) { return *this; }
|
||||
// Picture string a --(Edits ab)--> string b --(Edits bc)--> string c.
|
||||
// Parallel iteration over both Edits.
|
||||
Iterator abIter = ab.getFineIterator();
|
||||
Iterator bcIter = bc.getFineIterator();
|
||||
UBool abHasNext = true, bcHasNext = true;
|
||||
// Copy iterator state into local variables, so that we can modify and subdivide spans.
|
||||
// ab old & new length, bc old & new length
|
||||
int32_t aLength = 0, ab_bLength = 0, bc_bLength = 0, cLength = 0;
|
||||
// When we have different-intermediate-length changes, we accumulate a larger change.
|
||||
int32_t pending_aLength = 0, pending_cLength = 0;
|
||||
for (;;) {
|
||||
// At this point, for each of the two iterators:
|
||||
// Either we are done with the locally cached current edit,
|
||||
// and its intermediate-string length has been reset,
|
||||
// or we will continue to work with a truncated remainder of this edit.
|
||||
//
|
||||
// If the current edit is done, and the iterator has not yet reached the end,
|
||||
// then we fetch the next edit. This is true for at least one of the iterators.
|
||||
//
|
||||
// Normally it does not matter whether we fetch from ab and then bc or vice versa.
|
||||
// However, the result is observably different when
|
||||
// ab deletions meet bc insertions at the same intermediate-string index.
|
||||
// Some users expect the bc insertions to come first, so we fetch from bc first.
|
||||
if (bc_bLength == 0) {
|
||||
if (bcHasNext && (bcHasNext = bcIter.next(errorCode)) != 0) {
|
||||
bc_bLength = bcIter.oldLength();
|
||||
cLength = bcIter.newLength();
|
||||
if (bc_bLength == 0) {
|
||||
// insertion
|
||||
if (ab_bLength == 0 || !abIter.hasChange()) {
|
||||
addReplace(pending_aLength, pending_cLength + cLength);
|
||||
pending_aLength = pending_cLength = 0;
|
||||
} else {
|
||||
pending_cLength += cLength;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
}
|
||||
// else see if the other iterator is done, too.
|
||||
}
|
||||
if (ab_bLength == 0) {
|
||||
if (abHasNext && (abHasNext = abIter.next(errorCode)) != 0) {
|
||||
aLength = abIter.oldLength();
|
||||
ab_bLength = abIter.newLength();
|
||||
if (ab_bLength == 0) {
|
||||
// deletion
|
||||
if (bc_bLength == bcIter.oldLength() || !bcIter.hasChange()) {
|
||||
addReplace(pending_aLength + aLength, pending_cLength);
|
||||
pending_aLength = pending_cLength = 0;
|
||||
} else {
|
||||
pending_aLength += aLength;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
} else if (bc_bLength == 0) {
|
||||
// Both iterators are done at the same time:
|
||||
// The intermediate-string lengths match.
|
||||
break;
|
||||
} else {
|
||||
// The ab output string is shorter than the bc input string.
|
||||
if (!copyErrorTo(errorCode)) {
|
||||
errorCode = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
}
|
||||
if (bc_bLength == 0) {
|
||||
// The bc input string is shorter than the ab output string.
|
||||
if (!copyErrorTo(errorCode)) {
|
||||
errorCode = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
// Done fetching: ab_bLength > 0 && bc_bLength > 0
|
||||
|
||||
// The current state has two parts:
|
||||
// - Past: We accumulate a longer ac edit in the "pending" variables.
|
||||
// - Current: We have copies of the current ab/bc edits in local variables.
|
||||
// At least one side is newly fetched.
|
||||
// One side might be a truncated remainder of an edit we fetched earlier.
|
||||
|
||||
if (!abIter.hasChange() && !bcIter.hasChange()) {
|
||||
// An unchanged span all the way from string a to string c.
|
||||
if (pending_aLength != 0 || pending_cLength != 0) {
|
||||
addReplace(pending_aLength, pending_cLength);
|
||||
pending_aLength = pending_cLength = 0;
|
||||
}
|
||||
int32_t unchangedLength = aLength <= cLength ? aLength : cLength;
|
||||
addUnchanged(unchangedLength);
|
||||
ab_bLength = aLength -= unchangedLength;
|
||||
bc_bLength = cLength -= unchangedLength;
|
||||
// At least one of the unchanged spans is now empty.
|
||||
continue;
|
||||
}
|
||||
if (!abIter.hasChange() && bcIter.hasChange()) {
|
||||
// Unchanged a->b but changed b->c.
|
||||
if (ab_bLength >= bc_bLength) {
|
||||
// Split the longer unchanged span into change + remainder.
|
||||
addReplace(pending_aLength + bc_bLength, pending_cLength + cLength);
|
||||
pending_aLength = pending_cLength = 0;
|
||||
aLength = ab_bLength -= bc_bLength;
|
||||
bc_bLength = 0;
|
||||
continue;
|
||||
}
|
||||
// Handle the shorter unchanged span below like a change.
|
||||
} else if (abIter.hasChange() && !bcIter.hasChange()) {
|
||||
// Changed a->b and then unchanged b->c.
|
||||
if (ab_bLength <= bc_bLength) {
|
||||
// Split the longer unchanged span into change + remainder.
|
||||
addReplace(pending_aLength + aLength, pending_cLength + ab_bLength);
|
||||
pending_aLength = pending_cLength = 0;
|
||||
cLength = bc_bLength -= ab_bLength;
|
||||
ab_bLength = 0;
|
||||
continue;
|
||||
}
|
||||
// Handle the shorter unchanged span below like a change.
|
||||
} else { // both abIter.hasChange() && bcIter.hasChange()
|
||||
if (ab_bLength == bc_bLength) {
|
||||
// Changes on both sides up to the same position. Emit & reset.
|
||||
addReplace(pending_aLength + aLength, pending_cLength + cLength);
|
||||
pending_aLength = pending_cLength = 0;
|
||||
ab_bLength = bc_bLength = 0;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
// Accumulate the a->c change, reset the shorter side,
|
||||
// keep a remainder of the longer one.
|
||||
pending_aLength += aLength;
|
||||
pending_cLength += cLength;
|
||||
if (ab_bLength < bc_bLength) {
|
||||
bc_bLength -= ab_bLength;
|
||||
cLength = ab_bLength = 0;
|
||||
} else { // ab_bLength > bc_bLength
|
||||
ab_bLength -= bc_bLength;
|
||||
aLength = bc_bLength = 0;
|
||||
}
|
||||
}
|
||||
if (pending_aLength != 0 || pending_cLength != 0) {
|
||||
addReplace(pending_aLength, pending_cLength);
|
||||
}
|
||||
copyErrorTo(errorCode);
|
||||
return *this;
|
||||
}
|
||||
|
||||
Edits::Iterator::Iterator(const uint16_t *a, int32_t len, UBool oc, UBool crs) :
|
||||
array(a), index(0), length(len), remaining(0),
|
||||
onlyChanges_(oc), coarse(crs),
|
||||
dir(0), changed(false), oldLength_(0), newLength_(0),
|
||||
srcIndex(0), replIndex(0), destIndex(0) {}
|
||||
|
||||
int32_t Edits::Iterator::readLength(int32_t head) {
|
||||
if (head < LENGTH_IN_1TRAIL) {
|
||||
return head;
|
||||
} else if (head < LENGTH_IN_2TRAIL) {
|
||||
U_ASSERT(index < length);
|
||||
U_ASSERT(array[index] >= 0x8000);
|
||||
return array[index++] & 0x7fff;
|
||||
} else {
|
||||
U_ASSERT((index + 2) <= length);
|
||||
U_ASSERT(array[index] >= 0x8000);
|
||||
U_ASSERT(array[index + 1] >= 0x8000);
|
||||
int32_t len = ((head & 1) << 30) |
|
||||
((int32_t)(array[index] & 0x7fff) << 15) |
|
||||
(array[index + 1] & 0x7fff);
|
||||
index += 2;
|
||||
return len;
|
||||
}
|
||||
}
|
||||
|
||||
void Edits::Iterator::updateNextIndexes() {
|
||||
srcIndex += oldLength_;
|
||||
if (changed) {
|
||||
replIndex += newLength_;
|
||||
}
|
||||
destIndex += newLength_;
|
||||
}
|
||||
|
||||
void Edits::Iterator::updatePreviousIndexes() {
|
||||
srcIndex -= oldLength_;
|
||||
if (changed) {
|
||||
replIndex -= newLength_;
|
||||
}
|
||||
destIndex -= newLength_;
|
||||
}
|
||||
|
||||
UBool Edits::Iterator::noNext() {
|
||||
// No change before or beyond the string.
|
||||
dir = 0;
|
||||
changed = false;
|
||||
oldLength_ = newLength_ = 0;
|
||||
return false;
|
||||
}
|
||||
|
||||
UBool Edits::Iterator::next(UBool onlyChanges, UErrorCode &errorCode) {
|
||||
// Forward iteration: Update the string indexes to the limit of the current span,
|
||||
// and post-increment-read array units to assemble a new span.
|
||||
// Leaves the array index one after the last unit of that span.
|
||||
if (U_FAILURE(errorCode)) { return false; }
|
||||
// We have an errorCode in case we need to start guarding against integer overflows.
|
||||
// It is also convenient for caller loops if we bail out when an error was set elsewhere.
|
||||
if (dir > 0) {
|
||||
updateNextIndexes();
|
||||
} else {
|
||||
if (dir < 0) {
|
||||
// Turn around from previous() to next().
|
||||
// Post-increment-read the same span again.
|
||||
if (remaining > 0) {
|
||||
// Fine-grained iterator:
|
||||
// Stay on the current one of a sequence of compressed changes.
|
||||
++index; // next() rests on the index after the sequence unit.
|
||||
dir = 1;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
dir = 1;
|
||||
}
|
||||
if (remaining >= 1) {
|
||||
// Fine-grained iterator: Continue a sequence of compressed changes.
|
||||
if (remaining > 1) {
|
||||
--remaining;
|
||||
return true;
|
||||
}
|
||||
remaining = 0;
|
||||
}
|
||||
if (index >= length) {
|
||||
return noNext();
|
||||
}
|
||||
int32_t u = array[index++];
|
||||
if (u <= MAX_UNCHANGED) {
|
||||
// Combine adjacent unchanged ranges.
|
||||
changed = false;
|
||||
oldLength_ = u + 1;
|
||||
while (index < length && (u = array[index]) <= MAX_UNCHANGED) {
|
||||
++index;
|
||||
oldLength_ += u + 1;
|
||||
}
|
||||
newLength_ = oldLength_;
|
||||
if (onlyChanges) {
|
||||
updateNextIndexes();
|
||||
if (index >= length) {
|
||||
return noNext();
|
||||
}
|
||||
// already fetched u > MAX_UNCHANGED at index
|
||||
++index;
|
||||
} else {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
changed = true;
|
||||
if (u <= MAX_SHORT_CHANGE) {
|
||||
int32_t oldLen = u >> 12;
|
||||
int32_t newLen = (u >> 9) & MAX_SHORT_CHANGE_NEW_LENGTH;
|
||||
int32_t num = (u & SHORT_CHANGE_NUM_MASK) + 1;
|
||||
if (coarse) {
|
||||
oldLength_ = num * oldLen;
|
||||
newLength_ = num * newLen;
|
||||
} else {
|
||||
// Split a sequence of changes that was compressed into one unit.
|
||||
oldLength_ = oldLen;
|
||||
newLength_ = newLen;
|
||||
if (num > 1) {
|
||||
remaining = num; // This is the first of two or more changes.
|
||||
}
|
||||
return true;
|
||||
}
|
||||
} else {
|
||||
U_ASSERT(u <= 0x7fff);
|
||||
oldLength_ = readLength((u >> 6) & 0x3f);
|
||||
newLength_ = readLength(u & 0x3f);
|
||||
if (!coarse) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
// Combine adjacent changes.
|
||||
while (index < length && (u = array[index]) > MAX_UNCHANGED) {
|
||||
++index;
|
||||
if (u <= MAX_SHORT_CHANGE) {
|
||||
int32_t num = (u & SHORT_CHANGE_NUM_MASK) + 1;
|
||||
oldLength_ += (u >> 12) * num;
|
||||
newLength_ += ((u >> 9) & MAX_SHORT_CHANGE_NEW_LENGTH) * num;
|
||||
} else {
|
||||
U_ASSERT(u <= 0x7fff);
|
||||
oldLength_ += readLength((u >> 6) & 0x3f);
|
||||
newLength_ += readLength(u & 0x3f);
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
UBool Edits::Iterator::previous(UErrorCode &errorCode) {
|
||||
// Backward iteration: Pre-decrement-read array units to assemble a new span,
|
||||
// then update the string indexes to the start of that span.
|
||||
// Leaves the array index on the head unit of that span.
|
||||
if (U_FAILURE(errorCode)) { return false; }
|
||||
// We have an errorCode in case we need to start guarding against integer overflows.
|
||||
// It is also convenient for caller loops if we bail out when an error was set elsewhere.
|
||||
if (dir >= 0) {
|
||||
if (dir > 0) {
|
||||
// Turn around from next() to previous().
|
||||
// Set the string indexes to the span limit and
|
||||
// pre-decrement-read the same span again.
|
||||
if (remaining > 0) {
|
||||
// Fine-grained iterator:
|
||||
// Stay on the current one of a sequence of compressed changes.
|
||||
--index; // previous() rests on the sequence unit.
|
||||
dir = -1;
|
||||
return true;
|
||||
}
|
||||
updateNextIndexes();
|
||||
}
|
||||
dir = -1;
|
||||
}
|
||||
if (remaining > 0) {
|
||||
// Fine-grained iterator: Continue a sequence of compressed changes.
|
||||
int32_t u = array[index];
|
||||
U_ASSERT(MAX_UNCHANGED < u && u <= MAX_SHORT_CHANGE);
|
||||
if (remaining <= (u & SHORT_CHANGE_NUM_MASK)) {
|
||||
++remaining;
|
||||
updatePreviousIndexes();
|
||||
return true;
|
||||
}
|
||||
remaining = 0;
|
||||
}
|
||||
if (index <= 0) {
|
||||
return noNext();
|
||||
}
|
||||
int32_t u = array[--index];
|
||||
if (u <= MAX_UNCHANGED) {
|
||||
// Combine adjacent unchanged ranges.
|
||||
changed = false;
|
||||
oldLength_ = u + 1;
|
||||
while (index > 0 && (u = array[index - 1]) <= MAX_UNCHANGED) {
|
||||
--index;
|
||||
oldLength_ += u + 1;
|
||||
}
|
||||
newLength_ = oldLength_;
|
||||
// No need to handle onlyChanges as long as previous() is called only from findIndex().
|
||||
updatePreviousIndexes();
|
||||
return true;
|
||||
}
|
||||
changed = true;
|
||||
if (u <= MAX_SHORT_CHANGE) {
|
||||
int32_t oldLen = u >> 12;
|
||||
int32_t newLen = (u >> 9) & MAX_SHORT_CHANGE_NEW_LENGTH;
|
||||
int32_t num = (u & SHORT_CHANGE_NUM_MASK) + 1;
|
||||
if (coarse) {
|
||||
oldLength_ = num * oldLen;
|
||||
newLength_ = num * newLen;
|
||||
} else {
|
||||
// Split a sequence of changes that was compressed into one unit.
|
||||
oldLength_ = oldLen;
|
||||
newLength_ = newLen;
|
||||
if (num > 1) {
|
||||
remaining = 1; // This is the last of two or more changes.
|
||||
}
|
||||
updatePreviousIndexes();
|
||||
return true;
|
||||
}
|
||||
} else {
|
||||
if (u <= 0x7fff) {
|
||||
// The change is encoded in u alone.
|
||||
oldLength_ = readLength((u >> 6) & 0x3f);
|
||||
newLength_ = readLength(u & 0x3f);
|
||||
} else {
|
||||
// Back up to the head of the change, read the lengths,
|
||||
// and reset the index to the head again.
|
||||
U_ASSERT(index > 0);
|
||||
while ((u = array[--index]) > 0x7fff) {}
|
||||
U_ASSERT(u > MAX_SHORT_CHANGE);
|
||||
int32_t headIndex = index++;
|
||||
oldLength_ = readLength((u >> 6) & 0x3f);
|
||||
newLength_ = readLength(u & 0x3f);
|
||||
index = headIndex;
|
||||
}
|
||||
if (!coarse) {
|
||||
updatePreviousIndexes();
|
||||
return true;
|
||||
}
|
||||
}
|
||||
// Combine adjacent changes.
|
||||
while (index > 0 && (u = array[index - 1]) > MAX_UNCHANGED) {
|
||||
--index;
|
||||
if (u <= MAX_SHORT_CHANGE) {
|
||||
int32_t num = (u & SHORT_CHANGE_NUM_MASK) + 1;
|
||||
oldLength_ += (u >> 12) * num;
|
||||
newLength_ += ((u >> 9) & MAX_SHORT_CHANGE_NEW_LENGTH) * num;
|
||||
} else if (u <= 0x7fff) {
|
||||
// Read the lengths, and reset the index to the head again.
|
||||
int32_t headIndex = index++;
|
||||
oldLength_ += readLength((u >> 6) & 0x3f);
|
||||
newLength_ += readLength(u & 0x3f);
|
||||
index = headIndex;
|
||||
}
|
||||
}
|
||||
updatePreviousIndexes();
|
||||
return true;
|
||||
}
|
||||
|
||||
int32_t Edits::Iterator::findIndex(int32_t i, UBool findSource, UErrorCode &errorCode) {
|
||||
if (U_FAILURE(errorCode) || i < 0) { return -1; }
|
||||
int32_t spanStart, spanLength;
|
||||
if (findSource) { // find source index
|
||||
spanStart = srcIndex;
|
||||
spanLength = oldLength_;
|
||||
} else { // find destination index
|
||||
spanStart = destIndex;
|
||||
spanLength = newLength_;
|
||||
}
|
||||
if (i < spanStart) {
|
||||
if (i >= (spanStart / 2)) {
|
||||
// Search backwards.
|
||||
for (;;) {
|
||||
UBool hasPrevious = previous(errorCode);
|
||||
U_ASSERT(hasPrevious); // because i>=0 and the first span starts at 0
|
||||
(void)hasPrevious; // avoid unused-variable warning
|
||||
spanStart = findSource ? srcIndex : destIndex;
|
||||
if (i >= spanStart) {
|
||||
// The index is in the current span.
|
||||
return 0;
|
||||
}
|
||||
if (remaining > 0) {
|
||||
// Is the index in one of the remaining compressed edits?
|
||||
// spanStart is the start of the current span, first of the remaining ones.
|
||||
spanLength = findSource ? oldLength_ : newLength_;
|
||||
int32_t u = array[index];
|
||||
U_ASSERT(MAX_UNCHANGED < u && u <= MAX_SHORT_CHANGE);
|
||||
int32_t num = (u & SHORT_CHANGE_NUM_MASK) + 1 - remaining;
|
||||
int32_t len = num * spanLength;
|
||||
if (i >= (spanStart - len)) {
|
||||
int32_t n = ((spanStart - i - 1) / spanLength) + 1;
|
||||
// 1 <= n <= num
|
||||
srcIndex -= n * oldLength_;
|
||||
replIndex -= n * newLength_;
|
||||
destIndex -= n * newLength_;
|
||||
remaining += n;
|
||||
return 0;
|
||||
}
|
||||
// Skip all of these edits at once.
|
||||
srcIndex -= num * oldLength_;
|
||||
replIndex -= num * newLength_;
|
||||
destIndex -= num * newLength_;
|
||||
remaining = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
// Reset the iterator to the start.
|
||||
dir = 0;
|
||||
index = remaining = oldLength_ = newLength_ = srcIndex = replIndex = destIndex = 0;
|
||||
} else if (i < (spanStart + spanLength)) {
|
||||
// The index is in the current span.
|
||||
return 0;
|
||||
}
|
||||
while (next(false, errorCode)) {
|
||||
if (findSource) {
|
||||
spanStart = srcIndex;
|
||||
spanLength = oldLength_;
|
||||
} else {
|
||||
spanStart = destIndex;
|
||||
spanLength = newLength_;
|
||||
}
|
||||
if (i < (spanStart + spanLength)) {
|
||||
// The index is in the current span.
|
||||
return 0;
|
||||
}
|
||||
if (remaining > 1) {
|
||||
// Is the index in one of the remaining compressed edits?
|
||||
// spanStart is the start of the current span, first of the remaining ones.
|
||||
int32_t len = remaining * spanLength;
|
||||
if (i < (spanStart + len)) {
|
||||
int32_t n = (i - spanStart) / spanLength; // 1 <= n <= remaining - 1
|
||||
srcIndex += n * oldLength_;
|
||||
replIndex += n * newLength_;
|
||||
destIndex += n * newLength_;
|
||||
remaining -= n;
|
||||
return 0;
|
||||
}
|
||||
// Make next() skip all of these edits at once.
|
||||
oldLength_ *= remaining;
|
||||
newLength_ *= remaining;
|
||||
remaining = 0;
|
||||
}
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
||||
int32_t Edits::Iterator::destinationIndexFromSourceIndex(int32_t i, UErrorCode &errorCode) {
|
||||
int32_t where = findIndex(i, true, errorCode);
|
||||
if (where < 0) {
|
||||
// Error or before the string.
|
||||
return 0;
|
||||
}
|
||||
if (where > 0 || i == srcIndex) {
|
||||
// At or after string length, or at start of the found span.
|
||||
return destIndex;
|
||||
}
|
||||
if (changed) {
|
||||
// In a change span, map to its end.
|
||||
return destIndex + newLength_;
|
||||
} else {
|
||||
// In an unchanged span, offset 1:1 within it.
|
||||
return destIndex + (i - srcIndex);
|
||||
}
|
||||
}
|
||||
|
||||
int32_t Edits::Iterator::sourceIndexFromDestinationIndex(int32_t i, UErrorCode &errorCode) {
|
||||
int32_t where = findIndex(i, false, errorCode);
|
||||
if (where < 0) {
|
||||
// Error or before the string.
|
||||
return 0;
|
||||
}
|
||||
if (where > 0 || i == destIndex) {
|
||||
// At or after string length, or at start of the found span.
|
||||
return srcIndex;
|
||||
}
|
||||
if (changed) {
|
||||
// In a change span, map to its end.
|
||||
return srcIndex + oldLength_;
|
||||
} else {
|
||||
// In an unchanged span, offset within it.
|
||||
return srcIndex + (i - destIndex);
|
||||
}
|
||||
}
|
||||
|
||||
UnicodeString& Edits::Iterator::toString(UnicodeString& sb) const {
|
||||
sb.append(u"{ src[", -1);
|
||||
ICU_Utility::appendNumber(sb, srcIndex);
|
||||
sb.append(u"..", -1);
|
||||
ICU_Utility::appendNumber(sb, srcIndex + oldLength_);
|
||||
if (changed) {
|
||||
sb.append(u"] ⇝ dest[", -1);
|
||||
} else {
|
||||
sb.append(u"] ≡ dest[", -1);
|
||||
}
|
||||
ICU_Utility::appendNumber(sb, destIndex);
|
||||
sb.append(u"..", -1);
|
||||
ICU_Utility::appendNumber(sb, destIndex + newLength_);
|
||||
if (changed) {
|
||||
sb.append(u"], repl[", -1);
|
||||
ICU_Utility::appendNumber(sb, replIndex);
|
||||
sb.append(u"..", -1);
|
||||
ICU_Utility::appendNumber(sb, replIndex + newLength_);
|
||||
sb.append(u"] }", -1);
|
||||
} else {
|
||||
sb.append(u"] (no-change) }", -1);
|
||||
}
|
||||
return sb;
|
||||
}
|
||||
|
||||
U_NAMESPACE_END
|
||||
220
engine/thirdparty/icu4c/common/emojiprops.cpp
vendored
Normal file
220
engine/thirdparty/icu4c/common/emojiprops.cpp
vendored
Normal file
|
|
@ -0,0 +1,220 @@
|
|||
// © 2021 and later: Unicode, Inc. and others.
|
||||
// License & terms of use: https://www.unicode.org/copyright.html
|
||||
|
||||
// emojiprops.cpp
|
||||
// created: 2021sep04 Markus W. Scherer
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
#include "unicode/uchar.h"
|
||||
#include "unicode/ucharstrie.h"
|
||||
#include "unicode/ucptrie.h"
|
||||
#include "unicode/udata.h"
|
||||
#include "unicode/ustringtrie.h"
|
||||
#include "unicode/utf16.h"
|
||||
#include "emojiprops.h"
|
||||
#include "ucln.h"
|
||||
#include "ucln_cmn.h"
|
||||
#include "umutex.h"
|
||||
#include "uset_imp.h"
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
namespace {
|
||||
|
||||
EmojiProps *singleton = nullptr;
|
||||
icu::UInitOnce emojiInitOnce {};
|
||||
|
||||
UBool U_CALLCONV emojiprops_cleanup() {
|
||||
delete singleton;
|
||||
singleton = nullptr;
|
||||
emojiInitOnce.reset();
|
||||
return true;
|
||||
}
|
||||
|
||||
void U_CALLCONV initSingleton(UErrorCode &errorCode) {
|
||||
if (U_FAILURE(errorCode)) { return; }
|
||||
singleton = new EmojiProps(errorCode);
|
||||
if (singleton == nullptr) {
|
||||
errorCode = U_MEMORY_ALLOCATION_ERROR;
|
||||
} else if (U_FAILURE(errorCode)) {
|
||||
delete singleton;
|
||||
singleton = nullptr;
|
||||
}
|
||||
ucln_common_registerCleanup(UCLN_COMMON_EMOJIPROPS, emojiprops_cleanup);
|
||||
}
|
||||
|
||||
// TODO: turn this into a shared helper function
|
||||
// Requires the major version to match, and then requires at least the minor version.
|
||||
UBool udata_isAcceptableMajorMinor(
|
||||
const UDataInfo &info, const char16_t *dataFormat, uint8_t major, uint8_t minor) {
|
||||
return
|
||||
info.size >= 20 &&
|
||||
info.isBigEndian == U_IS_BIG_ENDIAN &&
|
||||
info.charsetFamily == U_CHARSET_FAMILY &&
|
||||
info.dataFormat[0] == dataFormat[0] &&
|
||||
info.dataFormat[1] == dataFormat[1] &&
|
||||
info.dataFormat[2] == dataFormat[2] &&
|
||||
info.dataFormat[3] == dataFormat[3] &&
|
||||
info.formatVersion[0] == major &&
|
||||
info.formatVersion[1] >= minor;
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
EmojiProps::~EmojiProps() {
|
||||
udata_close(memory);
|
||||
ucptrie_close(cpTrie);
|
||||
}
|
||||
|
||||
const EmojiProps *
|
||||
EmojiProps::getSingleton(UErrorCode &errorCode) {
|
||||
if (U_FAILURE(errorCode)) { return nullptr; }
|
||||
umtx_initOnce(emojiInitOnce, &initSingleton, errorCode);
|
||||
return singleton;
|
||||
}
|
||||
|
||||
UBool U_CALLCONV
|
||||
EmojiProps::isAcceptable(void * /*context*/, const char * /*type*/, const char * /*name*/,
|
||||
const UDataInfo *pInfo) {
|
||||
return udata_isAcceptableMajorMinor(*pInfo, u"Emoj", 1, 0);
|
||||
}
|
||||
|
||||
void
|
||||
EmojiProps::load(UErrorCode &errorCode) {
|
||||
memory = udata_openChoice(nullptr, "icu", "uemoji", isAcceptable, this, &errorCode);
|
||||
if (U_FAILURE(errorCode)) { return; }
|
||||
const uint8_t *inBytes = (const uint8_t *)udata_getMemory(memory);
|
||||
const int32_t *inIndexes = (const int32_t *)inBytes;
|
||||
int32_t indexesLength = inIndexes[IX_CPTRIE_OFFSET] / 4;
|
||||
if (indexesLength <= IX_RGI_EMOJI_ZWJ_SEQUENCE_TRIE_OFFSET) {
|
||||
errorCode = U_INVALID_FORMAT_ERROR; // Not enough indexes.
|
||||
return;
|
||||
}
|
||||
|
||||
int32_t i = IX_CPTRIE_OFFSET;
|
||||
int32_t offset = inIndexes[i++];
|
||||
int32_t nextOffset = inIndexes[i];
|
||||
cpTrie = ucptrie_openFromBinary(UCPTRIE_TYPE_FAST, UCPTRIE_VALUE_BITS_8,
|
||||
inBytes + offset, nextOffset - offset, nullptr, &errorCode);
|
||||
if (U_FAILURE(errorCode)) {
|
||||
return;
|
||||
}
|
||||
|
||||
for (i = IX_BASIC_EMOJI_TRIE_OFFSET; i <= IX_RGI_EMOJI_ZWJ_SEQUENCE_TRIE_OFFSET; ++i) {
|
||||
offset = inIndexes[i];
|
||||
nextOffset = inIndexes[i + 1];
|
||||
// Set/leave nullptr if there is no UCharsTrie.
|
||||
const char16_t *p = nextOffset > offset ? (const char16_t *)(inBytes + offset) : nullptr;
|
||||
stringTries[getStringTrieIndex(i)] = p;
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
EmojiProps::addPropertyStarts(const USetAdder *sa, UErrorCode & /*errorCode*/) const {
|
||||
// Add the start code point of each same-value range of the trie.
|
||||
UChar32 start = 0, end;
|
||||
uint32_t value;
|
||||
while ((end = ucptrie_getRange(cpTrie, start, UCPMAP_RANGE_NORMAL, 0,
|
||||
nullptr, nullptr, &value)) >= 0) {
|
||||
sa->add(sa->set, start);
|
||||
start = end + 1;
|
||||
}
|
||||
}
|
||||
|
||||
UBool
|
||||
EmojiProps::hasBinaryProperty(UChar32 c, UProperty which) {
|
||||
UErrorCode errorCode = U_ZERO_ERROR;
|
||||
const EmojiProps *ep = getSingleton(errorCode);
|
||||
return U_SUCCESS(errorCode) && ep->hasBinaryPropertyImpl(c, which);
|
||||
}
|
||||
|
||||
UBool
|
||||
EmojiProps::hasBinaryPropertyImpl(UChar32 c, UProperty which) const {
|
||||
if (which < UCHAR_EMOJI || UCHAR_RGI_EMOJI < which) {
|
||||
return false;
|
||||
}
|
||||
// Note: UCHAR_REGIONAL_INDICATOR is a single, hardcoded range implemented elsewhere.
|
||||
static constexpr int8_t bitFlags[] = {
|
||||
BIT_EMOJI, // UCHAR_EMOJI=57
|
||||
BIT_EMOJI_PRESENTATION, // UCHAR_EMOJI_PRESENTATION=58
|
||||
BIT_EMOJI_MODIFIER, // UCHAR_EMOJI_MODIFIER=59
|
||||
BIT_EMOJI_MODIFIER_BASE, // UCHAR_EMOJI_MODIFIER_BASE=60
|
||||
BIT_EMOJI_COMPONENT, // UCHAR_EMOJI_COMPONENT=61
|
||||
-1, // UCHAR_REGIONAL_INDICATOR=62
|
||||
-1, // UCHAR_PREPENDED_CONCATENATION_MARK=63
|
||||
BIT_EXTENDED_PICTOGRAPHIC, // UCHAR_EXTENDED_PICTOGRAPHIC=64
|
||||
BIT_BASIC_EMOJI, // UCHAR_BASIC_EMOJI=65
|
||||
-1, // UCHAR_EMOJI_KEYCAP_SEQUENCE=66
|
||||
-1, // UCHAR_RGI_EMOJI_MODIFIER_SEQUENCE=67
|
||||
-1, // UCHAR_RGI_EMOJI_FLAG_SEQUENCE=68
|
||||
-1, // UCHAR_RGI_EMOJI_TAG_SEQUENCE=69
|
||||
-1, // UCHAR_RGI_EMOJI_ZWJ_SEQUENCE=70
|
||||
BIT_BASIC_EMOJI, // UCHAR_RGI_EMOJI=71
|
||||
};
|
||||
int32_t bit = bitFlags[which - UCHAR_EMOJI];
|
||||
if (bit < 0) {
|
||||
return false; // not a property that we support in this function
|
||||
}
|
||||
uint8_t bits = UCPTRIE_FAST_GET(cpTrie, UCPTRIE_8, c);
|
||||
return (bits >> bit) & 1;
|
||||
}
|
||||
|
||||
UBool
|
||||
EmojiProps::hasBinaryProperty(const char16_t *s, int32_t length, UProperty which) {
|
||||
UErrorCode errorCode = U_ZERO_ERROR;
|
||||
const EmojiProps *ep = getSingleton(errorCode);
|
||||
return U_SUCCESS(errorCode) && ep->hasBinaryPropertyImpl(s, length, which);
|
||||
}
|
||||
|
||||
UBool
|
||||
EmojiProps::hasBinaryPropertyImpl(const char16_t *s, int32_t length, UProperty which) const {
|
||||
if (s == nullptr && length != 0) { return false; }
|
||||
if (length <= 0 && (length == 0 || *s == 0)) { return false; } // empty string
|
||||
// The caller should have delegated single code points to hasBinaryProperty(c, which).
|
||||
if (which < UCHAR_BASIC_EMOJI || UCHAR_RGI_EMOJI < which) {
|
||||
return false;
|
||||
}
|
||||
UProperty firstProp = which, lastProp = which;
|
||||
if (which == UCHAR_RGI_EMOJI) {
|
||||
// RGI_Emoji is the union of the other emoji properties of strings.
|
||||
firstProp = UCHAR_BASIC_EMOJI;
|
||||
lastProp = UCHAR_RGI_EMOJI_ZWJ_SEQUENCE;
|
||||
}
|
||||
for (int32_t prop = firstProp; prop <= lastProp; ++prop) {
|
||||
const char16_t *trieUChars = stringTries[prop - UCHAR_BASIC_EMOJI];
|
||||
if (trieUChars != nullptr) {
|
||||
UCharsTrie trie(trieUChars);
|
||||
UStringTrieResult result = trie.next(s, length);
|
||||
if (USTRINGTRIE_HAS_VALUE(result)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
void
|
||||
EmojiProps::addStrings(const USetAdder *sa, UProperty which, UErrorCode &errorCode) const {
|
||||
if (U_FAILURE(errorCode)) { return; }
|
||||
if (which < UCHAR_BASIC_EMOJI || UCHAR_RGI_EMOJI < which) {
|
||||
return;
|
||||
}
|
||||
UProperty firstProp = which, lastProp = which;
|
||||
if (which == UCHAR_RGI_EMOJI) {
|
||||
// RGI_Emoji is the union of the other emoji properties of strings.
|
||||
firstProp = UCHAR_BASIC_EMOJI;
|
||||
lastProp = UCHAR_RGI_EMOJI_ZWJ_SEQUENCE;
|
||||
}
|
||||
for (int32_t prop = firstProp; prop <= lastProp; ++prop) {
|
||||
const char16_t *trieUChars = stringTries[prop - UCHAR_BASIC_EMOJI];
|
||||
if (trieUChars != nullptr) {
|
||||
UCharsTrie::Iterator iter(trieUChars, 0, errorCode);
|
||||
while (iter.next(errorCode)) {
|
||||
const UnicodeString &s = iter.getString();
|
||||
sa->addString(sa->set, s.getBuffer(), s.length());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
U_NAMESPACE_END
|
||||
90
engine/thirdparty/icu4c/common/emojiprops.h
vendored
Normal file
90
engine/thirdparty/icu4c/common/emojiprops.h
vendored
Normal file
|
|
@ -0,0 +1,90 @@
|
|||
// © 2021 and later: Unicode, Inc. and others.
|
||||
// License & terms of use: https://www.unicode.org/copyright.html
|
||||
|
||||
// emojiprops.h
|
||||
// created: 2021sep03 Markus W. Scherer
|
||||
|
||||
#ifndef __EMOJIPROPS_H__
|
||||
#define __EMOJIPROPS_H__
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
#include "unicode/ucptrie.h"
|
||||
#include "unicode/udata.h"
|
||||
#include "unicode/uobject.h"
|
||||
#include "uset_imp.h"
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
class EmojiProps : public UMemory {
|
||||
public:
|
||||
// @internal
|
||||
EmojiProps(UErrorCode &errorCode) { load(errorCode); }
|
||||
~EmojiProps();
|
||||
|
||||
static const EmojiProps *getSingleton(UErrorCode &errorCode);
|
||||
static UBool hasBinaryProperty(UChar32 c, UProperty which);
|
||||
static UBool hasBinaryProperty(const char16_t *s, int32_t length, UProperty which);
|
||||
|
||||
void addPropertyStarts(const USetAdder *sa, UErrorCode &errorCode) const;
|
||||
void addStrings(const USetAdder *sa, UProperty which, UErrorCode &errorCode) const;
|
||||
|
||||
enum {
|
||||
// Byte offsets from the start of the data, after the generic header,
|
||||
// in ascending order.
|
||||
// UCPTrie=CodePointTrie, follows the indexes
|
||||
IX_CPTRIE_OFFSET,
|
||||
IX_RESERVED1,
|
||||
IX_RESERVED2,
|
||||
IX_RESERVED3,
|
||||
|
||||
// UCharsTrie=CharsTrie
|
||||
IX_BASIC_EMOJI_TRIE_OFFSET,
|
||||
IX_EMOJI_KEYCAP_SEQUENCE_TRIE_OFFSET,
|
||||
IX_RGI_EMOJI_MODIFIER_SEQUENCE_TRIE_OFFSET,
|
||||
IX_RGI_EMOJI_FLAG_SEQUENCE_TRIE_OFFSET,
|
||||
IX_RGI_EMOJI_TAG_SEQUENCE_TRIE_OFFSET,
|
||||
IX_RGI_EMOJI_ZWJ_SEQUENCE_TRIE_OFFSET,
|
||||
IX_RESERVED10,
|
||||
IX_RESERVED11,
|
||||
IX_RESERVED12,
|
||||
IX_TOTAL_SIZE,
|
||||
|
||||
// Not initially byte offsets.
|
||||
IX_RESERVED14,
|
||||
IX_RESERVED15,
|
||||
IX_COUNT // 16
|
||||
};
|
||||
|
||||
// Properties in the code point trie.
|
||||
enum {
|
||||
// https://www.unicode.org/reports/tr51/#Emoji_Properties
|
||||
BIT_EMOJI,
|
||||
BIT_EMOJI_PRESENTATION,
|
||||
BIT_EMOJI_MODIFIER,
|
||||
BIT_EMOJI_MODIFIER_BASE,
|
||||
BIT_EMOJI_COMPONENT,
|
||||
BIT_EXTENDED_PICTOGRAPHIC,
|
||||
// https://www.unicode.org/reports/tr51/#Emoji_Sets
|
||||
BIT_BASIC_EMOJI
|
||||
};
|
||||
|
||||
private:
|
||||
static UBool U_CALLCONV
|
||||
isAcceptable(void *context, const char *type, const char *name, const UDataInfo *pInfo);
|
||||
/** Input i: One of the IX_..._TRIE_OFFSET indexes into the data file indexes[] array. */
|
||||
static int32_t getStringTrieIndex(int32_t i) {
|
||||
return i - IX_BASIC_EMOJI_TRIE_OFFSET;
|
||||
}
|
||||
|
||||
void load(UErrorCode &errorCode);
|
||||
UBool hasBinaryPropertyImpl(UChar32 c, UProperty which) const;
|
||||
UBool hasBinaryPropertyImpl(const char16_t *s, int32_t length, UProperty which) const;
|
||||
|
||||
UDataMemory *memory = nullptr;
|
||||
UCPTrie *cpTrie = nullptr;
|
||||
const char16_t *stringTries[6] = { nullptr, nullptr, nullptr, nullptr, nullptr, nullptr };
|
||||
};
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
||||
#endif // __EMOJIPROPS_H__
|
||||
42
engine/thirdparty/icu4c/common/errorcode.cpp
vendored
Normal file
42
engine/thirdparty/icu4c/common/errorcode.cpp
vendored
Normal file
|
|
@ -0,0 +1,42 @@
|
|||
// © 2016 and later: Unicode, Inc. and others.
|
||||
// License & terms of use: http://www.unicode.org/copyright.html
|
||||
/*
|
||||
*******************************************************************************
|
||||
*
|
||||
* Copyright (C) 2009-2011, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*
|
||||
*******************************************************************************
|
||||
* file name: errorcode.cpp
|
||||
* encoding: UTF-8
|
||||
* tab size: 8 (not used)
|
||||
* indentation:4
|
||||
*
|
||||
* created on: 2009mar10
|
||||
* created by: Markus W. Scherer
|
||||
*/
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
#include "unicode/errorcode.h"
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
ErrorCode::~ErrorCode() {}
|
||||
|
||||
UErrorCode ErrorCode::reset() {
|
||||
UErrorCode code = errorCode;
|
||||
errorCode = U_ZERO_ERROR;
|
||||
return code;
|
||||
}
|
||||
|
||||
void ErrorCode::assertSuccess() const {
|
||||
if(isFailure()) {
|
||||
handleFailure();
|
||||
}
|
||||
}
|
||||
|
||||
const char* ErrorCode::errorName() const {
|
||||
return u_errorName(errorCode);
|
||||
}
|
||||
|
||||
U_NAMESPACE_END
|
||||
736
engine/thirdparty/icu4c/common/filteredbrk.cpp
vendored
Normal file
736
engine/thirdparty/icu4c/common/filteredbrk.cpp
vendored
Normal file
|
|
@ -0,0 +1,736 @@
|
|||
// © 2016 and later: Unicode, Inc. and others.
|
||||
// License & terms of use: http://www.unicode.org/copyright.html
|
||||
/*
|
||||
*******************************************************************************
|
||||
* Copyright (C) 2014-2015, International Business Machines Corporation and
|
||||
* others. All Rights Reserved.
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
#if !UCONFIG_NO_BREAK_ITERATION && !UCONFIG_NO_FILTERED_BREAK_ITERATION
|
||||
|
||||
#include "cmemory.h"
|
||||
|
||||
#include "unicode/filteredbrk.h"
|
||||
#include "unicode/ucharstriebuilder.h"
|
||||
#include "unicode/ures.h"
|
||||
|
||||
#include "uresimp.h" // ures_getByKeyWithFallback
|
||||
#include "ubrkimpl.h" // U_ICUDATA_BRKITR
|
||||
#include "uvector.h"
|
||||
#include "cmemory.h"
|
||||
#include "umutex.h"
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
#ifndef FB_DEBUG
|
||||
#define FB_DEBUG 0
|
||||
#endif
|
||||
|
||||
#if FB_DEBUG
|
||||
#include <stdio.h>
|
||||
static void _fb_trace(const char *m, const UnicodeString *s, UBool b, int32_t d, const char *f, int l) {
|
||||
char buf[2048];
|
||||
if(s) {
|
||||
s->extract(0,s->length(),buf,2048);
|
||||
} else {
|
||||
strcpy(buf,"nullptr");
|
||||
}
|
||||
fprintf(stderr,"%s:%d: %s. s='%s'(%p), b=%c, d=%d\n",
|
||||
f, l, m, buf, (const void*)s, b?'T':'F',(int)d);
|
||||
}
|
||||
|
||||
#define FB_TRACE(m,s,b,d) _fb_trace(m,s,b,d,__FILE__,__LINE__)
|
||||
#else
|
||||
#define FB_TRACE(m,s,b,d)
|
||||
#endif
|
||||
|
||||
/**
|
||||
* Used with sortedInsert()
|
||||
*/
|
||||
static int32_t U_CALLCONV compareUnicodeString(UElement t1, UElement t2) {
|
||||
const UnicodeString &a = *(const UnicodeString*)t1.pointer;
|
||||
const UnicodeString &b = *(const UnicodeString*)t2.pointer;
|
||||
return a.compare(b);
|
||||
}
|
||||
|
||||
/**
|
||||
* A UVector which implements a set of strings.
|
||||
*/
|
||||
class UStringSet : public UVector {
|
||||
public:
|
||||
UStringSet(UErrorCode &status) : UVector(uprv_deleteUObject,
|
||||
uhash_compareUnicodeString,
|
||||
1,
|
||||
status) {}
|
||||
virtual ~UStringSet();
|
||||
/**
|
||||
* Is this UnicodeSet contained?
|
||||
*/
|
||||
inline UBool contains(const UnicodeString& s) {
|
||||
return contains((void*) &s);
|
||||
}
|
||||
using UVector::contains;
|
||||
/**
|
||||
* Return the ith UnicodeString alias
|
||||
*/
|
||||
inline const UnicodeString* getStringAt(int32_t i) const {
|
||||
return (const UnicodeString*)elementAt(i);
|
||||
}
|
||||
/**
|
||||
* Adopt the UnicodeString if not already contained.
|
||||
* Caller no longer owns the pointer in any case.
|
||||
* @return true if adopted successfully, false otherwise (error, or else duplicate)
|
||||
*/
|
||||
inline UBool adopt(UnicodeString *str, UErrorCode &status) {
|
||||
if(U_FAILURE(status) || contains(*str)) {
|
||||
delete str;
|
||||
return false;
|
||||
} else {
|
||||
sortedInsert(str, compareUnicodeString, status);
|
||||
if(U_FAILURE(status)) {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
}
|
||||
/**
|
||||
* Add by value.
|
||||
* @return true if successfully adopted.
|
||||
*/
|
||||
inline UBool add(const UnicodeString& str, UErrorCode &status) {
|
||||
if(U_FAILURE(status)) return false;
|
||||
UnicodeString *t = new UnicodeString(str);
|
||||
if(t==nullptr) {
|
||||
status = U_MEMORY_ALLOCATION_ERROR; return false;
|
||||
}
|
||||
return adopt(t, status);
|
||||
}
|
||||
/**
|
||||
* Remove this string.
|
||||
* @return true if successfully removed, false otherwise (error, or else it wasn't there)
|
||||
*/
|
||||
inline UBool remove(const UnicodeString &s, UErrorCode &status) {
|
||||
if(U_FAILURE(status)) return false;
|
||||
return removeElement((void*) &s);
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* Virtual, won't be inlined
|
||||
*/
|
||||
UStringSet::~UStringSet() {}
|
||||
|
||||
/* ----------------------------------------------------------- */
|
||||
|
||||
|
||||
/* Filtered Break constants */
|
||||
static const int32_t kPARTIAL = (1<<0); //< partial - need to run through forward trie
|
||||
static const int32_t kMATCH = (1<<1); //< exact match - skip this one.
|
||||
static const int32_t kSuppressInReverse = (1<<0);
|
||||
static const int32_t kAddToForward = (1<<1);
|
||||
static const char16_t kFULLSTOP = 0x002E; // '.'
|
||||
|
||||
/**
|
||||
* Shared data for SimpleFilteredSentenceBreakIterator
|
||||
*/
|
||||
class SimpleFilteredSentenceBreakData : public UMemory {
|
||||
public:
|
||||
SimpleFilteredSentenceBreakData(UCharsTrie *forwards, UCharsTrie *backwards )
|
||||
: fForwardsPartialTrie(forwards), fBackwardsTrie(backwards), refcount(1) { }
|
||||
SimpleFilteredSentenceBreakData *incr() {
|
||||
umtx_atomic_inc(&refcount);
|
||||
return this;
|
||||
}
|
||||
SimpleFilteredSentenceBreakData *decr() {
|
||||
if(umtx_atomic_dec(&refcount) <= 0) {
|
||||
delete this;
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
virtual ~SimpleFilteredSentenceBreakData();
|
||||
|
||||
bool hasForwardsPartialTrie() const { return fForwardsPartialTrie.isValid(); }
|
||||
bool hasBackwardsTrie() const { return fBackwardsTrie.isValid(); }
|
||||
|
||||
const UCharsTrie &getForwardsPartialTrie() const { return *fForwardsPartialTrie; }
|
||||
const UCharsTrie &getBackwardsTrie() const { return *fBackwardsTrie; }
|
||||
|
||||
private:
|
||||
// These tries own their data arrays.
|
||||
// They are shared and must therefore not be modified.
|
||||
LocalPointer<UCharsTrie> fForwardsPartialTrie; // Has ".a" for "a.M."
|
||||
LocalPointer<UCharsTrie> fBackwardsTrie; // i.e. ".srM" for Mrs.
|
||||
u_atomic_int32_t refcount;
|
||||
};
|
||||
|
||||
SimpleFilteredSentenceBreakData::~SimpleFilteredSentenceBreakData() {}
|
||||
|
||||
/**
|
||||
* Concrete implementation
|
||||
*/
|
||||
class SimpleFilteredSentenceBreakIterator : public BreakIterator {
|
||||
public:
|
||||
SimpleFilteredSentenceBreakIterator(BreakIterator *adopt, UCharsTrie *forwards, UCharsTrie *backwards, UErrorCode &status);
|
||||
SimpleFilteredSentenceBreakIterator(const SimpleFilteredSentenceBreakIterator& other);
|
||||
virtual ~SimpleFilteredSentenceBreakIterator();
|
||||
private:
|
||||
SimpleFilteredSentenceBreakData *fData;
|
||||
LocalPointer<BreakIterator> fDelegate;
|
||||
LocalUTextPointer fText;
|
||||
|
||||
/* -- subclass interface -- */
|
||||
public:
|
||||
/* -- cloning and other subclass stuff -- */
|
||||
virtual BreakIterator * createBufferClone(void * /*stackBuffer*/,
|
||||
int32_t &/*BufferSize*/,
|
||||
UErrorCode &status) override {
|
||||
// for now - always deep clone
|
||||
status = U_SAFECLONE_ALLOCATED_WARNING;
|
||||
return clone();
|
||||
}
|
||||
virtual SimpleFilteredSentenceBreakIterator* clone() const override { return new SimpleFilteredSentenceBreakIterator(*this); }
|
||||
virtual UClassID getDynamicClassID() const override { return nullptr; }
|
||||
virtual bool operator==(const BreakIterator& o) const override { if(this==&o) return true; return false; }
|
||||
|
||||
/* -- text modifying -- */
|
||||
virtual void setText(UText *text, UErrorCode &status) override { fDelegate->setText(text,status); }
|
||||
virtual BreakIterator &refreshInputText(UText *input, UErrorCode &status) override { fDelegate->refreshInputText(input,status); return *this; }
|
||||
virtual void adoptText(CharacterIterator* it) override { fDelegate->adoptText(it); }
|
||||
virtual void setText(const UnicodeString &text) override { fDelegate->setText(text); }
|
||||
|
||||
/* -- other functions that are just delegated -- */
|
||||
virtual UText *getUText(UText *fillIn, UErrorCode &status) const override { return fDelegate->getUText(fillIn,status); }
|
||||
virtual CharacterIterator& getText() const override { return fDelegate->getText(); }
|
||||
|
||||
/* -- ITERATION -- */
|
||||
virtual int32_t first() override;
|
||||
virtual int32_t preceding(int32_t offset) override;
|
||||
virtual int32_t previous() override;
|
||||
virtual UBool isBoundary(int32_t offset) override;
|
||||
virtual int32_t current() const override { return fDelegate->current(); } // we keep the delegate current, so this should be correct.
|
||||
|
||||
virtual int32_t next() override;
|
||||
|
||||
virtual int32_t next(int32_t n) override;
|
||||
virtual int32_t following(int32_t offset) override;
|
||||
virtual int32_t last() override;
|
||||
|
||||
private:
|
||||
/**
|
||||
* Given that the fDelegate has already given its "initial" answer,
|
||||
* find the NEXT actual (non-excepted) break.
|
||||
* @param n initial position from delegate
|
||||
* @return new break position or UBRK_DONE
|
||||
*/
|
||||
int32_t internalNext(int32_t n);
|
||||
/**
|
||||
* Given that the fDelegate has already given its "initial" answer,
|
||||
* find the PREV actual (non-excepted) break.
|
||||
* @param n initial position from delegate
|
||||
* @return new break position or UBRK_DONE
|
||||
*/
|
||||
int32_t internalPrev(int32_t n);
|
||||
/**
|
||||
* set up the UText with the value of the fDelegate.
|
||||
* Call this before calling breakExceptionAt.
|
||||
* May be able to avoid excess calls
|
||||
*/
|
||||
void resetState(UErrorCode &status);
|
||||
/**
|
||||
* Is there a match (exception) at this spot?
|
||||
*/
|
||||
enum EFBMatchResult { kNoExceptionHere, kExceptionHere };
|
||||
/**
|
||||
* Determine if there is an exception at this spot
|
||||
* @param n spot to check
|
||||
* @return kNoExceptionHere or kExceptionHere
|
||||
**/
|
||||
enum EFBMatchResult breakExceptionAt(int32_t n);
|
||||
};
|
||||
|
||||
SimpleFilteredSentenceBreakIterator::SimpleFilteredSentenceBreakIterator(const SimpleFilteredSentenceBreakIterator& other)
|
||||
: BreakIterator(other), fData(other.fData->incr()), fDelegate(other.fDelegate->clone())
|
||||
{
|
||||
}
|
||||
|
||||
|
||||
SimpleFilteredSentenceBreakIterator::SimpleFilteredSentenceBreakIterator(BreakIterator *adopt, UCharsTrie *forwards, UCharsTrie *backwards, UErrorCode &status) :
|
||||
BreakIterator(adopt->getLocale(ULOC_VALID_LOCALE,status),adopt->getLocale(ULOC_ACTUAL_LOCALE,status)),
|
||||
fData(new SimpleFilteredSentenceBreakData(forwards, backwards)),
|
||||
fDelegate(adopt)
|
||||
{
|
||||
if (fData == nullptr) {
|
||||
delete forwards;
|
||||
delete backwards;
|
||||
if (U_SUCCESS(status)) {
|
||||
status = U_MEMORY_ALLOCATION_ERROR;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
SimpleFilteredSentenceBreakIterator::~SimpleFilteredSentenceBreakIterator() {
|
||||
fData = fData->decr();
|
||||
}
|
||||
|
||||
void SimpleFilteredSentenceBreakIterator::resetState(UErrorCode &status) {
|
||||
fText.adoptInstead(fDelegate->getUText(fText.orphan(), status));
|
||||
}
|
||||
|
||||
SimpleFilteredSentenceBreakIterator::EFBMatchResult
|
||||
SimpleFilteredSentenceBreakIterator::breakExceptionAt(int32_t n) {
|
||||
int64_t bestPosn = -1;
|
||||
int32_t bestValue = -1;
|
||||
// loops while 'n' points to an exception.
|
||||
utext_setNativeIndex(fText.getAlias(), n); // from n..
|
||||
|
||||
//if(debug2) u_printf(" n@ %d\n", n);
|
||||
// Assume a space is following the '.' (so we handle the case: "Mr. /Brown")
|
||||
if(utext_previous32(fText.getAlias())==u' ') { // TODO: skip a class of chars here??
|
||||
// TODO only do this the 1st time?
|
||||
//if(debug2) u_printf("skipping prev: |%C| \n", (char16_t)uch);
|
||||
} else {
|
||||
//if(debug2) u_printf("not skipping prev: |%C| \n", (char16_t)uch);
|
||||
utext_next32(fText.getAlias());
|
||||
//if(debug2) u_printf(" -> : |%C| \n", (char16_t)uch);
|
||||
}
|
||||
|
||||
{
|
||||
// Do not modify the shared trie!
|
||||
UCharsTrie iter(fData->getBackwardsTrie());
|
||||
UChar32 uch;
|
||||
while((uch=utext_previous32(fText.getAlias()))!=U_SENTINEL) { // more to consume backwards
|
||||
UStringTrieResult r = iter.nextForCodePoint(uch);
|
||||
if(USTRINGTRIE_HAS_VALUE(r)) { // remember the best match so far
|
||||
bestPosn = utext_getNativeIndex(fText.getAlias());
|
||||
bestValue = iter.getValue();
|
||||
}
|
||||
if(!USTRINGTRIE_HAS_NEXT(r)) {
|
||||
break;
|
||||
}
|
||||
//if(debug2) u_printf("rev< /%C/ cont?%d @%d\n", (char16_t)uch, r, utext_getNativeIndex(fText.getAlias()));
|
||||
}
|
||||
}
|
||||
|
||||
//if(bestValue >= 0) {
|
||||
//if(debug2) u_printf("rev<+/%C/+end of seq.. r=%d, bestPosn=%d, bestValue=%d\n", (char16_t)uch, r, bestPosn, bestValue);
|
||||
//}
|
||||
|
||||
if(bestPosn>=0) {
|
||||
//if(debug2) u_printf("rev< /%C/ end of seq.. r=%d, bestPosn=%d, bestValue=%d\n", (char16_t)uch, r, bestPosn, bestValue);
|
||||
|
||||
//if(USTRINGTRIE_MATCHES(r)) { // matched - so, now what?
|
||||
//int32_t bestValue = iter.getValue();
|
||||
////if(debug2) u_printf("rev< /%C/ matched, skip..%d bestValue=%d\n", (char16_t)uch, r, bestValue);
|
||||
|
||||
if(bestValue == kMATCH) { // exact match!
|
||||
//if(debug2) u_printf(" exact backward match\n");
|
||||
return kExceptionHere; // See if the next is another exception.
|
||||
} else if(bestValue == kPARTIAL
|
||||
&& fData->hasForwardsPartialTrie()) { // make sure there's a forward trie
|
||||
//if(debug2) u_printf(" partial backward match\n");
|
||||
// We matched the "Ph." in "Ph.D." - now we need to run everything through the forwards trie
|
||||
// to see if it matches something going forward.
|
||||
UStringTrieResult rfwd = USTRINGTRIE_INTERMEDIATE_VALUE;
|
||||
utext_setNativeIndex(fText.getAlias(), bestPosn); // hope that's close ..
|
||||
//if(debug2) u_printf("Retrying at %d\n", bestPosn);
|
||||
// Do not modify the shared trie!
|
||||
UCharsTrie iter(fData->getForwardsPartialTrie());
|
||||
UChar32 uch;
|
||||
while((uch=utext_next32(fText.getAlias()))!=U_SENTINEL &&
|
||||
USTRINGTRIE_HAS_NEXT(rfwd=iter.nextForCodePoint(uch))) {
|
||||
//if(debug2) u_printf("fwd> /%C/ cont?%d @%d\n", (char16_t)uch, rfwd, utext_getNativeIndex(fText.getAlias()));
|
||||
}
|
||||
if(USTRINGTRIE_MATCHES(rfwd)) {
|
||||
//if(debug2) u_printf("fwd> /%C/ == forward match!\n", (char16_t)uch);
|
||||
// only full matches here, nothing to check
|
||||
// skip the next:
|
||||
return kExceptionHere;
|
||||
} else {
|
||||
//if(debug2) u_printf("fwd> /%C/ no match.\n", (char16_t)uch);
|
||||
// no match (no exception) -return the 'underlying' break
|
||||
return kNoExceptionHere;
|
||||
}
|
||||
} else {
|
||||
return kNoExceptionHere; // internal error and/or no forwards trie
|
||||
}
|
||||
} else {
|
||||
//if(debug2) u_printf("rev< /%C/ .. no match..%d\n", (char16_t)uch, r); // no best match
|
||||
return kNoExceptionHere; // No match - so exit. Not an exception.
|
||||
}
|
||||
}
|
||||
|
||||
// the workhorse single next.
|
||||
int32_t
|
||||
SimpleFilteredSentenceBreakIterator::internalNext(int32_t n) {
|
||||
if(n == UBRK_DONE || // at end or
|
||||
!fData->hasBackwardsTrie()) { // .. no backwards table loaded == no exceptions
|
||||
return n;
|
||||
}
|
||||
// OK, do we need to break here?
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
// refresh text
|
||||
resetState(status);
|
||||
if(U_FAILURE(status)) return UBRK_DONE; // bail out
|
||||
int64_t utextLen = utext_nativeLength(fText.getAlias());
|
||||
|
||||
//if(debug2) u_printf("str, native len=%d\n", utext_nativeLength(fText.getAlias()));
|
||||
while (n != UBRK_DONE && n != utextLen) { // outer loop runs once per underlying break (from fDelegate).
|
||||
SimpleFilteredSentenceBreakIterator::EFBMatchResult m = breakExceptionAt(n);
|
||||
|
||||
switch(m) {
|
||||
case kExceptionHere:
|
||||
n = fDelegate->next(); // skip this one. Find the next lowerlevel break.
|
||||
continue;
|
||||
|
||||
default:
|
||||
case kNoExceptionHere:
|
||||
return n;
|
||||
}
|
||||
}
|
||||
return n;
|
||||
}
|
||||
|
||||
int32_t
|
||||
SimpleFilteredSentenceBreakIterator::internalPrev(int32_t n) {
|
||||
if(n == 0 || n == UBRK_DONE || // at end or
|
||||
!fData->hasBackwardsTrie()) { // .. no backwards table loaded == no exceptions
|
||||
return n;
|
||||
}
|
||||
// OK, do we need to break here?
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
// refresh text
|
||||
resetState(status);
|
||||
if(U_FAILURE(status)) return UBRK_DONE; // bail out
|
||||
|
||||
//if(debug2) u_printf("str, native len=%d\n", utext_nativeLength(fText.getAlias()));
|
||||
while (n != UBRK_DONE && n != 0) { // outer loop runs once per underlying break (from fDelegate).
|
||||
SimpleFilteredSentenceBreakIterator::EFBMatchResult m = breakExceptionAt(n);
|
||||
|
||||
switch(m) {
|
||||
case kExceptionHere:
|
||||
n = fDelegate->previous(); // skip this one. Find the next lowerlevel break.
|
||||
continue;
|
||||
|
||||
default:
|
||||
case kNoExceptionHere:
|
||||
return n;
|
||||
}
|
||||
}
|
||||
return n;
|
||||
}
|
||||
|
||||
|
||||
int32_t
|
||||
SimpleFilteredSentenceBreakIterator::next() {
|
||||
return internalNext(fDelegate->next());
|
||||
}
|
||||
|
||||
int32_t
|
||||
SimpleFilteredSentenceBreakIterator::first() {
|
||||
// Don't suppress a break opportunity at the beginning of text.
|
||||
return fDelegate->first();
|
||||
}
|
||||
|
||||
int32_t
|
||||
SimpleFilteredSentenceBreakIterator::preceding(int32_t offset) {
|
||||
return internalPrev(fDelegate->preceding(offset));
|
||||
}
|
||||
|
||||
int32_t
|
||||
SimpleFilteredSentenceBreakIterator::previous() {
|
||||
return internalPrev(fDelegate->previous());
|
||||
}
|
||||
|
||||
UBool SimpleFilteredSentenceBreakIterator::isBoundary(int32_t offset) {
|
||||
if (!fDelegate->isBoundary(offset)) return false; // no break to suppress
|
||||
|
||||
if (!fData->hasBackwardsTrie()) return true; // no data = no suppressions
|
||||
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
resetState(status);
|
||||
|
||||
SimpleFilteredSentenceBreakIterator::EFBMatchResult m = breakExceptionAt(offset);
|
||||
|
||||
switch(m) {
|
||||
case kExceptionHere:
|
||||
return false;
|
||||
default:
|
||||
case kNoExceptionHere:
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
int32_t
|
||||
SimpleFilteredSentenceBreakIterator::next(int32_t offset) {
|
||||
return internalNext(fDelegate->next(offset));
|
||||
}
|
||||
|
||||
int32_t
|
||||
SimpleFilteredSentenceBreakIterator::following(int32_t offset) {
|
||||
return internalNext(fDelegate->following(offset));
|
||||
}
|
||||
|
||||
int32_t
|
||||
SimpleFilteredSentenceBreakIterator::last() {
|
||||
// Don't suppress a break opportunity at the end of text.
|
||||
return fDelegate->last();
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Concrete implementation of builder class.
|
||||
*/
|
||||
class SimpleFilteredBreakIteratorBuilder : public FilteredBreakIteratorBuilder {
|
||||
public:
|
||||
virtual ~SimpleFilteredBreakIteratorBuilder();
|
||||
SimpleFilteredBreakIteratorBuilder(const Locale &fromLocale, UErrorCode &status);
|
||||
SimpleFilteredBreakIteratorBuilder(UErrorCode &status);
|
||||
virtual UBool suppressBreakAfter(const UnicodeString& exception, UErrorCode& status) override;
|
||||
virtual UBool unsuppressBreakAfter(const UnicodeString& exception, UErrorCode& status) override;
|
||||
virtual BreakIterator *build(BreakIterator* adoptBreakIterator, UErrorCode& status) override;
|
||||
private:
|
||||
UStringSet fSet;
|
||||
};
|
||||
|
||||
SimpleFilteredBreakIteratorBuilder::~SimpleFilteredBreakIteratorBuilder()
|
||||
{
|
||||
}
|
||||
|
||||
SimpleFilteredBreakIteratorBuilder::SimpleFilteredBreakIteratorBuilder(UErrorCode &status)
|
||||
: fSet(status)
|
||||
{
|
||||
}
|
||||
|
||||
SimpleFilteredBreakIteratorBuilder::SimpleFilteredBreakIteratorBuilder(const Locale &fromLocale, UErrorCode &status)
|
||||
: fSet(status)
|
||||
{
|
||||
if(U_SUCCESS(status)) {
|
||||
UErrorCode subStatus = U_ZERO_ERROR;
|
||||
LocalUResourceBundlePointer b(ures_open(U_ICUDATA_BRKITR, fromLocale.getBaseName(), &subStatus));
|
||||
if (U_FAILURE(subStatus) || (subStatus == U_USING_DEFAULT_WARNING) ) {
|
||||
status = subStatus; // copy the failing status
|
||||
#if FB_DEBUG
|
||||
fprintf(stderr, "open BUNDLE %s : %s, %s\n", fromLocale.getBaseName(), "[exit]", u_errorName(status));
|
||||
#endif
|
||||
return; // leaves the builder empty, if you try to use it.
|
||||
}
|
||||
LocalUResourceBundlePointer exceptions(ures_getByKeyWithFallback(b.getAlias(), "exceptions", nullptr, &subStatus));
|
||||
if (U_FAILURE(subStatus) || (subStatus == U_USING_DEFAULT_WARNING) ) {
|
||||
status = subStatus; // copy the failing status
|
||||
#if FB_DEBUG
|
||||
fprintf(stderr, "open EXCEPTIONS %s : %s, %s\n", fromLocale.getBaseName(), "[exit]", u_errorName(status));
|
||||
#endif
|
||||
return; // leaves the builder empty, if you try to use it.
|
||||
}
|
||||
LocalUResourceBundlePointer breaks(ures_getByKeyWithFallback(exceptions.getAlias(), "SentenceBreak", nullptr, &subStatus));
|
||||
|
||||
#if FB_DEBUG
|
||||
{
|
||||
UErrorCode subsub = subStatus;
|
||||
fprintf(stderr, "open SentenceBreak %s => %s, %s\n", fromLocale.getBaseName(), ures_getLocale(breaks.getAlias(), &subsub), u_errorName(subStatus));
|
||||
}
|
||||
#endif
|
||||
|
||||
if (U_FAILURE(subStatus) || (subStatus == U_USING_DEFAULT_WARNING) ) {
|
||||
status = subStatus; // copy the failing status
|
||||
#if FB_DEBUG
|
||||
fprintf(stderr, "open %s : %s, %s\n", fromLocale.getBaseName(), "[exit]", u_errorName(status));
|
||||
#endif
|
||||
return; // leaves the builder empty, if you try to use it.
|
||||
}
|
||||
|
||||
LocalUResourceBundlePointer strs;
|
||||
subStatus = status; // Pick up inherited warning status now
|
||||
do {
|
||||
strs.adoptInstead(ures_getNextResource(breaks.getAlias(), strs.orphan(), &subStatus));
|
||||
if(strs.isValid() && U_SUCCESS(subStatus)) {
|
||||
UnicodeString str(ures_getUnicodeString(strs.getAlias(), &status));
|
||||
suppressBreakAfter(str, status); // load the string
|
||||
}
|
||||
} while (strs.isValid() && U_SUCCESS(subStatus));
|
||||
if(U_FAILURE(subStatus)&&subStatus!=U_INDEX_OUTOFBOUNDS_ERROR&&U_SUCCESS(status)) {
|
||||
status = subStatus;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
UBool
|
||||
SimpleFilteredBreakIteratorBuilder::suppressBreakAfter(const UnicodeString& exception, UErrorCode& status)
|
||||
{
|
||||
UBool r = fSet.add(exception, status);
|
||||
FB_TRACE("suppressBreakAfter",&exception,r,0);
|
||||
return r;
|
||||
}
|
||||
|
||||
UBool
|
||||
SimpleFilteredBreakIteratorBuilder::unsuppressBreakAfter(const UnicodeString& exception, UErrorCode& status)
|
||||
{
|
||||
UBool r = fSet.remove(exception, status);
|
||||
FB_TRACE("unsuppressBreakAfter",&exception,r,0);
|
||||
return r;
|
||||
}
|
||||
|
||||
/**
|
||||
* Jitterbug 2974: MSVC has a bug whereby new X[0] behaves badly.
|
||||
* Work around this.
|
||||
*
|
||||
* Note: "new UnicodeString[subCount]" ends up calling global operator new
|
||||
* on MSVC2012 for some reason.
|
||||
*/
|
||||
static inline UnicodeString* newUnicodeStringArray(size_t count) {
|
||||
return new UnicodeString[count ? count : 1];
|
||||
}
|
||||
|
||||
BreakIterator *
|
||||
SimpleFilteredBreakIteratorBuilder::build(BreakIterator* adoptBreakIterator, UErrorCode& status) {
|
||||
LocalPointer<BreakIterator> adopt(adoptBreakIterator);
|
||||
|
||||
LocalPointer<UCharsTrieBuilder> builder(new UCharsTrieBuilder(status), status);
|
||||
LocalPointer<UCharsTrieBuilder> builder2(new UCharsTrieBuilder(status), status);
|
||||
if(U_FAILURE(status)) {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
int32_t revCount = 0;
|
||||
int32_t fwdCount = 0;
|
||||
|
||||
int32_t subCount = fSet.size();
|
||||
|
||||
UnicodeString *ustrs_ptr = newUnicodeStringArray(subCount);
|
||||
|
||||
LocalArray<UnicodeString> ustrs(ustrs_ptr);
|
||||
|
||||
LocalMemory<int> partials;
|
||||
partials.allocateInsteadAndReset(subCount);
|
||||
|
||||
LocalPointer<UCharsTrie> backwardsTrie; // i.e. ".srM" for Mrs.
|
||||
LocalPointer<UCharsTrie> forwardsPartialTrie; // Has ".a" for "a.M."
|
||||
|
||||
int n=0;
|
||||
for ( int32_t i = 0;
|
||||
i<fSet.size();
|
||||
i++) {
|
||||
const UnicodeString *abbr = fSet.getStringAt(i);
|
||||
if(abbr) {
|
||||
FB_TRACE("build",abbr,true,i);
|
||||
ustrs[n] = *abbr; // copy by value
|
||||
FB_TRACE("ustrs[n]",&ustrs[n],true,i);
|
||||
} else {
|
||||
FB_TRACE("build",abbr,false,i);
|
||||
status = U_MEMORY_ALLOCATION_ERROR;
|
||||
return nullptr;
|
||||
}
|
||||
partials[n] = 0; // default: not partial
|
||||
n++;
|
||||
}
|
||||
// first pass - find partials.
|
||||
for(int i=0;i<subCount;i++) {
|
||||
int nn = ustrs[i].indexOf(kFULLSTOP); // TODO: non-'.' abbreviations
|
||||
if(nn>-1 && (nn+1)!=ustrs[i].length()) {
|
||||
FB_TRACE("partial",&ustrs[i],false,i);
|
||||
// is partial.
|
||||
// is it unique?
|
||||
int sameAs = -1;
|
||||
for(int j=0;j<subCount;j++) {
|
||||
if(j==i) continue;
|
||||
if(ustrs[i].compare(0,nn+1,ustrs[j],0,nn+1)==0) {
|
||||
FB_TRACE("prefix",&ustrs[j],false,nn+1);
|
||||
//UBool otherIsPartial = ((nn+1)!=ustrs[j].length()); // true if ustrs[j] doesn't end at nn
|
||||
if(partials[j]==0) { // hasn't been processed yet
|
||||
partials[j] = kSuppressInReverse | kAddToForward;
|
||||
FB_TRACE("suppressing",&ustrs[j],false,j);
|
||||
} else if(partials[j] & kSuppressInReverse) {
|
||||
sameAs = j; // the other entry is already in the reverse table.
|
||||
}
|
||||
}
|
||||
}
|
||||
FB_TRACE("for partial same-",&ustrs[i],false,sameAs);
|
||||
FB_TRACE(" == partial #",&ustrs[i],false,partials[i]);
|
||||
UnicodeString prefix(ustrs[i], 0, nn+1);
|
||||
if(sameAs == -1 && partials[i] == 0) {
|
||||
// first one - add the prefix to the reverse table.
|
||||
prefix.reverse();
|
||||
builder->add(prefix, kPARTIAL, status);
|
||||
revCount++;
|
||||
FB_TRACE("Added partial",&prefix,false, i);
|
||||
FB_TRACE(u_errorName(status),&ustrs[i],false,i);
|
||||
partials[i] = kSuppressInReverse | kAddToForward;
|
||||
} else {
|
||||
FB_TRACE("NOT adding partial",&prefix,false, i);
|
||||
FB_TRACE(u_errorName(status),&ustrs[i],false,i);
|
||||
}
|
||||
}
|
||||
}
|
||||
for(int i=0;i<subCount;i++) {
|
||||
if(partials[i]==0) {
|
||||
ustrs[i].reverse();
|
||||
builder->add(ustrs[i], kMATCH, status);
|
||||
revCount++;
|
||||
FB_TRACE(u_errorName(status), &ustrs[i], false, i);
|
||||
} else {
|
||||
FB_TRACE("Adding fwd",&ustrs[i], false, i);
|
||||
|
||||
// an optimization would be to only add the portion after the '.'
|
||||
// for example, for "Ph.D." we store ".hP" in the reverse table. We could just store "D." in the forward,
|
||||
// instead of "Ph.D." since we already know the "Ph." part is a match.
|
||||
// would need the trie to be able to hold 0-length strings, though.
|
||||
builder2->add(ustrs[i], kMATCH, status); // forward
|
||||
fwdCount++;
|
||||
//ustrs[i].reverse();
|
||||
////if(debug2) u_printf("SUPPRESS- not Added(%d): /%S/ status=%s\n",partials[i], ustrs[i].getTerminatedBuffer(), u_errorName(status));
|
||||
}
|
||||
}
|
||||
FB_TRACE("AbbrCount",nullptr,false, subCount);
|
||||
|
||||
if(revCount>0) {
|
||||
backwardsTrie.adoptInstead(builder->build(USTRINGTRIE_BUILD_FAST, status));
|
||||
if(U_FAILURE(status)) {
|
||||
FB_TRACE(u_errorName(status),nullptr,false, -1);
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
if(fwdCount>0) {
|
||||
forwardsPartialTrie.adoptInstead(builder2->build(USTRINGTRIE_BUILD_FAST, status));
|
||||
if(U_FAILURE(status)) {
|
||||
FB_TRACE(u_errorName(status),nullptr,false, -1);
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
return new SimpleFilteredSentenceBreakIterator(adopt.orphan(), forwardsPartialTrie.orphan(), backwardsTrie.orphan(), status);
|
||||
}
|
||||
|
||||
|
||||
// ----------- Base class implementation
|
||||
|
||||
FilteredBreakIteratorBuilder::FilteredBreakIteratorBuilder() {
|
||||
}
|
||||
|
||||
FilteredBreakIteratorBuilder::~FilteredBreakIteratorBuilder() {
|
||||
}
|
||||
|
||||
FilteredBreakIteratorBuilder *
|
||||
FilteredBreakIteratorBuilder::createInstance(const Locale& where, UErrorCode& status) {
|
||||
if(U_FAILURE(status)) return nullptr;
|
||||
LocalPointer<FilteredBreakIteratorBuilder> ret(new SimpleFilteredBreakIteratorBuilder(where, status), status);
|
||||
return (U_SUCCESS(status))? ret.orphan(): nullptr;
|
||||
}
|
||||
|
||||
FilteredBreakIteratorBuilder *
|
||||
FilteredBreakIteratorBuilder::createInstance(UErrorCode &status) {
|
||||
return createEmptyInstance(status);
|
||||
}
|
||||
|
||||
FilteredBreakIteratorBuilder *
|
||||
FilteredBreakIteratorBuilder::createEmptyInstance(UErrorCode& status) {
|
||||
if(U_FAILURE(status)) return nullptr;
|
||||
LocalPointer<FilteredBreakIteratorBuilder> ret(new SimpleFilteredBreakIteratorBuilder(status), status);
|
||||
return (U_SUCCESS(status))? ret.orphan(): nullptr;
|
||||
}
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
||||
#endif //#if !UCONFIG_NO_BREAK_ITERATION && !UCONFIG_NO_FILTERED_BREAK_ITERATION
|
||||
363
engine/thirdparty/icu4c/common/filterednormalizer2.cpp
vendored
Normal file
363
engine/thirdparty/icu4c/common/filterednormalizer2.cpp
vendored
Normal file
|
|
@ -0,0 +1,363 @@
|
|||
// © 2016 and later: Unicode, Inc. and others.
|
||||
// License & terms of use: http://www.unicode.org/copyright.html
|
||||
/*
|
||||
*******************************************************************************
|
||||
*
|
||||
* Copyright (C) 2009-2012, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*
|
||||
*******************************************************************************
|
||||
* file name: filterednormalizer2.cpp
|
||||
* encoding: UTF-8
|
||||
* tab size: 8 (not used)
|
||||
* indentation:4
|
||||
*
|
||||
* created on: 2009dec10
|
||||
* created by: Markus W. Scherer
|
||||
*/
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
|
||||
#if !UCONFIG_NO_NORMALIZATION
|
||||
|
||||
#include "unicode/edits.h"
|
||||
#include "unicode/normalizer2.h"
|
||||
#include "unicode/stringoptions.h"
|
||||
#include "unicode/uniset.h"
|
||||
#include "unicode/unistr.h"
|
||||
#include "unicode/unorm.h"
|
||||
#include "cpputils.h"
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
FilteredNormalizer2::~FilteredNormalizer2() {}
|
||||
|
||||
UnicodeString &
|
||||
FilteredNormalizer2::normalize(const UnicodeString &src,
|
||||
UnicodeString &dest,
|
||||
UErrorCode &errorCode) const {
|
||||
uprv_checkCanGetBuffer(src, errorCode);
|
||||
if(U_FAILURE(errorCode)) {
|
||||
dest.setToBogus();
|
||||
return dest;
|
||||
}
|
||||
if(&dest==&src) {
|
||||
errorCode=U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return dest;
|
||||
}
|
||||
dest.remove();
|
||||
return normalize(src, dest, USET_SPAN_SIMPLE, errorCode);
|
||||
}
|
||||
|
||||
// Internal: No argument checking, and appends to dest.
|
||||
// Pass as input spanCondition the one that is likely to yield a non-zero
|
||||
// span length at the start of src.
|
||||
// For set=[:age=3.2:], since almost all common characters were in Unicode 3.2,
|
||||
// USET_SPAN_SIMPLE should be passed in for the start of src
|
||||
// and USET_SPAN_NOT_CONTAINED should be passed in if we continue after
|
||||
// an in-filter prefix.
|
||||
UnicodeString &
|
||||
FilteredNormalizer2::normalize(const UnicodeString &src,
|
||||
UnicodeString &dest,
|
||||
USetSpanCondition spanCondition,
|
||||
UErrorCode &errorCode) const {
|
||||
UnicodeString tempDest; // Don't throw away destination buffer between iterations.
|
||||
for(int32_t prevSpanLimit=0; prevSpanLimit<src.length();) {
|
||||
int32_t spanLimit=set.span(src, prevSpanLimit, spanCondition);
|
||||
int32_t spanLength=spanLimit-prevSpanLimit;
|
||||
if(spanCondition==USET_SPAN_NOT_CONTAINED) {
|
||||
if(spanLength!=0) {
|
||||
dest.append(src, prevSpanLimit, spanLength);
|
||||
}
|
||||
spanCondition=USET_SPAN_SIMPLE;
|
||||
} else {
|
||||
if(spanLength!=0) {
|
||||
// Not norm2.normalizeSecondAndAppend() because we do not want
|
||||
// to modify the non-filter part of dest.
|
||||
dest.append(norm2.normalize(src.tempSubStringBetween(prevSpanLimit, spanLimit),
|
||||
tempDest, errorCode));
|
||||
if(U_FAILURE(errorCode)) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
spanCondition=USET_SPAN_NOT_CONTAINED;
|
||||
}
|
||||
prevSpanLimit=spanLimit;
|
||||
}
|
||||
return dest;
|
||||
}
|
||||
|
||||
void
|
||||
FilteredNormalizer2::normalizeUTF8(uint32_t options, StringPiece src, ByteSink &sink,
|
||||
Edits *edits, UErrorCode &errorCode) const {
|
||||
if (U_FAILURE(errorCode)) {
|
||||
return;
|
||||
}
|
||||
if (edits != nullptr && (options & U_EDITS_NO_RESET) == 0) {
|
||||
edits->reset();
|
||||
}
|
||||
options |= U_EDITS_NO_RESET; // Do not reset for each span.
|
||||
normalizeUTF8(options, src.data(), src.length(), sink, edits, USET_SPAN_SIMPLE, errorCode);
|
||||
}
|
||||
|
||||
void
|
||||
FilteredNormalizer2::normalizeUTF8(uint32_t options, const char *src, int32_t length,
|
||||
ByteSink &sink, Edits *edits,
|
||||
USetSpanCondition spanCondition,
|
||||
UErrorCode &errorCode) const {
|
||||
while (length > 0) {
|
||||
int32_t spanLength = set.spanUTF8(src, length, spanCondition);
|
||||
if (spanCondition == USET_SPAN_NOT_CONTAINED) {
|
||||
if (spanLength != 0) {
|
||||
if (edits != nullptr) {
|
||||
edits->addUnchanged(spanLength);
|
||||
}
|
||||
if ((options & U_OMIT_UNCHANGED_TEXT) == 0) {
|
||||
sink.Append(src, spanLength);
|
||||
}
|
||||
}
|
||||
spanCondition = USET_SPAN_SIMPLE;
|
||||
} else {
|
||||
if (spanLength != 0) {
|
||||
// Not norm2.normalizeSecondAndAppend() because we do not want
|
||||
// to modify the non-filter part of dest.
|
||||
norm2.normalizeUTF8(options, StringPiece(src, spanLength), sink, edits, errorCode);
|
||||
if (U_FAILURE(errorCode)) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
spanCondition = USET_SPAN_NOT_CONTAINED;
|
||||
}
|
||||
src += spanLength;
|
||||
length -= spanLength;
|
||||
}
|
||||
}
|
||||
|
||||
UnicodeString &
|
||||
FilteredNormalizer2::normalizeSecondAndAppend(UnicodeString &first,
|
||||
const UnicodeString &second,
|
||||
UErrorCode &errorCode) const {
|
||||
return normalizeSecondAndAppend(first, second, true, errorCode);
|
||||
}
|
||||
|
||||
UnicodeString &
|
||||
FilteredNormalizer2::append(UnicodeString &first,
|
||||
const UnicodeString &second,
|
||||
UErrorCode &errorCode) const {
|
||||
return normalizeSecondAndAppend(first, second, false, errorCode);
|
||||
}
|
||||
|
||||
UnicodeString &
|
||||
FilteredNormalizer2::normalizeSecondAndAppend(UnicodeString &first,
|
||||
const UnicodeString &second,
|
||||
UBool doNormalize,
|
||||
UErrorCode &errorCode) const {
|
||||
uprv_checkCanGetBuffer(first, errorCode);
|
||||
uprv_checkCanGetBuffer(second, errorCode);
|
||||
if(U_FAILURE(errorCode)) {
|
||||
return first;
|
||||
}
|
||||
if(&first==&second) {
|
||||
errorCode=U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return first;
|
||||
}
|
||||
if(first.isEmpty()) {
|
||||
if(doNormalize) {
|
||||
return normalize(second, first, errorCode);
|
||||
} else {
|
||||
return first=second;
|
||||
}
|
||||
}
|
||||
// merge the in-filter suffix of the first string with the in-filter prefix of the second
|
||||
int32_t prefixLimit=set.span(second, 0, USET_SPAN_SIMPLE);
|
||||
if(prefixLimit!=0) {
|
||||
UnicodeString prefix(second.tempSubString(0, prefixLimit));
|
||||
int32_t suffixStart=set.spanBack(first, INT32_MAX, USET_SPAN_SIMPLE);
|
||||
if(suffixStart==0) {
|
||||
if(doNormalize) {
|
||||
norm2.normalizeSecondAndAppend(first, prefix, errorCode);
|
||||
} else {
|
||||
norm2.append(first, prefix, errorCode);
|
||||
}
|
||||
} else {
|
||||
UnicodeString middle(first, suffixStart, INT32_MAX);
|
||||
if(doNormalize) {
|
||||
norm2.normalizeSecondAndAppend(middle, prefix, errorCode);
|
||||
} else {
|
||||
norm2.append(middle, prefix, errorCode);
|
||||
}
|
||||
first.replace(suffixStart, INT32_MAX, middle);
|
||||
}
|
||||
}
|
||||
if(prefixLimit<second.length()) {
|
||||
UnicodeString rest(second.tempSubString(prefixLimit, INT32_MAX));
|
||||
if(doNormalize) {
|
||||
normalize(rest, first, USET_SPAN_NOT_CONTAINED, errorCode);
|
||||
} else {
|
||||
first.append(rest);
|
||||
}
|
||||
}
|
||||
return first;
|
||||
}
|
||||
|
||||
UBool
|
||||
FilteredNormalizer2::getDecomposition(UChar32 c, UnicodeString &decomposition) const {
|
||||
return set.contains(c) && norm2.getDecomposition(c, decomposition);
|
||||
}
|
||||
|
||||
UBool
|
||||
FilteredNormalizer2::getRawDecomposition(UChar32 c, UnicodeString &decomposition) const {
|
||||
return set.contains(c) && norm2.getRawDecomposition(c, decomposition);
|
||||
}
|
||||
|
||||
UChar32
|
||||
FilteredNormalizer2::composePair(UChar32 a, UChar32 b) const {
|
||||
return (set.contains(a) && set.contains(b)) ? norm2.composePair(a, b) : U_SENTINEL;
|
||||
}
|
||||
|
||||
uint8_t
|
||||
FilteredNormalizer2::getCombiningClass(UChar32 c) const {
|
||||
return set.contains(c) ? norm2.getCombiningClass(c) : 0;
|
||||
}
|
||||
|
||||
UBool
|
||||
FilteredNormalizer2::isNormalized(const UnicodeString &s, UErrorCode &errorCode) const {
|
||||
uprv_checkCanGetBuffer(s, errorCode);
|
||||
if(U_FAILURE(errorCode)) {
|
||||
return false;
|
||||
}
|
||||
USetSpanCondition spanCondition=USET_SPAN_SIMPLE;
|
||||
for(int32_t prevSpanLimit=0; prevSpanLimit<s.length();) {
|
||||
int32_t spanLimit=set.span(s, prevSpanLimit, spanCondition);
|
||||
if(spanCondition==USET_SPAN_NOT_CONTAINED) {
|
||||
spanCondition=USET_SPAN_SIMPLE;
|
||||
} else {
|
||||
if( !norm2.isNormalized(s.tempSubStringBetween(prevSpanLimit, spanLimit), errorCode) ||
|
||||
U_FAILURE(errorCode)
|
||||
) {
|
||||
return false;
|
||||
}
|
||||
spanCondition=USET_SPAN_NOT_CONTAINED;
|
||||
}
|
||||
prevSpanLimit=spanLimit;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
UBool
|
||||
FilteredNormalizer2::isNormalizedUTF8(StringPiece sp, UErrorCode &errorCode) const {
|
||||
if(U_FAILURE(errorCode)) {
|
||||
return false;
|
||||
}
|
||||
const char *s = sp.data();
|
||||
int32_t length = sp.length();
|
||||
USetSpanCondition spanCondition = USET_SPAN_SIMPLE;
|
||||
while (length > 0) {
|
||||
int32_t spanLength = set.spanUTF8(s, length, spanCondition);
|
||||
if (spanCondition == USET_SPAN_NOT_CONTAINED) {
|
||||
spanCondition = USET_SPAN_SIMPLE;
|
||||
} else {
|
||||
if (!norm2.isNormalizedUTF8(StringPiece(s, spanLength), errorCode) ||
|
||||
U_FAILURE(errorCode)) {
|
||||
return false;
|
||||
}
|
||||
spanCondition = USET_SPAN_NOT_CONTAINED;
|
||||
}
|
||||
s += spanLength;
|
||||
length -= spanLength;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
UNormalizationCheckResult
|
||||
FilteredNormalizer2::quickCheck(const UnicodeString &s, UErrorCode &errorCode) const {
|
||||
uprv_checkCanGetBuffer(s, errorCode);
|
||||
if(U_FAILURE(errorCode)) {
|
||||
return UNORM_MAYBE;
|
||||
}
|
||||
UNormalizationCheckResult result=UNORM_YES;
|
||||
USetSpanCondition spanCondition=USET_SPAN_SIMPLE;
|
||||
for(int32_t prevSpanLimit=0; prevSpanLimit<s.length();) {
|
||||
int32_t spanLimit=set.span(s, prevSpanLimit, spanCondition);
|
||||
if(spanCondition==USET_SPAN_NOT_CONTAINED) {
|
||||
spanCondition=USET_SPAN_SIMPLE;
|
||||
} else {
|
||||
UNormalizationCheckResult qcResult=
|
||||
norm2.quickCheck(s.tempSubStringBetween(prevSpanLimit, spanLimit), errorCode);
|
||||
if(U_FAILURE(errorCode) || qcResult==UNORM_NO) {
|
||||
return qcResult;
|
||||
} else if(qcResult==UNORM_MAYBE) {
|
||||
result=qcResult;
|
||||
}
|
||||
spanCondition=USET_SPAN_NOT_CONTAINED;
|
||||
}
|
||||
prevSpanLimit=spanLimit;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
int32_t
|
||||
FilteredNormalizer2::spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const {
|
||||
uprv_checkCanGetBuffer(s, errorCode);
|
||||
if(U_FAILURE(errorCode)) {
|
||||
return 0;
|
||||
}
|
||||
USetSpanCondition spanCondition=USET_SPAN_SIMPLE;
|
||||
for(int32_t prevSpanLimit=0; prevSpanLimit<s.length();) {
|
||||
int32_t spanLimit=set.span(s, prevSpanLimit, spanCondition);
|
||||
if(spanCondition==USET_SPAN_NOT_CONTAINED) {
|
||||
spanCondition=USET_SPAN_SIMPLE;
|
||||
} else {
|
||||
int32_t yesLimit=
|
||||
prevSpanLimit+
|
||||
norm2.spanQuickCheckYes(
|
||||
s.tempSubStringBetween(prevSpanLimit, spanLimit), errorCode);
|
||||
if(U_FAILURE(errorCode) || yesLimit<spanLimit) {
|
||||
return yesLimit;
|
||||
}
|
||||
spanCondition=USET_SPAN_NOT_CONTAINED;
|
||||
}
|
||||
prevSpanLimit=spanLimit;
|
||||
}
|
||||
return s.length();
|
||||
}
|
||||
|
||||
UBool
|
||||
FilteredNormalizer2::hasBoundaryBefore(UChar32 c) const {
|
||||
return !set.contains(c) || norm2.hasBoundaryBefore(c);
|
||||
}
|
||||
|
||||
UBool
|
||||
FilteredNormalizer2::hasBoundaryAfter(UChar32 c) const {
|
||||
return !set.contains(c) || norm2.hasBoundaryAfter(c);
|
||||
}
|
||||
|
||||
UBool
|
||||
FilteredNormalizer2::isInert(UChar32 c) const {
|
||||
return !set.contains(c) || norm2.isInert(c);
|
||||
}
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
||||
// C API ------------------------------------------------------------------- ***
|
||||
|
||||
U_NAMESPACE_USE
|
||||
|
||||
U_CAPI UNormalizer2 * U_EXPORT2
|
||||
unorm2_openFiltered(const UNormalizer2 *norm2, const USet *filterSet, UErrorCode *pErrorCode) {
|
||||
if(U_FAILURE(*pErrorCode)) {
|
||||
return nullptr;
|
||||
}
|
||||
if(filterSet==nullptr) {
|
||||
*pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return nullptr;
|
||||
}
|
||||
Normalizer2 *fn2=new FilteredNormalizer2(*(Normalizer2 *)norm2,
|
||||
*UnicodeSet::fromUSet(filterSet));
|
||||
if(fn2==nullptr) {
|
||||
*pErrorCode=U_MEMORY_ALLOCATION_ERROR;
|
||||
}
|
||||
return (UNormalizer2 *)fn2;
|
||||
}
|
||||
|
||||
#endif // !UCONFIG_NO_NORMALIZATION
|
||||
267
engine/thirdparty/icu4c/common/hash.h
vendored
Normal file
267
engine/thirdparty/icu4c/common/hash.h
vendored
Normal file
|
|
@ -0,0 +1,267 @@
|
|||
// © 2016 and later: Unicode, Inc. and others.
|
||||
// License & terms of use: http://www.unicode.org/copyright.html
|
||||
/*
|
||||
******************************************************************************
|
||||
* Copyright (C) 1997-2014, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
******************************************************************************
|
||||
* Date Name Description
|
||||
* 03/28/00 aliu Creation.
|
||||
******************************************************************************
|
||||
*/
|
||||
|
||||
#ifndef HASH_H
|
||||
#define HASH_H
|
||||
|
||||
#include "unicode/unistr.h"
|
||||
#include "unicode/uobject.h"
|
||||
#include "cmemory.h"
|
||||
#include "uhash.h"
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
/**
|
||||
* Hashtable is a thin C++ wrapper around UHashtable, a general-purpose void*
|
||||
* hashtable implemented in C. Hashtable is designed to be idiomatic and
|
||||
* easy-to-use in C++.
|
||||
*
|
||||
* Hashtable is an INTERNAL CLASS.
|
||||
*/
|
||||
class U_COMMON_API Hashtable : public UMemory {
|
||||
UHashtable* hash;
|
||||
UHashtable hashObj;
|
||||
|
||||
inline void init(UHashFunction *keyHash, UKeyComparator *keyComp, UValueComparator *valueComp, UErrorCode& status);
|
||||
|
||||
inline void initSize(UHashFunction *keyHash, UKeyComparator *keyComp, UValueComparator *valueComp, int32_t size, UErrorCode& status);
|
||||
|
||||
public:
|
||||
/**
|
||||
* Construct a hashtable
|
||||
* @param ignoreKeyCase If true, keys are case insensitive.
|
||||
* @param status Error code
|
||||
*/
|
||||
inline Hashtable(UBool ignoreKeyCase, UErrorCode& status);
|
||||
|
||||
/**
|
||||
* Construct a hashtable
|
||||
* @param ignoreKeyCase If true, keys are case insensitive.
|
||||
* @param size initial size allocation
|
||||
* @param status Error code
|
||||
*/
|
||||
inline Hashtable(UBool ignoreKeyCase, int32_t size, UErrorCode& status);
|
||||
|
||||
/**
|
||||
* Construct a hashtable
|
||||
* @param keyComp Comparator for comparing the keys
|
||||
* @param valueComp Comparator for comparing the values
|
||||
* @param status Error code
|
||||
*/
|
||||
inline Hashtable(UKeyComparator *keyComp, UValueComparator *valueComp, UErrorCode& status);
|
||||
|
||||
/**
|
||||
* Construct a hashtable
|
||||
* @param status Error code
|
||||
*/
|
||||
inline Hashtable(UErrorCode& status);
|
||||
|
||||
/**
|
||||
* Construct a hashtable, _disregarding any error_. Use this constructor
|
||||
* with caution.
|
||||
*/
|
||||
inline Hashtable();
|
||||
|
||||
/**
|
||||
* Non-virtual destructor; make this virtual if Hashtable is subclassed
|
||||
* in the future.
|
||||
*/
|
||||
inline ~Hashtable();
|
||||
|
||||
inline UObjectDeleter *setValueDeleter(UObjectDeleter *fn);
|
||||
|
||||
inline int32_t count() const;
|
||||
|
||||
inline void* put(const UnicodeString& key, void* value, UErrorCode& status);
|
||||
|
||||
inline int32_t puti(const UnicodeString& key, int32_t value, UErrorCode& status);
|
||||
|
||||
inline int32_t putiAllowZero(const UnicodeString& key, int32_t value, UErrorCode& status);
|
||||
|
||||
inline void* get(const UnicodeString& key) const;
|
||||
|
||||
inline int32_t geti(const UnicodeString& key) const;
|
||||
|
||||
inline int32_t getiAndFound(const UnicodeString& key, UBool &found) const;
|
||||
|
||||
inline void* remove(const UnicodeString& key);
|
||||
|
||||
inline int32_t removei(const UnicodeString& key);
|
||||
|
||||
inline void removeAll();
|
||||
|
||||
inline UBool containsKey(const UnicodeString& key) const;
|
||||
|
||||
inline const UHashElement* find(const UnicodeString& key) const;
|
||||
|
||||
/**
|
||||
* @param pos - must be UHASH_FIRST on first call, and untouched afterwards.
|
||||
* @see uhash_nextElement
|
||||
*/
|
||||
inline const UHashElement* nextElement(int32_t& pos) const;
|
||||
|
||||
inline UKeyComparator* setKeyComparator(UKeyComparator*keyComp);
|
||||
|
||||
inline UValueComparator* setValueComparator(UValueComparator* valueComp);
|
||||
|
||||
inline UBool equals(const Hashtable& that) const;
|
||||
private:
|
||||
Hashtable(const Hashtable &other) = delete; // forbid copying of this class
|
||||
Hashtable &operator=(const Hashtable &other) = delete; // forbid copying of this class
|
||||
};
|
||||
|
||||
/*********************************************************************
|
||||
* Implementation
|
||||
********************************************************************/
|
||||
|
||||
inline void Hashtable::init(UHashFunction *keyHash, UKeyComparator *keyComp,
|
||||
UValueComparator *valueComp, UErrorCode& status) {
|
||||
if (U_FAILURE(status)) {
|
||||
return;
|
||||
}
|
||||
uhash_init(&hashObj, keyHash, keyComp, valueComp, &status);
|
||||
if (U_SUCCESS(status)) {
|
||||
hash = &hashObj;
|
||||
uhash_setKeyDeleter(hash, uprv_deleteUObject);
|
||||
}
|
||||
}
|
||||
|
||||
inline void Hashtable::initSize(UHashFunction *keyHash, UKeyComparator *keyComp,
|
||||
UValueComparator *valueComp, int32_t size, UErrorCode& status) {
|
||||
if (U_FAILURE(status)) {
|
||||
return;
|
||||
}
|
||||
uhash_initSize(&hashObj, keyHash, keyComp, valueComp, size, &status);
|
||||
if (U_SUCCESS(status)) {
|
||||
hash = &hashObj;
|
||||
uhash_setKeyDeleter(hash, uprv_deleteUObject);
|
||||
}
|
||||
}
|
||||
|
||||
inline Hashtable::Hashtable(UKeyComparator *keyComp, UValueComparator *valueComp,
|
||||
UErrorCode& status) : hash(nullptr) {
|
||||
init( uhash_hashUnicodeString, keyComp, valueComp, status);
|
||||
}
|
||||
|
||||
inline Hashtable::Hashtable(UBool ignoreKeyCase, UErrorCode& status)
|
||||
: hash(nullptr)
|
||||
{
|
||||
init(ignoreKeyCase ? uhash_hashCaselessUnicodeString
|
||||
: uhash_hashUnicodeString,
|
||||
ignoreKeyCase ? uhash_compareCaselessUnicodeString
|
||||
: uhash_compareUnicodeString,
|
||||
nullptr,
|
||||
status);
|
||||
}
|
||||
|
||||
inline Hashtable::Hashtable(UBool ignoreKeyCase, int32_t size, UErrorCode& status)
|
||||
: hash(nullptr)
|
||||
{
|
||||
initSize(ignoreKeyCase ? uhash_hashCaselessUnicodeString
|
||||
: uhash_hashUnicodeString,
|
||||
ignoreKeyCase ? uhash_compareCaselessUnicodeString
|
||||
: uhash_compareUnicodeString,
|
||||
nullptr, size,
|
||||
status);
|
||||
}
|
||||
|
||||
inline Hashtable::Hashtable(UErrorCode& status)
|
||||
: hash(nullptr)
|
||||
{
|
||||
init(uhash_hashUnicodeString, uhash_compareUnicodeString, nullptr, status);
|
||||
}
|
||||
|
||||
inline Hashtable::Hashtable()
|
||||
: hash(nullptr)
|
||||
{
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
init(uhash_hashUnicodeString, uhash_compareUnicodeString, nullptr, status);
|
||||
}
|
||||
|
||||
inline Hashtable::~Hashtable() {
|
||||
if (hash != nullptr) {
|
||||
uhash_close(hash);
|
||||
}
|
||||
}
|
||||
|
||||
inline UObjectDeleter *Hashtable::setValueDeleter(UObjectDeleter *fn) {
|
||||
return uhash_setValueDeleter(hash, fn);
|
||||
}
|
||||
|
||||
inline int32_t Hashtable::count() const {
|
||||
return uhash_count(hash);
|
||||
}
|
||||
|
||||
inline void* Hashtable::put(const UnicodeString& key, void* value, UErrorCode& status) {
|
||||
return uhash_put(hash, new UnicodeString(key), value, &status);
|
||||
}
|
||||
|
||||
inline int32_t Hashtable::puti(const UnicodeString& key, int32_t value, UErrorCode& status) {
|
||||
return uhash_puti(hash, new UnicodeString(key), value, &status);
|
||||
}
|
||||
|
||||
inline int32_t Hashtable::putiAllowZero(const UnicodeString& key, int32_t value,
|
||||
UErrorCode& status) {
|
||||
return uhash_putiAllowZero(hash, new UnicodeString(key), value, &status);
|
||||
}
|
||||
|
||||
inline void* Hashtable::get(const UnicodeString& key) const {
|
||||
return uhash_get(hash, &key);
|
||||
}
|
||||
|
||||
inline int32_t Hashtable::geti(const UnicodeString& key) const {
|
||||
return uhash_geti(hash, &key);
|
||||
}
|
||||
|
||||
inline int32_t Hashtable::getiAndFound(const UnicodeString& key, UBool &found) const {
|
||||
return uhash_getiAndFound(hash, &key, &found);
|
||||
}
|
||||
|
||||
inline void* Hashtable::remove(const UnicodeString& key) {
|
||||
return uhash_remove(hash, &key);
|
||||
}
|
||||
|
||||
inline int32_t Hashtable::removei(const UnicodeString& key) {
|
||||
return uhash_removei(hash, &key);
|
||||
}
|
||||
|
||||
inline UBool Hashtable::containsKey(const UnicodeString& key) const {
|
||||
return uhash_containsKey(hash, &key);
|
||||
}
|
||||
|
||||
inline const UHashElement* Hashtable::find(const UnicodeString& key) const {
|
||||
return uhash_find(hash, &key);
|
||||
}
|
||||
|
||||
inline const UHashElement* Hashtable::nextElement(int32_t& pos) const {
|
||||
return uhash_nextElement(hash, &pos);
|
||||
}
|
||||
|
||||
inline void Hashtable::removeAll() {
|
||||
uhash_removeAll(hash);
|
||||
}
|
||||
|
||||
inline UKeyComparator* Hashtable::setKeyComparator(UKeyComparator*keyComp){
|
||||
return uhash_setKeyComparator(hash, keyComp);
|
||||
}
|
||||
|
||||
inline UValueComparator* Hashtable::setValueComparator(UValueComparator* valueComp){
|
||||
return uhash_setValueComparator(hash, valueComp);
|
||||
}
|
||||
|
||||
inline UBool Hashtable::equals(const Hashtable& that)const{
|
||||
return uhash_equals(hash, that.hash);
|
||||
}
|
||||
U_NAMESPACE_END
|
||||
|
||||
#endif
|
||||
|
||||
31
engine/thirdparty/icu4c/common/icudataver.cpp
vendored
Normal file
31
engine/thirdparty/icu4c/common/icudataver.cpp
vendored
Normal file
|
|
@ -0,0 +1,31 @@
|
|||
// © 2016 and later: Unicode, Inc. and others.
|
||||
// License & terms of use: http://www.unicode.org/copyright.html
|
||||
/*
|
||||
******************************************************************************
|
||||
*
|
||||
* Copyright (C) 2009-2011, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*
|
||||
******************************************************************************
|
||||
*/
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
#include "unicode/icudataver.h"
|
||||
#include "unicode/ures.h"
|
||||
#include "uresimp.h" /* for ures_getVersionByKey */
|
||||
|
||||
U_CAPI void U_EXPORT2 u_getDataVersion(UVersionInfo dataVersionFillin, UErrorCode *status) {
|
||||
UResourceBundle *icudatares = nullptr;
|
||||
|
||||
if (U_FAILURE(*status)) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (dataVersionFillin != nullptr) {
|
||||
icudatares = ures_openDirect(nullptr, U_ICU_VERSION_BUNDLE , status);
|
||||
if (U_SUCCESS(*status)) {
|
||||
ures_getVersionByKey(icudatares, U_ICU_DATA_KEY, dataVersionFillin, status);
|
||||
}
|
||||
ures_close(icudatares);
|
||||
}
|
||||
}
|
||||
884
engine/thirdparty/icu4c/common/icuplug.cpp
vendored
Normal file
884
engine/thirdparty/icu4c/common/icuplug.cpp
vendored
Normal file
|
|
@ -0,0 +1,884 @@
|
|||
// © 2016 and later: Unicode, Inc. and others.
|
||||
// License & terms of use: http://www.unicode.org/copyright.html
|
||||
/*
|
||||
******************************************************************************
|
||||
*
|
||||
* Copyright (C) 2009-2015, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*
|
||||
******************************************************************************
|
||||
*
|
||||
* FILE NAME : icuplug.c
|
||||
*
|
||||
* Date Name Description
|
||||
* 10/29/2009 sl New.
|
||||
******************************************************************************
|
||||
*/
|
||||
|
||||
#include "unicode/icuplug.h"
|
||||
|
||||
|
||||
#if UCONFIG_ENABLE_PLUGINS
|
||||
|
||||
|
||||
#include "icuplugimp.h"
|
||||
#include "cstring.h"
|
||||
#include "cmemory.h"
|
||||
#include "putilimp.h"
|
||||
#include "ucln.h"
|
||||
#include <stdio.h>
|
||||
#ifdef __MVS__ /* defined by z/OS compiler */
|
||||
#define _POSIX_SOURCE
|
||||
#include <cics.h> /* 12 Nov 2011 JAM iscics() function */
|
||||
#endif
|
||||
#include "charstr.h"
|
||||
|
||||
using namespace icu;
|
||||
|
||||
#ifndef UPLUG_TRACE
|
||||
#define UPLUG_TRACE 0
|
||||
#endif
|
||||
|
||||
#if UPLUG_TRACE
|
||||
#include <stdio.h>
|
||||
#define DBG(x) fprintf(stderr, "%s:%d: ",__FILE__,__LINE__); fprintf x
|
||||
#endif
|
||||
|
||||
/**
|
||||
* Internal structure of an ICU plugin.
|
||||
*/
|
||||
|
||||
struct UPlugData {
|
||||
UPlugEntrypoint *entrypoint; /**< plugin entrypoint */
|
||||
uint32_t structSize; /**< initialized to the size of this structure */
|
||||
uint32_t token; /**< must be U_PLUG_TOKEN */
|
||||
void *lib; /**< plugin library, or nullptr */
|
||||
char libName[UPLUG_NAME_MAX]; /**< library name */
|
||||
char sym[UPLUG_NAME_MAX]; /**< plugin symbol, or nullptr */
|
||||
char config[UPLUG_NAME_MAX]; /**< configuration data */
|
||||
void *context; /**< user context data */
|
||||
char name[UPLUG_NAME_MAX]; /**< name of plugin */
|
||||
UPlugLevel level; /**< level of plugin */
|
||||
UBool awaitingLoad; /**< true if the plugin is awaiting a load call */
|
||||
UBool dontUnload; /**< true if plugin must stay resident (leak plugin and lib) */
|
||||
UErrorCode pluginStatus; /**< status code of plugin */
|
||||
};
|
||||
|
||||
|
||||
|
||||
#define UPLUG_LIBRARY_INITIAL_COUNT 8
|
||||
#define UPLUG_PLUGIN_INITIAL_COUNT 12
|
||||
|
||||
/**
|
||||
* Remove an item
|
||||
* @param list the full list
|
||||
* @param listSize the number of entries in the list
|
||||
* @param memberSize the size of one member
|
||||
* @param itemToRemove the item number of the member
|
||||
* @return the new listsize
|
||||
*/
|
||||
static int32_t uplug_removeEntryAt(void *list, int32_t listSize, int32_t memberSize, int32_t itemToRemove) {
|
||||
uint8_t *bytePtr = (uint8_t *)list;
|
||||
|
||||
/* get rid of some bad cases first */
|
||||
if(listSize<1) {
|
||||
return listSize;
|
||||
}
|
||||
|
||||
/* is there anything to move? */
|
||||
if(listSize > itemToRemove+1) {
|
||||
memmove(bytePtr+(itemToRemove*memberSize), bytePtr+((itemToRemove+1)*memberSize), memberSize);
|
||||
}
|
||||
|
||||
return listSize-1;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
#if U_ENABLE_DYLOAD
|
||||
/**
|
||||
* Library management. Internal.
|
||||
* @internal
|
||||
*/
|
||||
struct UPlugLibrary;
|
||||
|
||||
/**
|
||||
* Library management. Internal.
|
||||
* @internal
|
||||
*/
|
||||
typedef struct UPlugLibrary {
|
||||
void *lib; /**< library ptr */
|
||||
char name[UPLUG_NAME_MAX]; /**< library name */
|
||||
uint32_t ref; /**< reference count */
|
||||
} UPlugLibrary;
|
||||
|
||||
static UPlugLibrary staticLibraryList[UPLUG_LIBRARY_INITIAL_COUNT];
|
||||
static UPlugLibrary * libraryList = staticLibraryList;
|
||||
static int32_t libraryCount = 0;
|
||||
static int32_t libraryMax = UPLUG_LIBRARY_INITIAL_COUNT;
|
||||
|
||||
/**
|
||||
* Search for a library. Doesn't lock
|
||||
* @param libName libname to search for
|
||||
* @return the library's struct
|
||||
*/
|
||||
static int32_t searchForLibraryName(const char *libName) {
|
||||
int32_t i;
|
||||
|
||||
for(i=0;i<libraryCount;i++) {
|
||||
if(!uprv_strcmp(libName, libraryList[i].name)) {
|
||||
return i;
|
||||
}
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
static int32_t searchForLibrary(void *lib) {
|
||||
int32_t i;
|
||||
|
||||
for(i=0;i<libraryCount;i++) {
|
||||
if(lib==libraryList[i].lib) {
|
||||
return i;
|
||||
}
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
U_CAPI char * U_EXPORT2
|
||||
uplug_findLibrary(void *lib, UErrorCode *status) {
|
||||
int32_t libEnt;
|
||||
char *ret = nullptr;
|
||||
if(U_FAILURE(*status)) {
|
||||
return nullptr;
|
||||
}
|
||||
libEnt = searchForLibrary(lib);
|
||||
if(libEnt!=-1) {
|
||||
ret = libraryList[libEnt].name;
|
||||
} else {
|
||||
*status = U_MISSING_RESOURCE_ERROR;
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
U_CAPI void * U_EXPORT2
|
||||
uplug_openLibrary(const char *libName, UErrorCode *status) {
|
||||
int32_t libEntry = -1;
|
||||
void *lib = nullptr;
|
||||
|
||||
if(U_FAILURE(*status)) return nullptr;
|
||||
|
||||
libEntry = searchForLibraryName(libName);
|
||||
if(libEntry == -1) {
|
||||
libEntry = libraryCount++;
|
||||
if(libraryCount >= libraryMax) {
|
||||
/* Ran out of library slots. Statically allocated because we can't depend on allocating memory.. */
|
||||
*status = U_MEMORY_ALLOCATION_ERROR;
|
||||
#if UPLUG_TRACE
|
||||
DBG((stderr, "uplug_openLibrary() - out of library slots (max %d)\n", libraryMax));
|
||||
#endif
|
||||
return nullptr;
|
||||
}
|
||||
/* Some operating systems don't want
|
||||
DL operations from multiple threads. */
|
||||
libraryList[libEntry].lib = uprv_dl_open(libName, status);
|
||||
#if UPLUG_TRACE
|
||||
DBG((stderr, "uplug_openLibrary(%s,%s) libEntry %d, lib %p\n", libName, u_errorName(*status), libEntry, lib));
|
||||
#endif
|
||||
|
||||
if(libraryList[libEntry].lib == nullptr || U_FAILURE(*status)) {
|
||||
/* cleanup. */
|
||||
libraryList[libEntry].lib = nullptr; /* failure with open */
|
||||
libraryList[libEntry].name[0] = 0;
|
||||
#if UPLUG_TRACE
|
||||
DBG((stderr, "uplug_openLibrary(%s,%s) libEntry %d, lib %p\n", libName, u_errorName(*status), libEntry, lib));
|
||||
#endif
|
||||
/* no need to free - just won't increase the count. */
|
||||
libraryCount--;
|
||||
} else { /* is it still there? */
|
||||
/* link it in */
|
||||
uprv_strncpy(libraryList[libEntry].name,libName,UPLUG_NAME_MAX);
|
||||
libraryList[libEntry].ref=1;
|
||||
lib = libraryList[libEntry].lib;
|
||||
}
|
||||
|
||||
} else {
|
||||
lib = libraryList[libEntry].lib;
|
||||
libraryList[libEntry].ref++;
|
||||
}
|
||||
return lib;
|
||||
}
|
||||
|
||||
U_CAPI void U_EXPORT2
|
||||
uplug_closeLibrary(void *lib, UErrorCode *status) {
|
||||
int32_t i;
|
||||
|
||||
#if UPLUG_TRACE
|
||||
DBG((stderr, "uplug_closeLibrary(%p,%s) list %p\n", lib, u_errorName(*status), (void*)libraryList));
|
||||
#endif
|
||||
if(U_FAILURE(*status)) return;
|
||||
|
||||
for(i=0;i<libraryCount;i++) {
|
||||
if(lib==libraryList[i].lib) {
|
||||
if(--(libraryList[i].ref) == 0) {
|
||||
uprv_dl_close(libraryList[i].lib, status);
|
||||
libraryCount = uplug_removeEntryAt(libraryList, libraryCount, sizeof(*libraryList), i);
|
||||
}
|
||||
return;
|
||||
}
|
||||
}
|
||||
*status = U_INTERNAL_PROGRAM_ERROR; /* could not find the entry! */
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
static UPlugData pluginList[UPLUG_PLUGIN_INITIAL_COUNT];
|
||||
static int32_t pluginCount = 0;
|
||||
|
||||
|
||||
|
||||
|
||||
static int32_t uplug_pluginNumber(UPlugData* d) {
|
||||
UPlugData *pastPlug = &pluginList[pluginCount];
|
||||
if(d<=pluginList) {
|
||||
return 0;
|
||||
} else if(d>=pastPlug) {
|
||||
return pluginCount;
|
||||
} else {
|
||||
return (d-pluginList)/sizeof(pluginList[0]);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
U_CAPI UPlugData * U_EXPORT2
|
||||
uplug_nextPlug(UPlugData *prior) {
|
||||
if(prior==nullptr) {
|
||||
return pluginList;
|
||||
} else {
|
||||
UPlugData *nextPlug = &prior[1];
|
||||
UPlugData *pastPlug = &pluginList[pluginCount];
|
||||
|
||||
if(nextPlug>=pastPlug) {
|
||||
return nullptr;
|
||||
} else {
|
||||
return nextPlug;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Call the plugin with some params
|
||||
*/
|
||||
static void uplug_callPlug(UPlugData *plug, UPlugReason reason, UErrorCode *status) {
|
||||
UPlugTokenReturn token;
|
||||
if(plug==nullptr||U_FAILURE(*status)) {
|
||||
return;
|
||||
}
|
||||
token = (*(plug->entrypoint))(plug, reason, status);
|
||||
if(token!=UPLUG_TOKEN) {
|
||||
*status = U_INTERNAL_PROGRAM_ERROR;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
static void uplug_unloadPlug(UPlugData *plug, UErrorCode *status) {
|
||||
if(plug->awaitingLoad) { /* shouldn't happen. Plugin hasn't been loaded yet.*/
|
||||
*status = U_INTERNAL_PROGRAM_ERROR;
|
||||
return;
|
||||
}
|
||||
if(U_SUCCESS(plug->pluginStatus)) {
|
||||
/* Don't unload a plug which has a failing load status - means it didn't actually load. */
|
||||
uplug_callPlug(plug, UPLUG_REASON_UNLOAD, status);
|
||||
}
|
||||
}
|
||||
|
||||
static void uplug_queryPlug(UPlugData *plug, UErrorCode *status) {
|
||||
if(!plug->awaitingLoad || !(plug->level == UPLUG_LEVEL_UNKNOWN) ) { /* shouldn't happen. Plugin hasn't been loaded yet.*/
|
||||
*status = U_INTERNAL_PROGRAM_ERROR;
|
||||
return;
|
||||
}
|
||||
plug->level = UPLUG_LEVEL_INVALID;
|
||||
uplug_callPlug(plug, UPLUG_REASON_QUERY, status);
|
||||
if(U_SUCCESS(*status)) {
|
||||
if(plug->level == UPLUG_LEVEL_INVALID) {
|
||||
plug->pluginStatus = U_PLUGIN_DIDNT_SET_LEVEL;
|
||||
plug->awaitingLoad = false;
|
||||
}
|
||||
} else {
|
||||
plug->pluginStatus = U_INTERNAL_PROGRAM_ERROR;
|
||||
plug->awaitingLoad = false;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
static void uplug_loadPlug(UPlugData *plug, UErrorCode *status) {
|
||||
if(U_FAILURE(*status)) {
|
||||
return;
|
||||
}
|
||||
if(!plug->awaitingLoad || (plug->level < UPLUG_LEVEL_LOW) ) { /* shouldn't happen. Plugin hasn't been loaded yet.*/
|
||||
*status = U_INTERNAL_PROGRAM_ERROR;
|
||||
return;
|
||||
}
|
||||
uplug_callPlug(plug, UPLUG_REASON_LOAD, status);
|
||||
plug->awaitingLoad = false;
|
||||
if(!U_SUCCESS(*status)) {
|
||||
plug->pluginStatus = U_INTERNAL_PROGRAM_ERROR;
|
||||
}
|
||||
}
|
||||
|
||||
static UPlugData *uplug_allocateEmptyPlug(UErrorCode *status)
|
||||
{
|
||||
UPlugData *plug = nullptr;
|
||||
|
||||
if(U_FAILURE(*status)) {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
if(pluginCount == UPLUG_PLUGIN_INITIAL_COUNT) {
|
||||
*status = U_MEMORY_ALLOCATION_ERROR;
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
plug = &pluginList[pluginCount++];
|
||||
|
||||
plug->token = UPLUG_TOKEN;
|
||||
plug->structSize = sizeof(UPlugData);
|
||||
plug->name[0]=0;
|
||||
plug->level = UPLUG_LEVEL_UNKNOWN; /* initialize to null state */
|
||||
plug->awaitingLoad = true;
|
||||
plug->dontUnload = false;
|
||||
plug->pluginStatus = U_ZERO_ERROR;
|
||||
plug->libName[0] = 0;
|
||||
plug->config[0]=0;
|
||||
plug->sym[0]=0;
|
||||
plug->lib=nullptr;
|
||||
plug->entrypoint=nullptr;
|
||||
|
||||
|
||||
return plug;
|
||||
}
|
||||
|
||||
static UPlugData *uplug_allocatePlug(UPlugEntrypoint *entrypoint, const char *config, void *lib, const char *symName,
|
||||
UErrorCode *status) {
|
||||
UPlugData *plug = uplug_allocateEmptyPlug(status);
|
||||
if(U_FAILURE(*status)) {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
if(config!=nullptr) {
|
||||
uprv_strncpy(plug->config, config, UPLUG_NAME_MAX);
|
||||
} else {
|
||||
plug->config[0] = 0;
|
||||
}
|
||||
|
||||
if(symName!=nullptr) {
|
||||
uprv_strncpy(plug->sym, symName, UPLUG_NAME_MAX);
|
||||
} else {
|
||||
plug->sym[0] = 0;
|
||||
}
|
||||
|
||||
plug->entrypoint = entrypoint;
|
||||
plug->lib = lib;
|
||||
uplug_queryPlug(plug, status);
|
||||
|
||||
return plug;
|
||||
}
|
||||
|
||||
static void uplug_deallocatePlug(UPlugData *plug, UErrorCode *status) {
|
||||
UErrorCode subStatus = U_ZERO_ERROR;
|
||||
if(!plug->dontUnload) {
|
||||
#if U_ENABLE_DYLOAD
|
||||
uplug_closeLibrary(plug->lib, &subStatus);
|
||||
#endif
|
||||
}
|
||||
plug->lib = nullptr;
|
||||
if(U_SUCCESS(*status) && U_FAILURE(subStatus)) {
|
||||
*status = subStatus;
|
||||
}
|
||||
/* shift plugins up and decrement count. */
|
||||
if(U_SUCCESS(*status)) {
|
||||
/* all ok- remove. */
|
||||
pluginCount = uplug_removeEntryAt(pluginList, pluginCount, sizeof(plug[0]), uplug_pluginNumber(plug));
|
||||
} else {
|
||||
/* not ok- leave as a message. */
|
||||
plug->awaitingLoad=false;
|
||||
plug->entrypoint=0;
|
||||
plug->dontUnload=true;
|
||||
}
|
||||
}
|
||||
|
||||
static void uplug_doUnloadPlug(UPlugData *plugToRemove, UErrorCode *status) {
|
||||
if(plugToRemove != nullptr) {
|
||||
uplug_unloadPlug(plugToRemove, status);
|
||||
uplug_deallocatePlug(plugToRemove, status);
|
||||
}
|
||||
}
|
||||
|
||||
U_CAPI void U_EXPORT2
|
||||
uplug_removePlug(UPlugData *plug, UErrorCode *status) {
|
||||
UPlugData *cursor = nullptr;
|
||||
UPlugData *plugToRemove = nullptr;
|
||||
if(U_FAILURE(*status)) return;
|
||||
|
||||
for(cursor=pluginList;cursor!=nullptr;) {
|
||||
if(cursor==plug) {
|
||||
plugToRemove = plug;
|
||||
cursor=nullptr;
|
||||
} else {
|
||||
cursor = uplug_nextPlug(cursor);
|
||||
}
|
||||
}
|
||||
|
||||
uplug_doUnloadPlug(plugToRemove, status);
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
U_CAPI void U_EXPORT2
|
||||
uplug_setPlugNoUnload(UPlugData *data, UBool dontUnload)
|
||||
{
|
||||
data->dontUnload = dontUnload;
|
||||
}
|
||||
|
||||
|
||||
U_CAPI void U_EXPORT2
|
||||
uplug_setPlugLevel(UPlugData *data, UPlugLevel level) {
|
||||
data->level = level;
|
||||
}
|
||||
|
||||
|
||||
U_CAPI UPlugLevel U_EXPORT2
|
||||
uplug_getPlugLevel(UPlugData *data) {
|
||||
return data->level;
|
||||
}
|
||||
|
||||
|
||||
U_CAPI void U_EXPORT2
|
||||
uplug_setPlugName(UPlugData *data, const char *name) {
|
||||
uprv_strncpy(data->name, name, UPLUG_NAME_MAX);
|
||||
}
|
||||
|
||||
|
||||
U_CAPI const char * U_EXPORT2
|
||||
uplug_getPlugName(UPlugData *data) {
|
||||
return data->name;
|
||||
}
|
||||
|
||||
|
||||
U_CAPI const char * U_EXPORT2
|
||||
uplug_getSymbolName(UPlugData *data) {
|
||||
return data->sym;
|
||||
}
|
||||
|
||||
U_CAPI const char * U_EXPORT2
|
||||
uplug_getLibraryName(UPlugData *data, UErrorCode *status) {
|
||||
if(data->libName[0]) {
|
||||
return data->libName;
|
||||
} else {
|
||||
#if U_ENABLE_DYLOAD
|
||||
return uplug_findLibrary(data->lib, status);
|
||||
#else
|
||||
return nullptr;
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
U_CAPI void * U_EXPORT2
|
||||
uplug_getLibrary(UPlugData *data) {
|
||||
return data->lib;
|
||||
}
|
||||
|
||||
U_CAPI void * U_EXPORT2
|
||||
uplug_getContext(UPlugData *data) {
|
||||
return data->context;
|
||||
}
|
||||
|
||||
|
||||
U_CAPI void U_EXPORT2
|
||||
uplug_setContext(UPlugData *data, void *context) {
|
||||
data->context = context;
|
||||
}
|
||||
|
||||
U_CAPI const char* U_EXPORT2
|
||||
uplug_getConfiguration(UPlugData *data) {
|
||||
return data->config;
|
||||
}
|
||||
|
||||
U_CAPI UPlugData* U_EXPORT2
|
||||
uplug_getPlugInternal(int32_t n) {
|
||||
if(n <0 || n >= pluginCount) {
|
||||
return nullptr;
|
||||
} else {
|
||||
return &(pluginList[n]);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
U_CAPI UErrorCode U_EXPORT2
|
||||
uplug_getPlugLoadStatus(UPlugData *plug) {
|
||||
return plug->pluginStatus;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Initialize a plugin from an entrypoint and library - but don't load it.
|
||||
*/
|
||||
static UPlugData* uplug_initPlugFromEntrypointAndLibrary(UPlugEntrypoint *entrypoint, const char *config, void *lib, const char *sym,
|
||||
UErrorCode *status) {
|
||||
UPlugData *plug = nullptr;
|
||||
|
||||
plug = uplug_allocatePlug(entrypoint, config, lib, sym, status);
|
||||
|
||||
if(U_SUCCESS(*status)) {
|
||||
return plug;
|
||||
} else {
|
||||
uplug_deallocatePlug(plug, status);
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
U_CAPI UPlugData* U_EXPORT2
|
||||
uplug_loadPlugFromEntrypoint(UPlugEntrypoint *entrypoint, const char *config, UErrorCode *status) {
|
||||
UPlugData* plug = uplug_initPlugFromEntrypointAndLibrary(entrypoint, config, nullptr, nullptr, status);
|
||||
uplug_loadPlug(plug, status);
|
||||
return plug;
|
||||
}
|
||||
|
||||
#if U_ENABLE_DYLOAD
|
||||
|
||||
static UPlugData*
|
||||
uplug_initErrorPlug(const char *libName, const char *sym, const char *config, const char *nameOrError, UErrorCode loadStatus, UErrorCode *status)
|
||||
{
|
||||
UPlugData *plug = uplug_allocateEmptyPlug(status);
|
||||
if(U_FAILURE(*status)) return nullptr;
|
||||
|
||||
plug->pluginStatus = loadStatus;
|
||||
plug->awaitingLoad = false; /* Won't load. */
|
||||
plug->dontUnload = true; /* cannot unload. */
|
||||
|
||||
if(sym!=nullptr) {
|
||||
uprv_strncpy(plug->sym, sym, UPLUG_NAME_MAX);
|
||||
}
|
||||
|
||||
if(libName!=nullptr) {
|
||||
uprv_strncpy(plug->libName, libName, UPLUG_NAME_MAX);
|
||||
}
|
||||
|
||||
if(nameOrError!=nullptr) {
|
||||
uprv_strncpy(plug->name, nameOrError, UPLUG_NAME_MAX);
|
||||
}
|
||||
|
||||
if(config!=nullptr) {
|
||||
uprv_strncpy(plug->config, config, UPLUG_NAME_MAX);
|
||||
}
|
||||
|
||||
return plug;
|
||||
}
|
||||
|
||||
/**
|
||||
* Fetch a plugin from DLL, and then initialize it from a library- but don't load it.
|
||||
*/
|
||||
static UPlugData*
|
||||
uplug_initPlugFromLibrary(const char *libName, const char *sym, const char *config, UErrorCode *status) {
|
||||
void *lib = nullptr;
|
||||
UPlugData *plug = nullptr;
|
||||
if(U_FAILURE(*status)) { return nullptr; }
|
||||
lib = uplug_openLibrary(libName, status);
|
||||
if(lib!=nullptr && U_SUCCESS(*status)) {
|
||||
UPlugEntrypoint *entrypoint = nullptr;
|
||||
entrypoint = (UPlugEntrypoint*)uprv_dlsym_func(lib, sym, status);
|
||||
|
||||
if(entrypoint!=nullptr&&U_SUCCESS(*status)) {
|
||||
plug = uplug_initPlugFromEntrypointAndLibrary(entrypoint, config, lib, sym, status);
|
||||
if(plug!=nullptr&&U_SUCCESS(*status)) {
|
||||
plug->lib = lib; /* plug takes ownership of library */
|
||||
lib = nullptr; /* library is now owned by plugin. */
|
||||
}
|
||||
} else {
|
||||
UErrorCode subStatus = U_ZERO_ERROR;
|
||||
plug = uplug_initErrorPlug(libName,sym,config,"ERROR: Could not load entrypoint",(lib==nullptr)?U_MISSING_RESOURCE_ERROR:*status,&subStatus);
|
||||
}
|
||||
if(lib!=nullptr) { /* still need to close the lib */
|
||||
UErrorCode subStatus = U_ZERO_ERROR;
|
||||
uplug_closeLibrary(lib, &subStatus); /* don't care here */
|
||||
}
|
||||
} else {
|
||||
UErrorCode subStatus = U_ZERO_ERROR;
|
||||
plug = uplug_initErrorPlug(libName,sym,config,"ERROR: could not load library",(lib==nullptr)?U_MISSING_RESOURCE_ERROR:*status,&subStatus);
|
||||
}
|
||||
return plug;
|
||||
}
|
||||
|
||||
U_CAPI UPlugData* U_EXPORT2
|
||||
uplug_loadPlugFromLibrary(const char *libName, const char *sym, const char *config, UErrorCode *status) {
|
||||
UPlugData *plug = nullptr;
|
||||
if(U_FAILURE(*status)) { return nullptr; }
|
||||
plug = uplug_initPlugFromLibrary(libName, sym, config, status);
|
||||
uplug_loadPlug(plug, status);
|
||||
|
||||
return plug;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
static UPlugLevel gCurrentLevel = UPLUG_LEVEL_LOW;
|
||||
|
||||
U_CAPI UPlugLevel U_EXPORT2 uplug_getCurrentLevel() {
|
||||
return gCurrentLevel;
|
||||
}
|
||||
|
||||
static UBool U_CALLCONV uplug_cleanup()
|
||||
{
|
||||
int32_t i;
|
||||
|
||||
UPlugData *pluginToRemove;
|
||||
/* cleanup plugs */
|
||||
for(i=0;i<pluginCount;i++) {
|
||||
UErrorCode subStatus = U_ZERO_ERROR;
|
||||
pluginToRemove = &pluginList[i];
|
||||
/* unload and deallocate */
|
||||
uplug_doUnloadPlug(pluginToRemove, &subStatus);
|
||||
}
|
||||
/* close other held libs? */
|
||||
gCurrentLevel = UPLUG_LEVEL_LOW;
|
||||
return true;
|
||||
}
|
||||
|
||||
#if U_ENABLE_DYLOAD
|
||||
|
||||
static void uplug_loadWaitingPlugs(UErrorCode *status) {
|
||||
int32_t i;
|
||||
UPlugLevel currentLevel = uplug_getCurrentLevel();
|
||||
|
||||
if(U_FAILURE(*status)) {
|
||||
return;
|
||||
}
|
||||
#if UPLUG_TRACE
|
||||
DBG((stderr, "uplug_loadWaitingPlugs() Level: %d\n", currentLevel));
|
||||
#endif
|
||||
/* pass #1: low level plugs */
|
||||
for(i=0;i<pluginCount;i++) {
|
||||
UErrorCode subStatus = U_ZERO_ERROR;
|
||||
UPlugData *pluginToLoad = &pluginList[i];
|
||||
if(pluginToLoad->awaitingLoad) {
|
||||
if(pluginToLoad->level == UPLUG_LEVEL_LOW) {
|
||||
if(currentLevel > UPLUG_LEVEL_LOW) {
|
||||
pluginToLoad->pluginStatus = U_PLUGIN_TOO_HIGH;
|
||||
} else {
|
||||
UPlugLevel newLevel;
|
||||
uplug_loadPlug(pluginToLoad, &subStatus);
|
||||
newLevel = uplug_getCurrentLevel();
|
||||
if(newLevel > currentLevel) {
|
||||
pluginToLoad->pluginStatus = U_PLUGIN_CHANGED_LEVEL_WARNING;
|
||||
currentLevel = newLevel;
|
||||
}
|
||||
}
|
||||
pluginToLoad->awaitingLoad = false;
|
||||
}
|
||||
}
|
||||
}
|
||||
for(i=0;i<pluginCount;i++) {
|
||||
UErrorCode subStatus = U_ZERO_ERROR;
|
||||
UPlugData *pluginToLoad = &pluginList[i];
|
||||
|
||||
if(pluginToLoad->awaitingLoad) {
|
||||
if(pluginToLoad->level == UPLUG_LEVEL_INVALID) {
|
||||
pluginToLoad->pluginStatus = U_PLUGIN_DIDNT_SET_LEVEL;
|
||||
} else if(pluginToLoad->level == UPLUG_LEVEL_UNKNOWN) {
|
||||
pluginToLoad->pluginStatus = U_INTERNAL_PROGRAM_ERROR;
|
||||
} else {
|
||||
uplug_loadPlug(pluginToLoad, &subStatus);
|
||||
}
|
||||
pluginToLoad->awaitingLoad = false;
|
||||
}
|
||||
}
|
||||
|
||||
#if UPLUG_TRACE
|
||||
DBG((stderr, " Done Loading Plugs. Level: %d\n", (int32_t)uplug_getCurrentLevel()));
|
||||
#endif
|
||||
}
|
||||
|
||||
/* Name of the plugin config file */
|
||||
static char plugin_file[2048] = "";
|
||||
#endif
|
||||
|
||||
U_CAPI const char* U_EXPORT2
|
||||
uplug_getPluginFile() {
|
||||
#if U_ENABLE_DYLOAD && !UCONFIG_NO_FILE_IO
|
||||
return plugin_file;
|
||||
#else
|
||||
return nullptr;
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
// uplug_init() is called first thing from u_init().
|
||||
|
||||
U_CAPI void U_EXPORT2
|
||||
uplug_init(UErrorCode *status) {
|
||||
#if !U_ENABLE_DYLOAD
|
||||
(void)status; /* unused */
|
||||
#elif !UCONFIG_NO_FILE_IO
|
||||
CharString plugin_dir;
|
||||
const char *env = getenv("ICU_PLUGINS");
|
||||
|
||||
if(U_FAILURE(*status)) return;
|
||||
if(env != nullptr) {
|
||||
plugin_dir.append(env, -1, *status);
|
||||
}
|
||||
if(U_FAILURE(*status)) return;
|
||||
|
||||
#if defined(DEFAULT_ICU_PLUGINS)
|
||||
if(plugin_dir.isEmpty()) {
|
||||
plugin_dir.append(DEFAULT_ICU_PLUGINS, -1, *status);
|
||||
}
|
||||
#endif
|
||||
|
||||
#if UPLUG_TRACE
|
||||
DBG((stderr, "ICU_PLUGINS=%s\n", plugin_dir.data()));
|
||||
#endif
|
||||
|
||||
if(!plugin_dir.isEmpty()) {
|
||||
FILE *f;
|
||||
|
||||
CharString pluginFile;
|
||||
#ifdef OS390BATCH
|
||||
/* There are potentially a lot of ways to implement a plugin directory on OS390/zOS */
|
||||
/* Keeping in mind that unauthorized file access is logged, monitored, and enforced */
|
||||
/* I've chosen to open a DDNAME if BATCH and leave it alone for (presumably) UNIX */
|
||||
/* System Services. Alternative techniques might be allocating a member in */
|
||||
/* SYS1.PARMLIB or setting an environment variable "ICU_PLUGIN_PATH" (?). The */
|
||||
/* DDNAME can be connected to a file in the HFS if need be. */
|
||||
|
||||
pluginFile.append("//DD:ICUPLUG", -1, *status); /* JAM 20 Oct 2011 */
|
||||
#else
|
||||
pluginFile.append(plugin_dir, *status);
|
||||
pluginFile.append(U_FILE_SEP_STRING, -1, *status);
|
||||
pluginFile.append("icuplugins", -1, *status);
|
||||
pluginFile.append(U_ICU_VERSION_SHORT, -1, *status);
|
||||
pluginFile.append(".txt", -1, *status);
|
||||
#endif
|
||||
|
||||
#if UPLUG_TRACE
|
||||
DBG((stderr, "status=%s\n", u_errorName(*status)));
|
||||
#endif
|
||||
|
||||
if(U_FAILURE(*status)) {
|
||||
return;
|
||||
}
|
||||
if((size_t)pluginFile.length() > (sizeof(plugin_file)-1)) {
|
||||
*status = U_BUFFER_OVERFLOW_ERROR;
|
||||
#if UPLUG_TRACE
|
||||
DBG((stderr, "status=%s\n", u_errorName(*status)));
|
||||
#endif
|
||||
return;
|
||||
}
|
||||
|
||||
/* plugin_file is not used for processing - it is only used
|
||||
so that uplug_getPluginFile() works (i.e. icuinfo)
|
||||
*/
|
||||
pluginFile.extract(plugin_file, sizeof(plugin_file), *status);
|
||||
|
||||
#if UPLUG_TRACE
|
||||
DBG((stderr, "pluginfile= %s len %d/%d\n", plugin_file, (int)strlen(plugin_file), (int)sizeof(plugin_file)));
|
||||
#endif
|
||||
|
||||
#ifdef __MVS__
|
||||
if (iscics()) /* 12 Nov 2011 JAM */
|
||||
{
|
||||
f = nullptr;
|
||||
}
|
||||
else
|
||||
#endif
|
||||
{
|
||||
f = fopen(pluginFile.data(), "r");
|
||||
}
|
||||
|
||||
if(f != nullptr) {
|
||||
char linebuf[1024];
|
||||
char *p, *libName=nullptr, *symName=nullptr, *config=nullptr;
|
||||
int32_t line = 0;
|
||||
|
||||
|
||||
while(fgets(linebuf,1023,f)) {
|
||||
line++;
|
||||
|
||||
if(!*linebuf || *linebuf=='#') {
|
||||
continue;
|
||||
} else {
|
||||
p = linebuf;
|
||||
while(*p&&isspace((int)*p))
|
||||
p++;
|
||||
if(!*p || *p=='#') continue;
|
||||
libName = p;
|
||||
while(*p&&!isspace((int)*p)) {
|
||||
p++;
|
||||
}
|
||||
if(!*p || *p=='#') continue; /* no tab after libname */
|
||||
*p=0; /* end of libname */
|
||||
p++;
|
||||
while(*p&&isspace((int)*p)) {
|
||||
p++;
|
||||
}
|
||||
if(!*p||*p=='#') continue; /* no symname after libname +tab */
|
||||
symName = p;
|
||||
while(*p&&!isspace((int)*p)) {
|
||||
p++;
|
||||
}
|
||||
|
||||
if(*p) { /* has config */
|
||||
*p=0;
|
||||
++p;
|
||||
while(*p&&isspace((int)*p)) {
|
||||
p++;
|
||||
}
|
||||
if(*p) {
|
||||
config = p;
|
||||
}
|
||||
}
|
||||
|
||||
/* chop whitespace at the end of the config */
|
||||
if(config!=nullptr&&*config!=0) {
|
||||
p = config+strlen(config);
|
||||
while(p>config&&isspace((int)*(--p))) {
|
||||
*p=0;
|
||||
}
|
||||
}
|
||||
|
||||
/* OK, we're good. */
|
||||
{
|
||||
UErrorCode subStatus = U_ZERO_ERROR;
|
||||
UPlugData *plug = uplug_initPlugFromLibrary(libName, symName, config, &subStatus);
|
||||
if(U_FAILURE(subStatus) && U_SUCCESS(*status)) {
|
||||
*status = subStatus;
|
||||
}
|
||||
#if UPLUG_TRACE
|
||||
DBG((stderr, "PLUGIN libName=[%s], sym=[%s], config=[%s]\n", libName, symName, config));
|
||||
DBG((stderr, " -> %p, %s\n", (void*)plug, u_errorName(subStatus)));
|
||||
#else
|
||||
(void)plug; /* unused */
|
||||
#endif
|
||||
}
|
||||
}
|
||||
}
|
||||
fclose(f);
|
||||
} else {
|
||||
#if UPLUG_TRACE
|
||||
DBG((stderr, "Can't open plugin file %s\n", plugin_file));
|
||||
#endif
|
||||
}
|
||||
}
|
||||
uplug_loadWaitingPlugs(status);
|
||||
#endif /* U_ENABLE_DYLOAD */
|
||||
gCurrentLevel = UPLUG_LEVEL_HIGH;
|
||||
ucln_registerCleanup(UCLN_UPLUG, uplug_cleanup);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
93
engine/thirdparty/icu4c/common/icuplugimp.h
vendored
Normal file
93
engine/thirdparty/icu4c/common/icuplugimp.h
vendored
Normal file
|
|
@ -0,0 +1,93 @@
|
|||
// © 2016 and later: Unicode, Inc. and others.
|
||||
// License & terms of use: http://www.unicode.org/copyright.html
|
||||
/*
|
||||
******************************************************************************
|
||||
*
|
||||
* Copyright (C) 2009-2015, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*
|
||||
******************************************************************************
|
||||
*
|
||||
* FILE NAME : icuplugimp.h
|
||||
*
|
||||
* Internal functions for the ICU plugin system
|
||||
*
|
||||
* Date Name Description
|
||||
* 10/29/2009 sl New.
|
||||
******************************************************************************
|
||||
*/
|
||||
|
||||
|
||||
#ifndef ICUPLUGIMP_H
|
||||
#define ICUPLUGIMP_H
|
||||
|
||||
#include "unicode/icuplug.h"
|
||||
|
||||
#if UCONFIG_ENABLE_PLUGINS
|
||||
|
||||
/*========================*/
|
||||
/** @{ Library Manipulation
|
||||
*/
|
||||
|
||||
/**
|
||||
* Open a library, adding a reference count if needed.
|
||||
* @param libName library name to load
|
||||
* @param status error code
|
||||
* @return the library pointer, or NULL
|
||||
* @internal internal use only
|
||||
*/
|
||||
U_CAPI void * U_EXPORT2
|
||||
uplug_openLibrary(const char *libName, UErrorCode *status);
|
||||
|
||||
/**
|
||||
* Close a library, if its reference count is 0
|
||||
* @param lib the library to close
|
||||
* @param status error code
|
||||
* @internal internal use only
|
||||
*/
|
||||
U_CAPI void U_EXPORT2
|
||||
uplug_closeLibrary(void *lib, UErrorCode *status);
|
||||
|
||||
/**
|
||||
* Get a library's name, or NULL if not found.
|
||||
* @param lib the library's name
|
||||
* @param status error code
|
||||
* @return the library name, or NULL if not found.
|
||||
* @internal internal use only
|
||||
*/
|
||||
U_CAPI char * U_EXPORT2
|
||||
uplug_findLibrary(void *lib, UErrorCode *status);
|
||||
|
||||
/** @} */
|
||||
|
||||
/*========================*/
|
||||
/** {@ ICU Plugin internal interfaces
|
||||
*/
|
||||
|
||||
/**
|
||||
* Initialize the plugins
|
||||
* @param status error result
|
||||
* @internal - Internal use only.
|
||||
*/
|
||||
U_CAPI void U_EXPORT2
|
||||
uplug_init(UErrorCode *status);
|
||||
|
||||
/**
|
||||
* Get raw plug N
|
||||
* @internal - Internal use only
|
||||
*/
|
||||
U_CAPI UPlugData* U_EXPORT2
|
||||
uplug_getPlugInternal(int32_t n);
|
||||
|
||||
/**
|
||||
* Get the name of the plugin file.
|
||||
* @internal - Internal use only.
|
||||
*/
|
||||
U_CAPI const char* U_EXPORT2
|
||||
uplug_getPluginFile(void);
|
||||
|
||||
/** @} */
|
||||
|
||||
#endif
|
||||
|
||||
#endif
|
||||
447
engine/thirdparty/icu4c/common/loadednormalizer2impl.cpp
vendored
Normal file
447
engine/thirdparty/icu4c/common/loadednormalizer2impl.cpp
vendored
Normal file
|
|
@ -0,0 +1,447 @@
|
|||
// © 2016 and later: Unicode, Inc. and others.
|
||||
// License & terms of use: http://www.unicode.org/copyright.html
|
||||
/*
|
||||
*******************************************************************************
|
||||
* Copyright (C) 2014, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*******************************************************************************
|
||||
* loadednormalizer2impl.cpp
|
||||
*
|
||||
* created on: 2014sep03
|
||||
* created by: Markus W. Scherer
|
||||
*/
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
|
||||
#if !UCONFIG_NO_NORMALIZATION
|
||||
|
||||
#include "unicode/udata.h"
|
||||
#include "unicode/localpointer.h"
|
||||
#include "unicode/normalizer2.h"
|
||||
#include "unicode/ucptrie.h"
|
||||
#include "unicode/unistr.h"
|
||||
#include "unicode/unorm.h"
|
||||
#include "cstring.h"
|
||||
#include "mutex.h"
|
||||
#include "norm2allmodes.h"
|
||||
#include "normalizer2impl.h"
|
||||
#include "uassert.h"
|
||||
#include "ucln_cmn.h"
|
||||
#include "uhash.h"
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
class LoadedNormalizer2Impl : public Normalizer2Impl {
|
||||
public:
|
||||
LoadedNormalizer2Impl() : memory(nullptr), ownedTrie(nullptr) {}
|
||||
virtual ~LoadedNormalizer2Impl();
|
||||
|
||||
void load(const char *packageName, const char *name, UErrorCode &errorCode);
|
||||
|
||||
private:
|
||||
static UBool U_CALLCONV
|
||||
isAcceptable(void *context, const char *type, const char *name, const UDataInfo *pInfo);
|
||||
|
||||
UDataMemory *memory;
|
||||
UCPTrie *ownedTrie;
|
||||
};
|
||||
|
||||
LoadedNormalizer2Impl::~LoadedNormalizer2Impl() {
|
||||
udata_close(memory);
|
||||
ucptrie_close(ownedTrie);
|
||||
}
|
||||
|
||||
UBool U_CALLCONV
|
||||
LoadedNormalizer2Impl::isAcceptable(void * /*context*/,
|
||||
const char * /* type */, const char * /*name*/,
|
||||
const UDataInfo *pInfo) {
|
||||
if(
|
||||
pInfo->size>=20 &&
|
||||
pInfo->isBigEndian==U_IS_BIG_ENDIAN &&
|
||||
pInfo->charsetFamily==U_CHARSET_FAMILY &&
|
||||
pInfo->dataFormat[0]==0x4e && /* dataFormat="Nrm2" */
|
||||
pInfo->dataFormat[1]==0x72 &&
|
||||
pInfo->dataFormat[2]==0x6d &&
|
||||
pInfo->dataFormat[3]==0x32 &&
|
||||
pInfo->formatVersion[0]==4
|
||||
) {
|
||||
// Normalizer2Impl *me=(Normalizer2Impl *)context;
|
||||
// uprv_memcpy(me->dataVersion, pInfo->dataVersion, 4);
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
LoadedNormalizer2Impl::load(const char *packageName, const char *name, UErrorCode &errorCode) {
|
||||
if(U_FAILURE(errorCode)) {
|
||||
return;
|
||||
}
|
||||
memory=udata_openChoice(packageName, "nrm", name, isAcceptable, this, &errorCode);
|
||||
if(U_FAILURE(errorCode)) {
|
||||
return;
|
||||
}
|
||||
const uint8_t *inBytes=(const uint8_t *)udata_getMemory(memory);
|
||||
const int32_t *inIndexes=(const int32_t *)inBytes;
|
||||
int32_t indexesLength=inIndexes[IX_NORM_TRIE_OFFSET]/4;
|
||||
if(indexesLength<=IX_MIN_LCCC_CP) {
|
||||
errorCode=U_INVALID_FORMAT_ERROR; // Not enough indexes.
|
||||
return;
|
||||
}
|
||||
|
||||
int32_t offset=inIndexes[IX_NORM_TRIE_OFFSET];
|
||||
int32_t nextOffset=inIndexes[IX_EXTRA_DATA_OFFSET];
|
||||
ownedTrie=ucptrie_openFromBinary(UCPTRIE_TYPE_FAST, UCPTRIE_VALUE_BITS_16,
|
||||
inBytes+offset, nextOffset-offset, nullptr,
|
||||
&errorCode);
|
||||
if(U_FAILURE(errorCode)) {
|
||||
return;
|
||||
}
|
||||
|
||||
offset=nextOffset;
|
||||
nextOffset=inIndexes[IX_SMALL_FCD_OFFSET];
|
||||
const uint16_t *inExtraData=(const uint16_t *)(inBytes+offset);
|
||||
|
||||
// smallFCD: new in formatVersion 2
|
||||
offset=nextOffset;
|
||||
const uint8_t *inSmallFCD=inBytes+offset;
|
||||
|
||||
init(inIndexes, ownedTrie, inExtraData, inSmallFCD);
|
||||
}
|
||||
|
||||
// instance cache ---------------------------------------------------------- ***
|
||||
|
||||
Norm2AllModes *
|
||||
Norm2AllModes::createInstance(const char *packageName,
|
||||
const char *name,
|
||||
UErrorCode &errorCode) {
|
||||
if(U_FAILURE(errorCode)) {
|
||||
return nullptr;
|
||||
}
|
||||
LoadedNormalizer2Impl *impl=new LoadedNormalizer2Impl;
|
||||
if(impl==nullptr) {
|
||||
errorCode=U_MEMORY_ALLOCATION_ERROR;
|
||||
return nullptr;
|
||||
}
|
||||
impl->load(packageName, name, errorCode);
|
||||
return createInstance(impl, errorCode);
|
||||
}
|
||||
|
||||
U_CDECL_BEGIN
|
||||
static UBool U_CALLCONV uprv_loaded_normalizer2_cleanup();
|
||||
U_CDECL_END
|
||||
|
||||
#if !NORM2_HARDCODE_NFC_DATA
|
||||
static Norm2AllModes *nfcSingleton;
|
||||
static icu::UInitOnce nfcInitOnce {};
|
||||
#endif
|
||||
|
||||
static Norm2AllModes *nfkcSingleton;
|
||||
static icu::UInitOnce nfkcInitOnce {};
|
||||
|
||||
static Norm2AllModes *nfkc_cfSingleton;
|
||||
static icu::UInitOnce nfkc_cfInitOnce {};
|
||||
|
||||
static Norm2AllModes *nfkc_scfSingleton;
|
||||
static icu::UInitOnce nfkc_scfInitOnce {};
|
||||
|
||||
static UHashtable *cache=nullptr;
|
||||
|
||||
// UInitOnce singleton initialization function
|
||||
static void U_CALLCONV initSingletons(const char *what, UErrorCode &errorCode) {
|
||||
#if !NORM2_HARDCODE_NFC_DATA
|
||||
if (uprv_strcmp(what, "nfc") == 0) {
|
||||
nfcSingleton = Norm2AllModes::createInstance(nullptr, "nfc", errorCode);
|
||||
} else
|
||||
#endif
|
||||
if (uprv_strcmp(what, "nfkc") == 0) {
|
||||
nfkcSingleton = Norm2AllModes::createInstance(nullptr, "nfkc", errorCode);
|
||||
} else if (uprv_strcmp(what, "nfkc_cf") == 0) {
|
||||
nfkc_cfSingleton = Norm2AllModes::createInstance(nullptr, "nfkc_cf", errorCode);
|
||||
} else if (uprv_strcmp(what, "nfkc_scf") == 0) {
|
||||
nfkc_scfSingleton = Norm2AllModes::createInstance(nullptr, "nfkc_scf", errorCode);
|
||||
} else {
|
||||
UPRV_UNREACHABLE_EXIT; // Unknown singleton
|
||||
}
|
||||
ucln_common_registerCleanup(UCLN_COMMON_LOADED_NORMALIZER2, uprv_loaded_normalizer2_cleanup);
|
||||
}
|
||||
|
||||
U_CDECL_BEGIN
|
||||
|
||||
static void U_CALLCONV deleteNorm2AllModes(void *allModes) {
|
||||
delete (Norm2AllModes *)allModes;
|
||||
}
|
||||
|
||||
static UBool U_CALLCONV uprv_loaded_normalizer2_cleanup() {
|
||||
#if !NORM2_HARDCODE_NFC_DATA
|
||||
delete nfcSingleton;
|
||||
nfcSingleton = nullptr;
|
||||
nfcInitOnce.reset();
|
||||
#endif
|
||||
|
||||
delete nfkcSingleton;
|
||||
nfkcSingleton = nullptr;
|
||||
nfkcInitOnce.reset();
|
||||
|
||||
delete nfkc_cfSingleton;
|
||||
nfkc_cfSingleton = nullptr;
|
||||
nfkc_cfInitOnce.reset();
|
||||
|
||||
delete nfkc_scfSingleton;
|
||||
nfkc_scfSingleton = nullptr;
|
||||
nfkc_scfInitOnce.reset();
|
||||
|
||||
uhash_close(cache);
|
||||
cache=nullptr;
|
||||
return true;
|
||||
}
|
||||
|
||||
U_CDECL_END
|
||||
|
||||
#if !NORM2_HARDCODE_NFC_DATA
|
||||
const Norm2AllModes *
|
||||
Norm2AllModes::getNFCInstance(UErrorCode &errorCode) {
|
||||
if(U_FAILURE(errorCode)) { return nullptr; }
|
||||
umtx_initOnce(nfcInitOnce, &initSingletons, "nfc", errorCode);
|
||||
return nfcSingleton;
|
||||
}
|
||||
#endif
|
||||
|
||||
const Norm2AllModes *
|
||||
Norm2AllModes::getNFKCInstance(UErrorCode &errorCode) {
|
||||
if(U_FAILURE(errorCode)) { return nullptr; }
|
||||
umtx_initOnce(nfkcInitOnce, &initSingletons, "nfkc", errorCode);
|
||||
return nfkcSingleton;
|
||||
}
|
||||
|
||||
const Norm2AllModes *
|
||||
Norm2AllModes::getNFKC_CFInstance(UErrorCode &errorCode) {
|
||||
if(U_FAILURE(errorCode)) { return nullptr; }
|
||||
umtx_initOnce(nfkc_cfInitOnce, &initSingletons, "nfkc_cf", errorCode);
|
||||
return nfkc_cfSingleton;
|
||||
}
|
||||
|
||||
const Norm2AllModes *
|
||||
Norm2AllModes::getNFKC_SCFInstance(UErrorCode &errorCode) {
|
||||
if(U_FAILURE(errorCode)) { return nullptr; }
|
||||
umtx_initOnce(nfkc_scfInitOnce, &initSingletons, "nfkc_scf", errorCode);
|
||||
return nfkc_scfSingleton;
|
||||
}
|
||||
|
||||
#if !NORM2_HARDCODE_NFC_DATA
|
||||
const Normalizer2 *
|
||||
Normalizer2::getNFCInstance(UErrorCode &errorCode) {
|
||||
const Norm2AllModes *allModes=Norm2AllModes::getNFCInstance(errorCode);
|
||||
return allModes!=nullptr ? &allModes->comp : nullptr;
|
||||
}
|
||||
|
||||
const Normalizer2 *
|
||||
Normalizer2::getNFDInstance(UErrorCode &errorCode) {
|
||||
const Norm2AllModes *allModes=Norm2AllModes::getNFCInstance(errorCode);
|
||||
return allModes!=nullptr ? &allModes->decomp : nullptr;
|
||||
}
|
||||
|
||||
const Normalizer2 *Normalizer2Factory::getFCDInstance(UErrorCode &errorCode) {
|
||||
const Norm2AllModes *allModes=Norm2AllModes::getNFCInstance(errorCode);
|
||||
return allModes!=nullptr ? &allModes->fcd : nullptr;
|
||||
}
|
||||
|
||||
const Normalizer2 *Normalizer2Factory::getFCCInstance(UErrorCode &errorCode) {
|
||||
const Norm2AllModes *allModes=Norm2AllModes::getNFCInstance(errorCode);
|
||||
return allModes!=nullptr ? &allModes->fcc : nullptr;
|
||||
}
|
||||
|
||||
const Normalizer2Impl *
|
||||
Normalizer2Factory::getNFCImpl(UErrorCode &errorCode) {
|
||||
const Norm2AllModes *allModes=Norm2AllModes::getNFCInstance(errorCode);
|
||||
return allModes!=nullptr ? allModes->impl : nullptr;
|
||||
}
|
||||
#endif
|
||||
|
||||
const Normalizer2 *
|
||||
Normalizer2::getNFKCInstance(UErrorCode &errorCode) {
|
||||
const Norm2AllModes *allModes=Norm2AllModes::getNFKCInstance(errorCode);
|
||||
return allModes!=nullptr ? &allModes->comp : nullptr;
|
||||
}
|
||||
|
||||
const Normalizer2 *
|
||||
Normalizer2::getNFKDInstance(UErrorCode &errorCode) {
|
||||
const Norm2AllModes *allModes=Norm2AllModes::getNFKCInstance(errorCode);
|
||||
return allModes!=nullptr ? &allModes->decomp : nullptr;
|
||||
}
|
||||
|
||||
const Normalizer2 *
|
||||
Normalizer2::getNFKCCasefoldInstance(UErrorCode &errorCode) {
|
||||
const Norm2AllModes *allModes=Norm2AllModes::getNFKC_CFInstance(errorCode);
|
||||
return allModes!=nullptr ? &allModes->comp : nullptr;
|
||||
}
|
||||
|
||||
const Normalizer2 *
|
||||
Normalizer2::getNFKCSimpleCasefoldInstance(UErrorCode &errorCode) {
|
||||
const Norm2AllModes *allModes=Norm2AllModes::getNFKC_SCFInstance(errorCode);
|
||||
return allModes!=nullptr ? &allModes->comp : nullptr;
|
||||
}
|
||||
|
||||
const Normalizer2 *
|
||||
Normalizer2::getInstance(const char *packageName,
|
||||
const char *name,
|
||||
UNormalization2Mode mode,
|
||||
UErrorCode &errorCode) {
|
||||
if(U_FAILURE(errorCode)) {
|
||||
return nullptr;
|
||||
}
|
||||
if(name==nullptr || *name==0) {
|
||||
errorCode=U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return nullptr;
|
||||
}
|
||||
const Norm2AllModes *allModes=nullptr;
|
||||
if(packageName==nullptr) {
|
||||
if(0==uprv_strcmp(name, "nfc")) {
|
||||
allModes=Norm2AllModes::getNFCInstance(errorCode);
|
||||
} else if(0==uprv_strcmp(name, "nfkc")) {
|
||||
allModes=Norm2AllModes::getNFKCInstance(errorCode);
|
||||
} else if(0==uprv_strcmp(name, "nfkc_cf")) {
|
||||
allModes=Norm2AllModes::getNFKC_CFInstance(errorCode);
|
||||
} else if(0==uprv_strcmp(name, "nfkc_scf")) {
|
||||
allModes=Norm2AllModes::getNFKC_SCFInstance(errorCode);
|
||||
}
|
||||
}
|
||||
if(allModes==nullptr && U_SUCCESS(errorCode)) {
|
||||
{
|
||||
Mutex lock;
|
||||
if(cache!=nullptr) {
|
||||
allModes=(Norm2AllModes *)uhash_get(cache, name);
|
||||
}
|
||||
}
|
||||
if(allModes==nullptr) {
|
||||
ucln_common_registerCleanup(UCLN_COMMON_LOADED_NORMALIZER2, uprv_loaded_normalizer2_cleanup);
|
||||
LocalPointer<Norm2AllModes> localAllModes(
|
||||
Norm2AllModes::createInstance(packageName, name, errorCode));
|
||||
if(U_SUCCESS(errorCode)) {
|
||||
Mutex lock;
|
||||
if(cache==nullptr) {
|
||||
cache=uhash_open(uhash_hashChars, uhash_compareChars, nullptr, &errorCode);
|
||||
if(U_FAILURE(errorCode)) {
|
||||
return nullptr;
|
||||
}
|
||||
uhash_setKeyDeleter(cache, uprv_free);
|
||||
uhash_setValueDeleter(cache, deleteNorm2AllModes);
|
||||
}
|
||||
void *temp=uhash_get(cache, name);
|
||||
if(temp==nullptr) {
|
||||
int32_t keyLength= static_cast<int32_t>(uprv_strlen(name)+1);
|
||||
char *nameCopy=(char *)uprv_malloc(keyLength);
|
||||
if(nameCopy==nullptr) {
|
||||
errorCode=U_MEMORY_ALLOCATION_ERROR;
|
||||
return nullptr;
|
||||
}
|
||||
uprv_memcpy(nameCopy, name, keyLength);
|
||||
allModes=localAllModes.getAlias();
|
||||
uhash_put(cache, nameCopy, localAllModes.orphan(), &errorCode);
|
||||
} else {
|
||||
// race condition
|
||||
allModes=(Norm2AllModes *)temp;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if(allModes!=nullptr && U_SUCCESS(errorCode)) {
|
||||
switch(mode) {
|
||||
case UNORM2_COMPOSE:
|
||||
return &allModes->comp;
|
||||
case UNORM2_DECOMPOSE:
|
||||
return &allModes->decomp;
|
||||
case UNORM2_FCD:
|
||||
return &allModes->fcd;
|
||||
case UNORM2_COMPOSE_CONTIGUOUS:
|
||||
return &allModes->fcc;
|
||||
default:
|
||||
break; // do nothing
|
||||
}
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
const Normalizer2 *
|
||||
Normalizer2Factory::getInstance(UNormalizationMode mode, UErrorCode &errorCode) {
|
||||
if(U_FAILURE(errorCode)) {
|
||||
return nullptr;
|
||||
}
|
||||
switch(mode) {
|
||||
case UNORM_NFD:
|
||||
return Normalizer2::getNFDInstance(errorCode);
|
||||
case UNORM_NFKD:
|
||||
return Normalizer2::getNFKDInstance(errorCode);
|
||||
case UNORM_NFC:
|
||||
return Normalizer2::getNFCInstance(errorCode);
|
||||
case UNORM_NFKC:
|
||||
return Normalizer2::getNFKCInstance(errorCode);
|
||||
case UNORM_FCD:
|
||||
return getFCDInstance(errorCode);
|
||||
default: // UNORM_NONE
|
||||
return getNoopInstance(errorCode);
|
||||
}
|
||||
}
|
||||
|
||||
const Normalizer2Impl *
|
||||
Normalizer2Factory::getNFKCImpl(UErrorCode &errorCode) {
|
||||
const Norm2AllModes *allModes=Norm2AllModes::getNFKCInstance(errorCode);
|
||||
return allModes!=nullptr ? allModes->impl : nullptr;
|
||||
}
|
||||
|
||||
const Normalizer2Impl *
|
||||
Normalizer2Factory::getNFKC_CFImpl(UErrorCode &errorCode) {
|
||||
const Norm2AllModes *allModes=Norm2AllModes::getNFKC_CFInstance(errorCode);
|
||||
return allModes!=nullptr ? allModes->impl : nullptr;
|
||||
}
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
||||
// C API ------------------------------------------------------------------- ***
|
||||
|
||||
U_NAMESPACE_USE
|
||||
|
||||
U_CAPI const UNormalizer2 * U_EXPORT2
|
||||
unorm2_getNFKCInstance(UErrorCode *pErrorCode) {
|
||||
return (const UNormalizer2 *)Normalizer2::getNFKCInstance(*pErrorCode);
|
||||
}
|
||||
|
||||
U_CAPI const UNormalizer2 * U_EXPORT2
|
||||
unorm2_getNFKDInstance(UErrorCode *pErrorCode) {
|
||||
return (const UNormalizer2 *)Normalizer2::getNFKDInstance(*pErrorCode);
|
||||
}
|
||||
|
||||
U_CAPI const UNormalizer2 * U_EXPORT2
|
||||
unorm2_getNFKCCasefoldInstance(UErrorCode *pErrorCode) {
|
||||
return (const UNormalizer2 *)Normalizer2::getNFKCCasefoldInstance(*pErrorCode);
|
||||
}
|
||||
|
||||
U_CAPI const UNormalizer2 * U_EXPORT2
|
||||
unorm2_getNFKCSimpleCasefoldInstance(UErrorCode *pErrorCode) {
|
||||
return (const UNormalizer2 *)Normalizer2::getNFKCSimpleCasefoldInstance(*pErrorCode);
|
||||
}
|
||||
|
||||
U_CAPI const UNormalizer2 * U_EXPORT2
|
||||
unorm2_getInstance(const char *packageName,
|
||||
const char *name,
|
||||
UNormalization2Mode mode,
|
||||
UErrorCode *pErrorCode) {
|
||||
return (const UNormalizer2 *)Normalizer2::getInstance(packageName, name, mode, *pErrorCode);
|
||||
}
|
||||
|
||||
U_CFUNC UNormalizationCheckResult
|
||||
unorm_getQuickCheck(UChar32 c, UNormalizationMode mode) {
|
||||
if(mode<=UNORM_NONE || UNORM_FCD<=mode) {
|
||||
return UNORM_YES;
|
||||
}
|
||||
UErrorCode errorCode=U_ZERO_ERROR;
|
||||
const Normalizer2 *norm2=Normalizer2Factory::getInstance(mode, errorCode);
|
||||
if(U_SUCCESS(errorCode)) {
|
||||
return ((const Normalizer2WithImpl *)norm2)->getQuickCheck(c);
|
||||
} else {
|
||||
return UNORM_MAYBE;
|
||||
}
|
||||
}
|
||||
|
||||
#endif // !UCONFIG_NO_NORMALIZATION
|
||||
480
engine/thirdparty/icu4c/common/localebuilder.cpp
vendored
Normal file
480
engine/thirdparty/icu4c/common/localebuilder.cpp
vendored
Normal file
|
|
@ -0,0 +1,480 @@
|
|||
// © 2019 and later: Unicode, Inc. and others.
|
||||
// License & terms of use: http://www.unicode.org/copyright.html
|
||||
|
||||
#include <utility>
|
||||
|
||||
#include "bytesinkutil.h" // StringByteSink<CharString>
|
||||
#include "charstr.h"
|
||||
#include "cstring.h"
|
||||
#include "ulocimp.h"
|
||||
#include "unicode/localebuilder.h"
|
||||
#include "unicode/locid.h"
|
||||
|
||||
namespace {
|
||||
|
||||
inline bool UPRV_ISDIGIT(char c) { return c >= '0' && c <= '9'; }
|
||||
inline bool UPRV_ISALPHANUM(char c) { return uprv_isASCIILetter(c) || UPRV_ISDIGIT(c); }
|
||||
|
||||
constexpr const char* kAttributeKey = "attribute";
|
||||
|
||||
bool _isExtensionSubtags(char key, const char* s, int32_t len) {
|
||||
switch (uprv_tolower(key)) {
|
||||
case 'u':
|
||||
return ultag_isUnicodeExtensionSubtags(s, len);
|
||||
case 't':
|
||||
return ultag_isTransformedExtensionSubtags(s, len);
|
||||
case 'x':
|
||||
return ultag_isPrivateuseValueSubtags(s, len);
|
||||
default:
|
||||
return ultag_isExtensionSubtags(s, len);
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
LocaleBuilder::LocaleBuilder() : UObject(), status_(U_ZERO_ERROR), language_(),
|
||||
script_(), region_(), variant_(nullptr), extensions_(nullptr)
|
||||
{
|
||||
language_[0] = 0;
|
||||
script_[0] = 0;
|
||||
region_[0] = 0;
|
||||
}
|
||||
|
||||
LocaleBuilder::~LocaleBuilder()
|
||||
{
|
||||
delete variant_;
|
||||
delete extensions_;
|
||||
}
|
||||
|
||||
LocaleBuilder& LocaleBuilder::setLocale(const Locale& locale)
|
||||
{
|
||||
clear();
|
||||
setLanguage(locale.getLanguage());
|
||||
setScript(locale.getScript());
|
||||
setRegion(locale.getCountry());
|
||||
setVariant(locale.getVariant());
|
||||
extensions_ = locale.clone();
|
||||
if (extensions_ == nullptr) {
|
||||
status_ = U_MEMORY_ALLOCATION_ERROR;
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
||||
LocaleBuilder& LocaleBuilder::setLanguageTag(StringPiece tag)
|
||||
{
|
||||
Locale l = Locale::forLanguageTag(tag, status_);
|
||||
if (U_FAILURE(status_)) { return *this; }
|
||||
// Because setLocale will reset status_ we need to return
|
||||
// first if we have error in forLanguageTag.
|
||||
setLocale(l);
|
||||
return *this;
|
||||
}
|
||||
|
||||
namespace {
|
||||
|
||||
void setField(StringPiece input, char* dest, UErrorCode& errorCode,
|
||||
bool (*test)(const char*, int32_t)) {
|
||||
if (U_FAILURE(errorCode)) { return; }
|
||||
if (input.empty()) {
|
||||
dest[0] = '\0';
|
||||
} else if (test(input.data(), input.length())) {
|
||||
uprv_memcpy(dest, input.data(), input.length());
|
||||
dest[input.length()] = '\0';
|
||||
} else {
|
||||
errorCode = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
LocaleBuilder& LocaleBuilder::setLanguage(StringPiece language)
|
||||
{
|
||||
setField(language, language_, status_, &ultag_isLanguageSubtag);
|
||||
return *this;
|
||||
}
|
||||
|
||||
LocaleBuilder& LocaleBuilder::setScript(StringPiece script)
|
||||
{
|
||||
setField(script, script_, status_, &ultag_isScriptSubtag);
|
||||
return *this;
|
||||
}
|
||||
|
||||
LocaleBuilder& LocaleBuilder::setRegion(StringPiece region)
|
||||
{
|
||||
setField(region, region_, status_, &ultag_isRegionSubtag);
|
||||
return *this;
|
||||
}
|
||||
|
||||
namespace {
|
||||
|
||||
void transform(char* data, int32_t len) {
|
||||
for (int32_t i = 0; i < len; i++, data++) {
|
||||
if (*data == '_') {
|
||||
*data = '-';
|
||||
} else {
|
||||
*data = uprv_tolower(*data);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
LocaleBuilder& LocaleBuilder::setVariant(StringPiece variant)
|
||||
{
|
||||
if (U_FAILURE(status_)) { return *this; }
|
||||
if (variant.empty()) {
|
||||
delete variant_;
|
||||
variant_ = nullptr;
|
||||
return *this;
|
||||
}
|
||||
CharString* new_variant = new CharString(variant, status_);
|
||||
if (U_FAILURE(status_)) { return *this; }
|
||||
if (new_variant == nullptr) {
|
||||
status_ = U_MEMORY_ALLOCATION_ERROR;
|
||||
return *this;
|
||||
}
|
||||
transform(new_variant->data(), new_variant->length());
|
||||
if (!ultag_isVariantSubtags(new_variant->data(), new_variant->length())) {
|
||||
delete new_variant;
|
||||
status_ = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return *this;
|
||||
}
|
||||
delete variant_;
|
||||
variant_ = new_variant;
|
||||
return *this;
|
||||
}
|
||||
|
||||
namespace {
|
||||
|
||||
bool
|
||||
_isKeywordValue(const char* key, const char* value, int32_t value_len)
|
||||
{
|
||||
if (key[1] == '\0') {
|
||||
// one char key
|
||||
return (UPRV_ISALPHANUM(uprv_tolower(key[0])) &&
|
||||
_isExtensionSubtags(key[0], value, value_len));
|
||||
} else if (uprv_strcmp(key, kAttributeKey) == 0) {
|
||||
// unicode attributes
|
||||
return ultag_isUnicodeLocaleAttributes(value, value_len);
|
||||
}
|
||||
// otherwise: unicode extension value
|
||||
// We need to convert from legacy key/value to unicode
|
||||
// key/value
|
||||
const char* unicode_locale_key = uloc_toUnicodeLocaleKey(key);
|
||||
const char* unicode_locale_type = uloc_toUnicodeLocaleType(key, value);
|
||||
|
||||
return unicode_locale_key && unicode_locale_type &&
|
||||
ultag_isUnicodeLocaleKey(unicode_locale_key, -1) &&
|
||||
ultag_isUnicodeLocaleType(unicode_locale_type, -1);
|
||||
}
|
||||
|
||||
void
|
||||
_copyExtensions(const Locale& from, icu::StringEnumeration *keywords,
|
||||
Locale& to, bool validate, UErrorCode& errorCode)
|
||||
{
|
||||
if (U_FAILURE(errorCode)) { return; }
|
||||
LocalPointer<icu::StringEnumeration> ownedKeywords;
|
||||
if (keywords == nullptr) {
|
||||
ownedKeywords.adoptInstead(from.createKeywords(errorCode));
|
||||
if (U_FAILURE(errorCode) || ownedKeywords.isNull()) { return; }
|
||||
keywords = ownedKeywords.getAlias();
|
||||
}
|
||||
const char* key;
|
||||
while ((key = keywords->next(nullptr, errorCode)) != nullptr) {
|
||||
auto value = from.getKeywordValue<CharString>(key, errorCode);
|
||||
if (U_FAILURE(errorCode)) { return; }
|
||||
if (uprv_strcmp(key, kAttributeKey) == 0) {
|
||||
transform(value.data(), value.length());
|
||||
}
|
||||
if (validate &&
|
||||
!_isKeywordValue(key, value.data(), value.length())) {
|
||||
errorCode = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return;
|
||||
}
|
||||
to.setKeywordValue(key, value.data(), errorCode);
|
||||
if (U_FAILURE(errorCode)) { return; }
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
_clearUAttributesAndKeyType(Locale& locale, UErrorCode& errorCode)
|
||||
{
|
||||
if (U_FAILURE(errorCode)) { return; }
|
||||
// Clear Unicode attributes
|
||||
locale.setKeywordValue(kAttributeKey, "", errorCode);
|
||||
|
||||
// Clear all Unicode keyword values
|
||||
LocalPointer<icu::StringEnumeration> iter(locale.createUnicodeKeywords(errorCode));
|
||||
if (U_FAILURE(errorCode) || iter.isNull()) { return; }
|
||||
const char* key;
|
||||
while ((key = iter->next(nullptr, errorCode)) != nullptr) {
|
||||
locale.setUnicodeKeywordValue(key, nullptr, errorCode);
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
_setUnicodeExtensions(Locale& locale, const CharString& value, UErrorCode& errorCode)
|
||||
{
|
||||
if (U_FAILURE(errorCode)) { return; }
|
||||
// Add the unicode extensions to extensions_
|
||||
CharString locale_str("und-u-", errorCode);
|
||||
locale_str.append(value, errorCode);
|
||||
_copyExtensions(
|
||||
Locale::forLanguageTag(locale_str.data(), errorCode), nullptr,
|
||||
locale, false, errorCode);
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
LocaleBuilder& LocaleBuilder::setExtension(char key, StringPiece value)
|
||||
{
|
||||
if (U_FAILURE(status_)) { return *this; }
|
||||
if (!UPRV_ISALPHANUM(key)) {
|
||||
status_ = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return *this;
|
||||
}
|
||||
CharString value_str(value, status_);
|
||||
if (U_FAILURE(status_)) { return *this; }
|
||||
transform(value_str.data(), value_str.length());
|
||||
if (!value_str.isEmpty() &&
|
||||
!_isExtensionSubtags(key, value_str.data(), value_str.length())) {
|
||||
status_ = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return *this;
|
||||
}
|
||||
if (extensions_ == nullptr) {
|
||||
extensions_ = Locale::getRoot().clone();
|
||||
if (extensions_ == nullptr) {
|
||||
status_ = U_MEMORY_ALLOCATION_ERROR;
|
||||
return *this;
|
||||
}
|
||||
}
|
||||
if (uprv_tolower(key) != 'u') {
|
||||
// for t, x and others extension.
|
||||
extensions_->setKeywordValue(StringPiece(&key, 1), value_str.data(),
|
||||
status_);
|
||||
return *this;
|
||||
}
|
||||
_clearUAttributesAndKeyType(*extensions_, status_);
|
||||
if (U_FAILURE(status_)) { return *this; }
|
||||
if (!value.empty()) {
|
||||
_setUnicodeExtensions(*extensions_, value_str, status_);
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
||||
LocaleBuilder& LocaleBuilder::setUnicodeLocaleKeyword(
|
||||
StringPiece key, StringPiece type)
|
||||
{
|
||||
if (U_FAILURE(status_)) { return *this; }
|
||||
if (!ultag_isUnicodeLocaleKey(key.data(), key.length()) ||
|
||||
(!type.empty() &&
|
||||
!ultag_isUnicodeLocaleType(type.data(), type.length()))) {
|
||||
status_ = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return *this;
|
||||
}
|
||||
if (extensions_ == nullptr) {
|
||||
extensions_ = Locale::getRoot().clone();
|
||||
if (extensions_ == nullptr) {
|
||||
status_ = U_MEMORY_ALLOCATION_ERROR;
|
||||
return *this;
|
||||
}
|
||||
}
|
||||
extensions_->setUnicodeKeywordValue(key, type, status_);
|
||||
return *this;
|
||||
}
|
||||
|
||||
LocaleBuilder& LocaleBuilder::addUnicodeLocaleAttribute(
|
||||
StringPiece value)
|
||||
{
|
||||
CharString value_str(value, status_);
|
||||
if (U_FAILURE(status_)) { return *this; }
|
||||
transform(value_str.data(), value_str.length());
|
||||
if (!ultag_isUnicodeLocaleAttribute(value_str.data(), value_str.length())) {
|
||||
status_ = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return *this;
|
||||
}
|
||||
if (extensions_ == nullptr) {
|
||||
extensions_ = Locale::getRoot().clone();
|
||||
if (extensions_ == nullptr) {
|
||||
status_ = U_MEMORY_ALLOCATION_ERROR;
|
||||
return *this;
|
||||
}
|
||||
extensions_->setKeywordValue(kAttributeKey, value_str.data(), status_);
|
||||
return *this;
|
||||
}
|
||||
|
||||
UErrorCode localErrorCode = U_ZERO_ERROR;
|
||||
auto attributes = extensions_->getKeywordValue<CharString>(kAttributeKey, localErrorCode);
|
||||
if (U_FAILURE(localErrorCode)) {
|
||||
CharString new_attributes(value_str.data(), status_);
|
||||
// No attributes, set the attribute.
|
||||
extensions_->setKeywordValue(kAttributeKey, new_attributes.data(), status_);
|
||||
return *this;
|
||||
}
|
||||
|
||||
transform(attributes.data(),attributes.length());
|
||||
const char* start = attributes.data();
|
||||
const char* limit = attributes.data() + attributes.length();
|
||||
CharString new_attributes;
|
||||
bool inserted = false;
|
||||
while (start < limit) {
|
||||
if (!inserted) {
|
||||
int cmp = uprv_strcmp(start, value_str.data());
|
||||
if (cmp == 0) { return *this; } // Found it in attributes: Just return
|
||||
if (cmp > 0) {
|
||||
if (!new_attributes.isEmpty()) new_attributes.append('_', status_);
|
||||
new_attributes.append(value_str.data(), status_);
|
||||
inserted = true;
|
||||
}
|
||||
}
|
||||
if (!new_attributes.isEmpty()) {
|
||||
new_attributes.append('_', status_);
|
||||
}
|
||||
new_attributes.append(start, status_);
|
||||
start += uprv_strlen(start) + 1;
|
||||
}
|
||||
if (!inserted) {
|
||||
if (!new_attributes.isEmpty()) {
|
||||
new_attributes.append('_', status_);
|
||||
}
|
||||
new_attributes.append(value_str.data(), status_);
|
||||
}
|
||||
// Not yet in the attributes, set the attribute.
|
||||
extensions_->setKeywordValue(kAttributeKey, new_attributes.data(), status_);
|
||||
return *this;
|
||||
}
|
||||
|
||||
LocaleBuilder& LocaleBuilder::removeUnicodeLocaleAttribute(
|
||||
StringPiece value)
|
||||
{
|
||||
CharString value_str(value, status_);
|
||||
if (U_FAILURE(status_)) { return *this; }
|
||||
transform(value_str.data(), value_str.length());
|
||||
if (!ultag_isUnicodeLocaleAttribute(value_str.data(), value_str.length())) {
|
||||
status_ = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return *this;
|
||||
}
|
||||
if (extensions_ == nullptr) { return *this; }
|
||||
UErrorCode localErrorCode = U_ZERO_ERROR;
|
||||
auto attributes = extensions_->getKeywordValue<CharString>(kAttributeKey, localErrorCode);
|
||||
// get failure, just return
|
||||
if (U_FAILURE(localErrorCode)) { return *this; }
|
||||
// Do not have any attributes, just return.
|
||||
if (attributes.isEmpty()) { return *this; }
|
||||
|
||||
char* p = attributes.data();
|
||||
// Replace null terminiator in place for _ and - so later
|
||||
// we can use uprv_strcmp to compare.
|
||||
for (int32_t i = 0; i < attributes.length(); i++, p++) {
|
||||
*p = (*p == '_' || *p == '-') ? '\0' : uprv_tolower(*p);
|
||||
}
|
||||
|
||||
const char* start = attributes.data();
|
||||
const char* limit = attributes.data() + attributes.length();
|
||||
CharString new_attributes;
|
||||
bool found = false;
|
||||
while (start < limit) {
|
||||
if (uprv_strcmp(start, value_str.data()) == 0) {
|
||||
found = true;
|
||||
} else {
|
||||
if (!new_attributes.isEmpty()) {
|
||||
new_attributes.append('_', status_);
|
||||
}
|
||||
new_attributes.append(start, status_);
|
||||
}
|
||||
start += uprv_strlen(start) + 1;
|
||||
}
|
||||
// Found the value in attributes, set the attribute.
|
||||
if (found) {
|
||||
extensions_->setKeywordValue(kAttributeKey, new_attributes.data(), status_);
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
||||
LocaleBuilder& LocaleBuilder::clear()
|
||||
{
|
||||
status_ = U_ZERO_ERROR;
|
||||
language_[0] = 0;
|
||||
script_[0] = 0;
|
||||
region_[0] = 0;
|
||||
delete variant_;
|
||||
variant_ = nullptr;
|
||||
clearExtensions();
|
||||
return *this;
|
||||
}
|
||||
|
||||
LocaleBuilder& LocaleBuilder::clearExtensions()
|
||||
{
|
||||
delete extensions_;
|
||||
extensions_ = nullptr;
|
||||
return *this;
|
||||
}
|
||||
|
||||
Locale makeBogusLocale() {
|
||||
Locale bogus;
|
||||
bogus.setToBogus();
|
||||
return bogus;
|
||||
}
|
||||
|
||||
void LocaleBuilder::copyExtensionsFrom(const Locale& src, UErrorCode& errorCode)
|
||||
{
|
||||
if (U_FAILURE(errorCode)) { return; }
|
||||
LocalPointer<icu::StringEnumeration> keywords(src.createKeywords(errorCode));
|
||||
if (U_FAILURE(errorCode) || keywords.isNull() || keywords->count(errorCode) == 0) {
|
||||
// Error, or no extensions to copy.
|
||||
return;
|
||||
}
|
||||
if (extensions_ == nullptr) {
|
||||
extensions_ = Locale::getRoot().clone();
|
||||
if (extensions_ == nullptr) {
|
||||
status_ = U_MEMORY_ALLOCATION_ERROR;
|
||||
return;
|
||||
}
|
||||
}
|
||||
_copyExtensions(src, keywords.getAlias(), *extensions_, false, errorCode);
|
||||
}
|
||||
|
||||
Locale LocaleBuilder::build(UErrorCode& errorCode)
|
||||
{
|
||||
if (U_FAILURE(errorCode)) {
|
||||
return makeBogusLocale();
|
||||
}
|
||||
if (U_FAILURE(status_)) {
|
||||
errorCode = status_;
|
||||
return makeBogusLocale();
|
||||
}
|
||||
CharString locale_str(language_, errorCode);
|
||||
if (uprv_strlen(script_) > 0) {
|
||||
locale_str.append('-', errorCode).append(StringPiece(script_), errorCode);
|
||||
}
|
||||
if (uprv_strlen(region_) > 0) {
|
||||
locale_str.append('-', errorCode).append(StringPiece(region_), errorCode);
|
||||
}
|
||||
if (variant_ != nullptr) {
|
||||
locale_str.append('-', errorCode).append(StringPiece(variant_->data()), errorCode);
|
||||
}
|
||||
if (U_FAILURE(errorCode)) {
|
||||
return makeBogusLocale();
|
||||
}
|
||||
Locale product(locale_str.data());
|
||||
if (extensions_ != nullptr) {
|
||||
_copyExtensions(*extensions_, nullptr, product, true, errorCode);
|
||||
}
|
||||
if (U_FAILURE(errorCode)) {
|
||||
return makeBogusLocale();
|
||||
}
|
||||
return product;
|
||||
}
|
||||
|
||||
UBool LocaleBuilder::copyErrorTo(UErrorCode &outErrorCode) const {
|
||||
if (U_FAILURE(outErrorCode)) {
|
||||
// Do not overwrite the older error code
|
||||
return true;
|
||||
}
|
||||
outErrorCode = status_;
|
||||
return U_FAILURE(outErrorCode);
|
||||
}
|
||||
|
||||
U_NAMESPACE_END
|
||||
1338
engine/thirdparty/icu4c/common/localefallback_data.h
vendored
Normal file
1338
engine/thirdparty/icu4c/common/localefallback_data.h
vendored
Normal file
File diff suppressed because it is too large
Load diff
834
engine/thirdparty/icu4c/common/localematcher.cpp
vendored
Normal file
834
engine/thirdparty/icu4c/common/localematcher.cpp
vendored
Normal file
|
|
@ -0,0 +1,834 @@
|
|||
// © 2019 and later: Unicode, Inc. and others.
|
||||
// License & terms of use: http://www.unicode.org/copyright.html
|
||||
|
||||
// localematcher.cpp
|
||||
// created: 2019may08 Markus W. Scherer
|
||||
|
||||
#include <optional>
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
#include "unicode/localebuilder.h"
|
||||
#include "unicode/localematcher.h"
|
||||
#include "unicode/locid.h"
|
||||
#include "unicode/stringpiece.h"
|
||||
#include "unicode/uloc.h"
|
||||
#include "unicode/uobject.h"
|
||||
#include "cstring.h"
|
||||
#include "localeprioritylist.h"
|
||||
#include "loclikelysubtags.h"
|
||||
#include "locdistance.h"
|
||||
#include "lsr.h"
|
||||
#include "uassert.h"
|
||||
#include "uhash.h"
|
||||
#include "ustr_imp.h"
|
||||
#include "uvector.h"
|
||||
|
||||
#define UND_LSR LSR("und", "", "", LSR::EXPLICIT_LSR)
|
||||
|
||||
/**
|
||||
* Indicator for the lifetime of desired-locale objects passed into the LocaleMatcher.
|
||||
*
|
||||
* @draft ICU 65
|
||||
*/
|
||||
enum ULocMatchLifetime {
|
||||
/**
|
||||
* Locale objects are temporary.
|
||||
* The matcher will make a copy of a locale that will be used beyond one function call.
|
||||
*
|
||||
* @draft ICU 65
|
||||
*/
|
||||
ULOCMATCH_TEMPORARY_LOCALES,
|
||||
/**
|
||||
* Locale objects are stored at least as long as the matcher is used.
|
||||
* The matcher will keep only a pointer to a locale that will be used beyond one function call,
|
||||
* avoiding a copy.
|
||||
*
|
||||
* @draft ICU 65
|
||||
*/
|
||||
ULOCMATCH_STORED_LOCALES // TODO: permanent? cached? clone?
|
||||
};
|
||||
#ifndef U_IN_DOXYGEN
|
||||
typedef enum ULocMatchLifetime ULocMatchLifetime;
|
||||
#endif
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
LocaleMatcher::Result::Result(LocaleMatcher::Result &&src) noexcept :
|
||||
desiredLocale(src.desiredLocale),
|
||||
supportedLocale(src.supportedLocale),
|
||||
desiredIndex(src.desiredIndex),
|
||||
supportedIndex(src.supportedIndex),
|
||||
desiredIsOwned(src.desiredIsOwned) {
|
||||
if (desiredIsOwned) {
|
||||
src.desiredLocale = nullptr;
|
||||
src.desiredIndex = -1;
|
||||
src.desiredIsOwned = false;
|
||||
}
|
||||
}
|
||||
|
||||
LocaleMatcher::Result::~Result() {
|
||||
if (desiredIsOwned) {
|
||||
delete desiredLocale;
|
||||
}
|
||||
}
|
||||
|
||||
LocaleMatcher::Result &LocaleMatcher::Result::operator=(LocaleMatcher::Result &&src) noexcept {
|
||||
this->~Result();
|
||||
|
||||
desiredLocale = src.desiredLocale;
|
||||
supportedLocale = src.supportedLocale;
|
||||
desiredIndex = src.desiredIndex;
|
||||
supportedIndex = src.supportedIndex;
|
||||
desiredIsOwned = src.desiredIsOwned;
|
||||
|
||||
if (desiredIsOwned) {
|
||||
src.desiredLocale = nullptr;
|
||||
src.desiredIndex = -1;
|
||||
src.desiredIsOwned = false;
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
||||
Locale LocaleMatcher::Result::makeResolvedLocale(UErrorCode &errorCode) const {
|
||||
if (U_FAILURE(errorCode) || supportedLocale == nullptr) {
|
||||
return Locale::getRoot();
|
||||
}
|
||||
const Locale *bestDesired = getDesiredLocale();
|
||||
if (bestDesired == nullptr || *supportedLocale == *bestDesired) {
|
||||
return *supportedLocale;
|
||||
}
|
||||
LocaleBuilder b;
|
||||
b.setLocale(*supportedLocale);
|
||||
|
||||
// Copy the region from bestDesired, if there is one.
|
||||
const char *region = bestDesired->getCountry();
|
||||
if (*region != 0) {
|
||||
b.setRegion(region);
|
||||
}
|
||||
|
||||
// Copy the variants from bestDesired, if there are any.
|
||||
// Note that this will override any supportedLocale variants.
|
||||
// For example, "sco-ulster-fonipa" + "...-fonupa" => "sco-fonupa" (replacing ulster).
|
||||
const char *variants = bestDesired->getVariant();
|
||||
if (*variants != 0) {
|
||||
b.setVariant(variants);
|
||||
}
|
||||
|
||||
// Copy the extensions from bestDesired, if there are any.
|
||||
// C++ note: The following note, copied from Java, may not be true,
|
||||
// as long as C++ copies by legacy ICU keyword, not by extension singleton.
|
||||
// Note that this will override any supportedLocale extensions.
|
||||
// For example, "th-u-nu-latn-ca-buddhist" + "...-u-nu-native" => "th-u-nu-native"
|
||||
// (replacing calendar).
|
||||
b.copyExtensionsFrom(*bestDesired, errorCode);
|
||||
return b.build(errorCode);
|
||||
}
|
||||
|
||||
LocaleMatcher::Builder::Builder(LocaleMatcher::Builder &&src) noexcept :
|
||||
errorCode_(src.errorCode_),
|
||||
supportedLocales_(src.supportedLocales_),
|
||||
thresholdDistance_(src.thresholdDistance_),
|
||||
demotion_(src.demotion_),
|
||||
defaultLocale_(src.defaultLocale_),
|
||||
withDefault_(src.withDefault_),
|
||||
favor_(src.favor_),
|
||||
direction_(src.direction_) {
|
||||
src.supportedLocales_ = nullptr;
|
||||
src.defaultLocale_ = nullptr;
|
||||
}
|
||||
|
||||
LocaleMatcher::Builder::~Builder() {
|
||||
delete supportedLocales_;
|
||||
delete defaultLocale_;
|
||||
delete maxDistanceDesired_;
|
||||
delete maxDistanceSupported_;
|
||||
}
|
||||
|
||||
LocaleMatcher::Builder &LocaleMatcher::Builder::operator=(LocaleMatcher::Builder &&src) noexcept {
|
||||
this->~Builder();
|
||||
|
||||
errorCode_ = src.errorCode_;
|
||||
supportedLocales_ = src.supportedLocales_;
|
||||
thresholdDistance_ = src.thresholdDistance_;
|
||||
demotion_ = src.demotion_;
|
||||
defaultLocale_ = src.defaultLocale_;
|
||||
withDefault_ = src.withDefault_,
|
||||
favor_ = src.favor_;
|
||||
direction_ = src.direction_;
|
||||
|
||||
src.supportedLocales_ = nullptr;
|
||||
src.defaultLocale_ = nullptr;
|
||||
return *this;
|
||||
}
|
||||
|
||||
void LocaleMatcher::Builder::clearSupportedLocales() {
|
||||
if (supportedLocales_ != nullptr) {
|
||||
supportedLocales_->removeAllElements();
|
||||
}
|
||||
}
|
||||
|
||||
bool LocaleMatcher::Builder::ensureSupportedLocaleVector() {
|
||||
if (U_FAILURE(errorCode_)) { return false; }
|
||||
if (supportedLocales_ != nullptr) { return true; }
|
||||
LocalPointer<UVector> lpSupportedLocales(new UVector(uprv_deleteUObject, nullptr, errorCode_), errorCode_);
|
||||
if (U_FAILURE(errorCode_)) { return false; }
|
||||
supportedLocales_ = lpSupportedLocales.orphan();
|
||||
return true;
|
||||
}
|
||||
|
||||
LocaleMatcher::Builder &LocaleMatcher::Builder::setSupportedLocalesFromListString(
|
||||
StringPiece locales) {
|
||||
LocalePriorityList list(locales, errorCode_);
|
||||
if (U_FAILURE(errorCode_)) { return *this; }
|
||||
clearSupportedLocales();
|
||||
if (!ensureSupportedLocaleVector()) { return *this; }
|
||||
int32_t length = list.getLengthIncludingRemoved();
|
||||
for (int32_t i = 0; i < length; ++i) {
|
||||
Locale *locale = list.orphanLocaleAt(i);
|
||||
if (locale == nullptr) { continue; }
|
||||
supportedLocales_->adoptElement(locale, errorCode_);
|
||||
if (U_FAILURE(errorCode_)) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
||||
LocaleMatcher::Builder &LocaleMatcher::Builder::setSupportedLocales(Locale::Iterator &locales) {
|
||||
if (ensureSupportedLocaleVector()) {
|
||||
clearSupportedLocales();
|
||||
while (locales.hasNext() && U_SUCCESS(errorCode_)) {
|
||||
const Locale &locale = locales.next();
|
||||
LocalPointer<Locale> clone (locale.clone(), errorCode_);
|
||||
supportedLocales_->adoptElement(clone.orphan(), errorCode_);
|
||||
}
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
||||
LocaleMatcher::Builder &LocaleMatcher::Builder::addSupportedLocale(const Locale &locale) {
|
||||
if (ensureSupportedLocaleVector()) {
|
||||
LocalPointer<Locale> clone(locale.clone(), errorCode_);
|
||||
supportedLocales_->adoptElement(clone.orphan(), errorCode_);
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
||||
LocaleMatcher::Builder &LocaleMatcher::Builder::setNoDefaultLocale() {
|
||||
if (U_FAILURE(errorCode_)) { return *this; }
|
||||
delete defaultLocale_;
|
||||
defaultLocale_ = nullptr;
|
||||
withDefault_ = false;
|
||||
return *this;
|
||||
}
|
||||
|
||||
LocaleMatcher::Builder &LocaleMatcher::Builder::setDefaultLocale(const Locale *defaultLocale) {
|
||||
if (U_FAILURE(errorCode_)) { return *this; }
|
||||
Locale *clone = nullptr;
|
||||
if (defaultLocale != nullptr) {
|
||||
clone = defaultLocale->clone();
|
||||
if (clone == nullptr) {
|
||||
errorCode_ = U_MEMORY_ALLOCATION_ERROR;
|
||||
return *this;
|
||||
}
|
||||
}
|
||||
delete defaultLocale_;
|
||||
defaultLocale_ = clone;
|
||||
withDefault_ = true;
|
||||
return *this;
|
||||
}
|
||||
|
||||
LocaleMatcher::Builder &LocaleMatcher::Builder::setFavorSubtag(ULocMatchFavorSubtag subtag) {
|
||||
if (U_FAILURE(errorCode_)) { return *this; }
|
||||
favor_ = subtag;
|
||||
return *this;
|
||||
}
|
||||
|
||||
LocaleMatcher::Builder &LocaleMatcher::Builder::setDemotionPerDesiredLocale(ULocMatchDemotion demotion) {
|
||||
if (U_FAILURE(errorCode_)) { return *this; }
|
||||
demotion_ = demotion;
|
||||
return *this;
|
||||
}
|
||||
|
||||
LocaleMatcher::Builder &LocaleMatcher::Builder::setMaxDistance(const Locale &desired,
|
||||
const Locale &supported) {
|
||||
if (U_FAILURE(errorCode_)) { return *this; }
|
||||
Locale *desiredClone = desired.clone();
|
||||
Locale *supportedClone = supported.clone();
|
||||
if (desiredClone == nullptr || supportedClone == nullptr) {
|
||||
delete desiredClone; // in case only one could not be allocated
|
||||
delete supportedClone;
|
||||
errorCode_ = U_MEMORY_ALLOCATION_ERROR;
|
||||
return *this;
|
||||
}
|
||||
delete maxDistanceDesired_;
|
||||
delete maxDistanceSupported_;
|
||||
maxDistanceDesired_ = desiredClone;
|
||||
maxDistanceSupported_ = supportedClone;
|
||||
return *this;
|
||||
}
|
||||
|
||||
#if 0
|
||||
/**
|
||||
* <i>Internal only!</i>
|
||||
*
|
||||
* @param thresholdDistance the thresholdDistance to set, with -1 = default
|
||||
* @return this Builder object
|
||||
* @internal
|
||||
* @deprecated This API is ICU internal only.
|
||||
*/
|
||||
@Deprecated
|
||||
LocaleMatcher::Builder &LocaleMatcher::Builder::internalSetThresholdDistance(int32_t thresholdDistance) {
|
||||
if (U_FAILURE(errorCode_)) { return *this; }
|
||||
if (thresholdDistance > 100) {
|
||||
thresholdDistance = 100;
|
||||
}
|
||||
thresholdDistance_ = thresholdDistance;
|
||||
return *this;
|
||||
}
|
||||
#endif
|
||||
|
||||
UBool LocaleMatcher::Builder::copyErrorTo(UErrorCode &outErrorCode) const {
|
||||
if (U_FAILURE(outErrorCode)) { return true; }
|
||||
if (U_SUCCESS(errorCode_)) { return false; }
|
||||
outErrorCode = errorCode_;
|
||||
return true;
|
||||
}
|
||||
|
||||
LocaleMatcher LocaleMatcher::Builder::build(UErrorCode &errorCode) const {
|
||||
if (U_SUCCESS(errorCode) && U_FAILURE(errorCode_)) {
|
||||
errorCode = errorCode_;
|
||||
}
|
||||
return LocaleMatcher(*this, errorCode);
|
||||
}
|
||||
|
||||
namespace {
|
||||
|
||||
LSR getMaximalLsrOrUnd(const LikelySubtags &likelySubtags, const Locale &locale,
|
||||
UErrorCode &errorCode) {
|
||||
if (U_FAILURE(errorCode) || locale.isBogus() || *locale.getName() == 0 /* "und" */) {
|
||||
return UND_LSR;
|
||||
} else {
|
||||
return likelySubtags.makeMaximizedLsrFrom(locale, false, errorCode);
|
||||
}
|
||||
}
|
||||
|
||||
int32_t hashLSR(const UHashTok token) {
|
||||
const LSR *lsr = static_cast<const LSR *>(token.pointer);
|
||||
return lsr->hashCode;
|
||||
}
|
||||
|
||||
UBool compareLSRs(const UHashTok t1, const UHashTok t2) {
|
||||
const LSR *lsr1 = static_cast<const LSR *>(t1.pointer);
|
||||
const LSR *lsr2 = static_cast<const LSR *>(t2.pointer);
|
||||
return *lsr1 == *lsr2;
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
int32_t LocaleMatcher::putIfAbsent(const LSR &lsr, int32_t i, int32_t suppLength,
|
||||
UErrorCode &errorCode) {
|
||||
if (U_FAILURE(errorCode)) { return suppLength; }
|
||||
if (!uhash_containsKey(supportedLsrToIndex, &lsr)) {
|
||||
uhash_putiAllowZero(supportedLsrToIndex, const_cast<LSR *>(&lsr), i, &errorCode);
|
||||
if (U_SUCCESS(errorCode)) {
|
||||
supportedLSRs[suppLength] = &lsr;
|
||||
supportedIndexes[suppLength++] = i;
|
||||
}
|
||||
}
|
||||
return suppLength;
|
||||
}
|
||||
|
||||
LocaleMatcher::LocaleMatcher(const Builder &builder, UErrorCode &errorCode) :
|
||||
likelySubtags(*LikelySubtags::getSingleton(errorCode)),
|
||||
localeDistance(*LocaleDistance::getSingleton(errorCode)),
|
||||
thresholdDistance(builder.thresholdDistance_),
|
||||
demotionPerDesiredLocale(0),
|
||||
favorSubtag(builder.favor_),
|
||||
direction(builder.direction_),
|
||||
supportedLocales(nullptr), lsrs(nullptr), supportedLocalesLength(0),
|
||||
supportedLsrToIndex(nullptr),
|
||||
supportedLSRs(nullptr), supportedIndexes(nullptr), supportedLSRsLength(0),
|
||||
ownedDefaultLocale(nullptr), defaultLocale(nullptr) {
|
||||
if (U_FAILURE(errorCode)) { return; }
|
||||
const Locale *def = builder.defaultLocale_;
|
||||
LSR builderDefaultLSR;
|
||||
const LSR *defLSR = nullptr;
|
||||
if (def != nullptr) {
|
||||
ownedDefaultLocale = def->clone();
|
||||
if (ownedDefaultLocale == nullptr) {
|
||||
errorCode = U_MEMORY_ALLOCATION_ERROR;
|
||||
return;
|
||||
}
|
||||
def = ownedDefaultLocale;
|
||||
builderDefaultLSR = getMaximalLsrOrUnd(likelySubtags, *def, errorCode);
|
||||
if (U_FAILURE(errorCode)) { return; }
|
||||
defLSR = &builderDefaultLSR;
|
||||
}
|
||||
supportedLocalesLength = builder.supportedLocales_ != nullptr ?
|
||||
builder.supportedLocales_->size() : 0;
|
||||
if (supportedLocalesLength > 0) {
|
||||
// Store the supported locales in input order,
|
||||
// so that when different types are used (e.g., language tag strings)
|
||||
// we can return those by parallel index.
|
||||
supportedLocales = static_cast<const Locale **>(
|
||||
uprv_malloc(supportedLocalesLength * sizeof(const Locale *)));
|
||||
// Supported LRSs in input order.
|
||||
// In C++, we store these permanently to simplify ownership management
|
||||
// in the hash tables. Duplicate LSRs (if any) are unused overhead.
|
||||
lsrs = new LSR[supportedLocalesLength];
|
||||
if (supportedLocales == nullptr || lsrs == nullptr) {
|
||||
errorCode = U_MEMORY_ALLOCATION_ERROR;
|
||||
return;
|
||||
}
|
||||
// If the constructor fails partway, we need null pointers for destructibility.
|
||||
uprv_memset(supportedLocales, 0, supportedLocalesLength * sizeof(const Locale *));
|
||||
for (int32_t i = 0; i < supportedLocalesLength; ++i) {
|
||||
const Locale &locale = *static_cast<Locale *>(builder.supportedLocales_->elementAt(i));
|
||||
supportedLocales[i] = locale.clone();
|
||||
if (supportedLocales[i] == nullptr) {
|
||||
errorCode = U_MEMORY_ALLOCATION_ERROR;
|
||||
return;
|
||||
}
|
||||
const Locale &supportedLocale = *supportedLocales[i];
|
||||
LSR &lsr = lsrs[i] = getMaximalLsrOrUnd(likelySubtags, supportedLocale, errorCode);
|
||||
lsr.setHashCode();
|
||||
if (U_FAILURE(errorCode)) { return; }
|
||||
}
|
||||
|
||||
// We need an unordered map from LSR to first supported locale with that LSR,
|
||||
// and an ordered list of (LSR, supported index) for
|
||||
// the supported locales in the following order:
|
||||
// 1. Default locale, if it is supported.
|
||||
// 2. Priority locales (aka "paradigm locales") in builder order.
|
||||
// 3. Remaining locales in builder order.
|
||||
supportedLsrToIndex = uhash_openSize(hashLSR, compareLSRs, uhash_compareLong,
|
||||
supportedLocalesLength, &errorCode);
|
||||
if (U_FAILURE(errorCode)) { return; }
|
||||
supportedLSRs = static_cast<const LSR **>(
|
||||
uprv_malloc(supportedLocalesLength * sizeof(const LSR *)));
|
||||
supportedIndexes = static_cast<int32_t *>(
|
||||
uprv_malloc(supportedLocalesLength * sizeof(int32_t)));
|
||||
if (supportedLSRs == nullptr || supportedIndexes == nullptr) {
|
||||
errorCode = U_MEMORY_ALLOCATION_ERROR;
|
||||
return;
|
||||
}
|
||||
int32_t suppLength = 0;
|
||||
// Determine insertion order.
|
||||
// Add locales immediately that are equivalent to the default.
|
||||
MaybeStackArray<int8_t, 100> order(supportedLocalesLength, errorCode);
|
||||
if (U_FAILURE(errorCode)) { return; }
|
||||
int32_t numParadigms = 0;
|
||||
for (int32_t i = 0; i < supportedLocalesLength; ++i) {
|
||||
const Locale &locale = *supportedLocales[i];
|
||||
const LSR &lsr = lsrs[i];
|
||||
if (defLSR == nullptr && builder.withDefault_) {
|
||||
// Implicit default locale = first supported locale, if not turned off.
|
||||
U_ASSERT(i == 0);
|
||||
def = &locale;
|
||||
defLSR = &lsr;
|
||||
order[i] = 1;
|
||||
suppLength = putIfAbsent(lsr, 0, suppLength, errorCode);
|
||||
} else if (defLSR != nullptr && lsr.isEquivalentTo(*defLSR)) {
|
||||
order[i] = 1;
|
||||
suppLength = putIfAbsent(lsr, i, suppLength, errorCode);
|
||||
} else if (localeDistance.isParadigmLSR(lsr)) {
|
||||
order[i] = 2;
|
||||
++numParadigms;
|
||||
} else {
|
||||
order[i] = 3;
|
||||
}
|
||||
if (U_FAILURE(errorCode)) { return; }
|
||||
}
|
||||
// Add supported paradigm locales.
|
||||
int32_t paradigmLimit = suppLength + numParadigms;
|
||||
for (int32_t i = 0; i < supportedLocalesLength && suppLength < paradigmLimit; ++i) {
|
||||
if (order[i] == 2) {
|
||||
suppLength = putIfAbsent(lsrs[i], i, suppLength, errorCode);
|
||||
}
|
||||
}
|
||||
// Add remaining supported locales.
|
||||
for (int32_t i = 0; i < supportedLocalesLength; ++i) {
|
||||
if (order[i] == 3) {
|
||||
suppLength = putIfAbsent(lsrs[i], i, suppLength, errorCode);
|
||||
}
|
||||
}
|
||||
supportedLSRsLength = suppLength;
|
||||
// If supportedLSRsLength < supportedLocalesLength then
|
||||
// we waste as many array slots as there are duplicate supported LSRs,
|
||||
// but the amount of wasted space is small as long as there are few duplicates.
|
||||
}
|
||||
|
||||
defaultLocale = def;
|
||||
|
||||
if (builder.demotion_ == ULOCMATCH_DEMOTION_REGION) {
|
||||
demotionPerDesiredLocale = localeDistance.getDefaultDemotionPerDesiredLocale();
|
||||
}
|
||||
|
||||
if (thresholdDistance >= 0) {
|
||||
// already copied
|
||||
} else if (builder.maxDistanceDesired_ != nullptr) {
|
||||
LSR suppLSR = getMaximalLsrOrUnd(likelySubtags, *builder.maxDistanceSupported_, errorCode);
|
||||
const LSR *pSuppLSR = &suppLSR;
|
||||
int32_t indexAndDistance = localeDistance.getBestIndexAndDistance(
|
||||
getMaximalLsrOrUnd(likelySubtags, *builder.maxDistanceDesired_, errorCode),
|
||||
&pSuppLSR, 1,
|
||||
LocaleDistance::shiftDistance(100), favorSubtag, direction);
|
||||
if (U_SUCCESS(errorCode)) {
|
||||
// +1 for an exclusive threshold from an inclusive max.
|
||||
thresholdDistance = LocaleDistance::getDistanceFloor(indexAndDistance) + 1;
|
||||
} else {
|
||||
thresholdDistance = 0;
|
||||
}
|
||||
} else {
|
||||
thresholdDistance = localeDistance.getDefaultScriptDistance();
|
||||
}
|
||||
}
|
||||
|
||||
LocaleMatcher::LocaleMatcher(LocaleMatcher &&src) noexcept :
|
||||
likelySubtags(src.likelySubtags),
|
||||
localeDistance(src.localeDistance),
|
||||
thresholdDistance(src.thresholdDistance),
|
||||
demotionPerDesiredLocale(src.demotionPerDesiredLocale),
|
||||
favorSubtag(src.favorSubtag),
|
||||
direction(src.direction),
|
||||
supportedLocales(src.supportedLocales), lsrs(src.lsrs),
|
||||
supportedLocalesLength(src.supportedLocalesLength),
|
||||
supportedLsrToIndex(src.supportedLsrToIndex),
|
||||
supportedLSRs(src.supportedLSRs),
|
||||
supportedIndexes(src.supportedIndexes),
|
||||
supportedLSRsLength(src.supportedLSRsLength),
|
||||
ownedDefaultLocale(src.ownedDefaultLocale), defaultLocale(src.defaultLocale) {
|
||||
src.supportedLocales = nullptr;
|
||||
src.lsrs = nullptr;
|
||||
src.supportedLocalesLength = 0;
|
||||
src.supportedLsrToIndex = nullptr;
|
||||
src.supportedLSRs = nullptr;
|
||||
src.supportedIndexes = nullptr;
|
||||
src.supportedLSRsLength = 0;
|
||||
src.ownedDefaultLocale = nullptr;
|
||||
src.defaultLocale = nullptr;
|
||||
}
|
||||
|
||||
LocaleMatcher::~LocaleMatcher() {
|
||||
for (int32_t i = 0; i < supportedLocalesLength; ++i) {
|
||||
delete supportedLocales[i];
|
||||
}
|
||||
uprv_free(supportedLocales);
|
||||
delete[] lsrs;
|
||||
uhash_close(supportedLsrToIndex);
|
||||
uprv_free(supportedLSRs);
|
||||
uprv_free(supportedIndexes);
|
||||
delete ownedDefaultLocale;
|
||||
}
|
||||
|
||||
LocaleMatcher &LocaleMatcher::operator=(LocaleMatcher &&src) noexcept {
|
||||
this->~LocaleMatcher();
|
||||
|
||||
thresholdDistance = src.thresholdDistance;
|
||||
demotionPerDesiredLocale = src.demotionPerDesiredLocale;
|
||||
favorSubtag = src.favorSubtag;
|
||||
direction = src.direction;
|
||||
supportedLocales = src.supportedLocales;
|
||||
lsrs = src.lsrs;
|
||||
supportedLocalesLength = src.supportedLocalesLength;
|
||||
supportedLsrToIndex = src.supportedLsrToIndex;
|
||||
supportedLSRs = src.supportedLSRs;
|
||||
supportedIndexes = src.supportedIndexes;
|
||||
supportedLSRsLength = src.supportedLSRsLength;
|
||||
ownedDefaultLocale = src.ownedDefaultLocale;
|
||||
defaultLocale = src.defaultLocale;
|
||||
|
||||
src.supportedLocales = nullptr;
|
||||
src.lsrs = nullptr;
|
||||
src.supportedLocalesLength = 0;
|
||||
src.supportedLsrToIndex = nullptr;
|
||||
src.supportedLSRs = nullptr;
|
||||
src.supportedIndexes = nullptr;
|
||||
src.supportedLSRsLength = 0;
|
||||
src.ownedDefaultLocale = nullptr;
|
||||
src.defaultLocale = nullptr;
|
||||
return *this;
|
||||
}
|
||||
|
||||
class LocaleLsrIterator {
|
||||
public:
|
||||
LocaleLsrIterator(const LikelySubtags &likelySubtags, Locale::Iterator &locales,
|
||||
ULocMatchLifetime lifetime) :
|
||||
likelySubtags(likelySubtags), locales(locales), lifetime(lifetime) {}
|
||||
|
||||
~LocaleLsrIterator() {
|
||||
if (lifetime == ULOCMATCH_TEMPORARY_LOCALES) {
|
||||
delete remembered;
|
||||
}
|
||||
}
|
||||
|
||||
bool hasNext() const {
|
||||
return locales.hasNext();
|
||||
}
|
||||
|
||||
LSR next(UErrorCode &errorCode) {
|
||||
current = &locales.next();
|
||||
return getMaximalLsrOrUnd(likelySubtags, *current, errorCode);
|
||||
}
|
||||
|
||||
void rememberCurrent(int32_t desiredIndex, UErrorCode &errorCode) {
|
||||
if (U_FAILURE(errorCode)) { return; }
|
||||
bestDesiredIndex = desiredIndex;
|
||||
if (lifetime == ULOCMATCH_STORED_LOCALES) {
|
||||
remembered = current;
|
||||
} else {
|
||||
// ULOCMATCH_TEMPORARY_LOCALES
|
||||
delete remembered;
|
||||
remembered = new Locale(*current);
|
||||
if (remembered == nullptr) {
|
||||
errorCode = U_MEMORY_ALLOCATION_ERROR;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const Locale *orphanRemembered() {
|
||||
const Locale *rem = remembered;
|
||||
remembered = nullptr;
|
||||
return rem;
|
||||
}
|
||||
|
||||
int32_t getBestDesiredIndex() const {
|
||||
return bestDesiredIndex;
|
||||
}
|
||||
|
||||
private:
|
||||
const LikelySubtags &likelySubtags;
|
||||
Locale::Iterator &locales;
|
||||
ULocMatchLifetime lifetime;
|
||||
const Locale *current = nullptr, *remembered = nullptr;
|
||||
int32_t bestDesiredIndex = -1;
|
||||
};
|
||||
|
||||
const Locale *LocaleMatcher::getBestMatch(const Locale &desiredLocale, UErrorCode &errorCode) const {
|
||||
if (U_FAILURE(errorCode)) { return nullptr; }
|
||||
std::optional<int32_t> suppIndex = getBestSuppIndex(
|
||||
getMaximalLsrOrUnd(likelySubtags, desiredLocale, errorCode),
|
||||
nullptr, errorCode);
|
||||
return U_SUCCESS(errorCode) && suppIndex.has_value() ? supportedLocales[*suppIndex]
|
||||
: defaultLocale;
|
||||
}
|
||||
|
||||
const Locale *LocaleMatcher::getBestMatch(Locale::Iterator &desiredLocales,
|
||||
UErrorCode &errorCode) const {
|
||||
if (U_FAILURE(errorCode)) { return nullptr; }
|
||||
if (!desiredLocales.hasNext()) {
|
||||
return defaultLocale;
|
||||
}
|
||||
LocaleLsrIterator lsrIter(likelySubtags, desiredLocales, ULOCMATCH_TEMPORARY_LOCALES);
|
||||
std::optional<int32_t> suppIndex = getBestSuppIndex(lsrIter.next(errorCode), &lsrIter, errorCode);
|
||||
return U_SUCCESS(errorCode) && suppIndex.has_value() ? supportedLocales[*suppIndex]
|
||||
: defaultLocale;
|
||||
}
|
||||
|
||||
const Locale *LocaleMatcher::getBestMatchForListString(
|
||||
StringPiece desiredLocaleList, UErrorCode &errorCode) const {
|
||||
if (U_FAILURE(errorCode)) { return nullptr; }
|
||||
LocalePriorityList list(desiredLocaleList, errorCode);
|
||||
LocalePriorityList::Iterator iter = list.iterator();
|
||||
return getBestMatch(iter, errorCode);
|
||||
}
|
||||
|
||||
LocaleMatcher::Result LocaleMatcher::getBestMatchResult(
|
||||
const Locale &desiredLocale, UErrorCode &errorCode) const {
|
||||
if (U_FAILURE(errorCode)) {
|
||||
return Result(nullptr, defaultLocale, -1, -1, false);
|
||||
}
|
||||
std::optional<int32_t> suppIndex = getBestSuppIndex(
|
||||
getMaximalLsrOrUnd(likelySubtags, desiredLocale, errorCode),
|
||||
nullptr, errorCode);
|
||||
if (U_FAILURE(errorCode) || !suppIndex.has_value()) {
|
||||
return Result(nullptr, defaultLocale, -1, -1, false);
|
||||
} else {
|
||||
return Result(&desiredLocale, supportedLocales[*suppIndex], 0, *suppIndex, false);
|
||||
}
|
||||
}
|
||||
|
||||
LocaleMatcher::Result LocaleMatcher::getBestMatchResult(
|
||||
Locale::Iterator &desiredLocales, UErrorCode &errorCode) const {
|
||||
if (U_FAILURE(errorCode) || !desiredLocales.hasNext()) {
|
||||
return Result(nullptr, defaultLocale, -1, -1, false);
|
||||
}
|
||||
LocaleLsrIterator lsrIter(likelySubtags, desiredLocales, ULOCMATCH_TEMPORARY_LOCALES);
|
||||
std::optional<int32_t> suppIndex = getBestSuppIndex(lsrIter.next(errorCode), &lsrIter, errorCode);
|
||||
if (U_FAILURE(errorCode) || !suppIndex.has_value()) {
|
||||
return Result(nullptr, defaultLocale, -1, -1, false);
|
||||
} else {
|
||||
return Result(lsrIter.orphanRemembered(), supportedLocales[*suppIndex],
|
||||
lsrIter.getBestDesiredIndex(), *suppIndex, true);
|
||||
}
|
||||
}
|
||||
|
||||
std::optional<int32_t> LocaleMatcher::getBestSuppIndex(LSR desiredLSR,
|
||||
LocaleLsrIterator *remainingIter,
|
||||
UErrorCode &errorCode) const {
|
||||
if (U_FAILURE(errorCode)) { return std::nullopt; }
|
||||
int32_t desiredIndex = 0;
|
||||
int32_t bestSupportedLsrIndex = -1;
|
||||
for (int32_t bestShiftedDistance = LocaleDistance::shiftDistance(thresholdDistance);;) {
|
||||
// Quick check for exact maximized LSR.
|
||||
if (supportedLsrToIndex != nullptr) {
|
||||
desiredLSR.setHashCode();
|
||||
UBool found = false;
|
||||
int32_t suppIndex = uhash_getiAndFound(supportedLsrToIndex, &desiredLSR, &found);
|
||||
if (found) {
|
||||
if (remainingIter != nullptr) {
|
||||
remainingIter->rememberCurrent(desiredIndex, errorCode);
|
||||
}
|
||||
return suppIndex;
|
||||
}
|
||||
}
|
||||
int32_t bestIndexAndDistance = localeDistance.getBestIndexAndDistance(
|
||||
desiredLSR, supportedLSRs, supportedLSRsLength,
|
||||
bestShiftedDistance, favorSubtag, direction);
|
||||
if (bestIndexAndDistance >= 0) {
|
||||
bestShiftedDistance = LocaleDistance::getShiftedDistance(bestIndexAndDistance);
|
||||
if (remainingIter != nullptr) {
|
||||
remainingIter->rememberCurrent(desiredIndex, errorCode);
|
||||
if (U_FAILURE(errorCode)) { return std::nullopt; }
|
||||
}
|
||||
bestSupportedLsrIndex = LocaleDistance::getIndex(bestIndexAndDistance);
|
||||
}
|
||||
if ((bestShiftedDistance -= LocaleDistance::shiftDistance(demotionPerDesiredLocale)) <= 0) {
|
||||
break;
|
||||
}
|
||||
if (remainingIter == nullptr || !remainingIter->hasNext()) {
|
||||
break;
|
||||
}
|
||||
desiredLSR = remainingIter->next(errorCode);
|
||||
if (U_FAILURE(errorCode)) { return std::nullopt; }
|
||||
++desiredIndex;
|
||||
}
|
||||
if (bestSupportedLsrIndex < 0) {
|
||||
// no good match
|
||||
return std::nullopt;
|
||||
}
|
||||
return supportedIndexes[bestSupportedLsrIndex];
|
||||
}
|
||||
|
||||
UBool LocaleMatcher::isMatch(const Locale &desired, const Locale &supported,
|
||||
UErrorCode &errorCode) const {
|
||||
if (U_FAILURE(errorCode)) { return false; }
|
||||
LSR suppLSR = getMaximalLsrOrUnd(likelySubtags, supported, errorCode);
|
||||
if (U_FAILURE(errorCode)) { return false; }
|
||||
const LSR *pSuppLSR = &suppLSR;
|
||||
int32_t indexAndDistance = localeDistance.getBestIndexAndDistance(
|
||||
getMaximalLsrOrUnd(likelySubtags, desired, errorCode),
|
||||
&pSuppLSR, 1,
|
||||
LocaleDistance::shiftDistance(thresholdDistance), favorSubtag, direction);
|
||||
return indexAndDistance >= 0;
|
||||
}
|
||||
|
||||
double LocaleMatcher::internalMatch(const Locale &desired, const Locale &supported, UErrorCode &errorCode) const {
|
||||
if (U_FAILURE(errorCode)) { return 0.; }
|
||||
// Returns the inverse of the distance: That is, 1-distance(desired, supported).
|
||||
LSR suppLSR = getMaximalLsrOrUnd(likelySubtags, supported, errorCode);
|
||||
if (U_FAILURE(errorCode)) { return 0.; }
|
||||
const LSR *pSuppLSR = &suppLSR;
|
||||
int32_t indexAndDistance = localeDistance.getBestIndexAndDistance(
|
||||
getMaximalLsrOrUnd(likelySubtags, desired, errorCode),
|
||||
&pSuppLSR, 1,
|
||||
LocaleDistance::shiftDistance(thresholdDistance), favorSubtag, direction);
|
||||
double distance = LocaleDistance::getDistanceDouble(indexAndDistance);
|
||||
return (100.0 - distance) / 100.0;
|
||||
}
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
||||
// uloc_acceptLanguage() --------------------------------------------------- ***
|
||||
|
||||
U_NAMESPACE_USE
|
||||
|
||||
namespace {
|
||||
|
||||
class LocaleFromTag {
|
||||
public:
|
||||
LocaleFromTag() : locale(Locale::getRoot()) {}
|
||||
const Locale &operator()(const char *tag) { return locale = Locale(tag); }
|
||||
|
||||
private:
|
||||
// Store the locale in the converter, rather than return a reference to a temporary,
|
||||
// or a value which could go out of scope with the caller's reference to it.
|
||||
Locale locale;
|
||||
};
|
||||
|
||||
int32_t acceptLanguage(UEnumeration &supportedLocales, Locale::Iterator &desiredLocales,
|
||||
char *dest, int32_t capacity, UAcceptResult *acceptResult,
|
||||
UErrorCode &errorCode) {
|
||||
if (U_FAILURE(errorCode)) { return 0; }
|
||||
LocaleMatcher::Builder builder;
|
||||
const char *locString;
|
||||
while ((locString = uenum_next(&supportedLocales, nullptr, &errorCode)) != nullptr) {
|
||||
Locale loc(locString);
|
||||
if (loc.isBogus()) {
|
||||
errorCode = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return 0;
|
||||
}
|
||||
builder.addSupportedLocale(loc);
|
||||
}
|
||||
LocaleMatcher matcher = builder.build(errorCode);
|
||||
LocaleMatcher::Result result = matcher.getBestMatchResult(desiredLocales, errorCode);
|
||||
if (U_FAILURE(errorCode)) { return 0; }
|
||||
if (result.getDesiredIndex() >= 0) {
|
||||
if (acceptResult != nullptr) {
|
||||
*acceptResult = *result.getDesiredLocale() == *result.getSupportedLocale() ?
|
||||
ULOC_ACCEPT_VALID : ULOC_ACCEPT_FALLBACK;
|
||||
}
|
||||
const char *bestStr = result.getSupportedLocale()->getName();
|
||||
int32_t bestLength = (int32_t)uprv_strlen(bestStr);
|
||||
if (bestLength <= capacity) {
|
||||
uprv_memcpy(dest, bestStr, bestLength);
|
||||
}
|
||||
return u_terminateChars(dest, capacity, bestLength, &errorCode);
|
||||
} else {
|
||||
if (acceptResult != nullptr) {
|
||||
*acceptResult = ULOC_ACCEPT_FAILED;
|
||||
}
|
||||
return u_terminateChars(dest, capacity, 0, &errorCode);
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
uloc_acceptLanguage(char *result, int32_t resultAvailable,
|
||||
UAcceptResult *outResult,
|
||||
const char **acceptList, int32_t acceptListCount,
|
||||
UEnumeration *availableLocales,
|
||||
UErrorCode *status) {
|
||||
if (U_FAILURE(*status)) { return 0; }
|
||||
if ((result == nullptr ? resultAvailable != 0 : resultAvailable < 0) ||
|
||||
(acceptList == nullptr ? acceptListCount != 0 : acceptListCount < 0) ||
|
||||
availableLocales == nullptr) {
|
||||
*status = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return 0;
|
||||
}
|
||||
LocaleFromTag converter;
|
||||
Locale::ConvertingIterator<const char **, LocaleFromTag> desiredLocales(
|
||||
acceptList, acceptList + acceptListCount, converter);
|
||||
return acceptLanguage(*availableLocales, desiredLocales,
|
||||
result, resultAvailable, outResult, *status);
|
||||
}
|
||||
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
uloc_acceptLanguageFromHTTP(char *result, int32_t resultAvailable,
|
||||
UAcceptResult *outResult,
|
||||
const char *httpAcceptLanguage,
|
||||
UEnumeration *availableLocales,
|
||||
UErrorCode *status) {
|
||||
if (U_FAILURE(*status)) { return 0; }
|
||||
if ((result == nullptr ? resultAvailable != 0 : resultAvailable < 0) ||
|
||||
httpAcceptLanguage == nullptr || availableLocales == nullptr) {
|
||||
*status = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return 0;
|
||||
}
|
||||
LocalePriorityList list(httpAcceptLanguage, *status);
|
||||
LocalePriorityList::Iterator desiredLocales = list.iterator();
|
||||
return acceptLanguage(*availableLocales, desiredLocales,
|
||||
result, resultAvailable, outResult, *status);
|
||||
}
|
||||
240
engine/thirdparty/icu4c/common/localeprioritylist.cpp
vendored
Normal file
240
engine/thirdparty/icu4c/common/localeprioritylist.cpp
vendored
Normal file
|
|
@ -0,0 +1,240 @@
|
|||
// © 2019 and later: Unicode, Inc. and others.
|
||||
// License & terms of use: http://www.unicode.org/copyright.html
|
||||
|
||||
// localeprioritylist.cpp
|
||||
// created: 2019jul11 Markus W. Scherer
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
#include "unicode/localpointer.h"
|
||||
#include "unicode/locid.h"
|
||||
#include "unicode/stringpiece.h"
|
||||
#include "unicode/uobject.h"
|
||||
#include "charstr.h"
|
||||
#include "cmemory.h"
|
||||
#include "localeprioritylist.h"
|
||||
#include "uarrsort.h"
|
||||
#include "uassert.h"
|
||||
#include "uhash.h"
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
namespace {
|
||||
|
||||
int32_t hashLocale(const UHashTok token) {
|
||||
const auto* locale = static_cast<const Locale*>(token.pointer);
|
||||
return locale->hashCode();
|
||||
}
|
||||
|
||||
UBool compareLocales(const UHashTok t1, const UHashTok t2) {
|
||||
const auto* l1 = static_cast<const Locale*>(t1.pointer);
|
||||
const auto* l2 = static_cast<const Locale*>(t2.pointer);
|
||||
return *l1 == *l2;
|
||||
}
|
||||
|
||||
constexpr int32_t WEIGHT_ONE = 1000;
|
||||
|
||||
struct LocaleAndWeight {
|
||||
Locale *locale;
|
||||
int32_t weight; // 0..1000 = 0.0..1.0
|
||||
int32_t index; // force stable sort
|
||||
|
||||
int32_t compare(const LocaleAndWeight &other) const {
|
||||
int32_t diff = other.weight - weight; // descending: other-this
|
||||
if (diff != 0) { return diff; }
|
||||
return index - other.index;
|
||||
}
|
||||
};
|
||||
|
||||
int32_t U_CALLCONV
|
||||
compareLocaleAndWeight(const void * /*context*/, const void *left, const void *right) {
|
||||
return static_cast<const LocaleAndWeight *>(left)->
|
||||
compare(*static_cast<const LocaleAndWeight *>(right));
|
||||
}
|
||||
|
||||
const char *skipSpaces(const char *p, const char *limit) {
|
||||
while (p < limit && *p == ' ') { ++p; }
|
||||
return p;
|
||||
}
|
||||
|
||||
int32_t findTagLength(const char *p, const char *limit) {
|
||||
// Look for accept-language delimiters.
|
||||
// Leave other validation up to the Locale constructor.
|
||||
const char *q;
|
||||
for (q = p; q < limit; ++q) {
|
||||
char c = *q;
|
||||
if (c == ' ' || c == ',' || c == ';') { break; }
|
||||
}
|
||||
return static_cast<int32_t>(q - p);
|
||||
}
|
||||
|
||||
/**
|
||||
* Parses and returns a qvalue weight in millis.
|
||||
* Advances p to after the parsed substring.
|
||||
* Returns a negative value if parsing fails.
|
||||
*/
|
||||
int32_t parseWeight(const char *&p, const char *limit) {
|
||||
p = skipSpaces(p, limit);
|
||||
char c;
|
||||
if (p == limit || ((c = *p) != '0' && c != '1')) { return -1; }
|
||||
int32_t weight = (c - '0') * 1000;
|
||||
if (++p == limit || *p != '.') { return weight; }
|
||||
int32_t multiplier = 100;
|
||||
while (++p != limit && '0' <= (c = *p) && c <= '9') {
|
||||
c -= '0';
|
||||
if (multiplier > 0) {
|
||||
weight += c * multiplier;
|
||||
multiplier /= 10;
|
||||
} else if (multiplier == 0) {
|
||||
// round up
|
||||
if (c >= 5) { ++weight; }
|
||||
multiplier = -1;
|
||||
} // else ignore further fraction digits
|
||||
}
|
||||
return weight <= WEIGHT_ONE ? weight : -1; // bad if > 1.0
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
/**
|
||||
* Nothing but a wrapper over a MaybeStackArray of LocaleAndWeight.
|
||||
*
|
||||
* This wrapper exists (and is not in an anonymous namespace)
|
||||
* so that we can forward-declare it in the header file and
|
||||
* don't have to expose the MaybeStackArray specialization and
|
||||
* the LocaleAndWeight to code (like the test) that #includes localeprioritylist.h.
|
||||
* Also, otherwise we would have to do a platform-specific
|
||||
* template export declaration of some kind for the MaybeStackArray specialization
|
||||
* to be properly exported from the common DLL.
|
||||
*/
|
||||
struct LocaleAndWeightArray : public UMemory {
|
||||
MaybeStackArray<LocaleAndWeight, 20> array;
|
||||
};
|
||||
|
||||
LocalePriorityList::LocalePriorityList(StringPiece s, UErrorCode &errorCode) {
|
||||
if (U_FAILURE(errorCode)) { return; }
|
||||
list = new LocaleAndWeightArray();
|
||||
if (list == nullptr) {
|
||||
errorCode = U_MEMORY_ALLOCATION_ERROR;
|
||||
return;
|
||||
}
|
||||
const char *p = s.data();
|
||||
const char *limit = p + s.length();
|
||||
while ((p = skipSpaces(p, limit)) != limit) {
|
||||
if (*p == ',') { // empty range field
|
||||
++p;
|
||||
continue;
|
||||
}
|
||||
int32_t tagLength = findTagLength(p, limit);
|
||||
if (tagLength == 0) {
|
||||
errorCode = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return;
|
||||
}
|
||||
CharString tag(p, tagLength, errorCode);
|
||||
if (U_FAILURE(errorCode)) { return; }
|
||||
Locale locale = Locale(tag.data());
|
||||
if (locale.isBogus()) {
|
||||
errorCode = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return;
|
||||
}
|
||||
int32_t weight = WEIGHT_ONE;
|
||||
if ((p = skipSpaces(p + tagLength, limit)) != limit && *p == ';') {
|
||||
if ((p = skipSpaces(p + 1, limit)) == limit || *p != 'q' ||
|
||||
(p = skipSpaces(p + 1, limit)) == limit || *p != '=' ||
|
||||
(++p, (weight = parseWeight(p, limit)) < 0)) {
|
||||
errorCode = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return;
|
||||
}
|
||||
p = skipSpaces(p, limit);
|
||||
}
|
||||
if (p != limit && *p != ',') { // trailing junk
|
||||
errorCode = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return;
|
||||
}
|
||||
add(locale, weight, errorCode);
|
||||
if (p == limit) { break; }
|
||||
++p;
|
||||
}
|
||||
sort(errorCode);
|
||||
}
|
||||
|
||||
LocalePriorityList::~LocalePriorityList() {
|
||||
if (list != nullptr) {
|
||||
for (int32_t i = 0; i < listLength; ++i) {
|
||||
delete list->array[i].locale;
|
||||
}
|
||||
delete list;
|
||||
}
|
||||
uhash_close(map);
|
||||
}
|
||||
|
||||
const Locale *LocalePriorityList::localeAt(int32_t i) const {
|
||||
return list->array[i].locale;
|
||||
}
|
||||
|
||||
Locale *LocalePriorityList::orphanLocaleAt(int32_t i) {
|
||||
if (list == nullptr) { return nullptr; }
|
||||
LocaleAndWeight &lw = list->array[i];
|
||||
Locale *l = lw.locale;
|
||||
lw.locale = nullptr;
|
||||
return l;
|
||||
}
|
||||
|
||||
bool LocalePriorityList::add(const Locale &locale, int32_t weight, UErrorCode &errorCode) {
|
||||
if (U_FAILURE(errorCode)) { return false; }
|
||||
if (map == nullptr) {
|
||||
if (weight <= 0) { return true; } // do not add q=0
|
||||
map = uhash_open(hashLocale, compareLocales, uhash_compareLong, &errorCode);
|
||||
if (U_FAILURE(errorCode)) { return false; }
|
||||
}
|
||||
LocalPointer<Locale> clone;
|
||||
UBool found = false;
|
||||
int32_t index = uhash_getiAndFound(map, &locale, &found);
|
||||
if (found) {
|
||||
// Duplicate: Remove the old item and append it anew.
|
||||
LocaleAndWeight &lw = list->array[index];
|
||||
clone.adoptInstead(lw.locale);
|
||||
lw.locale = nullptr;
|
||||
lw.weight = 0;
|
||||
++numRemoved;
|
||||
}
|
||||
if (weight <= 0) { // do not add q=0
|
||||
if (found) {
|
||||
// Not strictly necessary but cleaner.
|
||||
uhash_removei(map, &locale);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
if (clone.isNull()) {
|
||||
clone.adoptInstead(locale.clone());
|
||||
if (clone.isNull() || (clone->isBogus() && !locale.isBogus())) {
|
||||
errorCode = U_MEMORY_ALLOCATION_ERROR;
|
||||
return false;
|
||||
}
|
||||
}
|
||||
if (listLength == list->array.getCapacity()) {
|
||||
int32_t newCapacity = listLength < 50 ? 100 : 4 * listLength;
|
||||
if (list->array.resize(newCapacity, listLength) == nullptr) {
|
||||
errorCode = U_MEMORY_ALLOCATION_ERROR;
|
||||
return false;
|
||||
}
|
||||
}
|
||||
uhash_putiAllowZero(map, clone.getAlias(), listLength, &errorCode);
|
||||
if (U_FAILURE(errorCode)) { return false; }
|
||||
LocaleAndWeight &lw = list->array[listLength];
|
||||
lw.locale = clone.orphan();
|
||||
lw.weight = weight;
|
||||
lw.index = listLength++;
|
||||
if (weight < WEIGHT_ONE) { hasWeights = true; }
|
||||
U_ASSERT(uhash_count(map) == getLength());
|
||||
return true;
|
||||
}
|
||||
|
||||
void LocalePriorityList::sort(UErrorCode &errorCode) {
|
||||
// Sort by descending weights if there is a mix of weights.
|
||||
// The comparator forces a stable sort via the item index.
|
||||
if (U_FAILURE(errorCode) || getLength() <= 1 || !hasWeights) { return; }
|
||||
uprv_sortArray(list->array.getAlias(), listLength, sizeof(LocaleAndWeight),
|
||||
compareLocaleAndWeight, nullptr, false, &errorCode);
|
||||
}
|
||||
|
||||
U_NAMESPACE_END
|
||||
115
engine/thirdparty/icu4c/common/localeprioritylist.h
vendored
Normal file
115
engine/thirdparty/icu4c/common/localeprioritylist.h
vendored
Normal file
|
|
@ -0,0 +1,115 @@
|
|||
// © 2019 and later: Unicode, Inc. and others.
|
||||
// License & terms of use: http://www.unicode.org/copyright.html
|
||||
|
||||
// localeprioritylist.h
|
||||
// created: 2019jul11 Markus W. Scherer
|
||||
|
||||
#ifndef __LOCALEPRIORITYLIST_H__
|
||||
#define __LOCALEPRIORITYLIST_H__
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
#include "unicode/locid.h"
|
||||
#include "unicode/stringpiece.h"
|
||||
#include "unicode/uobject.h"
|
||||
|
||||
struct UHashtable;
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
struct LocaleAndWeightArray;
|
||||
|
||||
/**
|
||||
* Parses a list of locales from an accept-language string.
|
||||
* We are a bit more lenient than the spec:
|
||||
* We accept extra whitespace in more places, empty range fields,
|
||||
* and any number of qvalue fraction digits.
|
||||
*
|
||||
* https://tools.ietf.org/html/rfc2616#section-14.4
|
||||
* 14.4 Accept-Language
|
||||
*
|
||||
* Accept-Language = "Accept-Language" ":"
|
||||
* 1#( language-range [ ";" "q" "=" qvalue ] )
|
||||
* language-range = ( ( 1*8ALPHA *( "-" 1*8ALPHA ) ) | "*" )
|
||||
*
|
||||
* Each language-range MAY be given an associated quality value which
|
||||
* represents an estimate of the user's preference for the languages
|
||||
* specified by that range. The quality value defaults to "q=1". For
|
||||
* example,
|
||||
*
|
||||
* Accept-Language: da, en-gb;q=0.8, en;q=0.7
|
||||
*
|
||||
* https://tools.ietf.org/html/rfc2616#section-3.9
|
||||
* 3.9 Quality Values
|
||||
*
|
||||
* HTTP content negotiation (section 12) uses short "floating point"
|
||||
* numbers to indicate the relative importance ("weight") of various
|
||||
* negotiable parameters. A weight is normalized to a real number in
|
||||
* the range 0 through 1, where 0 is the minimum and 1 the maximum
|
||||
* value. If a parameter has a quality value of 0, then content with
|
||||
* this parameter is `not acceptable' for the client. HTTP/1.1
|
||||
* applications MUST NOT generate more than three digits after the
|
||||
* decimal point. User configuration of these values SHOULD also be
|
||||
* limited in this fashion.
|
||||
*
|
||||
* qvalue = ( "0" [ "." 0*3DIGIT ] )
|
||||
* | ( "1" [ "." 0*3("0") ] )
|
||||
*/
|
||||
class U_COMMON_API LocalePriorityList : public UMemory {
|
||||
public:
|
||||
class Iterator : public Locale::Iterator {
|
||||
public:
|
||||
UBool hasNext() const override { return count < length; }
|
||||
|
||||
const Locale &next() override {
|
||||
for(;;) {
|
||||
const Locale *locale = list.localeAt(index++);
|
||||
if (locale != nullptr) {
|
||||
++count;
|
||||
return *locale;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
friend class LocalePriorityList;
|
||||
|
||||
Iterator(const LocalePriorityList &list) : list(list), length(list.getLength()) {}
|
||||
|
||||
const LocalePriorityList &list;
|
||||
int32_t index = 0;
|
||||
int32_t count = 0;
|
||||
const int32_t length;
|
||||
};
|
||||
|
||||
LocalePriorityList(StringPiece s, UErrorCode &errorCode);
|
||||
|
||||
~LocalePriorityList();
|
||||
|
||||
int32_t getLength() const { return listLength - numRemoved; }
|
||||
|
||||
int32_t getLengthIncludingRemoved() const { return listLength; }
|
||||
|
||||
Iterator iterator() const { return Iterator(*this); }
|
||||
|
||||
const Locale *localeAt(int32_t i) const;
|
||||
|
||||
Locale *orphanLocaleAt(int32_t i);
|
||||
|
||||
private:
|
||||
LocalePriorityList(const LocalePriorityList &) = delete;
|
||||
LocalePriorityList &operator=(const LocalePriorityList &) = delete;
|
||||
|
||||
bool add(const Locale &locale, int32_t weight, UErrorCode &errorCode);
|
||||
|
||||
void sort(UErrorCode &errorCode);
|
||||
|
||||
LocaleAndWeightArray *list = nullptr;
|
||||
int32_t listLength = 0;
|
||||
int32_t numRemoved = 0;
|
||||
bool hasWeights = false; // other than 1.0
|
||||
UHashtable *map = nullptr;
|
||||
};
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
||||
#endif // __LOCALEPRIORITYLIST_H__
|
||||
27
engine/thirdparty/icu4c/common/localsvc.h
vendored
Normal file
27
engine/thirdparty/icu4c/common/localsvc.h
vendored
Normal file
|
|
@ -0,0 +1,27 @@
|
|||
// © 2016 and later: Unicode, Inc. and others.
|
||||
// License & terms of use: http://www.unicode.org/copyright.html
|
||||
/*
|
||||
***************************************************************************
|
||||
* Copyright (C) 2006 International Business Machines Corporation *
|
||||
* and others. All rights reserved. *
|
||||
***************************************************************************
|
||||
*/
|
||||
|
||||
#ifndef LOCALSVC_H
|
||||
#define LOCALSVC_H
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
|
||||
#if defined(U_LOCAL_SERVICE_HOOK) && U_LOCAL_SERVICE_HOOK
|
||||
/**
|
||||
* Prototype for user-supplied service hook. This function is expected to return
|
||||
* a type of factory object specific to the requested service.
|
||||
*
|
||||
* @param what service-specific string identifying the specific user hook
|
||||
* @param status error status
|
||||
* @return a service-specific hook, or NULL on failure.
|
||||
*/
|
||||
U_CAPI void* uprv_svc_hook(const char *what, UErrorCode *status);
|
||||
#endif
|
||||
|
||||
#endif
|
||||
265
engine/thirdparty/icu4c/common/locavailable.cpp
vendored
Normal file
265
engine/thirdparty/icu4c/common/locavailable.cpp
vendored
Normal file
|
|
@ -0,0 +1,265 @@
|
|||
// © 2016 and later: Unicode, Inc. and others.
|
||||
// License & terms of use: http://www.unicode.org/copyright.html
|
||||
/*
|
||||
*******************************************************************************
|
||||
*
|
||||
* Copyright (C) 1997-2013, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*
|
||||
*******************************************************************************
|
||||
* file name: locavailable.cpp
|
||||
* encoding: UTF-8
|
||||
* tab size: 8 (not used)
|
||||
* indentation:4
|
||||
*
|
||||
* created on: 2010feb25
|
||||
* created by: Markus W. Scherer
|
||||
*
|
||||
* Code for available locales, separated out from other .cpp files
|
||||
* that then do not depend on resource bundle code and res_index bundles.
|
||||
*/
|
||||
|
||||
#include "unicode/errorcode.h"
|
||||
#include "unicode/utypes.h"
|
||||
#include "unicode/locid.h"
|
||||
#include "unicode/uloc.h"
|
||||
#include "unicode/ures.h"
|
||||
#include "cmemory.h"
|
||||
#include "cstring.h"
|
||||
#include "ucln_cmn.h"
|
||||
#include "uassert.h"
|
||||
#include "umutex.h"
|
||||
#include "uresimp.h"
|
||||
|
||||
// C++ API ----------------------------------------------------------------- ***
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
static icu::Locale* availableLocaleList = nullptr;
|
||||
static int32_t availableLocaleListCount;
|
||||
static icu::UInitOnce gInitOnceLocale {};
|
||||
|
||||
namespace {
|
||||
|
||||
UBool U_CALLCONV locale_available_cleanup()
|
||||
{
|
||||
if (availableLocaleList) {
|
||||
delete []availableLocaleList;
|
||||
availableLocaleList = nullptr;
|
||||
}
|
||||
availableLocaleListCount = 0;
|
||||
gInitOnceLocale.reset();
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
void U_CALLCONV locale_available_init() {
|
||||
// This function is a friend of class Locale.
|
||||
// This function is only invoked via umtx_initOnce().
|
||||
|
||||
// for now, there is a hardcoded list, so just walk through that list and set it up.
|
||||
// Note: this function is a friend of class Locale.
|
||||
availableLocaleListCount = uloc_countAvailable();
|
||||
if(availableLocaleListCount) {
|
||||
availableLocaleList = new Locale[availableLocaleListCount];
|
||||
}
|
||||
if (availableLocaleList == nullptr) {
|
||||
availableLocaleListCount= 0;
|
||||
}
|
||||
for (int32_t locCount=availableLocaleListCount-1; locCount>=0; --locCount) {
|
||||
availableLocaleList[locCount].setFromPOSIXID(uloc_getAvailable(locCount));
|
||||
}
|
||||
ucln_common_registerCleanup(UCLN_COMMON_LOCALE_AVAILABLE, locale_available_cleanup);
|
||||
}
|
||||
|
||||
const Locale* U_EXPORT2
|
||||
Locale::getAvailableLocales(int32_t& count)
|
||||
{
|
||||
umtx_initOnce(gInitOnceLocale, &locale_available_init);
|
||||
count = availableLocaleListCount;
|
||||
return availableLocaleList;
|
||||
}
|
||||
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
||||
// C API ------------------------------------------------------------------- ***
|
||||
|
||||
U_NAMESPACE_USE
|
||||
|
||||
/* ### Constants **************************************************/
|
||||
|
||||
namespace {
|
||||
|
||||
// Enough capacity for the two lists in the res_index.res file
|
||||
const char** gAvailableLocaleNames[2] = {};
|
||||
int32_t gAvailableLocaleCounts[2] = {};
|
||||
icu::UInitOnce ginstalledLocalesInitOnce {};
|
||||
|
||||
class AvailableLocalesSink : public ResourceSink {
|
||||
public:
|
||||
void put(const char *key, ResourceValue &value, UBool /*noFallback*/, UErrorCode &status) override {
|
||||
if (U_FAILURE(status)) { return; }
|
||||
ResourceTable resIndexTable = value.getTable(status);
|
||||
if (U_FAILURE(status)) { return; }
|
||||
for (int32_t i = 0; resIndexTable.getKeyAndValue(i, key, value); ++i) {
|
||||
ULocAvailableType type;
|
||||
if (uprv_strcmp(key, "InstalledLocales") == 0) {
|
||||
type = ULOC_AVAILABLE_DEFAULT;
|
||||
} else if (uprv_strcmp(key, "AliasLocales") == 0) {
|
||||
type = ULOC_AVAILABLE_ONLY_LEGACY_ALIASES;
|
||||
} else {
|
||||
// CLDRVersion, etc.
|
||||
continue;
|
||||
}
|
||||
ResourceTable availableLocalesTable = value.getTable(status);
|
||||
if (U_FAILURE(status)) {
|
||||
return;
|
||||
}
|
||||
gAvailableLocaleCounts[type] = availableLocalesTable.getSize();
|
||||
gAvailableLocaleNames[type] = static_cast<const char**>(
|
||||
uprv_malloc(gAvailableLocaleCounts[type] * sizeof(const char*)));
|
||||
if (gAvailableLocaleNames[type] == nullptr) {
|
||||
status = U_MEMORY_ALLOCATION_ERROR;
|
||||
return;
|
||||
}
|
||||
for (int32_t j = 0; availableLocalesTable.getKeyAndValue(j, key, value); ++j) {
|
||||
gAvailableLocaleNames[type][j] = key;
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
class AvailableLocalesStringEnumeration : public StringEnumeration {
|
||||
public:
|
||||
AvailableLocalesStringEnumeration(ULocAvailableType type) : fType(type) {
|
||||
}
|
||||
|
||||
const char* next(int32_t *resultLength, UErrorCode &status) override {
|
||||
if (U_FAILURE(status)) { return nullptr; }
|
||||
ULocAvailableType actualType = fType;
|
||||
int32_t actualIndex = fIndex++;
|
||||
|
||||
// If the "combined" list was requested, resolve that now
|
||||
if (fType == ULOC_AVAILABLE_WITH_LEGACY_ALIASES) {
|
||||
int32_t defaultLocalesCount = gAvailableLocaleCounts[ULOC_AVAILABLE_DEFAULT];
|
||||
if (actualIndex < defaultLocalesCount) {
|
||||
actualType = ULOC_AVAILABLE_DEFAULT;
|
||||
} else {
|
||||
actualIndex -= defaultLocalesCount;
|
||||
actualType = ULOC_AVAILABLE_ONLY_LEGACY_ALIASES;
|
||||
}
|
||||
}
|
||||
|
||||
// Return the requested string
|
||||
int32_t count = gAvailableLocaleCounts[actualType];
|
||||
const char* result;
|
||||
if (actualIndex < count) {
|
||||
result = gAvailableLocaleNames[actualType][actualIndex];
|
||||
if (resultLength != nullptr) {
|
||||
*resultLength = static_cast<int32_t>(uprv_strlen(result));
|
||||
}
|
||||
} else {
|
||||
result = nullptr;
|
||||
if (resultLength != nullptr) {
|
||||
*resultLength = 0;
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
void reset(UErrorCode &status) override {
|
||||
if (U_FAILURE(status)) { return; }
|
||||
fIndex = 0;
|
||||
}
|
||||
|
||||
int32_t count(UErrorCode &status) const override {
|
||||
if (U_FAILURE(status)) { return 0; }
|
||||
if (fType == ULOC_AVAILABLE_WITH_LEGACY_ALIASES) {
|
||||
return gAvailableLocaleCounts[ULOC_AVAILABLE_DEFAULT]
|
||||
+ gAvailableLocaleCounts[ULOC_AVAILABLE_ONLY_LEGACY_ALIASES];
|
||||
} else {
|
||||
return gAvailableLocaleCounts[fType];
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
ULocAvailableType fType;
|
||||
int32_t fIndex = 0;
|
||||
};
|
||||
|
||||
/* ### Get available **************************************************/
|
||||
|
||||
UBool U_CALLCONV uloc_cleanup() {
|
||||
for (int32_t i = 0; i < UPRV_LENGTHOF(gAvailableLocaleNames); i++) {
|
||||
uprv_free(gAvailableLocaleNames[i]);
|
||||
gAvailableLocaleNames[i] = nullptr;
|
||||
gAvailableLocaleCounts[i] = 0;
|
||||
}
|
||||
ginstalledLocalesInitOnce.reset();
|
||||
return true;
|
||||
}
|
||||
|
||||
// Load Installed Locales. This function will be called exactly once
|
||||
// via the initOnce mechanism.
|
||||
|
||||
void U_CALLCONV loadInstalledLocales(UErrorCode& status) {
|
||||
ucln_common_registerCleanup(UCLN_COMMON_ULOC, uloc_cleanup);
|
||||
|
||||
icu::LocalUResourceBundlePointer rb(ures_openDirect(nullptr, "res_index", &status));
|
||||
AvailableLocalesSink sink;
|
||||
ures_getAllItemsWithFallback(rb.getAlias(), "", sink, status);
|
||||
}
|
||||
|
||||
void _load_installedLocales(UErrorCode& status) {
|
||||
umtx_initOnce(ginstalledLocalesInitOnce, &loadInstalledLocales, status);
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
U_CAPI const char* U_EXPORT2
|
||||
uloc_getAvailable(int32_t offset) {
|
||||
icu::ErrorCode status;
|
||||
_load_installedLocales(status);
|
||||
if (status.isFailure()) {
|
||||
return nullptr;
|
||||
}
|
||||
if (offset > gAvailableLocaleCounts[0]) {
|
||||
// *status = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return nullptr;
|
||||
}
|
||||
return gAvailableLocaleNames[0][offset];
|
||||
}
|
||||
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
uloc_countAvailable() {
|
||||
icu::ErrorCode status;
|
||||
_load_installedLocales(status);
|
||||
if (status.isFailure()) {
|
||||
return 0;
|
||||
}
|
||||
return gAvailableLocaleCounts[0];
|
||||
}
|
||||
|
||||
U_CAPI UEnumeration* U_EXPORT2
|
||||
uloc_openAvailableByType(ULocAvailableType type, UErrorCode* status) {
|
||||
if (U_FAILURE(*status)) {
|
||||
return nullptr;
|
||||
}
|
||||
if (type < 0 || type >= ULOC_AVAILABLE_COUNT) {
|
||||
*status = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return nullptr;
|
||||
}
|
||||
_load_installedLocales(*status);
|
||||
if (U_FAILURE(*status)) {
|
||||
return nullptr;
|
||||
}
|
||||
LocalPointer<AvailableLocalesStringEnumeration> result(
|
||||
new AvailableLocalesStringEnumeration(type), *status);
|
||||
if (U_FAILURE(*status)) {
|
||||
return nullptr;
|
||||
}
|
||||
return uenum_openFromStringEnumeration(result.orphan(), status);
|
||||
}
|
||||
55
engine/thirdparty/icu4c/common/locbased.cpp
vendored
Normal file
55
engine/thirdparty/icu4c/common/locbased.cpp
vendored
Normal file
|
|
@ -0,0 +1,55 @@
|
|||
// © 2016 and later: Unicode, Inc. and others.
|
||||
// License & terms of use: http://www.unicode.org/copyright.html
|
||||
/*
|
||||
**********************************************************************
|
||||
* Copyright (c) 2004-2014, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
**********************************************************************
|
||||
* Author: Alan Liu
|
||||
* Created: January 16 2004
|
||||
* Since: ICU 2.8
|
||||
**********************************************************************
|
||||
*/
|
||||
#include "locbased.h"
|
||||
#include "cstring.h"
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
Locale LocaleBased::getLocale(ULocDataLocaleType type, UErrorCode& status) const {
|
||||
const char* id = getLocaleID(type, status);
|
||||
return Locale(id != nullptr ? id : "");
|
||||
}
|
||||
|
||||
const char* LocaleBased::getLocaleID(ULocDataLocaleType type, UErrorCode& status) const {
|
||||
if (U_FAILURE(status)) {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
switch(type) {
|
||||
case ULOC_VALID_LOCALE:
|
||||
return valid;
|
||||
case ULOC_ACTUAL_LOCALE:
|
||||
return actual;
|
||||
default:
|
||||
status = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
void LocaleBased::setLocaleIDs(const char* validID, const char* actualID) {
|
||||
if (validID != nullptr) {
|
||||
uprv_strncpy(valid, validID, ULOC_FULLNAME_CAPACITY);
|
||||
valid[ULOC_FULLNAME_CAPACITY-1] = 0; // always terminate
|
||||
}
|
||||
if (actualID != nullptr) {
|
||||
uprv_strncpy(actual, actualID, ULOC_FULLNAME_CAPACITY);
|
||||
actual[ULOC_FULLNAME_CAPACITY-1] = 0; // always terminate
|
||||
}
|
||||
}
|
||||
|
||||
void LocaleBased::setLocaleIDs(const Locale& validID, const Locale& actualID) {
|
||||
uprv_strcpy(valid, validID.getName());
|
||||
uprv_strcpy(actual, actualID.getName());
|
||||
}
|
||||
|
||||
U_NAMESPACE_END
|
||||
107
engine/thirdparty/icu4c/common/locbased.h
vendored
Normal file
107
engine/thirdparty/icu4c/common/locbased.h
vendored
Normal file
|
|
@ -0,0 +1,107 @@
|
|||
// © 2016 and later: Unicode, Inc. and others.
|
||||
// License & terms of use: http://www.unicode.org/copyright.html
|
||||
/*
|
||||
**********************************************************************
|
||||
* Copyright (c) 2004-2014, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
**********************************************************************
|
||||
* Author: Alan Liu
|
||||
* Created: January 16 2004
|
||||
* Since: ICU 2.8
|
||||
**********************************************************************
|
||||
*/
|
||||
#ifndef LOCBASED_H
|
||||
#define LOCBASED_H
|
||||
|
||||
#include "unicode/locid.h"
|
||||
#include "unicode/uobject.h"
|
||||
|
||||
/**
|
||||
* Macro to declare a locale LocaleBased wrapper object for the given
|
||||
* object, which must have two members named `validLocale' and
|
||||
* `actualLocale' of size ULOC_FULLNAME_CAPACITY
|
||||
*/
|
||||
#define U_LOCALE_BASED(varname, objname) \
|
||||
LocaleBased varname((objname).validLocale, (objname).actualLocale)
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
/**
|
||||
* A utility class that unifies the implementation of getLocale() by
|
||||
* various ICU services. This class is likely to be removed in the
|
||||
* ICU 3.0 time frame in favor of an integrated approach with the
|
||||
* services framework.
|
||||
* @since ICU 2.8
|
||||
*/
|
||||
class U_COMMON_API LocaleBased : public UMemory {
|
||||
|
||||
public:
|
||||
|
||||
/**
|
||||
* Construct a LocaleBased wrapper around the two pointers. These
|
||||
* will be aliased for the lifetime of this object.
|
||||
*/
|
||||
inline LocaleBased(char* validAlias, char* actualAlias);
|
||||
|
||||
/**
|
||||
* Construct a LocaleBased wrapper around the two const pointers.
|
||||
* These will be aliased for the lifetime of this object.
|
||||
*/
|
||||
inline LocaleBased(const char* validAlias, const char* actualAlias);
|
||||
|
||||
/**
|
||||
* Return locale meta-data for the service object wrapped by this
|
||||
* object. Either the valid or the actual locale may be
|
||||
* retrieved.
|
||||
* @param type either ULOC_VALID_LOCALE or ULOC_ACTUAL_LOCALE
|
||||
* @param status input-output error code
|
||||
* @return the indicated locale
|
||||
*/
|
||||
Locale getLocale(ULocDataLocaleType type, UErrorCode& status) const;
|
||||
|
||||
/**
|
||||
* Return the locale ID for the service object wrapped by this
|
||||
* object. Either the valid or the actual locale may be
|
||||
* retrieved.
|
||||
* @param type either ULOC_VALID_LOCALE or ULOC_ACTUAL_LOCALE
|
||||
* @param status input-output error code
|
||||
* @return the indicated locale ID
|
||||
*/
|
||||
const char* getLocaleID(ULocDataLocaleType type, UErrorCode& status) const;
|
||||
|
||||
/**
|
||||
* Set the locale meta-data for the service object wrapped by this
|
||||
* object. If either parameter is zero, it is ignored.
|
||||
* @param valid the ID of the valid locale
|
||||
* @param actual the ID of the actual locale
|
||||
*/
|
||||
void setLocaleIDs(const char* valid, const char* actual);
|
||||
|
||||
/**
|
||||
* Set the locale meta-data for the service object wrapped by this
|
||||
* object.
|
||||
* @param valid the ID of the valid locale
|
||||
* @param actual the ID of the actual locale
|
||||
*/
|
||||
void setLocaleIDs(const Locale& valid, const Locale& actual);
|
||||
|
||||
private:
|
||||
|
||||
char* valid;
|
||||
|
||||
char* actual;
|
||||
};
|
||||
|
||||
inline LocaleBased::LocaleBased(char* validAlias, char* actualAlias) :
|
||||
valid(validAlias), actual(actualAlias) {
|
||||
}
|
||||
|
||||
inline LocaleBased::LocaleBased(const char* validAlias,
|
||||
const char* actualAlias) :
|
||||
// ugh: cast away const
|
||||
valid((char*)validAlias), actual((char*)actualAlias) {
|
||||
}
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
||||
#endif
|
||||
897
engine/thirdparty/icu4c/common/locdispnames.cpp
vendored
Normal file
897
engine/thirdparty/icu4c/common/locdispnames.cpp
vendored
Normal file
|
|
@ -0,0 +1,897 @@
|
|||
// © 2016 and later: Unicode, Inc. and others.
|
||||
// License & terms of use: http://www.unicode.org/copyright.html
|
||||
/*
|
||||
*******************************************************************************
|
||||
*
|
||||
* Copyright (C) 1997-2016, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*
|
||||
*******************************************************************************
|
||||
* file name: locdispnames.cpp
|
||||
* encoding: UTF-8
|
||||
* tab size: 8 (not used)
|
||||
* indentation:4
|
||||
*
|
||||
* created on: 2010feb25
|
||||
* created by: Markus W. Scherer
|
||||
*
|
||||
* Code for locale display names, separated out from other .cpp files
|
||||
* that then do not depend on resource bundle code and display name data.
|
||||
*/
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
#include "unicode/brkiter.h"
|
||||
#include "unicode/locid.h"
|
||||
#include "unicode/uenum.h"
|
||||
#include "unicode/uloc.h"
|
||||
#include "unicode/ures.h"
|
||||
#include "unicode/ustring.h"
|
||||
#include "charstr.h"
|
||||
#include "cmemory.h"
|
||||
#include "cstring.h"
|
||||
#include "putilimp.h"
|
||||
#include "ulocimp.h"
|
||||
#include "uresimp.h"
|
||||
#include "ureslocs.h"
|
||||
#include "ustr_imp.h"
|
||||
|
||||
// C++ API ----------------------------------------------------------------- ***
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
UnicodeString&
|
||||
Locale::getDisplayLanguage(UnicodeString& dispLang) const
|
||||
{
|
||||
return this->getDisplayLanguage(getDefault(), dispLang);
|
||||
}
|
||||
|
||||
/*We cannot make any assumptions on the size of the output display strings
|
||||
* Yet, since we are calling through to a C API, we need to set limits on
|
||||
* buffer size. For all the following getDisplay functions we first attempt
|
||||
* to fill up a stack allocated buffer. If it is to small we heap allocated
|
||||
* the exact buffer we need copy it to the UnicodeString and delete it*/
|
||||
|
||||
UnicodeString&
|
||||
Locale::getDisplayLanguage(const Locale &displayLocale,
|
||||
UnicodeString &result) const {
|
||||
char16_t *buffer;
|
||||
UErrorCode errorCode=U_ZERO_ERROR;
|
||||
int32_t length;
|
||||
|
||||
buffer=result.getBuffer(ULOC_FULLNAME_CAPACITY);
|
||||
if (buffer == nullptr) {
|
||||
result.truncate(0);
|
||||
return result;
|
||||
}
|
||||
|
||||
length=uloc_getDisplayLanguage(fullName, displayLocale.fullName,
|
||||
buffer, result.getCapacity(),
|
||||
&errorCode);
|
||||
result.releaseBuffer(U_SUCCESS(errorCode) ? length : 0);
|
||||
|
||||
if(errorCode==U_BUFFER_OVERFLOW_ERROR) {
|
||||
buffer=result.getBuffer(length);
|
||||
if (buffer == nullptr) {
|
||||
result.truncate(0);
|
||||
return result;
|
||||
}
|
||||
errorCode=U_ZERO_ERROR;
|
||||
length=uloc_getDisplayLanguage(fullName, displayLocale.fullName,
|
||||
buffer, result.getCapacity(),
|
||||
&errorCode);
|
||||
result.releaseBuffer(U_SUCCESS(errorCode) ? length : 0);
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
UnicodeString&
|
||||
Locale::getDisplayScript(UnicodeString& dispScript) const
|
||||
{
|
||||
return this->getDisplayScript(getDefault(), dispScript);
|
||||
}
|
||||
|
||||
UnicodeString&
|
||||
Locale::getDisplayScript(const Locale &displayLocale,
|
||||
UnicodeString &result) const {
|
||||
char16_t *buffer;
|
||||
UErrorCode errorCode=U_ZERO_ERROR;
|
||||
int32_t length;
|
||||
|
||||
buffer=result.getBuffer(ULOC_FULLNAME_CAPACITY);
|
||||
if (buffer == nullptr) {
|
||||
result.truncate(0);
|
||||
return result;
|
||||
}
|
||||
|
||||
length=uloc_getDisplayScript(fullName, displayLocale.fullName,
|
||||
buffer, result.getCapacity(),
|
||||
&errorCode);
|
||||
result.releaseBuffer(U_SUCCESS(errorCode) ? length : 0);
|
||||
|
||||
if(errorCode==U_BUFFER_OVERFLOW_ERROR) {
|
||||
buffer=result.getBuffer(length);
|
||||
if (buffer == nullptr) {
|
||||
result.truncate(0);
|
||||
return result;
|
||||
}
|
||||
errorCode=U_ZERO_ERROR;
|
||||
length=uloc_getDisplayScript(fullName, displayLocale.fullName,
|
||||
buffer, result.getCapacity(),
|
||||
&errorCode);
|
||||
result.releaseBuffer(U_SUCCESS(errorCode) ? length : 0);
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
UnicodeString&
|
||||
Locale::getDisplayCountry(UnicodeString& dispCntry) const
|
||||
{
|
||||
return this->getDisplayCountry(getDefault(), dispCntry);
|
||||
}
|
||||
|
||||
UnicodeString&
|
||||
Locale::getDisplayCountry(const Locale &displayLocale,
|
||||
UnicodeString &result) const {
|
||||
char16_t *buffer;
|
||||
UErrorCode errorCode=U_ZERO_ERROR;
|
||||
int32_t length;
|
||||
|
||||
buffer=result.getBuffer(ULOC_FULLNAME_CAPACITY);
|
||||
if (buffer == nullptr) {
|
||||
result.truncate(0);
|
||||
return result;
|
||||
}
|
||||
|
||||
length=uloc_getDisplayCountry(fullName, displayLocale.fullName,
|
||||
buffer, result.getCapacity(),
|
||||
&errorCode);
|
||||
result.releaseBuffer(U_SUCCESS(errorCode) ? length : 0);
|
||||
|
||||
if(errorCode==U_BUFFER_OVERFLOW_ERROR) {
|
||||
buffer=result.getBuffer(length);
|
||||
if (buffer == nullptr) {
|
||||
result.truncate(0);
|
||||
return result;
|
||||
}
|
||||
errorCode=U_ZERO_ERROR;
|
||||
length=uloc_getDisplayCountry(fullName, displayLocale.fullName,
|
||||
buffer, result.getCapacity(),
|
||||
&errorCode);
|
||||
result.releaseBuffer(U_SUCCESS(errorCode) ? length : 0);
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
UnicodeString&
|
||||
Locale::getDisplayVariant(UnicodeString& dispVar) const
|
||||
{
|
||||
return this->getDisplayVariant(getDefault(), dispVar);
|
||||
}
|
||||
|
||||
UnicodeString&
|
||||
Locale::getDisplayVariant(const Locale &displayLocale,
|
||||
UnicodeString &result) const {
|
||||
char16_t *buffer;
|
||||
UErrorCode errorCode=U_ZERO_ERROR;
|
||||
int32_t length;
|
||||
|
||||
buffer=result.getBuffer(ULOC_FULLNAME_CAPACITY);
|
||||
if (buffer == nullptr) {
|
||||
result.truncate(0);
|
||||
return result;
|
||||
}
|
||||
|
||||
length=uloc_getDisplayVariant(fullName, displayLocale.fullName,
|
||||
buffer, result.getCapacity(),
|
||||
&errorCode);
|
||||
result.releaseBuffer(U_SUCCESS(errorCode) ? length : 0);
|
||||
|
||||
if(errorCode==U_BUFFER_OVERFLOW_ERROR) {
|
||||
buffer=result.getBuffer(length);
|
||||
if (buffer == nullptr) {
|
||||
result.truncate(0);
|
||||
return result;
|
||||
}
|
||||
errorCode=U_ZERO_ERROR;
|
||||
length=uloc_getDisplayVariant(fullName, displayLocale.fullName,
|
||||
buffer, result.getCapacity(),
|
||||
&errorCode);
|
||||
result.releaseBuffer(U_SUCCESS(errorCode) ? length : 0);
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
UnicodeString&
|
||||
Locale::getDisplayName( UnicodeString& name ) const
|
||||
{
|
||||
return this->getDisplayName(getDefault(), name);
|
||||
}
|
||||
|
||||
UnicodeString&
|
||||
Locale::getDisplayName(const Locale &displayLocale,
|
||||
UnicodeString &result) const {
|
||||
char16_t *buffer;
|
||||
UErrorCode errorCode=U_ZERO_ERROR;
|
||||
int32_t length;
|
||||
|
||||
buffer=result.getBuffer(ULOC_FULLNAME_CAPACITY);
|
||||
if (buffer == nullptr) {
|
||||
result.truncate(0);
|
||||
return result;
|
||||
}
|
||||
|
||||
length=uloc_getDisplayName(fullName, displayLocale.fullName,
|
||||
buffer, result.getCapacity(),
|
||||
&errorCode);
|
||||
result.releaseBuffer(U_SUCCESS(errorCode) ? length : 0);
|
||||
|
||||
if(errorCode==U_BUFFER_OVERFLOW_ERROR) {
|
||||
buffer=result.getBuffer(length);
|
||||
if (buffer == nullptr) {
|
||||
result.truncate(0);
|
||||
return result;
|
||||
}
|
||||
errorCode=U_ZERO_ERROR;
|
||||
length=uloc_getDisplayName(fullName, displayLocale.fullName,
|
||||
buffer, result.getCapacity(),
|
||||
&errorCode);
|
||||
result.releaseBuffer(U_SUCCESS(errorCode) ? length : 0);
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
#if ! UCONFIG_NO_BREAK_ITERATION
|
||||
|
||||
// -------------------------------------
|
||||
// Gets the objectLocale display name in the default locale language.
|
||||
UnicodeString& U_EXPORT2
|
||||
BreakIterator::getDisplayName(const Locale& objectLocale,
|
||||
UnicodeString& name)
|
||||
{
|
||||
return objectLocale.getDisplayName(name);
|
||||
}
|
||||
|
||||
// -------------------------------------
|
||||
// Gets the objectLocale display name in the displayLocale language.
|
||||
UnicodeString& U_EXPORT2
|
||||
BreakIterator::getDisplayName(const Locale& objectLocale,
|
||||
const Locale& displayLocale,
|
||||
UnicodeString& name)
|
||||
{
|
||||
return objectLocale.getDisplayName(displayLocale, name);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
||||
// C API ------------------------------------------------------------------- ***
|
||||
|
||||
U_NAMESPACE_USE
|
||||
|
||||
namespace {
|
||||
|
||||
/* ### Constants **************************************************/
|
||||
|
||||
/* These strings describe the resources we attempt to load from
|
||||
the locale ResourceBundle data file.*/
|
||||
constexpr char _kLanguages[] = "Languages";
|
||||
constexpr char _kScripts[] = "Scripts";
|
||||
constexpr char _kScriptsStandAlone[] = "Scripts%stand-alone";
|
||||
constexpr char _kCountries[] = "Countries";
|
||||
constexpr char _kVariants[] = "Variants";
|
||||
constexpr char _kKeys[] = "Keys";
|
||||
constexpr char _kTypes[] = "Types";
|
||||
//constexpr char _kRootName[] = "root";
|
||||
constexpr char _kCurrency[] = "currency";
|
||||
constexpr char _kCurrencies[] = "Currencies";
|
||||
constexpr char _kLocaleDisplayPattern[] = "localeDisplayPattern";
|
||||
constexpr char _kPattern[] = "pattern";
|
||||
constexpr char _kSeparator[] = "separator";
|
||||
|
||||
/* ### Display name **************************************************/
|
||||
|
||||
int32_t
|
||||
_getStringOrCopyKey(const char *path, const char *locale,
|
||||
const char *tableKey,
|
||||
const char* subTableKey,
|
||||
const char *itemKey,
|
||||
const char *substitute,
|
||||
char16_t *dest, int32_t destCapacity,
|
||||
UErrorCode &errorCode) {
|
||||
if (U_FAILURE(errorCode)) { return 0; }
|
||||
const char16_t *s = nullptr;
|
||||
int32_t length = 0;
|
||||
|
||||
if(itemKey==nullptr) {
|
||||
/* top-level item: normal resource bundle access */
|
||||
icu::LocalUResourceBundlePointer rb(ures_open(path, locale, &errorCode));
|
||||
|
||||
if(U_SUCCESS(errorCode)) {
|
||||
s=ures_getStringByKey(rb.getAlias(), tableKey, &length, &errorCode);
|
||||
/* see comment about closing rb near "return item;" in _res_getTableStringWithFallback() */
|
||||
}
|
||||
} else {
|
||||
bool isLanguageCode = (uprv_strncmp(tableKey, _kLanguages, 9) == 0);
|
||||
/* Language code should not be a number. If it is, set the error code. */
|
||||
if (isLanguageCode && uprv_strtol(itemKey, nullptr, 10)) {
|
||||
errorCode = U_MISSING_RESOURCE_ERROR;
|
||||
} else {
|
||||
/* second-level item, use special fallback */
|
||||
s=uloc_getTableStringWithFallback(path, locale,
|
||||
tableKey,
|
||||
subTableKey,
|
||||
itemKey,
|
||||
&length,
|
||||
&errorCode);
|
||||
if (U_FAILURE(errorCode) && isLanguageCode && itemKey != nullptr) {
|
||||
// convert itemKey locale code to canonical form and try again, ICU-20870
|
||||
errorCode = U_ZERO_ERROR;
|
||||
Locale canonKey = Locale::createCanonical(itemKey);
|
||||
s=uloc_getTableStringWithFallback(path, locale,
|
||||
tableKey,
|
||||
subTableKey,
|
||||
canonKey.getName(),
|
||||
&length,
|
||||
&errorCode);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if(U_SUCCESS(errorCode)) {
|
||||
int32_t copyLength=uprv_min(length, destCapacity);
|
||||
if(copyLength>0 && s != nullptr) {
|
||||
u_memcpy(dest, s, copyLength);
|
||||
}
|
||||
} else {
|
||||
/* no string from a resource bundle: convert the substitute */
|
||||
length=(int32_t)uprv_strlen(substitute);
|
||||
u_charsToUChars(substitute, dest, uprv_min(length, destCapacity));
|
||||
errorCode = U_USING_DEFAULT_WARNING;
|
||||
}
|
||||
|
||||
return u_terminateUChars(dest, destCapacity, length, &errorCode);
|
||||
}
|
||||
|
||||
using UDisplayNameGetter = icu::CharString(const char*, UErrorCode&);
|
||||
|
||||
int32_t
|
||||
_getDisplayNameForComponent(const char *locale,
|
||||
const char *displayLocale,
|
||||
char16_t *dest, int32_t destCapacity,
|
||||
UDisplayNameGetter *getter,
|
||||
const char *tag,
|
||||
UErrorCode &errorCode) {
|
||||
if (U_FAILURE(errorCode)) { return 0; }
|
||||
UErrorCode localStatus;
|
||||
const char* root = nullptr;
|
||||
|
||||
if(destCapacity<0 || (destCapacity>0 && dest==nullptr)) {
|
||||
errorCode = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return 0;
|
||||
}
|
||||
|
||||
localStatus = U_ZERO_ERROR;
|
||||
icu::CharString localeBuffer = (*getter)(locale, localStatus);
|
||||
if (U_FAILURE(localStatus)) {
|
||||
errorCode = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return 0;
|
||||
}
|
||||
if (localeBuffer.isEmpty()) {
|
||||
// For the display name, we treat this as unknown language (ICU-20273).
|
||||
if (getter == ulocimp_getLanguage) {
|
||||
localeBuffer.append("und", errorCode);
|
||||
} else {
|
||||
return u_terminateUChars(dest, destCapacity, 0, &errorCode);
|
||||
}
|
||||
}
|
||||
|
||||
root = tag == _kCountries ? U_ICUDATA_REGION : U_ICUDATA_LANG;
|
||||
|
||||
return _getStringOrCopyKey(root, displayLocale,
|
||||
tag, nullptr, localeBuffer.data(),
|
||||
localeBuffer.data(),
|
||||
dest, destCapacity,
|
||||
errorCode);
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
uloc_getDisplayLanguage(const char *locale,
|
||||
const char *displayLocale,
|
||||
char16_t *dest, int32_t destCapacity,
|
||||
UErrorCode *pErrorCode) {
|
||||
return _getDisplayNameForComponent(locale, displayLocale, dest, destCapacity,
|
||||
ulocimp_getLanguage, _kLanguages, *pErrorCode);
|
||||
}
|
||||
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
uloc_getDisplayScript(const char* locale,
|
||||
const char* displayLocale,
|
||||
char16_t *dest, int32_t destCapacity,
|
||||
UErrorCode *pErrorCode)
|
||||
{
|
||||
if (U_FAILURE(*pErrorCode)) { return 0; }
|
||||
UErrorCode err = U_ZERO_ERROR;
|
||||
int32_t res = _getDisplayNameForComponent(locale, displayLocale, dest, destCapacity,
|
||||
ulocimp_getScript, _kScriptsStandAlone, err);
|
||||
|
||||
if (destCapacity == 0 && err == U_BUFFER_OVERFLOW_ERROR) {
|
||||
// For preflight, return the max of the value and the fallback.
|
||||
int32_t fallback_res = _getDisplayNameForComponent(locale, displayLocale, dest, destCapacity,
|
||||
ulocimp_getScript, _kScripts, *pErrorCode);
|
||||
return (fallback_res > res) ? fallback_res : res;
|
||||
}
|
||||
if ( err == U_USING_DEFAULT_WARNING ) {
|
||||
return _getDisplayNameForComponent(locale, displayLocale, dest, destCapacity,
|
||||
ulocimp_getScript, _kScripts, *pErrorCode);
|
||||
} else {
|
||||
*pErrorCode = err;
|
||||
return res;
|
||||
}
|
||||
}
|
||||
|
||||
static int32_t
|
||||
uloc_getDisplayScriptInContext(const char* locale,
|
||||
const char* displayLocale,
|
||||
char16_t *dest, int32_t destCapacity,
|
||||
UErrorCode *pErrorCode)
|
||||
{
|
||||
return _getDisplayNameForComponent(locale, displayLocale, dest, destCapacity,
|
||||
ulocimp_getScript, _kScripts, *pErrorCode);
|
||||
}
|
||||
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
uloc_getDisplayCountry(const char *locale,
|
||||
const char *displayLocale,
|
||||
char16_t *dest, int32_t destCapacity,
|
||||
UErrorCode *pErrorCode) {
|
||||
return _getDisplayNameForComponent(locale, displayLocale, dest, destCapacity,
|
||||
ulocimp_getRegion, _kCountries, *pErrorCode);
|
||||
}
|
||||
|
||||
/*
|
||||
* TODO separate variant1_variant2_variant3...
|
||||
* by getting each tag's display string and concatenating them with ", "
|
||||
* in between - similar to uloc_getDisplayName()
|
||||
*/
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
uloc_getDisplayVariant(const char *locale,
|
||||
const char *displayLocale,
|
||||
char16_t *dest, int32_t destCapacity,
|
||||
UErrorCode *pErrorCode) {
|
||||
return _getDisplayNameForComponent(locale, displayLocale, dest, destCapacity,
|
||||
ulocimp_getVariant, _kVariants, *pErrorCode);
|
||||
}
|
||||
|
||||
/* Instead of having a separate pass for 'special' patterns, reintegrate the two
|
||||
* so we don't get bitten by preflight bugs again. We can be reasonably efficient
|
||||
* without two separate code paths, this code isn't that performance-critical.
|
||||
*
|
||||
* This code is general enough to deal with patterns that have a prefix or swap the
|
||||
* language and remainder components, since we gave developers enough rope to do such
|
||||
* things if they futz with the pattern data. But since we don't give them a way to
|
||||
* specify a pattern for arbitrary combinations of components, there's not much use in
|
||||
* that. I don't think our data includes such patterns, the only variable I know if is
|
||||
* whether there is a space before the open paren, or not. Oh, and zh uses different
|
||||
* chars than the standard open/close paren (which ja and ko use, btw).
|
||||
*/
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
uloc_getDisplayName(const char *locale,
|
||||
const char *displayLocale,
|
||||
char16_t *dest, int32_t destCapacity,
|
||||
UErrorCode *pErrorCode)
|
||||
{
|
||||
static const char16_t defaultSeparator[9] = { 0x007b, 0x0030, 0x007d, 0x002c, 0x0020, 0x007b, 0x0031, 0x007d, 0x0000 }; /* "{0}, {1}" */
|
||||
static const char16_t sub0[4] = { 0x007b, 0x0030, 0x007d , 0x0000 } ; /* {0} */
|
||||
static const char16_t sub1[4] = { 0x007b, 0x0031, 0x007d , 0x0000 } ; /* {1} */
|
||||
static const int32_t subLen = 3;
|
||||
static const char16_t defaultPattern[10] = {
|
||||
0x007b, 0x0030, 0x007d, 0x0020, 0x0028, 0x007b, 0x0031, 0x007d, 0x0029, 0x0000
|
||||
}; /* {0} ({1}) */
|
||||
static const int32_t defaultPatLen = 9;
|
||||
static const int32_t defaultSub0Pos = 0;
|
||||
static const int32_t defaultSub1Pos = 5;
|
||||
|
||||
int32_t length; /* of formatted result */
|
||||
|
||||
const char16_t *separator;
|
||||
int32_t sepLen = 0;
|
||||
const char16_t *pattern;
|
||||
int32_t patLen = 0;
|
||||
int32_t sub0Pos, sub1Pos;
|
||||
|
||||
char16_t formatOpenParen = 0x0028; // (
|
||||
char16_t formatReplaceOpenParen = 0x005B; // [
|
||||
char16_t formatCloseParen = 0x0029; // )
|
||||
char16_t formatReplaceCloseParen = 0x005D; // ]
|
||||
|
||||
UBool haveLang = true; /* assume true, set false if we find we don't have
|
||||
a lang component in the locale */
|
||||
UBool haveRest = true; /* assume true, set false if we find we don't have
|
||||
any other component in the locale */
|
||||
UBool retry = false; /* set true if we need to retry, see below */
|
||||
|
||||
int32_t langi = 0; /* index of the language substitution (0 or 1), virtually always 0 */
|
||||
|
||||
if(pErrorCode==nullptr || U_FAILURE(*pErrorCode)) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
if(destCapacity<0 || (destCapacity>0 && dest==nullptr)) {
|
||||
*pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return 0;
|
||||
}
|
||||
|
||||
{
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
|
||||
icu::LocalUResourceBundlePointer locbundle(
|
||||
ures_open(U_ICUDATA_LANG, displayLocale, &status));
|
||||
icu::LocalUResourceBundlePointer dspbundle(
|
||||
ures_getByKeyWithFallback(locbundle.getAlias(), _kLocaleDisplayPattern, nullptr, &status));
|
||||
|
||||
separator=ures_getStringByKeyWithFallback(dspbundle.getAlias(), _kSeparator, &sepLen, &status);
|
||||
pattern=ures_getStringByKeyWithFallback(dspbundle.getAlias(), _kPattern, &patLen, &status);
|
||||
}
|
||||
|
||||
/* If we couldn't find any data, then use the defaults */
|
||||
if(sepLen == 0) {
|
||||
separator = defaultSeparator;
|
||||
}
|
||||
/* #10244: Even though separator is now a pattern, it is awkward to handle it as such
|
||||
* here since we are trying to build the display string in place in the dest buffer,
|
||||
* and to handle it as a pattern would entail having separate storage for the
|
||||
* substrings that need to be combined (the first of which may be the result of
|
||||
* previous such combinations). So for now we continue to treat the portion between
|
||||
* {0} and {1} as a string to be appended when joining substrings, ignoring anything
|
||||
* that is before {0} or after {1} (no existing separator pattern has any such thing).
|
||||
* This is similar to how pattern is handled below.
|
||||
*/
|
||||
{
|
||||
char16_t *p0=u_strstr(separator, sub0);
|
||||
char16_t *p1=u_strstr(separator, sub1);
|
||||
if (p0==nullptr || p1==nullptr || p1<p0) {
|
||||
*pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return 0;
|
||||
}
|
||||
separator = (const char16_t *)p0 + subLen;
|
||||
sepLen = static_cast<int32_t>(p1 - separator);
|
||||
}
|
||||
|
||||
if(patLen==0 || (patLen==defaultPatLen && !u_strncmp(pattern, defaultPattern, patLen))) {
|
||||
pattern=defaultPattern;
|
||||
patLen=defaultPatLen;
|
||||
sub0Pos=defaultSub0Pos;
|
||||
sub1Pos=defaultSub1Pos;
|
||||
// use default formatOpenParen etc. set above
|
||||
} else { /* non-default pattern */
|
||||
char16_t *p0=u_strstr(pattern, sub0);
|
||||
char16_t *p1=u_strstr(pattern, sub1);
|
||||
if (p0==nullptr || p1==nullptr) {
|
||||
*pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return 0;
|
||||
}
|
||||
sub0Pos = static_cast<int32_t>(p0-pattern);
|
||||
sub1Pos = static_cast<int32_t>(p1-pattern);
|
||||
if (sub1Pos < sub0Pos) { /* a very odd pattern */
|
||||
int32_t t=sub0Pos; sub0Pos=sub1Pos; sub1Pos=t;
|
||||
langi=1;
|
||||
}
|
||||
if (u_strchr(pattern, 0xFF08) != nullptr) {
|
||||
formatOpenParen = 0xFF08; // fullwidth (
|
||||
formatReplaceOpenParen = 0xFF3B; // fullwidth [
|
||||
formatCloseParen = 0xFF09; // fullwidth )
|
||||
formatReplaceCloseParen = 0xFF3D; // fullwidth ]
|
||||
}
|
||||
}
|
||||
|
||||
/* We loop here because there is one case in which after the first pass we could need to
|
||||
* reextract the data. If there's initial padding before the first element, we put in
|
||||
* the padding and then write that element. If it turns out there's no second element,
|
||||
* we didn't need the padding. If we do need the data (no preflight), and the first element
|
||||
* would have fit but for the padding, we need to reextract. In this case (only) we
|
||||
* adjust the parameters so padding is not added, and repeat.
|
||||
*/
|
||||
do {
|
||||
char16_t* p=dest;
|
||||
int32_t patPos=0; /* position in the pattern, used for non-substitution portions */
|
||||
int32_t langLen=0; /* length of language substitution */
|
||||
int32_t langPos=0; /* position in output of language substitution */
|
||||
int32_t restLen=0; /* length of 'everything else' substitution */
|
||||
int32_t restPos=0; /* position in output of 'everything else' substitution */
|
||||
icu::LocalUEnumerationPointer kenum; /* keyword enumeration */
|
||||
|
||||
/* prefix of pattern, extremely likely to be empty */
|
||||
if(sub0Pos) {
|
||||
if(destCapacity >= sub0Pos) {
|
||||
while (patPos < sub0Pos) {
|
||||
*p++ = pattern[patPos++];
|
||||
}
|
||||
} else {
|
||||
patPos=sub0Pos;
|
||||
}
|
||||
length=sub0Pos;
|
||||
} else {
|
||||
length=0;
|
||||
}
|
||||
|
||||
for(int32_t subi=0,resti=0;subi<2;) { /* iterate through patterns 0 and 1*/
|
||||
UBool subdone = false; /* set true when ready to move to next substitution */
|
||||
|
||||
/* prep p and cap for calls to get display components, pin cap to 0 since
|
||||
they complain if cap is negative */
|
||||
int32_t cap=destCapacity-length;
|
||||
if (cap <= 0) {
|
||||
cap=0;
|
||||
} else {
|
||||
p=dest+length;
|
||||
}
|
||||
|
||||
if (subi == langi) { /* {0}*/
|
||||
if(haveLang) {
|
||||
langPos=length;
|
||||
langLen=uloc_getDisplayLanguage(locale, displayLocale, p, cap, pErrorCode);
|
||||
length+=langLen;
|
||||
haveLang=langLen>0;
|
||||
}
|
||||
subdone=true;
|
||||
} else { /* {1} */
|
||||
if(!haveRest) {
|
||||
subdone=true;
|
||||
} else {
|
||||
int32_t len; /* length of component (plus other stuff) we just fetched */
|
||||
switch(resti++) {
|
||||
case 0:
|
||||
restPos=length;
|
||||
len=uloc_getDisplayScriptInContext(locale, displayLocale, p, cap, pErrorCode);
|
||||
break;
|
||||
case 1:
|
||||
len=uloc_getDisplayCountry(locale, displayLocale, p, cap, pErrorCode);
|
||||
break;
|
||||
case 2:
|
||||
len=uloc_getDisplayVariant(locale, displayLocale, p, cap, pErrorCode);
|
||||
break;
|
||||
case 3:
|
||||
kenum.adoptInstead(uloc_openKeywords(locale, pErrorCode));
|
||||
U_FALLTHROUGH;
|
||||
default: {
|
||||
const char* kw=uenum_next(kenum.getAlias(), &len, pErrorCode);
|
||||
if (kw == nullptr) {
|
||||
len=0; /* mark that we didn't add a component */
|
||||
subdone=true;
|
||||
} else {
|
||||
/* incorporating this behavior into the loop made it even more complex,
|
||||
so just special case it here */
|
||||
len = uloc_getDisplayKeyword(kw, displayLocale, p, cap, pErrorCode);
|
||||
if(len) {
|
||||
if(len < cap) {
|
||||
p[len]=0x3d; /* '=', assume we'll need it */
|
||||
}
|
||||
len+=1;
|
||||
|
||||
/* adjust for call to get keyword */
|
||||
cap-=len;
|
||||
if(cap <= 0) {
|
||||
cap=0;
|
||||
} else {
|
||||
p+=len;
|
||||
}
|
||||
}
|
||||
/* reset for call below */
|
||||
if(*pErrorCode == U_BUFFER_OVERFLOW_ERROR) {
|
||||
*pErrorCode=U_ZERO_ERROR;
|
||||
}
|
||||
int32_t vlen = uloc_getDisplayKeywordValue(locale, kw, displayLocale,
|
||||
p, cap, pErrorCode);
|
||||
if(len) {
|
||||
if(vlen==0) {
|
||||
--len; /* remove unneeded '=' */
|
||||
}
|
||||
/* restore cap and p to what they were at start */
|
||||
cap=destCapacity-length;
|
||||
if(cap <= 0) {
|
||||
cap=0;
|
||||
} else {
|
||||
p=dest+length;
|
||||
}
|
||||
}
|
||||
len+=vlen; /* total we added for key + '=' + value */
|
||||
}
|
||||
} break;
|
||||
} /* end switch */
|
||||
|
||||
if (len>0) {
|
||||
/* we added a component, so add separator and write it if there's room. */
|
||||
if(len+sepLen<=cap) {
|
||||
const char16_t * plimit = p + len;
|
||||
for (; p < plimit; p++) {
|
||||
if (*p == formatOpenParen) {
|
||||
*p = formatReplaceOpenParen;
|
||||
} else if (*p == formatCloseParen) {
|
||||
*p = formatReplaceCloseParen;
|
||||
}
|
||||
}
|
||||
for(int32_t i=0;i<sepLen;++i) {
|
||||
*p++=separator[i];
|
||||
}
|
||||
}
|
||||
length+=len+sepLen;
|
||||
} else if(subdone) {
|
||||
/* remove separator if we added it */
|
||||
if (length!=restPos) {
|
||||
length-=sepLen;
|
||||
}
|
||||
restLen=length-restPos;
|
||||
haveRest=restLen>0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if(*pErrorCode == U_BUFFER_OVERFLOW_ERROR) {
|
||||
*pErrorCode=U_ZERO_ERROR;
|
||||
}
|
||||
|
||||
if(subdone) {
|
||||
if(haveLang && haveRest) {
|
||||
/* append internal portion of pattern, the first time,
|
||||
or last portion of pattern the second time */
|
||||
int32_t padLen;
|
||||
patPos+=subLen;
|
||||
padLen=(subi==0 ? sub1Pos : patLen)-patPos;
|
||||
if(length+padLen <= destCapacity) {
|
||||
p=dest+length;
|
||||
for(int32_t i=0;i<padLen;++i) {
|
||||
*p++=pattern[patPos++];
|
||||
}
|
||||
} else {
|
||||
patPos+=padLen;
|
||||
}
|
||||
length+=padLen;
|
||||
} else if(subi==0) {
|
||||
/* don't have first component, reset for second component */
|
||||
sub0Pos=0;
|
||||
length=0;
|
||||
} else if(length>0) {
|
||||
/* true length is the length of just the component we got. */
|
||||
length=haveLang?langLen:restLen;
|
||||
if(dest && sub0Pos!=0) {
|
||||
if (sub0Pos+length<=destCapacity) {
|
||||
/* first component not at start of result,
|
||||
but we have full component in buffer. */
|
||||
u_memmove(dest, dest+(haveLang?langPos:restPos), length);
|
||||
} else {
|
||||
/* would have fit, but didn't because of pattern prefix. */
|
||||
sub0Pos=0; /* stops initial padding (and a second retry,
|
||||
so we won't end up here again) */
|
||||
retry=true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
++subi; /* move on to next substitution */
|
||||
}
|
||||
}
|
||||
} while(retry);
|
||||
|
||||
return u_terminateUChars(dest, destCapacity, length, pErrorCode);
|
||||
}
|
||||
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
uloc_getDisplayKeyword(const char* keyword,
|
||||
const char* displayLocale,
|
||||
char16_t* dest,
|
||||
int32_t destCapacity,
|
||||
UErrorCode* status){
|
||||
|
||||
/* argument checking */
|
||||
if(status==nullptr || U_FAILURE(*status)) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
if(destCapacity<0 || (destCapacity>0 && dest==nullptr)) {
|
||||
*status=U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
/* pass itemKey=nullptr to look for a top-level item */
|
||||
return _getStringOrCopyKey(U_ICUDATA_LANG, displayLocale,
|
||||
_kKeys, nullptr,
|
||||
keyword,
|
||||
keyword,
|
||||
dest, destCapacity,
|
||||
*status);
|
||||
|
||||
}
|
||||
|
||||
|
||||
#define UCURRENCY_DISPLAY_NAME_INDEX 1
|
||||
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
uloc_getDisplayKeywordValue( const char* locale,
|
||||
const char* keyword,
|
||||
const char* displayLocale,
|
||||
char16_t* dest,
|
||||
int32_t destCapacity,
|
||||
UErrorCode* status){
|
||||
|
||||
|
||||
/* argument checking */
|
||||
if(status==nullptr || U_FAILURE(*status)) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
if(destCapacity<0 || (destCapacity>0 && dest==nullptr)) {
|
||||
*status=U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* get the keyword value */
|
||||
CharString keywordValue = ulocimp_getKeywordValue(locale, keyword, *status);
|
||||
|
||||
/*
|
||||
* if the keyword is equal to currency .. then to get the display name
|
||||
* we need to do the fallback ourselves
|
||||
*/
|
||||
if(uprv_stricmp(keyword, _kCurrency)==0){
|
||||
|
||||
int32_t dispNameLen = 0;
|
||||
const char16_t *dispName = nullptr;
|
||||
|
||||
icu::LocalUResourceBundlePointer bundle(
|
||||
ures_open(U_ICUDATA_CURR, displayLocale, status));
|
||||
icu::LocalUResourceBundlePointer currencies(
|
||||
ures_getByKey(bundle.getAlias(), _kCurrencies, nullptr, status));
|
||||
icu::LocalUResourceBundlePointer currency(
|
||||
ures_getByKeyWithFallback(currencies.getAlias(), keywordValue.data(), nullptr, status));
|
||||
|
||||
dispName = ures_getStringByIndex(currency.getAlias(), UCURRENCY_DISPLAY_NAME_INDEX, &dispNameLen, status);
|
||||
|
||||
if(U_FAILURE(*status)){
|
||||
if(*status == U_MISSING_RESOURCE_ERROR){
|
||||
/* we just want to write the value over if nothing is available */
|
||||
*status = U_USING_DEFAULT_WARNING;
|
||||
}else{
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
/* now copy the dispName over if not nullptr */
|
||||
if(dispName != nullptr){
|
||||
if(dispNameLen <= destCapacity){
|
||||
u_memcpy(dest, dispName, dispNameLen);
|
||||
return u_terminateUChars(dest, destCapacity, dispNameLen, status);
|
||||
}else{
|
||||
*status = U_BUFFER_OVERFLOW_ERROR;
|
||||
return dispNameLen;
|
||||
}
|
||||
}else{
|
||||
/* we have not found the display name for the value .. just copy over */
|
||||
if(keywordValue.length() <= destCapacity){
|
||||
u_charsToUChars(keywordValue.data(), dest, keywordValue.length());
|
||||
return u_terminateUChars(dest, destCapacity, keywordValue.length(), status);
|
||||
}else{
|
||||
*status = U_BUFFER_OVERFLOW_ERROR;
|
||||
return keywordValue.length();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
}else{
|
||||
|
||||
return _getStringOrCopyKey(U_ICUDATA_LANG, displayLocale,
|
||||
_kTypes, keyword,
|
||||
keywordValue.data(),
|
||||
keywordValue.data(),
|
||||
dest, destCapacity,
|
||||
*status);
|
||||
}
|
||||
}
|
||||
415
engine/thirdparty/icu4c/common/locdistance.cpp
vendored
Normal file
415
engine/thirdparty/icu4c/common/locdistance.cpp
vendored
Normal file
|
|
@ -0,0 +1,415 @@
|
|||
// © 2019 and later: Unicode, Inc. and others.
|
||||
// License & terms of use: http://www.unicode.org/copyright.html
|
||||
|
||||
// locdistance.cpp
|
||||
// created: 2019may08 Markus W. Scherer
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
#include "unicode/bytestrie.h"
|
||||
#include "unicode/localematcher.h"
|
||||
#include "unicode/locid.h"
|
||||
#include "unicode/uobject.h"
|
||||
#include "unicode/ures.h"
|
||||
#include "cstring.h"
|
||||
#include "locdistance.h"
|
||||
#include "loclikelysubtags.h"
|
||||
#include "uassert.h"
|
||||
#include "ucln_cmn.h"
|
||||
#include "uinvchar.h"
|
||||
#include "umutex.h"
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
namespace {
|
||||
|
||||
/**
|
||||
* Bit flag used on the last character of a subtag in the trie.
|
||||
* Must be set consistently by the builder and the lookup code.
|
||||
*/
|
||||
constexpr int32_t END_OF_SUBTAG = 0x80;
|
||||
/** Distance value bit flag, set by the builder. */
|
||||
constexpr int32_t DISTANCE_SKIP_SCRIPT = 0x80;
|
||||
/** Distance value bit flag, set by trieNext(). */
|
||||
constexpr int32_t DISTANCE_IS_FINAL = 0x100;
|
||||
constexpr int32_t DISTANCE_IS_FINAL_OR_SKIP_SCRIPT = DISTANCE_IS_FINAL | DISTANCE_SKIP_SCRIPT;
|
||||
|
||||
constexpr int32_t ABOVE_THRESHOLD = 100;
|
||||
|
||||
// Indexes into array of distances.
|
||||
enum {
|
||||
IX_DEF_LANG_DISTANCE,
|
||||
IX_DEF_SCRIPT_DISTANCE,
|
||||
IX_DEF_REGION_DISTANCE,
|
||||
IX_MIN_REGION_DISTANCE,
|
||||
IX_LIMIT
|
||||
};
|
||||
|
||||
LocaleDistance *gLocaleDistance = nullptr;
|
||||
UInitOnce gInitOnce {};
|
||||
|
||||
UBool U_CALLCONV cleanup() {
|
||||
delete gLocaleDistance;
|
||||
gLocaleDistance = nullptr;
|
||||
gInitOnce.reset();
|
||||
return true;
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
void U_CALLCONV LocaleDistance::initLocaleDistance(UErrorCode &errorCode) {
|
||||
// This function is invoked only via umtx_initOnce().
|
||||
U_ASSERT(gLocaleDistance == nullptr);
|
||||
const LikelySubtags &likely = *LikelySubtags::getSingleton(errorCode);
|
||||
if (U_FAILURE(errorCode)) { return; }
|
||||
const LocaleDistanceData &data = likely.getDistanceData();
|
||||
if (data.distanceTrieBytes == nullptr ||
|
||||
data.regionToPartitions == nullptr || data.partitions == nullptr ||
|
||||
// ok if no paradigms
|
||||
data.distances == nullptr) {
|
||||
errorCode = U_MISSING_RESOURCE_ERROR;
|
||||
return;
|
||||
}
|
||||
gLocaleDistance = new LocaleDistance(data, likely);
|
||||
if (gLocaleDistance == nullptr) {
|
||||
errorCode = U_MEMORY_ALLOCATION_ERROR;
|
||||
return;
|
||||
}
|
||||
ucln_common_registerCleanup(UCLN_COMMON_LOCALE_DISTANCE, cleanup);
|
||||
}
|
||||
|
||||
const LocaleDistance *LocaleDistance::getSingleton(UErrorCode &errorCode) {
|
||||
if (U_FAILURE(errorCode)) { return nullptr; }
|
||||
umtx_initOnce(gInitOnce, &LocaleDistance::initLocaleDistance, errorCode);
|
||||
return gLocaleDistance;
|
||||
}
|
||||
|
||||
LocaleDistance::LocaleDistance(const LocaleDistanceData &data, const LikelySubtags &likely) :
|
||||
likelySubtags(likely),
|
||||
trie(data.distanceTrieBytes),
|
||||
regionToPartitionsIndex(data.regionToPartitions), partitionArrays(data.partitions),
|
||||
paradigmLSRs(data.paradigms), paradigmLSRsLength(data.paradigmsLength),
|
||||
defaultLanguageDistance(data.distances[IX_DEF_LANG_DISTANCE]),
|
||||
defaultScriptDistance(data.distances[IX_DEF_SCRIPT_DISTANCE]),
|
||||
defaultRegionDistance(data.distances[IX_DEF_REGION_DISTANCE]),
|
||||
minRegionDistance(data.distances[IX_MIN_REGION_DISTANCE]) {
|
||||
// For the default demotion value, use the
|
||||
// default region distance between unrelated Englishes.
|
||||
// Thus, unless demotion is turned off,
|
||||
// a mere region difference for one desired locale
|
||||
// is as good as a perfect match for the next following desired locale.
|
||||
// As of CLDR 36, we have <languageMatch desired="en_*_*" supported="en_*_*" distance="5"/>.
|
||||
LSR en("en", "Latn", "US", LSR::EXPLICIT_LSR);
|
||||
LSR enGB("en", "Latn", "GB", LSR::EXPLICIT_LSR);
|
||||
const LSR *p_enGB = &enGB;
|
||||
int32_t indexAndDistance = getBestIndexAndDistance(en, &p_enGB, 1,
|
||||
shiftDistance(50), ULOCMATCH_FAVOR_LANGUAGE, ULOCMATCH_DIRECTION_WITH_ONE_WAY);
|
||||
defaultDemotionPerDesiredLocale = getDistanceFloor(indexAndDistance);
|
||||
}
|
||||
|
||||
int32_t LocaleDistance::getBestIndexAndDistance(
|
||||
const LSR &desired,
|
||||
const LSR **supportedLSRs, int32_t supportedLSRsLength,
|
||||
int32_t shiftedThreshold,
|
||||
ULocMatchFavorSubtag favorSubtag, ULocMatchDirection direction) const {
|
||||
BytesTrie iter(trie);
|
||||
// Look up the desired language only once for all supported LSRs.
|
||||
// Its "distance" is either a match point value of 0, or a non-match negative value.
|
||||
// Note: The data builder verifies that there are no <*, supported> or <desired, *> rules.
|
||||
int32_t desLangDistance = trieNext(iter, desired.language, false);
|
||||
uint64_t desLangState = desLangDistance >= 0 && supportedLSRsLength > 1 ? iter.getState64() : 0;
|
||||
// Index of the supported LSR with the lowest distance.
|
||||
int32_t bestIndex = -1;
|
||||
// Cached lookup info from LikelySubtags.compareLikely().
|
||||
int32_t bestLikelyInfo = -1;
|
||||
for (int32_t slIndex = 0; slIndex < supportedLSRsLength; ++slIndex) {
|
||||
const LSR &supported = *supportedLSRs[slIndex];
|
||||
bool star = false;
|
||||
int32_t distance = desLangDistance;
|
||||
if (distance >= 0) {
|
||||
U_ASSERT((distance & DISTANCE_IS_FINAL) == 0);
|
||||
if (slIndex != 0) {
|
||||
iter.resetToState64(desLangState);
|
||||
}
|
||||
distance = trieNext(iter, supported.language, true);
|
||||
}
|
||||
// Note: The data builder verifies that there are no rules with "any" (*) language and
|
||||
// real (non *) script or region subtags.
|
||||
// This means that if the lookup for either language fails we can use
|
||||
// the default distances without further lookups.
|
||||
int32_t flags;
|
||||
if (distance >= 0) {
|
||||
flags = distance & DISTANCE_IS_FINAL_OR_SKIP_SCRIPT;
|
||||
distance &= ~DISTANCE_IS_FINAL_OR_SKIP_SCRIPT;
|
||||
} else { // <*, *>
|
||||
if (uprv_strcmp(desired.language, supported.language) == 0) {
|
||||
distance = 0;
|
||||
} else {
|
||||
distance = defaultLanguageDistance;
|
||||
}
|
||||
flags = 0;
|
||||
star = true;
|
||||
}
|
||||
U_ASSERT(0 <= distance && distance <= 100);
|
||||
// Round up the shifted threshold (if fraction bits are not 0)
|
||||
// for comparison with un-shifted distances until we need fraction bits.
|
||||
// (If we simply shifted non-zero fraction bits away, then we might ignore a language
|
||||
// when it's really still a micro distance below the threshold.)
|
||||
int32_t roundedThreshold = (shiftedThreshold + DISTANCE_FRACTION_MASK) >> DISTANCE_SHIFT;
|
||||
// We implement "favor subtag" by reducing the language subtag distance
|
||||
// (unscientifically reducing it to a quarter of the normal value),
|
||||
// so that the script distance is relatively more important.
|
||||
// For example, given a default language distance of 80, we reduce it to 20,
|
||||
// which is below the default threshold of 50, which is the default script distance.
|
||||
if (favorSubtag == ULOCMATCH_FAVOR_SCRIPT) {
|
||||
distance >>= 2;
|
||||
}
|
||||
// Let distance == roundedThreshold pass until the tie-breaker logic
|
||||
// at the end of the loop.
|
||||
if (distance > roundedThreshold) {
|
||||
continue;
|
||||
}
|
||||
|
||||
int32_t scriptDistance;
|
||||
if (star || flags != 0) {
|
||||
if (uprv_strcmp(desired.script, supported.script) == 0) {
|
||||
scriptDistance = 0;
|
||||
} else {
|
||||
scriptDistance = defaultScriptDistance;
|
||||
}
|
||||
} else {
|
||||
scriptDistance = getDesSuppScriptDistance(iter, iter.getState64(),
|
||||
desired.script, supported.script);
|
||||
flags = scriptDistance & DISTANCE_IS_FINAL;
|
||||
scriptDistance &= ~DISTANCE_IS_FINAL;
|
||||
}
|
||||
distance += scriptDistance;
|
||||
if (distance > roundedThreshold) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (uprv_strcmp(desired.region, supported.region) == 0) {
|
||||
// regionDistance = 0
|
||||
} else if (star || (flags & DISTANCE_IS_FINAL) != 0) {
|
||||
distance += defaultRegionDistance;
|
||||
} else {
|
||||
int32_t remainingThreshold = roundedThreshold - distance;
|
||||
if (minRegionDistance > remainingThreshold) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// From here on we know the regions are not equal.
|
||||
// Map each region to zero or more partitions. (zero = one non-matching string)
|
||||
// (Each array of single-character partition strings is encoded as one string.)
|
||||
// If either side has more than one, then we find the maximum distance.
|
||||
// This could be optimized by adding some more structure, but probably not worth it.
|
||||
distance += getRegionPartitionsDistance(
|
||||
iter, iter.getState64(),
|
||||
partitionsForRegion(desired),
|
||||
partitionsForRegion(supported),
|
||||
remainingThreshold);
|
||||
}
|
||||
int32_t shiftedDistance = shiftDistance(distance);
|
||||
if (shiftedDistance == 0) {
|
||||
// Distinguish between equivalent but originally unequal locales via an
|
||||
// additional micro distance.
|
||||
shiftedDistance |= (desired.flags ^ supported.flags);
|
||||
if (shiftedDistance < shiftedThreshold) {
|
||||
if (direction != ULOCMATCH_DIRECTION_ONLY_TWO_WAY ||
|
||||
// Is there also a match when we swap desired/supported?
|
||||
isMatch(supported, desired, shiftedThreshold, favorSubtag)) {
|
||||
if (shiftedDistance == 0) {
|
||||
return slIndex << INDEX_SHIFT;
|
||||
}
|
||||
bestIndex = slIndex;
|
||||
shiftedThreshold = shiftedDistance;
|
||||
bestLikelyInfo = -1;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
if (shiftedDistance < shiftedThreshold) {
|
||||
if (direction != ULOCMATCH_DIRECTION_ONLY_TWO_WAY ||
|
||||
// Is there also a match when we swap desired/supported?
|
||||
isMatch(supported, desired, shiftedThreshold, favorSubtag)) {
|
||||
bestIndex = slIndex;
|
||||
shiftedThreshold = shiftedDistance;
|
||||
bestLikelyInfo = -1;
|
||||
}
|
||||
} else if (shiftedDistance == shiftedThreshold && bestIndex >= 0) {
|
||||
if (direction != ULOCMATCH_DIRECTION_ONLY_TWO_WAY ||
|
||||
// Is there also a match when we swap desired/supported?
|
||||
isMatch(supported, desired, shiftedThreshold, favorSubtag)) {
|
||||
bestLikelyInfo = likelySubtags.compareLikely(
|
||||
supported, *supportedLSRs[bestIndex], bestLikelyInfo);
|
||||
if ((bestLikelyInfo & 1) != 0) {
|
||||
// This supported locale matches as well as the previous best match,
|
||||
// and neither matches perfectly,
|
||||
// but this one is "more likely" (has more-default subtags).
|
||||
bestIndex = slIndex;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return bestIndex >= 0 ?
|
||||
(bestIndex << INDEX_SHIFT) | shiftedThreshold :
|
||||
INDEX_NEG_1 | shiftDistance(ABOVE_THRESHOLD);
|
||||
}
|
||||
|
||||
int32_t LocaleDistance::getDesSuppScriptDistance(
|
||||
BytesTrie &iter, uint64_t startState, const char *desired, const char *supported) {
|
||||
// Note: The data builder verifies that there are no <*, supported> or <desired, *> rules.
|
||||
int32_t distance = trieNext(iter, desired, false);
|
||||
if (distance >= 0) {
|
||||
distance = trieNext(iter, supported, true);
|
||||
}
|
||||
if (distance < 0) {
|
||||
UStringTrieResult result = iter.resetToState64(startState).next(u'*'); // <*, *>
|
||||
U_ASSERT(USTRINGTRIE_HAS_VALUE(result));
|
||||
if (uprv_strcmp(desired, supported) == 0) {
|
||||
distance = 0; // same script
|
||||
} else {
|
||||
distance = iter.getValue();
|
||||
U_ASSERT(distance >= 0);
|
||||
}
|
||||
if (result == USTRINGTRIE_FINAL_VALUE) {
|
||||
distance |= DISTANCE_IS_FINAL;
|
||||
}
|
||||
}
|
||||
return distance;
|
||||
}
|
||||
|
||||
int32_t LocaleDistance::getRegionPartitionsDistance(
|
||||
BytesTrie &iter, uint64_t startState,
|
||||
const char *desiredPartitions, const char *supportedPartitions, int32_t threshold) {
|
||||
char desired = *desiredPartitions++;
|
||||
char supported = *supportedPartitions++;
|
||||
U_ASSERT(desired != 0 && supported != 0);
|
||||
// See if we have single desired/supported partitions, from NUL-terminated
|
||||
// partition strings without explicit length.
|
||||
bool suppLengthGt1 = *supportedPartitions != 0; // gt1: more than 1 character
|
||||
// equivalent to: if (desLength == 1 && suppLength == 1)
|
||||
if (*desiredPartitions == 0 && !suppLengthGt1) {
|
||||
// Fastpath for single desired/supported partitions.
|
||||
UStringTrieResult result = iter.next(uprv_invCharToAscii(desired) | END_OF_SUBTAG);
|
||||
if (USTRINGTRIE_HAS_NEXT(result)) {
|
||||
result = iter.next(uprv_invCharToAscii(supported) | END_OF_SUBTAG);
|
||||
if (USTRINGTRIE_HAS_VALUE(result)) {
|
||||
return iter.getValue();
|
||||
}
|
||||
}
|
||||
return getFallbackRegionDistance(iter, startState);
|
||||
}
|
||||
|
||||
const char *supportedStart = supportedPartitions - 1; // for restart of inner loop
|
||||
int32_t regionDistance = 0;
|
||||
// Fall back to * only once, not for each pair of partition strings.
|
||||
bool star = false;
|
||||
for (;;) {
|
||||
// Look up each desired-partition string only once,
|
||||
// not for each (desired, supported) pair.
|
||||
UStringTrieResult result = iter.next(uprv_invCharToAscii(desired) | END_OF_SUBTAG);
|
||||
if (USTRINGTRIE_HAS_NEXT(result)) {
|
||||
uint64_t desState = suppLengthGt1 ? iter.getState64() : 0;
|
||||
for (;;) {
|
||||
result = iter.next(uprv_invCharToAscii(supported) | END_OF_SUBTAG);
|
||||
int32_t d;
|
||||
if (USTRINGTRIE_HAS_VALUE(result)) {
|
||||
d = iter.getValue();
|
||||
} else if (star) {
|
||||
d = 0;
|
||||
} else {
|
||||
d = getFallbackRegionDistance(iter, startState);
|
||||
star = true;
|
||||
}
|
||||
if (d > threshold) {
|
||||
return d;
|
||||
} else if (regionDistance < d) {
|
||||
regionDistance = d;
|
||||
}
|
||||
if ((supported = *supportedPartitions++) != 0) {
|
||||
iter.resetToState64(desState);
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
} else if (!star) {
|
||||
int32_t d = getFallbackRegionDistance(iter, startState);
|
||||
if (d > threshold) {
|
||||
return d;
|
||||
} else if (regionDistance < d) {
|
||||
regionDistance = d;
|
||||
}
|
||||
star = true;
|
||||
}
|
||||
if ((desired = *desiredPartitions++) != 0) {
|
||||
iter.resetToState64(startState);
|
||||
supportedPartitions = supportedStart;
|
||||
supported = *supportedPartitions++;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
return regionDistance;
|
||||
}
|
||||
|
||||
int32_t LocaleDistance::getFallbackRegionDistance(BytesTrie &iter, uint64_t startState) {
|
||||
#if U_DEBUG
|
||||
UStringTrieResult result =
|
||||
#endif
|
||||
iter.resetToState64(startState).next(u'*'); // <*, *>
|
||||
U_ASSERT(USTRINGTRIE_HAS_VALUE(result));
|
||||
int32_t distance = iter.getValue();
|
||||
U_ASSERT(distance >= 0);
|
||||
return distance;
|
||||
}
|
||||
|
||||
int32_t LocaleDistance::trieNext(BytesTrie &iter, const char *s, bool wantValue) {
|
||||
uint8_t c;
|
||||
if ((c = *s) == 0) {
|
||||
return -1; // no empty subtags in the distance data
|
||||
}
|
||||
for (;;) {
|
||||
c = uprv_invCharToAscii(c);
|
||||
// EBCDIC: If *s is not an invariant character,
|
||||
// then c is now 0 and will simply not match anything, which is harmless.
|
||||
uint8_t next = *++s;
|
||||
if (next != 0) {
|
||||
if (!USTRINGTRIE_HAS_NEXT(iter.next(c))) {
|
||||
return -1;
|
||||
}
|
||||
} else {
|
||||
// last character of this subtag
|
||||
UStringTrieResult result = iter.next(c | END_OF_SUBTAG);
|
||||
if (wantValue) {
|
||||
if (USTRINGTRIE_HAS_VALUE(result)) {
|
||||
int32_t value = iter.getValue();
|
||||
if (result == USTRINGTRIE_FINAL_VALUE) {
|
||||
value |= DISTANCE_IS_FINAL;
|
||||
}
|
||||
return value;
|
||||
}
|
||||
} else {
|
||||
if (USTRINGTRIE_HAS_NEXT(result)) {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
c = next;
|
||||
}
|
||||
}
|
||||
|
||||
bool LocaleDistance::isParadigmLSR(const LSR &lsr) const {
|
||||
// Linear search for a very short list (length 6 as of 2019),
|
||||
// because we look for equivalence not equality, and
|
||||
// because it's easy.
|
||||
// If there are many paradigm LSRs we should use a hash set
|
||||
// with custom comparator and hasher.
|
||||
U_ASSERT(paradigmLSRsLength <= 15);
|
||||
for (int32_t i = 0; i < paradigmLSRsLength; ++i) {
|
||||
if (lsr.isEquivalentTo(paradigmLSRs[i])) { return true; }
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
U_NAMESPACE_END
|
||||
151
engine/thirdparty/icu4c/common/locdistance.h
vendored
Normal file
151
engine/thirdparty/icu4c/common/locdistance.h
vendored
Normal file
|
|
@ -0,0 +1,151 @@
|
|||
// © 2019 and later: Unicode, Inc. and others.
|
||||
// License & terms of use: http://www.unicode.org/copyright.html
|
||||
|
||||
// locdistance.h
|
||||
// created: 2019may08 Markus W. Scherer
|
||||
|
||||
#ifndef __LOCDISTANCE_H__
|
||||
#define __LOCDISTANCE_H__
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
#include "unicode/bytestrie.h"
|
||||
#include "unicode/localematcher.h"
|
||||
#include "unicode/locid.h"
|
||||
#include "unicode/uobject.h"
|
||||
#include "lsr.h"
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
struct LocaleDistanceData;
|
||||
|
||||
/**
|
||||
* Offline-built data for LocaleMatcher.
|
||||
* Mostly but not only the data for mapping locales to their maximized forms.
|
||||
*/
|
||||
class LocaleDistance final : public UMemory {
|
||||
public:
|
||||
static const LocaleDistance *getSingleton(UErrorCode &errorCode);
|
||||
|
||||
static int32_t shiftDistance(int32_t distance) {
|
||||
return distance << DISTANCE_SHIFT;
|
||||
}
|
||||
|
||||
static int32_t getShiftedDistance(int32_t indexAndDistance) {
|
||||
return indexAndDistance & DISTANCE_MASK;
|
||||
}
|
||||
|
||||
static double getDistanceDouble(int32_t indexAndDistance) {
|
||||
double shiftedDistance = getShiftedDistance(indexAndDistance);
|
||||
return shiftedDistance / (1 << DISTANCE_SHIFT);
|
||||
}
|
||||
|
||||
static int32_t getDistanceFloor(int32_t indexAndDistance) {
|
||||
return (indexAndDistance & DISTANCE_MASK) >> DISTANCE_SHIFT;
|
||||
}
|
||||
|
||||
static int32_t getIndex(int32_t indexAndDistance) {
|
||||
// assert indexAndDistance >= 0;
|
||||
return indexAndDistance >> INDEX_SHIFT;
|
||||
}
|
||||
|
||||
/**
|
||||
* Finds the supported LSR with the smallest distance from the desired one.
|
||||
* Equivalent LSR subtags must be normalized into a canonical form.
|
||||
*
|
||||
* <p>Returns the index of the lowest-distance supported LSR in the high bits
|
||||
* (negative if none has a distance below the threshold),
|
||||
* and its distance (0..ABOVE_THRESHOLD) in the low bits.
|
||||
*/
|
||||
int32_t getBestIndexAndDistance(const LSR &desired,
|
||||
const LSR **supportedLSRs, int32_t supportedLSRsLength,
|
||||
int32_t shiftedThreshold,
|
||||
ULocMatchFavorSubtag favorSubtag,
|
||||
ULocMatchDirection direction) const;
|
||||
|
||||
bool isParadigmLSR(const LSR &lsr) const;
|
||||
|
||||
int32_t getDefaultScriptDistance() const {
|
||||
return defaultScriptDistance;
|
||||
}
|
||||
|
||||
int32_t getDefaultDemotionPerDesiredLocale() const {
|
||||
return defaultDemotionPerDesiredLocale;
|
||||
}
|
||||
|
||||
private:
|
||||
// The distance is shifted left to gain some fraction bits.
|
||||
static constexpr int32_t DISTANCE_SHIFT = 3;
|
||||
static constexpr int32_t DISTANCE_FRACTION_MASK = 7;
|
||||
// 7 bits for 0..100
|
||||
static constexpr int32_t DISTANCE_INT_SHIFT = 7;
|
||||
static constexpr int32_t INDEX_SHIFT = DISTANCE_INT_SHIFT + DISTANCE_SHIFT;
|
||||
static constexpr int32_t DISTANCE_MASK = 0x3ff;
|
||||
// tic constexpr int32_t MAX_INDEX = 0x1fffff; // avoids sign bit
|
||||
static constexpr int32_t INDEX_NEG_1 = 0xfffffc00;
|
||||
|
||||
LocaleDistance(const LocaleDistanceData &data, const LikelySubtags &likely);
|
||||
LocaleDistance(const LocaleDistance &other) = delete;
|
||||
LocaleDistance &operator=(const LocaleDistance &other) = delete;
|
||||
|
||||
static void initLocaleDistance(UErrorCode &errorCode);
|
||||
|
||||
bool isMatch(const LSR &desired, const LSR &supported,
|
||||
int32_t shiftedThreshold, ULocMatchFavorSubtag favorSubtag) const {
|
||||
const LSR *pSupp = &supported;
|
||||
return getBestIndexAndDistance(
|
||||
desired, &pSupp, 1,
|
||||
shiftedThreshold, favorSubtag, ULOCMATCH_DIRECTION_WITH_ONE_WAY) >= 0;
|
||||
}
|
||||
|
||||
static int32_t getDesSuppScriptDistance(BytesTrie &iter, uint64_t startState,
|
||||
const char *desired, const char *supported);
|
||||
|
||||
static int32_t getRegionPartitionsDistance(
|
||||
BytesTrie &iter, uint64_t startState,
|
||||
const char *desiredPartitions, const char *supportedPartitions,
|
||||
int32_t threshold);
|
||||
|
||||
static int32_t getFallbackRegionDistance(BytesTrie &iter, uint64_t startState);
|
||||
|
||||
static int32_t trieNext(BytesTrie &iter, const char *s, bool wantValue);
|
||||
|
||||
const char *partitionsForRegion(const LSR &lsr) const {
|
||||
// ill-formed region -> one non-matching string
|
||||
int32_t pIndex = regionToPartitionsIndex[lsr.regionIndex];
|
||||
return partitionArrays[pIndex];
|
||||
}
|
||||
|
||||
int32_t getDefaultRegionDistance() const {
|
||||
return defaultRegionDistance;
|
||||
}
|
||||
|
||||
const LikelySubtags &likelySubtags;
|
||||
|
||||
// The trie maps each dlang+slang+dscript+sscript+dregion+sregion
|
||||
// (encoded in ASCII with bit 7 set on the last character of each subtag) to a distance.
|
||||
// There is also a trie value for each subsequence of whole subtags.
|
||||
// One '*' is used for a (desired, supported) pair of "und", "Zzzz"/"", or "ZZ"/"".
|
||||
BytesTrie trie;
|
||||
|
||||
/**
|
||||
* Maps each region to zero or more single-character partitions.
|
||||
*/
|
||||
const uint8_t *regionToPartitionsIndex;
|
||||
const char **partitionArrays;
|
||||
|
||||
/**
|
||||
* Used to get the paradigm region for a cluster, if there is one.
|
||||
*/
|
||||
const LSR *paradigmLSRs;
|
||||
int32_t paradigmLSRsLength;
|
||||
|
||||
int32_t defaultLanguageDistance;
|
||||
int32_t defaultScriptDistance;
|
||||
int32_t defaultRegionDistance;
|
||||
int32_t minRegionDistance;
|
||||
int32_t defaultDemotionPerDesiredLocale;
|
||||
};
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
||||
#endif // __LOCDISTANCE_H__
|
||||
1108
engine/thirdparty/icu4c/common/locdspnm.cpp
vendored
Normal file
1108
engine/thirdparty/icu4c/common/locdspnm.cpp
vendored
Normal file
File diff suppressed because it is too large
Load diff
2742
engine/thirdparty/icu4c/common/locid.cpp
vendored
Normal file
2742
engine/thirdparty/icu4c/common/locid.cpp
vendored
Normal file
File diff suppressed because it is too large
Load diff
437
engine/thirdparty/icu4c/common/loclikely.cpp
vendored
Normal file
437
engine/thirdparty/icu4c/common/loclikely.cpp
vendored
Normal file
|
|
@ -0,0 +1,437 @@
|
|||
// © 2016 and later: Unicode, Inc. and others.
|
||||
// License & terms of use: http://www.unicode.org/copyright.html
|
||||
/*
|
||||
*******************************************************************************
|
||||
*
|
||||
* Copyright (C) 1997-2016, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*
|
||||
*******************************************************************************
|
||||
* file name: loclikely.cpp
|
||||
* encoding: UTF-8
|
||||
* tab size: 8 (not used)
|
||||
* indentation:4
|
||||
*
|
||||
* created on: 2010feb25
|
||||
* created by: Markus W. Scherer
|
||||
*
|
||||
* Code for likely and minimized locale subtags, separated out from other .cpp files
|
||||
* that then do not depend on resource bundle code and likely-subtags data.
|
||||
*/
|
||||
|
||||
#include <utility>
|
||||
|
||||
#include "unicode/bytestream.h"
|
||||
#include "unicode/utypes.h"
|
||||
#include "unicode/locid.h"
|
||||
#include "unicode/putil.h"
|
||||
#include "unicode/uchar.h"
|
||||
#include "unicode/uloc.h"
|
||||
#include "unicode/ures.h"
|
||||
#include "unicode/uscript.h"
|
||||
#include "bytesinkutil.h"
|
||||
#include "charstr.h"
|
||||
#include "cmemory.h"
|
||||
#include "cstring.h"
|
||||
#include "loclikelysubtags.h"
|
||||
#include "ulocimp.h"
|
||||
|
||||
namespace {
|
||||
|
||||
/**
|
||||
* Create a tag string from the supplied parameters. The lang, script and region
|
||||
* parameters may be nullptr pointers. If they are, their corresponding length parameters
|
||||
* must be less than or equal to 0.
|
||||
*
|
||||
* If an illegal argument is provided, the function returns the error
|
||||
* U_ILLEGAL_ARGUMENT_ERROR.
|
||||
*
|
||||
* @param lang The language tag to use.
|
||||
* @param langLength The length of the language tag.
|
||||
* @param script The script tag to use.
|
||||
* @param scriptLength The length of the script tag.
|
||||
* @param region The region tag to use.
|
||||
* @param regionLength The length of the region tag.
|
||||
* @param variant The region tag to use.
|
||||
* @param variantLength The length of the region tag.
|
||||
* @param trailing Any trailing data to append to the new tag.
|
||||
* @param trailingLength The length of the trailing data.
|
||||
* @param sink The output sink receiving the tag string.
|
||||
* @param err A pointer to a UErrorCode for error reporting.
|
||||
**/
|
||||
void U_CALLCONV
|
||||
createTagStringWithAlternates(
|
||||
const char* lang,
|
||||
int32_t langLength,
|
||||
const char* script,
|
||||
int32_t scriptLength,
|
||||
const char* region,
|
||||
int32_t regionLength,
|
||||
const char* variant,
|
||||
int32_t variantLength,
|
||||
const char* trailing,
|
||||
int32_t trailingLength,
|
||||
icu::ByteSink& sink,
|
||||
UErrorCode& err) {
|
||||
if (U_FAILURE(err)) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (langLength >= ULOC_LANG_CAPACITY ||
|
||||
scriptLength >= ULOC_SCRIPT_CAPACITY ||
|
||||
regionLength >= ULOC_COUNTRY_CAPACITY) {
|
||||
err = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return;
|
||||
}
|
||||
|
||||
if (langLength > 0) {
|
||||
sink.Append(lang, langLength);
|
||||
}
|
||||
|
||||
if (scriptLength > 0) {
|
||||
sink.Append("_", 1);
|
||||
sink.Append(script, scriptLength);
|
||||
}
|
||||
|
||||
if (regionLength > 0) {
|
||||
sink.Append("_", 1);
|
||||
sink.Append(region, regionLength);
|
||||
}
|
||||
|
||||
if (variantLength > 0) {
|
||||
if (regionLength == 0) {
|
||||
/* extra separator is required */
|
||||
sink.Append("_", 1);
|
||||
}
|
||||
sink.Append("_", 1);
|
||||
sink.Append(variant, variantLength);
|
||||
}
|
||||
|
||||
if (trailingLength > 0) {
|
||||
/*
|
||||
* Copy the trailing data into the supplied buffer.
|
||||
*/
|
||||
sink.Append(trailing, trailingLength);
|
||||
}
|
||||
}
|
||||
|
||||
bool CHECK_TRAILING_VARIANT_SIZE(const char* variant, int32_t variantLength) {
|
||||
int32_t count = 0;
|
||||
for (int32_t i = 0; i < variantLength; i++) {
|
||||
if (_isIDSeparator(variant[i])) {
|
||||
count = 0;
|
||||
} else if (count == 8) {
|
||||
return false;
|
||||
} else {
|
||||
count++;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
void
|
||||
_uloc_addLikelySubtags(const char* localeID,
|
||||
icu::ByteSink& sink,
|
||||
UErrorCode& err) {
|
||||
if (U_FAILURE(err)) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (localeID == nullptr) {
|
||||
err = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return;
|
||||
}
|
||||
|
||||
icu::CharString lang;
|
||||
icu::CharString script;
|
||||
icu::CharString region;
|
||||
icu::CharString variant;
|
||||
const char* trailing = nullptr;
|
||||
ulocimp_getSubtags(localeID, &lang, &script, ®ion, &variant, &trailing, err);
|
||||
if (U_FAILURE(err)) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (!CHECK_TRAILING_VARIANT_SIZE(variant.data(), variant.length())) {
|
||||
err = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return;
|
||||
}
|
||||
|
||||
if (lang.length() == 4) {
|
||||
if (script.isEmpty()) {
|
||||
script = std::move(lang);
|
||||
lang.clear();
|
||||
} else {
|
||||
err = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return;
|
||||
}
|
||||
} else if (lang.length() > 8) {
|
||||
err = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return;
|
||||
}
|
||||
|
||||
int32_t trailingLength = (int32_t)uprv_strlen(trailing);
|
||||
|
||||
const icu::LikelySubtags* likelySubtags = icu::LikelySubtags::getSingleton(err);
|
||||
if (U_FAILURE(err)) {
|
||||
return;
|
||||
}
|
||||
// We need to keep l on the stack because lsr may point into internal
|
||||
// memory of l.
|
||||
icu::Locale l = icu::Locale::createFromName(localeID);
|
||||
if (l.isBogus()) {
|
||||
err = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return;
|
||||
}
|
||||
icu::LSR lsr = likelySubtags->makeMaximizedLsrFrom(l, true, err);
|
||||
if (U_FAILURE(err)) {
|
||||
return;
|
||||
}
|
||||
const char* language = lsr.language;
|
||||
if (uprv_strcmp(language, "und") == 0) {
|
||||
language = "";
|
||||
}
|
||||
createTagStringWithAlternates(
|
||||
language,
|
||||
(int32_t)uprv_strlen(language),
|
||||
lsr.script,
|
||||
(int32_t)uprv_strlen(lsr.script),
|
||||
lsr.region,
|
||||
(int32_t)uprv_strlen(lsr.region),
|
||||
variant.data(),
|
||||
variant.length(),
|
||||
trailing,
|
||||
trailingLength,
|
||||
sink,
|
||||
err);
|
||||
}
|
||||
|
||||
void
|
||||
_uloc_minimizeSubtags(const char* localeID,
|
||||
icu::ByteSink& sink,
|
||||
bool favorScript,
|
||||
UErrorCode& err) {
|
||||
if (U_FAILURE(err)) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (localeID == nullptr) {
|
||||
err = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return;
|
||||
}
|
||||
|
||||
icu::CharString lang;
|
||||
icu::CharString script;
|
||||
icu::CharString region;
|
||||
icu::CharString variant;
|
||||
const char* trailing = nullptr;
|
||||
ulocimp_getSubtags(localeID, &lang, &script, ®ion, &variant, &trailing, err);
|
||||
if (U_FAILURE(err)) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (!CHECK_TRAILING_VARIANT_SIZE(variant.data(), variant.length())) {
|
||||
err = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return;
|
||||
}
|
||||
|
||||
int32_t trailingLength = (int32_t)uprv_strlen(trailing);
|
||||
|
||||
const icu::LikelySubtags* likelySubtags = icu::LikelySubtags::getSingleton(err);
|
||||
if (U_FAILURE(err)) {
|
||||
return;
|
||||
}
|
||||
icu::LSR lsr = likelySubtags->minimizeSubtags(
|
||||
lang.toStringPiece(),
|
||||
script.toStringPiece(),
|
||||
region.toStringPiece(),
|
||||
favorScript,
|
||||
err);
|
||||
if (U_FAILURE(err)) {
|
||||
return;
|
||||
}
|
||||
const char* language = lsr.language;
|
||||
if (uprv_strcmp(language, "und") == 0) {
|
||||
language = "";
|
||||
}
|
||||
createTagStringWithAlternates(
|
||||
language,
|
||||
(int32_t)uprv_strlen(language),
|
||||
lsr.script,
|
||||
(int32_t)uprv_strlen(lsr.script),
|
||||
lsr.region,
|
||||
(int32_t)uprv_strlen(lsr.region),
|
||||
variant.data(),
|
||||
variant.length(),
|
||||
trailing,
|
||||
trailingLength,
|
||||
sink,
|
||||
err);
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
uloc_addLikelySubtags(const char* localeID,
|
||||
char* maximizedLocaleID,
|
||||
int32_t maximizedLocaleIDCapacity,
|
||||
UErrorCode* status) {
|
||||
return icu::ByteSinkUtil::viaByteSinkToTerminatedChars(
|
||||
maximizedLocaleID, maximizedLocaleIDCapacity,
|
||||
[&](icu::ByteSink& sink, UErrorCode& status) {
|
||||
ulocimp_addLikelySubtags(localeID, sink, status);
|
||||
},
|
||||
*status);
|
||||
}
|
||||
|
||||
U_EXPORT icu::CharString
|
||||
ulocimp_addLikelySubtags(const char* localeID,
|
||||
UErrorCode& status) {
|
||||
return icu::ByteSinkUtil::viaByteSinkToCharString(
|
||||
[&](icu::ByteSink& sink, UErrorCode& status) {
|
||||
ulocimp_addLikelySubtags(localeID, sink, status);
|
||||
},
|
||||
status);
|
||||
}
|
||||
|
||||
U_EXPORT void
|
||||
ulocimp_addLikelySubtags(const char* localeID,
|
||||
icu::ByteSink& sink,
|
||||
UErrorCode& status) {
|
||||
if (U_FAILURE(status)) { return; }
|
||||
icu::CharString localeBuffer = ulocimp_canonicalize(localeID, status);
|
||||
_uloc_addLikelySubtags(localeBuffer.data(), sink, status);
|
||||
}
|
||||
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
uloc_minimizeSubtags(const char* localeID,
|
||||
char* minimizedLocaleID,
|
||||
int32_t minimizedLocaleIDCapacity,
|
||||
UErrorCode* status) {
|
||||
return icu::ByteSinkUtil::viaByteSinkToTerminatedChars(
|
||||
minimizedLocaleID, minimizedLocaleIDCapacity,
|
||||
[&](icu::ByteSink& sink, UErrorCode& status) {
|
||||
ulocimp_minimizeSubtags(localeID, sink, false, status);
|
||||
},
|
||||
*status);
|
||||
}
|
||||
|
||||
U_EXPORT icu::CharString
|
||||
ulocimp_minimizeSubtags(const char* localeID,
|
||||
bool favorScript,
|
||||
UErrorCode& status) {
|
||||
return icu::ByteSinkUtil::viaByteSinkToCharString(
|
||||
[&](icu::ByteSink& sink, UErrorCode& status) {
|
||||
ulocimp_minimizeSubtags(localeID, sink, favorScript, status);
|
||||
},
|
||||
status);
|
||||
}
|
||||
|
||||
U_EXPORT void
|
||||
ulocimp_minimizeSubtags(const char* localeID,
|
||||
icu::ByteSink& sink,
|
||||
bool favorScript,
|
||||
UErrorCode& status) {
|
||||
if (U_FAILURE(status)) { return; }
|
||||
icu::CharString localeBuffer = ulocimp_canonicalize(localeID, status);
|
||||
_uloc_minimizeSubtags(localeBuffer.data(), sink, favorScript, status);
|
||||
}
|
||||
|
||||
// Pairs of (language subtag, + or -) for finding out fast if common languages
|
||||
// are LTR (minus) or RTL (plus).
|
||||
static const char LANG_DIR_STRING[] =
|
||||
"root-en-es-pt-zh-ja-ko-de-fr-it-ar+he+fa+ru-nl-pl-th-tr-";
|
||||
|
||||
// Implemented here because this calls ulocimp_addLikelySubtags().
|
||||
U_CAPI UBool U_EXPORT2
|
||||
uloc_isRightToLeft(const char *locale) {
|
||||
UErrorCode errorCode = U_ZERO_ERROR;
|
||||
icu::CharString lang;
|
||||
icu::CharString script;
|
||||
ulocimp_getSubtags(locale, &lang, &script, nullptr, nullptr, nullptr, errorCode);
|
||||
if (U_FAILURE(errorCode) || script.isEmpty()) {
|
||||
// Fastpath: We know the likely scripts and their writing direction
|
||||
// for some common languages.
|
||||
if (!lang.isEmpty()) {
|
||||
const char* langPtr = uprv_strstr(LANG_DIR_STRING, lang.data());
|
||||
if (langPtr != nullptr) {
|
||||
switch (langPtr[lang.length()]) {
|
||||
case '-': return false;
|
||||
case '+': return true;
|
||||
default: break; // partial match of a longer code
|
||||
}
|
||||
}
|
||||
}
|
||||
// Otherwise, find the likely script.
|
||||
errorCode = U_ZERO_ERROR;
|
||||
icu::CharString likely = ulocimp_addLikelySubtags(locale, errorCode);
|
||||
if (U_FAILURE(errorCode)) {
|
||||
return false;
|
||||
}
|
||||
ulocimp_getSubtags(likely.data(), nullptr, &script, nullptr, nullptr, nullptr, errorCode);
|
||||
if (U_FAILURE(errorCode) || script.isEmpty()) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
UScriptCode scriptCode = (UScriptCode)u_getPropertyValueEnum(UCHAR_SCRIPT, script.data());
|
||||
return uscript_isRightToLeft(scriptCode);
|
||||
}
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
UBool
|
||||
Locale::isRightToLeft() const {
|
||||
return uloc_isRightToLeft(getBaseName());
|
||||
}
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
||||
namespace {
|
||||
icu::CharString
|
||||
GetRegionFromKey(const char* localeID, const char* key, UErrorCode& status) {
|
||||
icu::CharString result;
|
||||
|
||||
// First check for keyword value
|
||||
icu::CharString kw = ulocimp_getKeywordValue(localeID, key, status);
|
||||
int32_t len = kw.length();
|
||||
if (U_SUCCESS(status) && len >= 3 && len <= 7) {
|
||||
// chop off the subdivision code (which will generally be "zzzz" anyway)
|
||||
const char* const data = kw.data();
|
||||
if (uprv_isASCIILetter(data[0])) {
|
||||
result.append(uprv_toupper(data[0]), status);
|
||||
result.append(uprv_toupper(data[1]), status);
|
||||
} else {
|
||||
// assume three-digit region code
|
||||
result.append(data, 3, status);
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
} // namespace
|
||||
|
||||
U_EXPORT icu::CharString
|
||||
ulocimp_getRegionForSupplementalData(const char *localeID, bool inferRegion,
|
||||
UErrorCode& status) {
|
||||
if (U_FAILURE(status)) {
|
||||
return {};
|
||||
}
|
||||
icu::CharString rgBuf = GetRegionFromKey(localeID, "rg", status);
|
||||
if (U_SUCCESS(status) && rgBuf.isEmpty()) {
|
||||
// No valid rg keyword value, try for unicode_region_subtag
|
||||
rgBuf = ulocimp_getRegion(localeID, status);
|
||||
if (U_SUCCESS(status) && rgBuf.isEmpty() && inferRegion) {
|
||||
// Second check for sd keyword value
|
||||
rgBuf = GetRegionFromKey(localeID, "sd", status);
|
||||
if (U_SUCCESS(status) && rgBuf.isEmpty()) {
|
||||
// no unicode_region_subtag but inferRegion true, try likely subtags
|
||||
UErrorCode rgStatus = U_ZERO_ERROR;
|
||||
icu::CharString locBuf = ulocimp_addLikelySubtags(localeID, rgStatus);
|
||||
if (U_SUCCESS(rgStatus)) {
|
||||
rgBuf = ulocimp_getRegion(locBuf.data(), status);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return rgBuf;
|
||||
}
|
||||
976
engine/thirdparty/icu4c/common/loclikelysubtags.cpp
vendored
Normal file
976
engine/thirdparty/icu4c/common/loclikelysubtags.cpp
vendored
Normal file
|
|
@ -0,0 +1,976 @@
|
|||
// © 2019 and later: Unicode, Inc. and others.
|
||||
// License & terms of use: http://www.unicode.org/copyright.html
|
||||
|
||||
// loclikelysubtags.cpp
|
||||
// created: 2019may08 Markus W. Scherer
|
||||
|
||||
#include <utility>
|
||||
#include "unicode/utypes.h"
|
||||
#include "unicode/bytestrie.h"
|
||||
#include "unicode/localpointer.h"
|
||||
#include "unicode/locid.h"
|
||||
#include "unicode/uobject.h"
|
||||
#include "unicode/ures.h"
|
||||
#include "unicode/uscript.h"
|
||||
#include "charstr.h"
|
||||
#include "cstring.h"
|
||||
#include "loclikelysubtags.h"
|
||||
#include "lsr.h"
|
||||
#include "uassert.h"
|
||||
#include "ucln_cmn.h"
|
||||
#include "uhash.h"
|
||||
#include "uinvchar.h"
|
||||
#include "umutex.h"
|
||||
#include "uniquecharstr.h"
|
||||
#include "uresdata.h"
|
||||
#include "uresimp.h"
|
||||
#include "uvector.h"
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
namespace {
|
||||
|
||||
constexpr char PSEUDO_ACCENTS_PREFIX = '\''; // -XA, -PSACCENT
|
||||
constexpr char PSEUDO_BIDI_PREFIX = '+'; // -XB, -PSBIDI
|
||||
constexpr char PSEUDO_CRACKED_PREFIX = ','; // -XC, -PSCRACK
|
||||
|
||||
} // namespace
|
||||
|
||||
LocaleDistanceData::LocaleDistanceData(LocaleDistanceData &&data) :
|
||||
distanceTrieBytes(data.distanceTrieBytes),
|
||||
regionToPartitions(data.regionToPartitions),
|
||||
partitions(data.partitions),
|
||||
paradigms(data.paradigms), paradigmsLength(data.paradigmsLength),
|
||||
distances(data.distances) {
|
||||
data.partitions = nullptr;
|
||||
data.paradigms = nullptr;
|
||||
}
|
||||
|
||||
LocaleDistanceData::~LocaleDistanceData() {
|
||||
uprv_free(partitions);
|
||||
delete[] paradigms;
|
||||
}
|
||||
|
||||
struct LikelySubtagsData {
|
||||
UResourceBundle *langInfoBundle = nullptr;
|
||||
UniqueCharStrings strings;
|
||||
CharStringMap languageAliases;
|
||||
CharStringMap regionAliases;
|
||||
const uint8_t *trieBytes = nullptr;
|
||||
LSR *lsrs = nullptr;
|
||||
int32_t lsrsLength = 0;
|
||||
|
||||
LocaleDistanceData distanceData;
|
||||
|
||||
LikelySubtagsData(UErrorCode &errorCode) : strings(errorCode) {}
|
||||
|
||||
~LikelySubtagsData() {
|
||||
ures_close(langInfoBundle);
|
||||
delete[] lsrs;
|
||||
}
|
||||
|
||||
void load(UErrorCode &errorCode) {
|
||||
if (U_FAILURE(errorCode)) { return; }
|
||||
langInfoBundle = ures_openDirect(nullptr, "langInfo", &errorCode);
|
||||
if (U_FAILURE(errorCode)) { return; }
|
||||
StackUResourceBundle stackTempBundle;
|
||||
ResourceDataValue value;
|
||||
ures_getValueWithFallback(langInfoBundle, "likely", stackTempBundle.getAlias(),
|
||||
value, errorCode);
|
||||
ResourceTable likelyTable = value.getTable(errorCode);
|
||||
if (U_FAILURE(errorCode)) { return; }
|
||||
|
||||
// Read all strings in the resource bundle and convert them to invariant char *.
|
||||
LocalMemory<int32_t> languageIndexes, regionIndexes, lsrSubtagIndexes;
|
||||
int32_t languagesLength = 0, regionsLength = 0, lsrSubtagsLength = 0;
|
||||
ResourceArray m49Array;
|
||||
if (likelyTable.findValue("m49", value)) {
|
||||
m49Array = value.getArray(errorCode);
|
||||
} else {
|
||||
errorCode = U_MISSING_RESOURCE_ERROR;
|
||||
return;
|
||||
}
|
||||
if (!readStrings(likelyTable, "languageAliases", value,
|
||||
languageIndexes, languagesLength, errorCode) ||
|
||||
!readStrings(likelyTable, "regionAliases", value,
|
||||
regionIndexes, regionsLength, errorCode) ||
|
||||
!readLSREncodedStrings(likelyTable, "lsrnum", value, m49Array,
|
||||
lsrSubtagIndexes,lsrSubtagsLength, errorCode)) {
|
||||
return;
|
||||
}
|
||||
if ((languagesLength & 1) != 0 ||
|
||||
(regionsLength & 1) != 0 ||
|
||||
(lsrSubtagsLength % 3) != 0) {
|
||||
errorCode = U_INVALID_FORMAT_ERROR;
|
||||
return;
|
||||
}
|
||||
if (lsrSubtagsLength == 0) {
|
||||
errorCode = U_MISSING_RESOURCE_ERROR;
|
||||
return;
|
||||
}
|
||||
|
||||
if (!likelyTable.findValue("trie", value)) {
|
||||
errorCode = U_MISSING_RESOURCE_ERROR;
|
||||
return;
|
||||
}
|
||||
int32_t length;
|
||||
trieBytes = value.getBinary(length, errorCode);
|
||||
if (U_FAILURE(errorCode)) { return; }
|
||||
|
||||
// Also read distance/matcher data if available,
|
||||
// to open & keep only one resource bundle pointer
|
||||
// and to use one single UniqueCharStrings.
|
||||
UErrorCode matchErrorCode = U_ZERO_ERROR;
|
||||
ures_getValueWithFallback(langInfoBundle, "match", stackTempBundle.getAlias(),
|
||||
value, matchErrorCode);
|
||||
LocalMemory<int32_t> partitionIndexes, paradigmSubtagIndexes;
|
||||
int32_t partitionsLength = 0, paradigmSubtagsLength = 0;
|
||||
if (U_SUCCESS(matchErrorCode)) {
|
||||
ResourceTable matchTable = value.getTable(errorCode);
|
||||
if (U_FAILURE(errorCode)) { return; }
|
||||
|
||||
if (matchTable.findValue("trie", value)) {
|
||||
distanceData.distanceTrieBytes = value.getBinary(length, errorCode);
|
||||
if (U_FAILURE(errorCode)) { return; }
|
||||
}
|
||||
|
||||
if (matchTable.findValue("regionToPartitions", value)) {
|
||||
distanceData.regionToPartitions = value.getBinary(length, errorCode);
|
||||
if (U_FAILURE(errorCode)) { return; }
|
||||
if (length < LSR::REGION_INDEX_LIMIT) {
|
||||
errorCode = U_INVALID_FORMAT_ERROR;
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
if (!readStrings(matchTable, "partitions", value,
|
||||
partitionIndexes, partitionsLength, errorCode) ||
|
||||
!readLSREncodedStrings(matchTable, "paradigmnum", value, m49Array,
|
||||
paradigmSubtagIndexes, paradigmSubtagsLength, errorCode)) {
|
||||
return;
|
||||
}
|
||||
if ((paradigmSubtagsLength % 3) != 0) {
|
||||
errorCode = U_INVALID_FORMAT_ERROR;
|
||||
return;
|
||||
}
|
||||
|
||||
if (matchTable.findValue("distances", value)) {
|
||||
distanceData.distances = value.getIntVector(length, errorCode);
|
||||
if (U_FAILURE(errorCode)) { return; }
|
||||
if (length < 4) { // LocaleDistance IX_LIMIT
|
||||
errorCode = U_INVALID_FORMAT_ERROR;
|
||||
return;
|
||||
}
|
||||
}
|
||||
} else if (matchErrorCode == U_MISSING_RESOURCE_ERROR) {
|
||||
// ok for likely subtags
|
||||
} else { // error other than missing resource
|
||||
errorCode = matchErrorCode;
|
||||
return;
|
||||
}
|
||||
|
||||
// Fetch & store invariant-character versions of strings
|
||||
// only after we have collected and de-duplicated all of them.
|
||||
strings.freeze();
|
||||
|
||||
languageAliases = CharStringMap(languagesLength / 2, errorCode);
|
||||
for (int32_t i = 0; i < languagesLength; i += 2) {
|
||||
languageAliases.put(strings.get(languageIndexes[i]),
|
||||
strings.get(languageIndexes[i + 1]), errorCode);
|
||||
}
|
||||
|
||||
regionAliases = CharStringMap(regionsLength / 2, errorCode);
|
||||
for (int32_t i = 0; i < regionsLength; i += 2) {
|
||||
regionAliases.put(strings.get(regionIndexes[i]),
|
||||
strings.get(regionIndexes[i + 1]), errorCode);
|
||||
}
|
||||
if (U_FAILURE(errorCode)) { return; }
|
||||
|
||||
lsrsLength = lsrSubtagsLength / 3;
|
||||
lsrs = new LSR[lsrsLength];
|
||||
if (lsrs == nullptr) {
|
||||
errorCode = U_MEMORY_ALLOCATION_ERROR;
|
||||
return;
|
||||
}
|
||||
for (int32_t i = 0, j = 0; i < lsrSubtagsLength; i += 3, ++j) {
|
||||
lsrs[j] = LSR(strings.get(lsrSubtagIndexes[i]),
|
||||
strings.get(lsrSubtagIndexes[i + 1]),
|
||||
strings.get(lsrSubtagIndexes[i + 2]),
|
||||
LSR::IMPLICIT_LSR);
|
||||
}
|
||||
|
||||
if (partitionsLength > 0) {
|
||||
distanceData.partitions = static_cast<const char **>(
|
||||
uprv_malloc(partitionsLength * sizeof(const char *)));
|
||||
if (distanceData.partitions == nullptr) {
|
||||
errorCode = U_MEMORY_ALLOCATION_ERROR;
|
||||
return;
|
||||
}
|
||||
for (int32_t i = 0; i < partitionsLength; ++i) {
|
||||
distanceData.partitions[i] = strings.get(partitionIndexes[i]);
|
||||
}
|
||||
}
|
||||
|
||||
if (paradigmSubtagsLength > 0) {
|
||||
distanceData.paradigmsLength = paradigmSubtagsLength / 3;
|
||||
LSR *paradigms = new LSR[distanceData.paradigmsLength];
|
||||
if (paradigms == nullptr) {
|
||||
errorCode = U_MEMORY_ALLOCATION_ERROR;
|
||||
return;
|
||||
}
|
||||
for (int32_t i = 0, j = 0; i < paradigmSubtagsLength; i += 3, ++j) {
|
||||
paradigms[j] = LSR(strings.get(paradigmSubtagIndexes[i]),
|
||||
strings.get(paradigmSubtagIndexes[i + 1]),
|
||||
strings.get(paradigmSubtagIndexes[i + 2]),
|
||||
LSR::DONT_CARE_FLAGS);
|
||||
}
|
||||
distanceData.paradigms = paradigms;
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
bool readStrings(const ResourceTable &table, const char *key, ResourceValue &value,
|
||||
LocalMemory<int32_t> &indexes, int32_t &length, UErrorCode &errorCode) {
|
||||
if (U_FAILURE(errorCode)) { return false; }
|
||||
if (table.findValue(key, value)) {
|
||||
ResourceArray stringArray = value.getArray(errorCode);
|
||||
if (U_FAILURE(errorCode)) { return false; }
|
||||
length = stringArray.getSize();
|
||||
if (length == 0) { return true; }
|
||||
int32_t *rawIndexes = indexes.allocateInsteadAndCopy(length);
|
||||
if (rawIndexes == nullptr) {
|
||||
errorCode = U_MEMORY_ALLOCATION_ERROR;
|
||||
return false;
|
||||
}
|
||||
for (int i = 0; i < length; ++i) {
|
||||
if (stringArray.getValue(i, value)) { // returns true because i < length
|
||||
int32_t strLength = 0;
|
||||
rawIndexes[i] = strings.add(value.getString(strLength, errorCode), errorCode);
|
||||
if (U_FAILURE(errorCode)) { return false; }
|
||||
}
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
UnicodeString toLanguage(int encoded) {
|
||||
if (encoded == 0) {
|
||||
return UNICODE_STRING_SIMPLE("");
|
||||
}
|
||||
if (encoded == 1) {
|
||||
return UNICODE_STRING_SIMPLE("skip");
|
||||
}
|
||||
encoded &= 0x00ffffff;
|
||||
encoded %= 27*27*27;
|
||||
char lang[3];
|
||||
lang[0] = 'a' + ((encoded % 27) - 1);
|
||||
lang[1] = 'a' + (((encoded / 27 ) % 27) - 1);
|
||||
if (encoded / (27 * 27) == 0) {
|
||||
return UnicodeString(lang, 2, US_INV);
|
||||
}
|
||||
lang[2] = 'a' + ((encoded / (27 * 27)) - 1);
|
||||
return UnicodeString(lang, 3, US_INV);
|
||||
}
|
||||
UnicodeString toScript(int encoded) {
|
||||
if (encoded == 0) {
|
||||
return UNICODE_STRING_SIMPLE("");
|
||||
}
|
||||
if (encoded == 1) {
|
||||
return UNICODE_STRING_SIMPLE("script");
|
||||
}
|
||||
encoded = (encoded >> 24) & 0x000000ff;
|
||||
const char* script = uscript_getShortName(static_cast<UScriptCode>(encoded));
|
||||
if (script == nullptr) {
|
||||
return UNICODE_STRING_SIMPLE("");
|
||||
}
|
||||
U_ASSERT(uprv_strlen(script) == 4);
|
||||
return UnicodeString(script, 4, US_INV);
|
||||
}
|
||||
UnicodeString m49IndexToCode(const ResourceArray &m49Array, ResourceValue &value, int index, UErrorCode &errorCode) {
|
||||
if (U_FAILURE(errorCode)) {
|
||||
return UNICODE_STRING_SIMPLE("");
|
||||
}
|
||||
if (m49Array.getValue(index, value)) {
|
||||
return value.getUnicodeString(errorCode);
|
||||
}
|
||||
// "m49" does not include the index.
|
||||
errorCode = U_MISSING_RESOURCE_ERROR;
|
||||
return UNICODE_STRING_SIMPLE("");
|
||||
}
|
||||
|
||||
UnicodeString toRegion(const ResourceArray& m49Array, ResourceValue &value, int encoded, UErrorCode &errorCode) {
|
||||
if (U_FAILURE(errorCode) || encoded == 0 || encoded == 1) {
|
||||
return UNICODE_STRING_SIMPLE("");
|
||||
}
|
||||
encoded &= 0x00ffffff;
|
||||
encoded /= 27 * 27 * 27;
|
||||
encoded %= 27 * 27;
|
||||
if (encoded < 27) {
|
||||
// Selected M49 code index, find the code from "m49" resource.
|
||||
return m49IndexToCode(m49Array, value, encoded, errorCode);
|
||||
}
|
||||
char region[2];
|
||||
region[0] = 'A' + ((encoded % 27) - 1);
|
||||
region[1] = 'A' + (((encoded / 27) % 27) - 1);
|
||||
return UnicodeString(region, 2, US_INV);
|
||||
}
|
||||
|
||||
bool readLSREncodedStrings(const ResourceTable &table, const char* key, ResourceValue &value, const ResourceArray& m49Array,
|
||||
LocalMemory<int32_t> &indexes, int32_t &length, UErrorCode &errorCode) {
|
||||
if (U_FAILURE(errorCode)) { return false; }
|
||||
if (table.findValue(key, value)) {
|
||||
const int32_t* vectors = value.getIntVector(length, errorCode);
|
||||
if (U_FAILURE(errorCode)) { return false; }
|
||||
if (length == 0) { return true; }
|
||||
int32_t *rawIndexes = indexes.allocateInsteadAndCopy(length * 3);
|
||||
if (rawIndexes == nullptr) {
|
||||
errorCode = U_MEMORY_ALLOCATION_ERROR;
|
||||
return false;
|
||||
}
|
||||
for (int i = 0; i < length; ++i) {
|
||||
rawIndexes[i*3] = strings.addByValue(toLanguage(vectors[i]), errorCode);
|
||||
rawIndexes[i*3+1] = strings.addByValue(toScript(vectors[i]), errorCode);
|
||||
rawIndexes[i*3+2] = strings.addByValue(
|
||||
toRegion(m49Array, value, vectors[i], errorCode), errorCode);
|
||||
if (U_FAILURE(errorCode)) { return false; }
|
||||
}
|
||||
length *= 3;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
};
|
||||
|
||||
namespace {
|
||||
|
||||
LikelySubtags *gLikelySubtags = nullptr;
|
||||
UVector *gMacroregions = nullptr;
|
||||
UInitOnce gInitOnce {};
|
||||
|
||||
UBool U_CALLCONV cleanup() {
|
||||
delete gLikelySubtags;
|
||||
gLikelySubtags = nullptr;
|
||||
delete gMacroregions;
|
||||
gMacroregions = nullptr;
|
||||
gInitOnce.reset();
|
||||
return true;
|
||||
}
|
||||
|
||||
constexpr const char16_t* MACROREGION_HARDCODE[] = {
|
||||
u"001~3",
|
||||
u"005",
|
||||
u"009",
|
||||
u"011",
|
||||
u"013~5",
|
||||
u"017~9",
|
||||
u"021",
|
||||
u"029",
|
||||
u"030",
|
||||
u"034~5",
|
||||
u"039",
|
||||
u"053~4",
|
||||
u"057",
|
||||
u"061",
|
||||
u"142~3",
|
||||
u"145",
|
||||
u"150~1",
|
||||
u"154~5",
|
||||
u"202",
|
||||
u"419",
|
||||
u"EU",
|
||||
u"EZ",
|
||||
u"QO",
|
||||
u"UN",
|
||||
};
|
||||
|
||||
constexpr char16_t RANGE_MARKER = 0x7E; /* '~' */
|
||||
void processMacroregionRange(const UnicodeString& regionName, UVector* newMacroRegions, UErrorCode& status) {
|
||||
if (U_FAILURE(status)) { return; }
|
||||
int32_t rangeMarkerLocation = regionName.indexOf(RANGE_MARKER);
|
||||
char16_t buf[6];
|
||||
regionName.extract(buf,6,status);
|
||||
if ( rangeMarkerLocation > 0 ) {
|
||||
char16_t endRange = regionName.charAt(rangeMarkerLocation+1);
|
||||
buf[rangeMarkerLocation] = 0;
|
||||
while ( buf[rangeMarkerLocation-1] <= endRange && U_SUCCESS(status)) {
|
||||
LocalPointer<UnicodeString> newRegion(new UnicodeString(buf), status);
|
||||
newMacroRegions->adoptElement(newRegion.orphan(),status);
|
||||
buf[rangeMarkerLocation-1]++;
|
||||
}
|
||||
} else {
|
||||
LocalPointer<UnicodeString> newRegion(new UnicodeString(regionName), status);
|
||||
newMacroRegions->adoptElement(newRegion.orphan(),status);
|
||||
}
|
||||
}
|
||||
|
||||
#if U_DEBUG
|
||||
UVector* loadMacroregions(UErrorCode &status) {
|
||||
if (U_FAILURE(status)) { return nullptr; }
|
||||
LocalPointer<UVector> newMacroRegions(new UVector(uprv_deleteUObject, uhash_compareUnicodeString, status), status);
|
||||
|
||||
LocalUResourceBundlePointer supplementalData(ures_openDirect(nullptr,"supplementalData",&status));
|
||||
LocalUResourceBundlePointer idValidity(ures_getByKey(supplementalData.getAlias(),"idValidity",nullptr,&status));
|
||||
LocalUResourceBundlePointer regionList(ures_getByKey(idValidity.getAlias(),"region",nullptr,&status));
|
||||
LocalUResourceBundlePointer regionMacro(ures_getByKey(regionList.getAlias(),"macroregion",nullptr,&status));
|
||||
|
||||
if (U_FAILURE(status)) {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
while (ures_hasNext(regionMacro.getAlias())) {
|
||||
UnicodeString regionName = ures_getNextUnicodeString(regionMacro.getAlias(),nullptr,&status);
|
||||
processMacroregionRange(regionName, newMacroRegions.getAlias(), status);
|
||||
if (U_FAILURE(status)) {
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
return newMacroRegions.orphan();
|
||||
}
|
||||
#endif // U_DEBUG
|
||||
|
||||
UVector* getStaticMacroregions(UErrorCode &status) {
|
||||
if (U_FAILURE(status)) { return nullptr; }
|
||||
LocalPointer<UVector> newMacroRegions(new UVector(uprv_deleteUObject, uhash_compareUnicodeString, status), status);
|
||||
|
||||
if (U_FAILURE(status)) {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
for (const auto *region : MACROREGION_HARDCODE) {
|
||||
UnicodeString regionName(region);
|
||||
processMacroregionRange(regionName, newMacroRegions.getAlias(), status);
|
||||
if (U_FAILURE(status)) {
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
return newMacroRegions.orphan();
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
void U_CALLCONV LikelySubtags::initLikelySubtags(UErrorCode &errorCode) {
|
||||
// This function is invoked only via umtx_initOnce().
|
||||
U_ASSERT(gLikelySubtags == nullptr);
|
||||
LikelySubtagsData data(errorCode);
|
||||
data.load(errorCode);
|
||||
if (U_FAILURE(errorCode)) { return; }
|
||||
gLikelySubtags = new LikelySubtags(data);
|
||||
gMacroregions = getStaticMacroregions(errorCode);
|
||||
#if U_DEBUG
|
||||
auto macroregionsFromData = loadMacroregions(errorCode);
|
||||
U_ASSERT((*gMacroregions) == (*macroregionsFromData));
|
||||
delete macroregionsFromData;
|
||||
#endif
|
||||
if (U_FAILURE(errorCode) || gLikelySubtags == nullptr || gMacroregions == nullptr) {
|
||||
delete gLikelySubtags;
|
||||
delete gMacroregions;
|
||||
errorCode = U_MEMORY_ALLOCATION_ERROR;
|
||||
return;
|
||||
}
|
||||
|
||||
ucln_common_registerCleanup(UCLN_COMMON_LIKELY_SUBTAGS, cleanup);
|
||||
}
|
||||
|
||||
const LikelySubtags *LikelySubtags::getSingleton(UErrorCode &errorCode) {
|
||||
if (U_FAILURE(errorCode)) { return nullptr; }
|
||||
umtx_initOnce(gInitOnce, &LikelySubtags::initLikelySubtags, errorCode);
|
||||
return gLikelySubtags;
|
||||
}
|
||||
|
||||
LikelySubtags::LikelySubtags(LikelySubtagsData &data) :
|
||||
langInfoBundle(data.langInfoBundle),
|
||||
strings(data.strings.orphanCharStrings()),
|
||||
languageAliases(std::move(data.languageAliases)),
|
||||
regionAliases(std::move(data.regionAliases)),
|
||||
trie(data.trieBytes),
|
||||
lsrs(data.lsrs),
|
||||
#if U_DEBUG
|
||||
lsrsLength(data.lsrsLength),
|
||||
#endif // U_DEBUG
|
||||
distanceData(std::move(data.distanceData)) {
|
||||
data.langInfoBundle = nullptr;
|
||||
data.lsrs = nullptr;
|
||||
|
||||
// Cache the result of looking up language="und" encoded as "*", and "und-Zzzz" ("**").
|
||||
UStringTrieResult result = trie.next(u'*');
|
||||
U_ASSERT(USTRINGTRIE_HAS_NEXT(result));
|
||||
trieUndState = trie.getState64();
|
||||
result = trie.next(u'*');
|
||||
U_ASSERT(USTRINGTRIE_HAS_NEXT(result));
|
||||
trieUndZzzzState = trie.getState64();
|
||||
result = trie.next(u'*');
|
||||
U_ASSERT(USTRINGTRIE_HAS_VALUE(result));
|
||||
defaultLsrIndex = trie.getValue();
|
||||
trie.reset();
|
||||
|
||||
for (char16_t c = u'a'; c <= u'z'; ++c) {
|
||||
result = trie.next(c);
|
||||
if (result == USTRINGTRIE_NO_VALUE) {
|
||||
trieFirstLetterStates[c - u'a'] = trie.getState64();
|
||||
}
|
||||
trie.reset();
|
||||
}
|
||||
}
|
||||
|
||||
LikelySubtags::~LikelySubtags() {
|
||||
ures_close(langInfoBundle);
|
||||
delete strings;
|
||||
delete[] lsrs;
|
||||
}
|
||||
|
||||
LSR LikelySubtags::makeMaximizedLsrFrom(const Locale &locale,
|
||||
bool returnInputIfUnmatch,
|
||||
UErrorCode &errorCode) const {
|
||||
if (U_FAILURE(errorCode)) { return {}; }
|
||||
if (locale.isBogus()) {
|
||||
errorCode = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return {};
|
||||
}
|
||||
const char *name = locale.getName();
|
||||
if (uprv_isAtSign(name[0]) && name[1] == 'x' && name[2] == '=') { // name.startsWith("@x=")
|
||||
// Private use language tag x-subtag-subtag... which CLDR changes to
|
||||
// und-x-subtag-subtag...
|
||||
return LSR(name, "", "", LSR::EXPLICIT_LSR);
|
||||
}
|
||||
LSR max = makeMaximizedLsr(locale.getLanguage(), locale.getScript(), locale.getCountry(),
|
||||
locale.getVariant(), returnInputIfUnmatch, errorCode);
|
||||
|
||||
if (uprv_strlen(max.language) == 0 &&
|
||||
uprv_strlen(max.script) == 0 &&
|
||||
uprv_strlen(max.region) == 0) {
|
||||
// No match. ICU API mandate us to
|
||||
// If the provided ULocale instance is already in the maximal form, or
|
||||
// there is no data available available for maximization, it will be
|
||||
// returned.
|
||||
return LSR(locale.getLanguage(), locale.getScript(), locale.getCountry(), LSR::EXPLICIT_LSR, errorCode);
|
||||
}
|
||||
return max;
|
||||
}
|
||||
|
||||
namespace {
|
||||
|
||||
const char *getCanonical(const CharStringMap &aliases, const char *alias) {
|
||||
const char *canonical = aliases.get(alias);
|
||||
return canonical == nullptr ? alias : canonical;
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
LSR LikelySubtags::makeMaximizedLsr(const char *language, const char *script, const char *region,
|
||||
const char *variant,
|
||||
bool returnInputIfUnmatch,
|
||||
UErrorCode &errorCode) const {
|
||||
if (U_FAILURE(errorCode)) { return {}; }
|
||||
// Handle pseudolocales like en-XA, ar-XB, fr-PSCRACK.
|
||||
// They should match only themselves,
|
||||
// not other locales with what looks like the same language and script subtags.
|
||||
char c1;
|
||||
if (region[0] == 'X' && (c1 = region[1]) != 0 && region[2] == 0) {
|
||||
switch (c1) {
|
||||
case 'A':
|
||||
if (returnInputIfUnmatch) {
|
||||
return LSR(language, script, region, LSR::EXPLICIT_LSR);
|
||||
}
|
||||
return LSR(PSEUDO_ACCENTS_PREFIX, language, script, region,
|
||||
LSR::EXPLICIT_LSR, errorCode);
|
||||
case 'B':
|
||||
if (returnInputIfUnmatch) {
|
||||
return LSR(language, script, region, LSR::EXPLICIT_LSR);
|
||||
}
|
||||
return LSR(PSEUDO_BIDI_PREFIX, language, script, region,
|
||||
LSR::EXPLICIT_LSR, errorCode);
|
||||
case 'C':
|
||||
if (returnInputIfUnmatch) {
|
||||
return LSR(language, script, region, LSR::EXPLICIT_LSR);
|
||||
}
|
||||
return LSR(PSEUDO_CRACKED_PREFIX, language, script, region,
|
||||
LSR::EXPLICIT_LSR, errorCode);
|
||||
default: // normal locale
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (variant[0] == 'P' && variant[1] == 'S') {
|
||||
int32_t lsrFlags = *region == 0 ?
|
||||
LSR::EXPLICIT_LANGUAGE | LSR::EXPLICIT_SCRIPT : LSR::EXPLICIT_LSR;
|
||||
if (uprv_strcmp(variant, "PSACCENT") == 0) {
|
||||
return LSR(PSEUDO_ACCENTS_PREFIX, language, script,
|
||||
*region == 0 ? "XA" : region, lsrFlags, errorCode);
|
||||
} else if (uprv_strcmp(variant, "PSBIDI") == 0) {
|
||||
return LSR(PSEUDO_BIDI_PREFIX, language, script,
|
||||
*region == 0 ? "XB" : region, lsrFlags, errorCode);
|
||||
} else if (uprv_strcmp(variant, "PSCRACK") == 0) {
|
||||
return LSR(PSEUDO_CRACKED_PREFIX, language, script,
|
||||
*region == 0 ? "XC" : region, lsrFlags, errorCode);
|
||||
}
|
||||
// else normal locale
|
||||
}
|
||||
|
||||
language = getCanonical(languageAliases, language);
|
||||
// (We have no script mappings.)
|
||||
region = getCanonical(regionAliases, region);
|
||||
return maximize(language, script, region, returnInputIfUnmatch, errorCode);
|
||||
}
|
||||
|
||||
LSR LikelySubtags::maximize(const char *language, const char *script, const char *region,
|
||||
bool returnInputIfUnmatch,
|
||||
UErrorCode &errorCode) const {
|
||||
if (U_FAILURE(errorCode)) { return {}; }
|
||||
return maximize({language, (int32_t)uprv_strlen(language)},
|
||||
{script, (int32_t)uprv_strlen(script)},
|
||||
{region, (int32_t)uprv_strlen(region)},
|
||||
returnInputIfUnmatch,
|
||||
errorCode);
|
||||
}
|
||||
|
||||
bool LikelySubtags::isMacroregion(StringPiece& region, UErrorCode& errorCode) const {
|
||||
if (U_FAILURE(errorCode)) { return false; }
|
||||
// In Java, we use Region class. In C++, since Region is under i18n,
|
||||
// we read the same data used by Region into gMacroregions avoid dependency
|
||||
// from common to i18n/region.cpp
|
||||
umtx_initOnce(gInitOnce, &LikelySubtags::initLikelySubtags, errorCode);
|
||||
if (U_FAILURE(errorCode)) { return false; }
|
||||
UnicodeString str(UnicodeString::fromUTF8(region));
|
||||
return gMacroregions->contains((void *)&str);
|
||||
}
|
||||
|
||||
LSR LikelySubtags::maximize(StringPiece language, StringPiece script, StringPiece region,
|
||||
bool returnInputIfUnmatch,
|
||||
UErrorCode &errorCode) const {
|
||||
if (U_FAILURE(errorCode)) { return {}; }
|
||||
if (language.compare("und") == 0) {
|
||||
language = "";
|
||||
}
|
||||
if (script.compare("Zzzz") == 0) {
|
||||
script = "";
|
||||
}
|
||||
if (region.compare("ZZ") == 0) {
|
||||
region = "";
|
||||
}
|
||||
if (!script.empty() && !region.empty() && !language.empty()) {
|
||||
return LSR(language, script, region, LSR::EXPLICIT_LSR, errorCode); // already maximized
|
||||
}
|
||||
bool retainLanguage = false;
|
||||
bool retainScript = false;
|
||||
bool retainRegion = false;
|
||||
|
||||
BytesTrie iter(trie);
|
||||
uint64_t state;
|
||||
int32_t value;
|
||||
// Small optimization: Array lookup for first language letter.
|
||||
int32_t c0;
|
||||
if (0 <= (c0 = uprv_lowerOrdinal(language.data()[0])) && c0 <= 25 &&
|
||||
language.length() >= 2 &&
|
||||
(state = trieFirstLetterStates[c0]) != 0) {
|
||||
value = trieNext(iter.resetToState64(state), language, 1);
|
||||
} else {
|
||||
value = trieNext(iter, language, 0);
|
||||
}
|
||||
bool matchLanguage = (value >= 0);
|
||||
bool matchScript = false;
|
||||
if (value >= 0) {
|
||||
retainLanguage = !language.empty();
|
||||
state = iter.getState64();
|
||||
} else {
|
||||
retainLanguage = true;
|
||||
iter.resetToState64(trieUndState); // "und" ("*")
|
||||
state = 0;
|
||||
}
|
||||
|
||||
if (value >= 0 && !script.empty()) {
|
||||
matchScript = true;
|
||||
}
|
||||
if (value > 0) {
|
||||
// Intermediate or final value from just language.
|
||||
if (value == SKIP_SCRIPT) {
|
||||
value = 0;
|
||||
}
|
||||
retainScript = !script.empty();
|
||||
} else {
|
||||
value = trieNext(iter, script, 0);
|
||||
if (value >= 0) {
|
||||
retainScript = !script.empty();
|
||||
state = iter.getState64();
|
||||
} else {
|
||||
retainScript = true;
|
||||
if (state == 0) {
|
||||
iter.resetToState64(trieUndZzzzState); // "und-Zzzz" ("**")
|
||||
} else {
|
||||
iter.resetToState64(state);
|
||||
value = trieNext(iter, "", 0);
|
||||
U_ASSERT(value >= 0);
|
||||
state = iter.getState64();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
bool matchRegion = false;
|
||||
if (value > 0) {
|
||||
// Final value from just language or language+script.
|
||||
retainRegion = !region.empty();
|
||||
} else {
|
||||
value = trieNext(iter, region, 0);
|
||||
if (value >= 0) {
|
||||
if (!region.empty() && !isMacroregion(region, errorCode)) {
|
||||
retainRegion = true;
|
||||
matchRegion = true;
|
||||
}
|
||||
} else {
|
||||
retainRegion = true;
|
||||
if (state == 0) {
|
||||
value = defaultLsrIndex;
|
||||
} else {
|
||||
iter.resetToState64(state);
|
||||
value = trieNext(iter, "", 0);
|
||||
U_ASSERT(value > 0);
|
||||
}
|
||||
}
|
||||
}
|
||||
U_ASSERT(value < lsrsLength);
|
||||
const LSR &matched = lsrs[value];
|
||||
|
||||
if (returnInputIfUnmatch &&
|
||||
(!(matchLanguage || matchScript || (matchRegion && language.empty())))) {
|
||||
return LSR("", "", "", LSR::EXPLICIT_LSR, errorCode); // no matching.
|
||||
}
|
||||
if (language.empty()) {
|
||||
language = StringPiece("und");
|
||||
}
|
||||
|
||||
if (!(retainLanguage || retainScript || retainRegion)) {
|
||||
// Quickly return a copy of the lookup-result LSR
|
||||
// without new allocation of the subtags.
|
||||
return LSR(matched.language, matched.script, matched.region, matched.flags);
|
||||
}
|
||||
if (!retainLanguage) {
|
||||
language = matched.language;
|
||||
}
|
||||
if (!retainScript) {
|
||||
script = matched.script;
|
||||
}
|
||||
if (!retainRegion) {
|
||||
region = matched.region;
|
||||
}
|
||||
int32_t retainMask = (retainLanguage ? 4 : 0) + (retainScript ? 2 : 0) + (retainRegion ? 1 : 0);
|
||||
// retainOldMask flags = LSR explicit-subtag flags
|
||||
return LSR(language, script, region, retainMask, errorCode);
|
||||
}
|
||||
|
||||
int32_t LikelySubtags::compareLikely(const LSR &lsr, const LSR &other, int32_t likelyInfo) const {
|
||||
// If likelyInfo >= 0:
|
||||
// likelyInfo bit 1 is set if the previous comparison with lsr
|
||||
// was for equal language and script.
|
||||
// Otherwise the scripts differed.
|
||||
if (uprv_strcmp(lsr.language, other.language) != 0) {
|
||||
return 0xfffffffc; // negative, lsr not better than other
|
||||
}
|
||||
if (uprv_strcmp(lsr.script, other.script) != 0) {
|
||||
int32_t index;
|
||||
if (likelyInfo >= 0 && (likelyInfo & 2) == 0) {
|
||||
index = likelyInfo >> 2;
|
||||
} else {
|
||||
index = getLikelyIndex(lsr.language, "");
|
||||
likelyInfo = index << 2;
|
||||
}
|
||||
const LSR &likely = lsrs[index];
|
||||
if (uprv_strcmp(lsr.script, likely.script) == 0) {
|
||||
return likelyInfo | 1;
|
||||
} else {
|
||||
return likelyInfo & ~1;
|
||||
}
|
||||
}
|
||||
if (uprv_strcmp(lsr.region, other.region) != 0) {
|
||||
int32_t index;
|
||||
if (likelyInfo >= 0 && (likelyInfo & 2) != 0) {
|
||||
index = likelyInfo >> 2;
|
||||
} else {
|
||||
index = getLikelyIndex(lsr.language, lsr.region);
|
||||
likelyInfo = (index << 2) | 2;
|
||||
}
|
||||
const LSR &likely = lsrs[index];
|
||||
if (uprv_strcmp(lsr.region, likely.region) == 0) {
|
||||
return likelyInfo | 1;
|
||||
} else {
|
||||
return likelyInfo & ~1;
|
||||
}
|
||||
}
|
||||
return likelyInfo & ~1; // lsr not better than other
|
||||
}
|
||||
|
||||
// Subset of maximize().
|
||||
int32_t LikelySubtags::getLikelyIndex(const char *language, const char *script) const {
|
||||
if (uprv_strcmp(language, "und") == 0) {
|
||||
language = "";
|
||||
}
|
||||
if (uprv_strcmp(script, "Zzzz") == 0) {
|
||||
script = "";
|
||||
}
|
||||
|
||||
BytesTrie iter(trie);
|
||||
uint64_t state;
|
||||
int32_t value;
|
||||
// Small optimization: Array lookup for first language letter.
|
||||
int32_t c0;
|
||||
if (0 <= (c0 = uprv_lowerOrdinal(language[0])) && c0 <= 25 &&
|
||||
language[1] != 0 && // language.length() >= 2
|
||||
(state = trieFirstLetterStates[c0]) != 0) {
|
||||
value = trieNext(iter.resetToState64(state), language, 1);
|
||||
} else {
|
||||
value = trieNext(iter, language, 0);
|
||||
}
|
||||
if (value >= 0) {
|
||||
state = iter.getState64();
|
||||
} else {
|
||||
iter.resetToState64(trieUndState); // "und" ("*")
|
||||
state = 0;
|
||||
}
|
||||
|
||||
if (value > 0) {
|
||||
// Intermediate or final value from just language.
|
||||
if (value == SKIP_SCRIPT) {
|
||||
value = 0;
|
||||
}
|
||||
} else {
|
||||
value = trieNext(iter, script, 0);
|
||||
if (value >= 0) {
|
||||
state = iter.getState64();
|
||||
} else {
|
||||
if (state == 0) {
|
||||
iter.resetToState64(trieUndZzzzState); // "und-Zzzz" ("**")
|
||||
} else {
|
||||
iter.resetToState64(state);
|
||||
value = trieNext(iter, "", 0);
|
||||
U_ASSERT(value >= 0);
|
||||
state = iter.getState64();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (value > 0) {
|
||||
// Final value from just language or language+script.
|
||||
} else {
|
||||
value = trieNext(iter, "", 0);
|
||||
U_ASSERT(value > 0);
|
||||
}
|
||||
U_ASSERT(value < lsrsLength);
|
||||
return value;
|
||||
}
|
||||
|
||||
int32_t LikelySubtags::trieNext(BytesTrie &iter, const char *s, int32_t i) {
|
||||
UStringTrieResult result;
|
||||
uint8_t c;
|
||||
if ((c = s[i]) == 0) {
|
||||
result = iter.next(u'*');
|
||||
} else {
|
||||
for (;;) {
|
||||
c = uprv_invCharToAscii(c);
|
||||
// EBCDIC: If s[i] is not an invariant character,
|
||||
// then c is now 0 and will simply not match anything, which is harmless.
|
||||
uint8_t next = s[++i];
|
||||
if (next != 0) {
|
||||
if (!USTRINGTRIE_HAS_NEXT(iter.next(c))) {
|
||||
return -1;
|
||||
}
|
||||
} else {
|
||||
// last character of this subtag
|
||||
result = iter.next(c | 0x80);
|
||||
break;
|
||||
}
|
||||
c = next;
|
||||
}
|
||||
}
|
||||
switch (result) {
|
||||
case USTRINGTRIE_NO_MATCH: return -1;
|
||||
case USTRINGTRIE_NO_VALUE: return 0;
|
||||
case USTRINGTRIE_INTERMEDIATE_VALUE:
|
||||
U_ASSERT(iter.getValue() == SKIP_SCRIPT);
|
||||
return SKIP_SCRIPT;
|
||||
case USTRINGTRIE_FINAL_VALUE: return iter.getValue();
|
||||
default: return -1;
|
||||
}
|
||||
}
|
||||
int32_t LikelySubtags::trieNext(BytesTrie &iter, StringPiece s, int32_t i) {
|
||||
UStringTrieResult result;
|
||||
uint8_t c;
|
||||
if (s.length() == i) {
|
||||
result = iter.next(u'*');
|
||||
} else {
|
||||
c = s.data()[i];
|
||||
for (;;) {
|
||||
c = uprv_invCharToAscii(c);
|
||||
// EBCDIC: If s[i] is not an invariant character,
|
||||
// then c is now 0 and will simply not match anything, which is harmless.
|
||||
if (i+1 != s.length()) {
|
||||
if (!USTRINGTRIE_HAS_NEXT(iter.next(c))) {
|
||||
return -1;
|
||||
}
|
||||
c = s.data()[++i];
|
||||
} else {
|
||||
// last character of this subtag
|
||||
result = iter.next(c | 0x80);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
switch (result) {
|
||||
case USTRINGTRIE_NO_MATCH: return -1;
|
||||
case USTRINGTRIE_NO_VALUE: return 0;
|
||||
case USTRINGTRIE_INTERMEDIATE_VALUE:
|
||||
U_ASSERT(iter.getValue() == SKIP_SCRIPT);
|
||||
return SKIP_SCRIPT;
|
||||
case USTRINGTRIE_FINAL_VALUE: return iter.getValue();
|
||||
default: return -1;
|
||||
}
|
||||
}
|
||||
|
||||
LSR LikelySubtags::minimizeSubtags(StringPiece language, StringPiece script,
|
||||
StringPiece region,
|
||||
bool favorScript,
|
||||
UErrorCode &errorCode) const {
|
||||
if (U_FAILURE(errorCode)) { return {}; }
|
||||
LSR max = maximize(language, script, region, true, errorCode);
|
||||
if (U_FAILURE(errorCode)) { return {}; }
|
||||
// If no match, return it.
|
||||
if (uprv_strlen(max.language) == 0 &&
|
||||
uprv_strlen(max.script) == 0 &&
|
||||
uprv_strlen(max.region) == 0) {
|
||||
// No match. ICU API mandate us to
|
||||
// "If this Locale is already in the minimal form, or not valid, or
|
||||
// there is no data available for minimization, the Locale will be
|
||||
// unchanged."
|
||||
return LSR(language, script, region, LSR::EXPLICIT_LSR, errorCode);
|
||||
}
|
||||
// try language
|
||||
LSR test = maximize(max.language, "", "", true, errorCode);
|
||||
if (U_FAILURE(errorCode)) { return {}; }
|
||||
if (test.isEquivalentTo(max)) {
|
||||
return LSR(max.language, "", "", LSR::DONT_CARE_FLAGS, errorCode);
|
||||
}
|
||||
|
||||
if (!favorScript) {
|
||||
// favor Region
|
||||
// try language and region
|
||||
test = maximize(max.language, "", max.region, true, errorCode);
|
||||
if (U_FAILURE(errorCode)) { return {}; }
|
||||
if (test.isEquivalentTo(max)) {
|
||||
return LSR(max.language, "", max.region, LSR::DONT_CARE_FLAGS, errorCode);
|
||||
}
|
||||
}
|
||||
// try language and script
|
||||
test = maximize(max.language, max.script, "", true, errorCode);
|
||||
if (U_FAILURE(errorCode)) { return {}; }
|
||||
if (test.isEquivalentTo(max)) {
|
||||
return LSR(max.language, max.script, "", LSR::DONT_CARE_FLAGS, errorCode);
|
||||
}
|
||||
if (favorScript) {
|
||||
// try language and region
|
||||
test = maximize(max.language, "", max.region, true, errorCode);
|
||||
if (U_FAILURE(errorCode)) { return {}; }
|
||||
if (test.isEquivalentTo(max)) {
|
||||
return LSR(max.language, "", max.region, LSR::DONT_CARE_FLAGS, errorCode);
|
||||
}
|
||||
}
|
||||
return LSR(max.language, max.script, max.region, LSR::DONT_CARE_FLAGS, errorCode);
|
||||
}
|
||||
|
||||
U_NAMESPACE_END
|
||||
128
engine/thirdparty/icu4c/common/loclikelysubtags.h
vendored
Normal file
128
engine/thirdparty/icu4c/common/loclikelysubtags.h
vendored
Normal file
|
|
@ -0,0 +1,128 @@
|
|||
// © 2019 and later: Unicode, Inc. and others.
|
||||
// License & terms of use: http://www.unicode.org/copyright.html
|
||||
|
||||
// loclikelysubtags.h
|
||||
// created: 2019may08 Markus W. Scherer
|
||||
|
||||
#ifndef __LOCLIKELYSUBTAGS_H__
|
||||
#define __LOCLIKELYSUBTAGS_H__
|
||||
|
||||
#include <utility>
|
||||
#include "unicode/utypes.h"
|
||||
#include "unicode/bytestrie.h"
|
||||
#include "unicode/locid.h"
|
||||
#include "unicode/stringpiece.h"
|
||||
#include "unicode/uobject.h"
|
||||
#include "unicode/ures.h"
|
||||
#include "charstrmap.h"
|
||||
#include "lsr.h"
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
struct LikelySubtagsData;
|
||||
|
||||
struct LocaleDistanceData {
|
||||
LocaleDistanceData() = default;
|
||||
LocaleDistanceData(LocaleDistanceData &&data);
|
||||
~LocaleDistanceData();
|
||||
|
||||
const uint8_t *distanceTrieBytes = nullptr;
|
||||
const uint8_t *regionToPartitions = nullptr;
|
||||
const char **partitions = nullptr;
|
||||
const LSR *paradigms = nullptr;
|
||||
int32_t paradigmsLength = 0;
|
||||
const int32_t *distances = nullptr;
|
||||
|
||||
private:
|
||||
LocaleDistanceData &operator=(const LocaleDistanceData &) = delete;
|
||||
};
|
||||
|
||||
class LikelySubtags final : public UMemory {
|
||||
public:
|
||||
~LikelySubtags();
|
||||
|
||||
static constexpr int32_t SKIP_SCRIPT = 1;
|
||||
|
||||
// VisibleForTesting
|
||||
static const LikelySubtags *getSingleton(UErrorCode &errorCode);
|
||||
|
||||
// VisibleForTesting
|
||||
LSR makeMaximizedLsrFrom(const Locale &locale,
|
||||
bool returnInputIfUnmatch,
|
||||
UErrorCode &errorCode) const;
|
||||
|
||||
/**
|
||||
* Tests whether lsr is "more likely" than other.
|
||||
* For example, fr-Latn-FR is more likely than fr-Latn-CH because
|
||||
* FR is the default region for fr-Latn.
|
||||
*
|
||||
* The likelyInfo caches lookup information between calls.
|
||||
* The return value is an updated likelyInfo value,
|
||||
* with bit 0 set if lsr is "more likely".
|
||||
* The initial value of likelyInfo must be negative.
|
||||
*/
|
||||
int32_t compareLikely(const LSR &lsr, const LSR &other, int32_t likelyInfo) const;
|
||||
|
||||
LSR minimizeSubtags(StringPiece language, StringPiece script, StringPiece region,
|
||||
bool favorScript,
|
||||
UErrorCode &errorCode) const;
|
||||
|
||||
// visible for LocaleDistance
|
||||
const LocaleDistanceData &getDistanceData() const { return distanceData; }
|
||||
|
||||
private:
|
||||
LikelySubtags(LikelySubtagsData &data);
|
||||
LikelySubtags(const LikelySubtags &other) = delete;
|
||||
LikelySubtags &operator=(const LikelySubtags &other) = delete;
|
||||
|
||||
static void initLikelySubtags(UErrorCode &errorCode);
|
||||
|
||||
LSR makeMaximizedLsr(const char *language, const char *script, const char *region,
|
||||
const char *variant,
|
||||
bool returnInputIfUnmatch,
|
||||
UErrorCode &errorCode) const;
|
||||
|
||||
/**
|
||||
* Raw access to addLikelySubtags. Input must be in canonical format, eg "en", not "eng" or "EN".
|
||||
*/
|
||||
LSR maximize(const char *language, const char *script, const char *region,
|
||||
bool returnInputIfUnmatch,
|
||||
UErrorCode &errorCode) const;
|
||||
LSR maximize(StringPiece language, StringPiece script, StringPiece region,
|
||||
bool returnInputIfUnmatch,
|
||||
UErrorCode &errorCode) const;
|
||||
|
||||
int32_t getLikelyIndex(const char *language, const char *script) const;
|
||||
bool isMacroregion(StringPiece& region, UErrorCode &errorCode) const;
|
||||
|
||||
static int32_t trieNext(BytesTrie &iter, const char *s, int32_t i);
|
||||
static int32_t trieNext(BytesTrie &iter, StringPiece s, int32_t i);
|
||||
|
||||
UResourceBundle *langInfoBundle;
|
||||
// We could store the strings by value, except that if there were few enough strings,
|
||||
// moving the contents could copy it to a different array,
|
||||
// invalidating the pointers stored in the maps.
|
||||
CharString *strings;
|
||||
CharStringMap languageAliases;
|
||||
CharStringMap regionAliases;
|
||||
|
||||
// The trie maps each lang+script+region (encoded in ASCII) to an index into lsrs.
|
||||
// There is also a trie value for each intermediate lang and lang+script.
|
||||
// '*' is used instead of "und", "Zzzz"/"" and "ZZ"/"".
|
||||
BytesTrie trie;
|
||||
uint64_t trieUndState;
|
||||
uint64_t trieUndZzzzState;
|
||||
int32_t defaultLsrIndex;
|
||||
uint64_t trieFirstLetterStates[26];
|
||||
const LSR *lsrs;
|
||||
#if U_DEBUG
|
||||
int32_t lsrsLength;
|
||||
#endif
|
||||
|
||||
// distance/matcher data: see comment in LikelySubtagsData::load()
|
||||
LocaleDistanceData distanceData;
|
||||
};
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
||||
#endif // __LOCLIKELYSUBTAGS_H__
|
||||
1315
engine/thirdparty/icu4c/common/locmap.cpp
vendored
Normal file
1315
engine/thirdparty/icu4c/common/locmap.cpp
vendored
Normal file
File diff suppressed because it is too large
Load diff
40
engine/thirdparty/icu4c/common/locmap.h
vendored
Normal file
40
engine/thirdparty/icu4c/common/locmap.h
vendored
Normal file
|
|
@ -0,0 +1,40 @@
|
|||
// © 2016 and later: Unicode, Inc. and others.
|
||||
// License & terms of use: http://www.unicode.org/copyright.html
|
||||
/*
|
||||
******************************************************************************
|
||||
*
|
||||
* Copyright (C) 1996-2013, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*
|
||||
******************************************************************************
|
||||
*
|
||||
* File locmap.h : Locale Mapping Classes
|
||||
*
|
||||
*
|
||||
* Created by: Helena Shih
|
||||
*
|
||||
* Modification History:
|
||||
*
|
||||
* Date Name Description
|
||||
* 3/11/97 aliu Added setId().
|
||||
* 4/20/99 Madhu Added T_convertToPosix()
|
||||
* 09/18/00 george Removed the memory leaks.
|
||||
* 08/23/01 george Convert to C
|
||||
*============================================================================
|
||||
*/
|
||||
|
||||
#ifndef LOCMAP_H
|
||||
#define LOCMAP_H
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
|
||||
#define LANGUAGE_LCID(hostID) (uint16_t)(0x03FF & hostID)
|
||||
|
||||
U_CAPI int32_t uprv_convertToPosix(uint32_t hostid, char* posixID, int32_t posixIDCapacity, UErrorCode* status);
|
||||
|
||||
/* Don't call these functions directly. Use uloc_getLCID instead. */
|
||||
U_CAPI uint32_t uprv_convertToLCIDPlatform(const char* localeID, UErrorCode* status); // Leverage platform conversion if possible
|
||||
U_CAPI uint32_t uprv_convertToLCID(const char* langID, const char* posixID, UErrorCode* status);
|
||||
|
||||
#endif /* LOCMAP_H */
|
||||
|
||||
226
engine/thirdparty/icu4c/common/locresdata.cpp
vendored
Normal file
226
engine/thirdparty/icu4c/common/locresdata.cpp
vendored
Normal file
|
|
@ -0,0 +1,226 @@
|
|||
// © 2016 and later: Unicode, Inc. and others.
|
||||
// License & terms of use: http://www.unicode.org/copyright.html
|
||||
/*
|
||||
*******************************************************************************
|
||||
*
|
||||
* Copyright (C) 1997-2012, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*
|
||||
*******************************************************************************
|
||||
* file name: loclikely.cpp
|
||||
* encoding: UTF-8
|
||||
* tab size: 8 (not used)
|
||||
* indentation:4
|
||||
*
|
||||
* created on: 2010feb25
|
||||
* created by: Markus W. Scherer
|
||||
*
|
||||
* Code for miscellaneous locale-related resource bundle data access,
|
||||
* separated out from other .cpp files
|
||||
* that then do not depend on resource bundle code and this data.
|
||||
*/
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
#include "unicode/putil.h"
|
||||
#include "unicode/uloc.h"
|
||||
#include "unicode/ures.h"
|
||||
#include "charstr.h"
|
||||
#include "cstring.h"
|
||||
#include "ulocimp.h"
|
||||
#include "uresimp.h"
|
||||
|
||||
/*
|
||||
* Lookup a resource bundle table item with fallback on the table level.
|
||||
* Regular resource bundle lookups perform fallback to parent locale bundles
|
||||
* and eventually the root bundle, but only for top-level items.
|
||||
* This function takes the name of a top-level table and of an item in that table
|
||||
* and performs a lookup of both, falling back until a bundle contains a table
|
||||
* with this item.
|
||||
*
|
||||
* Note: Only the opening of entire bundles falls back through the default locale
|
||||
* before root. Once a bundle is open, item lookups do not go through the
|
||||
* default locale because that would result in a mix of languages that is
|
||||
* unpredictable to the programmer and most likely useless.
|
||||
*/
|
||||
U_CAPI const char16_t * U_EXPORT2
|
||||
uloc_getTableStringWithFallback(const char *path, const char *locale,
|
||||
const char *tableKey, const char *subTableKey,
|
||||
const char *itemKey,
|
||||
int32_t *pLength,
|
||||
UErrorCode *pErrorCode)
|
||||
{
|
||||
if (U_FAILURE(*pErrorCode)) { return nullptr; }
|
||||
/* char localeBuffer[ULOC_FULLNAME_CAPACITY*4];*/
|
||||
const char16_t *item=nullptr;
|
||||
UErrorCode errorCode;
|
||||
|
||||
/*
|
||||
* open the bundle for the current locale
|
||||
* this falls back through the locale's chain to root
|
||||
*/
|
||||
errorCode=U_ZERO_ERROR;
|
||||
icu::LocalUResourceBundlePointer rb(ures_open(path, locale, &errorCode));
|
||||
|
||||
if(U_FAILURE(errorCode)) {
|
||||
/* total failure, not even root could be opened */
|
||||
*pErrorCode=errorCode;
|
||||
return nullptr;
|
||||
} else if(errorCode==U_USING_DEFAULT_WARNING ||
|
||||
(errorCode==U_USING_FALLBACK_WARNING && *pErrorCode!=U_USING_DEFAULT_WARNING)
|
||||
) {
|
||||
/* set the "strongest" error code (success->fallback->default->failure) */
|
||||
*pErrorCode=errorCode;
|
||||
}
|
||||
|
||||
for(;;){
|
||||
icu::StackUResourceBundle table;
|
||||
icu::StackUResourceBundle subTable;
|
||||
ures_getByKeyWithFallback(rb.getAlias(), tableKey, table.getAlias(), &errorCode);
|
||||
|
||||
if (subTableKey != nullptr) {
|
||||
/*
|
||||
ures_getByKeyWithFallback(table.getAlias(), subTableKey, subTable.getAlias(), &errorCode);
|
||||
item = ures_getStringByKeyWithFallback(subTable.getAlias(), itemKey, pLength, &errorCode);
|
||||
if(U_FAILURE(errorCode)){
|
||||
*pErrorCode = errorCode;
|
||||
}
|
||||
|
||||
break;*/
|
||||
|
||||
ures_getByKeyWithFallback(table.getAlias(), subTableKey, table.getAlias(), &errorCode);
|
||||
}
|
||||
if(U_SUCCESS(errorCode)){
|
||||
item = ures_getStringByKeyWithFallback(table.getAlias(), itemKey, pLength, &errorCode);
|
||||
if(U_FAILURE(errorCode)){
|
||||
const char* replacement = nullptr;
|
||||
*pErrorCode = errorCode; /*save the errorCode*/
|
||||
errorCode = U_ZERO_ERROR;
|
||||
/* may be a deprecated code */
|
||||
if(uprv_strcmp(tableKey, "Countries")==0){
|
||||
replacement = uloc_getCurrentCountryID(itemKey);
|
||||
}else if(uprv_strcmp(tableKey, "Languages")==0){
|
||||
replacement = uloc_getCurrentLanguageID(itemKey);
|
||||
}
|
||||
/*pointer comparison is ok since uloc_getCurrentCountryID & uloc_getCurrentLanguageID return the key itself is replacement is not found*/
|
||||
if(replacement!=nullptr && itemKey != replacement){
|
||||
item = ures_getStringByKeyWithFallback(table.getAlias(), replacement, pLength, &errorCode);
|
||||
if(U_SUCCESS(errorCode)){
|
||||
*pErrorCode = errorCode;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}else{
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if(U_FAILURE(errorCode)){
|
||||
|
||||
/* still can't figure out ?.. try the fallback mechanism */
|
||||
int32_t len = 0;
|
||||
const char16_t* fallbackLocale = nullptr;
|
||||
*pErrorCode = errorCode;
|
||||
errorCode = U_ZERO_ERROR;
|
||||
|
||||
fallbackLocale = ures_getStringByKeyWithFallback(table.getAlias(), "Fallback", &len, &errorCode);
|
||||
if(U_FAILURE(errorCode)){
|
||||
*pErrorCode = errorCode;
|
||||
break;
|
||||
}
|
||||
|
||||
icu::CharString explicitFallbackName;
|
||||
explicitFallbackName.appendInvariantChars(fallbackLocale, len, errorCode);
|
||||
|
||||
/* guard against recursive fallback */
|
||||
if (explicitFallbackName == locale) {
|
||||
*pErrorCode = U_INTERNAL_PROGRAM_ERROR;
|
||||
break;
|
||||
}
|
||||
rb.adoptInstead(ures_open(path, explicitFallbackName.data(), &errorCode));
|
||||
if(U_FAILURE(errorCode)){
|
||||
*pErrorCode = errorCode;
|
||||
break;
|
||||
}
|
||||
/* succeeded in opening the fallback bundle .. continue and try to fetch the item */
|
||||
}else{
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return item;
|
||||
}
|
||||
|
||||
namespace {
|
||||
|
||||
ULayoutType
|
||||
_uloc_getOrientationHelper(const char* localeId,
|
||||
const char* key,
|
||||
UErrorCode& status)
|
||||
{
|
||||
ULayoutType result = ULOC_LAYOUT_UNKNOWN;
|
||||
|
||||
if (U_FAILURE(status)) { return result; }
|
||||
|
||||
icu::CharString localeBuffer = ulocimp_canonicalize(localeId, status);
|
||||
|
||||
if (U_FAILURE(status)) { return result; }
|
||||
|
||||
int32_t length = 0;
|
||||
const char16_t* const value =
|
||||
uloc_getTableStringWithFallback(
|
||||
nullptr,
|
||||
localeBuffer.data(),
|
||||
"layout",
|
||||
nullptr,
|
||||
key,
|
||||
&length,
|
||||
&status);
|
||||
|
||||
if (U_FAILURE(status)) { return result; }
|
||||
|
||||
if (length != 0) {
|
||||
switch(value[0])
|
||||
{
|
||||
case 0x0062: /* 'b' */
|
||||
result = ULOC_LAYOUT_BTT;
|
||||
break;
|
||||
case 0x006C: /* 'l' */
|
||||
result = ULOC_LAYOUT_LTR;
|
||||
break;
|
||||
case 0x0072: /* 'r' */
|
||||
result = ULOC_LAYOUT_RTL;
|
||||
break;
|
||||
case 0x0074: /* 't' */
|
||||
result = ULOC_LAYOUT_TTB;
|
||||
break;
|
||||
default:
|
||||
status = U_INTERNAL_PROGRAM_ERROR;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
U_CAPI ULayoutType U_EXPORT2
|
||||
uloc_getCharacterOrientation(const char* localeId,
|
||||
UErrorCode *status)
|
||||
{
|
||||
return _uloc_getOrientationHelper(localeId, "characters", *status);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the layout line orientation for the specified locale.
|
||||
*
|
||||
* @param localeID locale name
|
||||
* @param status Error status
|
||||
* @return an enum indicating the layout orientation for lines.
|
||||
*/
|
||||
U_CAPI ULayoutType U_EXPORT2
|
||||
uloc_getLineOrientation(const char* localeId,
|
||||
UErrorCode *status)
|
||||
{
|
||||
return _uloc_getOrientationHelper(localeId, "lines", *status);
|
||||
}
|
||||
276
engine/thirdparty/icu4c/common/locutil.cpp
vendored
Normal file
276
engine/thirdparty/icu4c/common/locutil.cpp
vendored
Normal file
|
|
@ -0,0 +1,276 @@
|
|||
// © 2016 and later: Unicode, Inc. and others.
|
||||
// License & terms of use: http://www.unicode.org/copyright.html
|
||||
/*
|
||||
*******************************************************************************
|
||||
* Copyright (C) 2002-2014, International Business Machines Corporation and
|
||||
* others. All Rights Reserved.
|
||||
*******************************************************************************
|
||||
*/
|
||||
#include "unicode/utypes.h"
|
||||
|
||||
#if !UCONFIG_NO_SERVICE || !UCONFIG_NO_TRANSLITERATION
|
||||
|
||||
#include "unicode/resbund.h"
|
||||
#include "unicode/uenum.h"
|
||||
#include "cmemory.h"
|
||||
#include "ustrfmt.h"
|
||||
#include "locutil.h"
|
||||
#include "charstr.h"
|
||||
#include "ucln_cmn.h"
|
||||
#include "uassert.h"
|
||||
#include "umutex.h"
|
||||
|
||||
// see LocaleUtility::getAvailableLocaleNames
|
||||
static icu::UInitOnce LocaleUtilityInitOnce {};
|
||||
static icu::Hashtable * LocaleUtility_cache = nullptr;
|
||||
|
||||
#define UNDERSCORE_CHAR ((char16_t)0x005f)
|
||||
#define AT_SIGN_CHAR ((char16_t)64)
|
||||
#define PERIOD_CHAR ((char16_t)46)
|
||||
|
||||
/*
|
||||
******************************************************************
|
||||
*/
|
||||
|
||||
/**
|
||||
* Release all static memory held by Locale Utility.
|
||||
*/
|
||||
U_CDECL_BEGIN
|
||||
static UBool U_CALLCONV service_cleanup() {
|
||||
if (LocaleUtility_cache) {
|
||||
delete LocaleUtility_cache;
|
||||
LocaleUtility_cache = nullptr;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
static void U_CALLCONV locale_utility_init(UErrorCode &status) {
|
||||
using namespace icu;
|
||||
U_ASSERT(LocaleUtility_cache == nullptr);
|
||||
ucln_common_registerCleanup(UCLN_COMMON_SERVICE, service_cleanup);
|
||||
LocaleUtility_cache = new Hashtable(status);
|
||||
if (U_FAILURE(status)) {
|
||||
delete LocaleUtility_cache;
|
||||
LocaleUtility_cache = nullptr;
|
||||
return;
|
||||
}
|
||||
if (LocaleUtility_cache == nullptr) {
|
||||
status = U_MEMORY_ALLOCATION_ERROR;
|
||||
return;
|
||||
}
|
||||
LocaleUtility_cache->setValueDeleter(uhash_deleteHashtable);
|
||||
}
|
||||
|
||||
U_CDECL_END
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
UnicodeString&
|
||||
LocaleUtility::canonicalLocaleString(const UnicodeString* id, UnicodeString& result)
|
||||
{
|
||||
if (id == nullptr) {
|
||||
result.setToBogus();
|
||||
} else {
|
||||
// Fix case only (no other changes) up to the first '@' or '.' or
|
||||
// end of string, whichever comes first. In 3.0 I changed this to
|
||||
// stop at first '@' or '.'. It used to run out to the end of
|
||||
// string. My fix makes the tests pass but is probably
|
||||
// structurally incorrect. See below. [alan 3.0]
|
||||
|
||||
// TODO: Doug, you might want to revise this...
|
||||
result = *id;
|
||||
int32_t i = 0;
|
||||
int32_t end = result.indexOf(AT_SIGN_CHAR);
|
||||
int32_t n = result.indexOf(PERIOD_CHAR);
|
||||
if (n >= 0 && n < end) {
|
||||
end = n;
|
||||
}
|
||||
if (end < 0) {
|
||||
end = result.length();
|
||||
}
|
||||
n = result.indexOf(UNDERSCORE_CHAR);
|
||||
if (n < 0) {
|
||||
n = end;
|
||||
}
|
||||
for (; i < n; ++i) {
|
||||
char16_t c = result.charAt(i);
|
||||
if (c >= 0x0041 && c <= 0x005a) {
|
||||
c += 0x20;
|
||||
result.setCharAt(i, c);
|
||||
}
|
||||
}
|
||||
for (n = end; i < n; ++i) {
|
||||
char16_t c = result.charAt(i);
|
||||
if (c >= 0x0061 && c <= 0x007a) {
|
||||
c -= 0x20;
|
||||
result.setCharAt(i, c);
|
||||
}
|
||||
}
|
||||
}
|
||||
return result;
|
||||
|
||||
#if 0
|
||||
// This code does a proper full level 2 canonicalization of id.
|
||||
// It's nasty to go from char16_t to char to char to char16_t -- but
|
||||
// that's what you have to do to use the uloc_canonicalize
|
||||
// function on UnicodeStrings.
|
||||
|
||||
// I ended up doing the alternate fix (see above) not for
|
||||
// performance reasons, although performance will certainly be
|
||||
// better, but because doing a full level 2 canonicalization
|
||||
// causes some tests to fail. [alan 3.0]
|
||||
|
||||
// TODO: Doug, you might want to revisit this...
|
||||
result.setToBogus();
|
||||
if (id != 0) {
|
||||
int32_t buflen = id->length() + 8; // space for NUL
|
||||
char* buf = (char*) uprv_malloc(buflen);
|
||||
char* canon = (buf == 0) ? 0 : (char*) uprv_malloc(buflen);
|
||||
if (buf != 0 && canon != 0) {
|
||||
U_ASSERT(id->extract(0, INT32_MAX, buf, buflen) < buflen);
|
||||
UErrorCode ec = U_ZERO_ERROR;
|
||||
uloc_canonicalize(buf, canon, buflen, &ec);
|
||||
if (U_SUCCESS(ec)) {
|
||||
result = UnicodeString(canon);
|
||||
}
|
||||
}
|
||||
uprv_free(buf);
|
||||
uprv_free(canon);
|
||||
}
|
||||
return result;
|
||||
#endif
|
||||
}
|
||||
|
||||
Locale&
|
||||
LocaleUtility::initLocaleFromName(const UnicodeString& id, Locale& result)
|
||||
{
|
||||
if (id.isBogus()) {
|
||||
result.setToBogus();
|
||||
} else {
|
||||
/*
|
||||
* We need to convert from a UnicodeString to char * in order to
|
||||
* create a Locale.
|
||||
*
|
||||
* Problem: Locale ID strings may contain '@' which is a variant
|
||||
* character and cannot be handled by invariant-character conversion.
|
||||
*
|
||||
* Hack: Since ICU code can handle locale IDs with multiple encodings
|
||||
* of '@' (at least for EBCDIC; it's not known to be a problem for
|
||||
* ASCII-based systems),
|
||||
* we use regular invariant-character conversion for everything else
|
||||
* and manually convert U+0040 into a compiler-char-constant '@'.
|
||||
* While this compilation-time constant may not match the runtime
|
||||
* encoding of '@', it should be one of the encodings which ICU
|
||||
* recognizes.
|
||||
*
|
||||
* There should be only at most one '@' in a locale ID.
|
||||
*/
|
||||
CharString buffer;
|
||||
int32_t prev, i;
|
||||
prev = 0;
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
do {
|
||||
i = id.indexOf((char16_t)0x40, prev);
|
||||
if(i < 0) {
|
||||
// no @ between prev and the rest of the string
|
||||
buffer.appendInvariantChars(id.tempSubString(prev), status);
|
||||
break; // done
|
||||
} else {
|
||||
// normal invariant-character conversion for text between @s
|
||||
buffer.appendInvariantChars(id.tempSubString(prev, i - prev), status);
|
||||
// manually "convert" U+0040 at id[i] into '@' at buffer[i]
|
||||
buffer.append('@', status);
|
||||
prev = i + 1;
|
||||
}
|
||||
} while (U_SUCCESS(status));
|
||||
if (U_FAILURE(status)) {
|
||||
result.setToBogus();
|
||||
} else {
|
||||
result = Locale::createFromName(buffer.data());
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
UnicodeString&
|
||||
LocaleUtility::initNameFromLocale(const Locale& locale, UnicodeString& result)
|
||||
{
|
||||
if (locale.isBogus()) {
|
||||
result.setToBogus();
|
||||
} else {
|
||||
result.append(UnicodeString(locale.getName(), -1, US_INV));
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
const Hashtable*
|
||||
LocaleUtility::getAvailableLocaleNames(const UnicodeString& bundleID)
|
||||
{
|
||||
// LocaleUtility_cache is a hash-of-hashes. The top-level keys
|
||||
// are path strings ('bundleID') passed to
|
||||
// ures_openAvailableLocales. The top-level values are
|
||||
// second-level hashes. The second-level keys are result strings
|
||||
// from ures_openAvailableLocales. The second-level values are
|
||||
// garbage ((void*)1 or other random pointer).
|
||||
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
umtx_initOnce(LocaleUtilityInitOnce, locale_utility_init, status);
|
||||
Hashtable *cache = LocaleUtility_cache;
|
||||
if (cache == nullptr) {
|
||||
// Catastrophic failure.
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
Hashtable* htp;
|
||||
umtx_lock(nullptr);
|
||||
htp = (Hashtable*) cache->get(bundleID);
|
||||
umtx_unlock(nullptr);
|
||||
|
||||
if (htp == nullptr) {
|
||||
htp = new Hashtable(status);
|
||||
if (htp && U_SUCCESS(status)) {
|
||||
CharString cbundleID;
|
||||
cbundleID.appendInvariantChars(bundleID, status);
|
||||
const char* path = cbundleID.isEmpty() ? nullptr : cbundleID.data();
|
||||
icu::LocalUEnumerationPointer uenum(ures_openAvailableLocales(path, &status));
|
||||
for (;;) {
|
||||
const char16_t* id = uenum_unext(uenum.getAlias(), nullptr, &status);
|
||||
if (id == nullptr) {
|
||||
break;
|
||||
}
|
||||
htp->put(UnicodeString(id), (void*)htp, status);
|
||||
}
|
||||
if (U_FAILURE(status)) {
|
||||
delete htp;
|
||||
return nullptr;
|
||||
}
|
||||
umtx_lock(nullptr);
|
||||
Hashtable *t = static_cast<Hashtable *>(cache->get(bundleID));
|
||||
if (t != nullptr) {
|
||||
// Another thread raced through this code, creating the cache entry first.
|
||||
// Discard ours and return theirs.
|
||||
umtx_unlock(nullptr);
|
||||
delete htp;
|
||||
htp = t;
|
||||
} else {
|
||||
cache->put(bundleID, (void*)htp, status);
|
||||
umtx_unlock(nullptr);
|
||||
}
|
||||
}
|
||||
}
|
||||
return htp;
|
||||
}
|
||||
|
||||
bool
|
||||
LocaleUtility::isFallbackOf(const UnicodeString& root, const UnicodeString& child)
|
||||
{
|
||||
return child.indexOf(root) == 0 &&
|
||||
(child.length() == root.length() ||
|
||||
child.charAt(root.length()) == UNDERSCORE_CHAR);
|
||||
}
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
||||
/* !UCONFIG_NO_SERVICE */
|
||||
#endif
|
||||
39
engine/thirdparty/icu4c/common/locutil.h
vendored
Normal file
39
engine/thirdparty/icu4c/common/locutil.h
vendored
Normal file
|
|
@ -0,0 +1,39 @@
|
|||
// © 2016 and later: Unicode, Inc. and others.
|
||||
// License & terms of use: http://www.unicode.org/copyright.html
|
||||
/**
|
||||
*******************************************************************************
|
||||
* Copyright (C) 2002-2005, International Business Machines Corporation and *
|
||||
* others. All Rights Reserved. *
|
||||
*******************************************************************************
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
#ifndef LOCUTIL_H
|
||||
#define LOCUTIL_H
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
#include "hash.h"
|
||||
|
||||
#if !UCONFIG_NO_SERVICE || !UCONFIG_NO_TRANSLITERATION
|
||||
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
// temporary utility functions, till I know where to find them
|
||||
// in header so tests can also access them
|
||||
|
||||
class U_COMMON_API LocaleUtility {
|
||||
public:
|
||||
static UnicodeString& canonicalLocaleString(const UnicodeString* id, UnicodeString& result);
|
||||
static Locale& initLocaleFromName(const UnicodeString& id, Locale& result);
|
||||
static UnicodeString& initNameFromLocale(const Locale& locale, UnicodeString& result);
|
||||
static const Hashtable* getAvailableLocaleNames(const UnicodeString& bundleID);
|
||||
static bool isFallbackOf(const UnicodeString& root, const UnicodeString& child);
|
||||
};
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
||||
|
||||
#endif
|
||||
|
||||
#endif
|
||||
134
engine/thirdparty/icu4c/common/lsr.cpp
vendored
Normal file
134
engine/thirdparty/icu4c/common/lsr.cpp
vendored
Normal file
|
|
@ -0,0 +1,134 @@
|
|||
// © 2019 and later: Unicode, Inc. and others.
|
||||
// License & terms of use: http://www.unicode.org/copyright.html
|
||||
|
||||
// lsr.cpp
|
||||
// created: 2019may08 Markus W. Scherer
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
#include "charstr.h"
|
||||
#include "cmemory.h"
|
||||
#include "cstring.h"
|
||||
#include "lsr.h"
|
||||
#include "uinvchar.h"
|
||||
#include "ustr_imp.h"
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
LSR::LSR(char prefix, const char *lang, const char *scr, const char *r, int32_t f,
|
||||
UErrorCode &errorCode) :
|
||||
language(nullptr), script(nullptr), region(r),
|
||||
regionIndex(indexForRegion(region)), flags(f) {
|
||||
if (U_SUCCESS(errorCode)) {
|
||||
CharString langScript;
|
||||
langScript.append(prefix, errorCode).append(lang, errorCode).append('\0', errorCode);
|
||||
int32_t scriptOffset = langScript.length();
|
||||
langScript.append(prefix, errorCode).append(scr, errorCode);
|
||||
owned = langScript.cloneData(errorCode);
|
||||
if (U_SUCCESS(errorCode)) {
|
||||
language = owned;
|
||||
script = owned + scriptOffset;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
LSR::LSR(StringPiece lang, StringPiece scr, StringPiece r, int32_t f,
|
||||
UErrorCode &errorCode) :
|
||||
language(nullptr), script(nullptr), region(nullptr),
|
||||
regionIndex(indexForRegion(r.data())), flags(f) {
|
||||
if (U_SUCCESS(errorCode)) {
|
||||
CharString data;
|
||||
data.append(lang, errorCode).append('\0', errorCode);
|
||||
int32_t scriptOffset = data.length();
|
||||
data.append(scr, errorCode).append('\0', errorCode);
|
||||
int32_t regionOffset = data.length();
|
||||
data.append(r, errorCode);
|
||||
owned = data.cloneData(errorCode);
|
||||
if (U_SUCCESS(errorCode)) {
|
||||
language = owned;
|
||||
script = owned + scriptOffset;
|
||||
region = owned + regionOffset;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
LSR::LSR(LSR &&other) noexcept :
|
||||
language(other.language), script(other.script), region(other.region), owned(other.owned),
|
||||
regionIndex(other.regionIndex), flags(other.flags),
|
||||
hashCode(other.hashCode) {
|
||||
if (owned != nullptr) {
|
||||
other.language = other.script = "";
|
||||
other.owned = nullptr;
|
||||
other.hashCode = 0;
|
||||
}
|
||||
}
|
||||
|
||||
void LSR::deleteOwned() {
|
||||
uprv_free(owned);
|
||||
}
|
||||
|
||||
LSR &LSR::operator=(LSR &&other) noexcept {
|
||||
this->~LSR();
|
||||
language = other.language;
|
||||
script = other.script;
|
||||
region = other.region;
|
||||
regionIndex = other.regionIndex;
|
||||
flags = other.flags;
|
||||
owned = other.owned;
|
||||
hashCode = other.hashCode;
|
||||
if (owned != nullptr) {
|
||||
other.language = other.script = "";
|
||||
other.owned = nullptr;
|
||||
other.hashCode = 0;
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
||||
UBool LSR::isEquivalentTo(const LSR &other) const {
|
||||
return
|
||||
uprv_strcmp(language, other.language) == 0 &&
|
||||
uprv_strcmp(script, other.script) == 0 &&
|
||||
regionIndex == other.regionIndex &&
|
||||
// Compare regions if both are ill-formed (and their indexes are 0).
|
||||
(regionIndex > 0 || uprv_strcmp(region, other.region) == 0);
|
||||
}
|
||||
|
||||
bool LSR::operator==(const LSR &other) const {
|
||||
return
|
||||
uprv_strcmp(language, other.language) == 0 &&
|
||||
uprv_strcmp(script, other.script) == 0 &&
|
||||
regionIndex == other.regionIndex &&
|
||||
// Compare regions if both are ill-formed (and their indexes are 0).
|
||||
(regionIndex > 0 || uprv_strcmp(region, other.region) == 0) &&
|
||||
flags == other.flags;
|
||||
}
|
||||
|
||||
int32_t LSR::indexForRegion(const char *region) {
|
||||
int32_t c = region[0];
|
||||
int32_t a = c - '0';
|
||||
if (0 <= a && a <= 9) { // digits: "419"
|
||||
int32_t b = region[1] - '0';
|
||||
if (b < 0 || 9 < b) { return 0; }
|
||||
c = region[2] - '0';
|
||||
if (c < 0 || 9 < c || region[3] != 0) { return 0; }
|
||||
return (10 * a + b) * 10 + c + 1;
|
||||
} else { // letters: "DE"
|
||||
a = uprv_upperOrdinal(c);
|
||||
if (a < 0 || 25 < a) { return 0; }
|
||||
int32_t b = uprv_upperOrdinal(region[1]);
|
||||
if (b < 0 || 25 < b || region[2] != 0) { return 0; }
|
||||
return 26 * a + b + 1001;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
LSR &LSR::setHashCode() {
|
||||
if (hashCode == 0) {
|
||||
uint32_t h = ustr_hashCharsN(language, static_cast<int32_t>(uprv_strlen(language)));
|
||||
h = h * 37 + ustr_hashCharsN(script, static_cast<int32_t>(uprv_strlen(script)));
|
||||
h = h * 37 + regionIndex;
|
||||
hashCode = h * 37 + flags;
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
||||
U_NAMESPACE_END
|
||||
85
engine/thirdparty/icu4c/common/lsr.h
vendored
Normal file
85
engine/thirdparty/icu4c/common/lsr.h
vendored
Normal file
|
|
@ -0,0 +1,85 @@
|
|||
// © 2019 and later: Unicode, Inc. and others.
|
||||
// License & terms of use: http://www.unicode.org/copyright.html
|
||||
|
||||
// lsr.h
|
||||
// created: 2019may08 Markus W. Scherer
|
||||
|
||||
#ifndef __LSR_H__
|
||||
#define __LSR_H__
|
||||
|
||||
#include "unicode/stringpiece.h"
|
||||
#include "unicode/utypes.h"
|
||||
#include "unicode/uobject.h"
|
||||
#include "cstring.h"
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
struct LSR final : public UMemory {
|
||||
static constexpr int32_t REGION_INDEX_LIMIT = 1001 + 26 * 26;
|
||||
|
||||
static constexpr int32_t EXPLICIT_LSR = 7;
|
||||
static constexpr int32_t EXPLICIT_LANGUAGE = 4;
|
||||
static constexpr int32_t EXPLICIT_SCRIPT = 2;
|
||||
static constexpr int32_t EXPLICIT_REGION = 1;
|
||||
static constexpr int32_t IMPLICIT_LSR = 0;
|
||||
static constexpr int32_t DONT_CARE_FLAGS = 0;
|
||||
|
||||
const char *language;
|
||||
const char *script;
|
||||
const char *region;
|
||||
char *owned = nullptr;
|
||||
/** Index for region, 0 if ill-formed. @see indexForRegion */
|
||||
int32_t regionIndex = 0;
|
||||
int32_t flags = 0;
|
||||
/** Only set for LSRs that will be used in a hash table. */
|
||||
int32_t hashCode = 0;
|
||||
|
||||
LSR() : language("und"), script(""), region("") {}
|
||||
|
||||
/** Constructor which aliases all subtag pointers. */
|
||||
LSR(const char *lang, const char *scr, const char *r, int32_t f) :
|
||||
language(lang), script(scr), region(r),
|
||||
regionIndex(indexForRegion(region)), flags(f) {}
|
||||
/**
|
||||
* Constructor which prepends the prefix to the language and script,
|
||||
* copies those into owned memory, and aliases the region.
|
||||
*/
|
||||
LSR(char prefix, const char *lang, const char *scr, const char *r, int32_t f,
|
||||
UErrorCode &errorCode);
|
||||
LSR(StringPiece lang, StringPiece scr, StringPiece r, int32_t f,
|
||||
UErrorCode &errorCode);
|
||||
LSR(LSR &&other) noexcept;
|
||||
LSR(const LSR &other) = delete;
|
||||
inline ~LSR() {
|
||||
// Pure inline code for almost all instances.
|
||||
if (owned != nullptr) {
|
||||
deleteOwned();
|
||||
}
|
||||
}
|
||||
|
||||
LSR &operator=(LSR &&other) noexcept;
|
||||
LSR &operator=(const LSR &other) = delete;
|
||||
|
||||
/**
|
||||
* Returns a positive index (>0) for a well-formed region code.
|
||||
* Do not rely on a particular region->index mapping; it may change.
|
||||
* Returns 0 for ill-formed strings.
|
||||
*/
|
||||
static int32_t indexForRegion(const char *region);
|
||||
|
||||
UBool isEquivalentTo(const LSR &other) const;
|
||||
bool operator==(const LSR &other) const;
|
||||
|
||||
inline bool operator!=(const LSR &other) const {
|
||||
return !operator==(other);
|
||||
}
|
||||
|
||||
LSR &setHashCode();
|
||||
|
||||
private:
|
||||
void deleteOwned();
|
||||
};
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
||||
#endif // __LSR_H__
|
||||
856
engine/thirdparty/icu4c/common/lstmbe.cpp
vendored
Normal file
856
engine/thirdparty/icu4c/common/lstmbe.cpp
vendored
Normal file
|
|
@ -0,0 +1,856 @@
|
|||
// © 2021 and later: Unicode, Inc. and others.
|
||||
// License & terms of use: http://www.unicode.org/copyright.html
|
||||
|
||||
#include <complex>
|
||||
#include <utility>
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
|
||||
#if !UCONFIG_NO_BREAK_ITERATION
|
||||
|
||||
#include "brkeng.h"
|
||||
#include "charstr.h"
|
||||
#include "cmemory.h"
|
||||
#include "lstmbe.h"
|
||||
#include "putilimp.h"
|
||||
#include "uassert.h"
|
||||
#include "ubrkimpl.h"
|
||||
#include "uresimp.h"
|
||||
#include "uvectr32.h"
|
||||
#include "uvector.h"
|
||||
|
||||
#include "unicode/brkiter.h"
|
||||
#include "unicode/resbund.h"
|
||||
#include "unicode/ubrk.h"
|
||||
#include "unicode/uniset.h"
|
||||
#include "unicode/ustring.h"
|
||||
#include "unicode/utf.h"
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
// Uncomment the following #define to debug.
|
||||
// #define LSTM_DEBUG 1
|
||||
// #define LSTM_VECTORIZER_DEBUG 1
|
||||
|
||||
/**
|
||||
* Interface for reading 1D array.
|
||||
*/
|
||||
class ReadArray1D {
|
||||
public:
|
||||
virtual ~ReadArray1D();
|
||||
virtual int32_t d1() const = 0;
|
||||
virtual float get(int32_t i) const = 0;
|
||||
|
||||
#ifdef LSTM_DEBUG
|
||||
void print() const {
|
||||
printf("\n[");
|
||||
for (int32_t i = 0; i < d1(); i++) {
|
||||
printf("%0.8e ", get(i));
|
||||
if (i % 4 == 3) printf("\n");
|
||||
}
|
||||
printf("]\n");
|
||||
}
|
||||
#endif
|
||||
};
|
||||
|
||||
ReadArray1D::~ReadArray1D()
|
||||
{
|
||||
}
|
||||
|
||||
/**
|
||||
* Interface for reading 2D array.
|
||||
*/
|
||||
class ReadArray2D {
|
||||
public:
|
||||
virtual ~ReadArray2D();
|
||||
virtual int32_t d1() const = 0;
|
||||
virtual int32_t d2() const = 0;
|
||||
virtual float get(int32_t i, int32_t j) const = 0;
|
||||
};
|
||||
|
||||
ReadArray2D::~ReadArray2D()
|
||||
{
|
||||
}
|
||||
|
||||
/**
|
||||
* A class to index a float array as a 1D Array without owning the pointer or
|
||||
* copy the data.
|
||||
*/
|
||||
class ConstArray1D : public ReadArray1D {
|
||||
public:
|
||||
ConstArray1D() : data_(nullptr), d1_(0) {}
|
||||
|
||||
ConstArray1D(const float* data, int32_t d1) : data_(data), d1_(d1) {}
|
||||
|
||||
virtual ~ConstArray1D();
|
||||
|
||||
// Init the object, the object does not own the data nor copy.
|
||||
// It is designed to directly use data from memory mapped resources.
|
||||
void init(const int32_t* data, int32_t d1) {
|
||||
U_ASSERT(IEEE_754 == 1);
|
||||
data_ = reinterpret_cast<const float*>(data);
|
||||
d1_ = d1;
|
||||
}
|
||||
|
||||
// ReadArray1D methods.
|
||||
virtual int32_t d1() const override { return d1_; }
|
||||
virtual float get(int32_t i) const override {
|
||||
U_ASSERT(i < d1_);
|
||||
return data_[i];
|
||||
}
|
||||
|
||||
private:
|
||||
const float* data_;
|
||||
int32_t d1_;
|
||||
};
|
||||
|
||||
ConstArray1D::~ConstArray1D()
|
||||
{
|
||||
}
|
||||
|
||||
/**
|
||||
* A class to index a float array as a 2D Array without owning the pointer or
|
||||
* copy the data.
|
||||
*/
|
||||
class ConstArray2D : public ReadArray2D {
|
||||
public:
|
||||
ConstArray2D() : data_(nullptr), d1_(0), d2_(0) {}
|
||||
|
||||
ConstArray2D(const float* data, int32_t d1, int32_t d2)
|
||||
: data_(data), d1_(d1), d2_(d2) {}
|
||||
|
||||
virtual ~ConstArray2D();
|
||||
|
||||
// Init the object, the object does not own the data nor copy.
|
||||
// It is designed to directly use data from memory mapped resources.
|
||||
void init(const int32_t* data, int32_t d1, int32_t d2) {
|
||||
U_ASSERT(IEEE_754 == 1);
|
||||
data_ = reinterpret_cast<const float*>(data);
|
||||
d1_ = d1;
|
||||
d2_ = d2;
|
||||
}
|
||||
|
||||
// ReadArray2D methods.
|
||||
inline int32_t d1() const override { return d1_; }
|
||||
inline int32_t d2() const override { return d2_; }
|
||||
float get(int32_t i, int32_t j) const override {
|
||||
U_ASSERT(i < d1_);
|
||||
U_ASSERT(j < d2_);
|
||||
return data_[i * d2_ + j];
|
||||
}
|
||||
|
||||
// Expose the ith row as a ConstArray1D
|
||||
inline ConstArray1D row(int32_t i) const {
|
||||
U_ASSERT(i < d1_);
|
||||
return ConstArray1D(data_ + i * d2_, d2_);
|
||||
}
|
||||
|
||||
private:
|
||||
const float* data_;
|
||||
int32_t d1_;
|
||||
int32_t d2_;
|
||||
};
|
||||
|
||||
ConstArray2D::~ConstArray2D()
|
||||
{
|
||||
}
|
||||
|
||||
/**
|
||||
* A class to allocate data as a writable 1D array.
|
||||
* This is the main class implement matrix operation.
|
||||
*/
|
||||
class Array1D : public ReadArray1D {
|
||||
public:
|
||||
Array1D() : memory_(nullptr), data_(nullptr), d1_(0) {}
|
||||
Array1D(int32_t d1, UErrorCode &status)
|
||||
: memory_(uprv_malloc(d1 * sizeof(float))),
|
||||
data_((float*)memory_), d1_(d1) {
|
||||
if (U_SUCCESS(status)) {
|
||||
if (memory_ == nullptr) {
|
||||
status = U_MEMORY_ALLOCATION_ERROR;
|
||||
return;
|
||||
}
|
||||
clear();
|
||||
}
|
||||
}
|
||||
|
||||
virtual ~Array1D();
|
||||
|
||||
// A special constructor which does not own the memory but writeable
|
||||
// as a slice of an array.
|
||||
Array1D(float* data, int32_t d1)
|
||||
: memory_(nullptr), data_(data), d1_(d1) {}
|
||||
|
||||
// ReadArray1D methods.
|
||||
virtual int32_t d1() const override { return d1_; }
|
||||
virtual float get(int32_t i) const override {
|
||||
U_ASSERT(i < d1_);
|
||||
return data_[i];
|
||||
}
|
||||
|
||||
// Return the index which point to the max data in the array.
|
||||
inline int32_t maxIndex() const {
|
||||
int32_t index = 0;
|
||||
float max = data_[0];
|
||||
for (int32_t i = 1; i < d1_; i++) {
|
||||
if (data_[i] > max) {
|
||||
max = data_[i];
|
||||
index = i;
|
||||
}
|
||||
}
|
||||
return index;
|
||||
}
|
||||
|
||||
// Slice part of the array to a new one.
|
||||
inline Array1D slice(int32_t from, int32_t size) const {
|
||||
U_ASSERT(from >= 0);
|
||||
U_ASSERT(from < d1_);
|
||||
U_ASSERT(from + size <= d1_);
|
||||
return Array1D(data_ + from, size);
|
||||
}
|
||||
|
||||
// Add dot product of a 1D array and a 2D array into this one.
|
||||
inline Array1D& addDotProduct(const ReadArray1D& a, const ReadArray2D& b) {
|
||||
U_ASSERT(a.d1() == b.d1());
|
||||
U_ASSERT(b.d2() == d1());
|
||||
for (int32_t i = 0; i < d1(); i++) {
|
||||
for (int32_t j = 0; j < a.d1(); j++) {
|
||||
data_[i] += a.get(j) * b.get(j, i);
|
||||
}
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
||||
// Hadamard Product the values of another array of the same size into this one.
|
||||
inline Array1D& hadamardProduct(const ReadArray1D& a) {
|
||||
U_ASSERT(a.d1() == d1());
|
||||
for (int32_t i = 0; i < d1(); i++) {
|
||||
data_[i] *= a.get(i);
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
||||
// Add the Hadamard Product of two arrays of the same size into this one.
|
||||
inline Array1D& addHadamardProduct(const ReadArray1D& a, const ReadArray1D& b) {
|
||||
U_ASSERT(a.d1() == d1());
|
||||
U_ASSERT(b.d1() == d1());
|
||||
for (int32_t i = 0; i < d1(); i++) {
|
||||
data_[i] += a.get(i) * b.get(i);
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
||||
// Add the values of another array of the same size into this one.
|
||||
inline Array1D& add(const ReadArray1D& a) {
|
||||
U_ASSERT(a.d1() == d1());
|
||||
for (int32_t i = 0; i < d1(); i++) {
|
||||
data_[i] += a.get(i);
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
||||
// Assign the values of another array of the same size into this one.
|
||||
inline Array1D& assign(const ReadArray1D& a) {
|
||||
U_ASSERT(a.d1() == d1());
|
||||
for (int32_t i = 0; i < d1(); i++) {
|
||||
data_[i] = a.get(i);
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
||||
// Apply tanh to all the elements in the array.
|
||||
inline Array1D& tanh() {
|
||||
return tanh(*this);
|
||||
}
|
||||
|
||||
// Apply tanh of a and store into this array.
|
||||
inline Array1D& tanh(const Array1D& a) {
|
||||
U_ASSERT(a.d1() == d1());
|
||||
for (int32_t i = 0; i < d1_; i++) {
|
||||
data_[i] = std::tanh(a.get(i));
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
||||
// Apply sigmoid to all the elements in the array.
|
||||
inline Array1D& sigmoid() {
|
||||
for (int32_t i = 0; i < d1_; i++) {
|
||||
data_[i] = 1.0f/(1.0f + expf(-data_[i]));
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
||||
inline Array1D& clear() {
|
||||
uprv_memset(data_, 0, d1_ * sizeof(float));
|
||||
return *this;
|
||||
}
|
||||
|
||||
private:
|
||||
void* memory_;
|
||||
float* data_;
|
||||
int32_t d1_;
|
||||
};
|
||||
|
||||
Array1D::~Array1D()
|
||||
{
|
||||
uprv_free(memory_);
|
||||
}
|
||||
|
||||
class Array2D : public ReadArray2D {
|
||||
public:
|
||||
Array2D() : memory_(nullptr), data_(nullptr), d1_(0), d2_(0) {}
|
||||
Array2D(int32_t d1, int32_t d2, UErrorCode &status)
|
||||
: memory_(uprv_malloc(d1 * d2 * sizeof(float))),
|
||||
data_((float*)memory_), d1_(d1), d2_(d2) {
|
||||
if (U_SUCCESS(status)) {
|
||||
if (memory_ == nullptr) {
|
||||
status = U_MEMORY_ALLOCATION_ERROR;
|
||||
return;
|
||||
}
|
||||
clear();
|
||||
}
|
||||
}
|
||||
virtual ~Array2D();
|
||||
|
||||
// ReadArray2D methods.
|
||||
virtual int32_t d1() const override { return d1_; }
|
||||
virtual int32_t d2() const override { return d2_; }
|
||||
virtual float get(int32_t i, int32_t j) const override {
|
||||
U_ASSERT(i < d1_);
|
||||
U_ASSERT(j < d2_);
|
||||
return data_[i * d2_ + j];
|
||||
}
|
||||
|
||||
inline Array1D row(int32_t i) const {
|
||||
U_ASSERT(i < d1_);
|
||||
return Array1D(data_ + i * d2_, d2_);
|
||||
}
|
||||
|
||||
inline Array2D& clear() {
|
||||
uprv_memset(data_, 0, d1_ * d2_ * sizeof(float));
|
||||
return *this;
|
||||
}
|
||||
|
||||
private:
|
||||
void* memory_;
|
||||
float* data_;
|
||||
int32_t d1_;
|
||||
int32_t d2_;
|
||||
};
|
||||
|
||||
Array2D::~Array2D()
|
||||
{
|
||||
uprv_free(memory_);
|
||||
}
|
||||
|
||||
typedef enum {
|
||||
BEGIN,
|
||||
INSIDE,
|
||||
END,
|
||||
SINGLE
|
||||
} LSTMClass;
|
||||
|
||||
typedef enum {
|
||||
UNKNOWN,
|
||||
CODE_POINTS,
|
||||
GRAPHEME_CLUSTER,
|
||||
} EmbeddingType;
|
||||
|
||||
struct LSTMData : public UMemory {
|
||||
LSTMData(UResourceBundle* rb, UErrorCode &status);
|
||||
~LSTMData();
|
||||
UHashtable* fDict;
|
||||
EmbeddingType fType;
|
||||
const char16_t* fName;
|
||||
ConstArray2D fEmbedding;
|
||||
ConstArray2D fForwardW;
|
||||
ConstArray2D fForwardU;
|
||||
ConstArray1D fForwardB;
|
||||
ConstArray2D fBackwardW;
|
||||
ConstArray2D fBackwardU;
|
||||
ConstArray1D fBackwardB;
|
||||
ConstArray2D fOutputW;
|
||||
ConstArray1D fOutputB;
|
||||
|
||||
private:
|
||||
UResourceBundle* fBundle;
|
||||
};
|
||||
|
||||
LSTMData::LSTMData(UResourceBundle* rb, UErrorCode &status)
|
||||
: fDict(nullptr), fType(UNKNOWN), fName(nullptr),
|
||||
fBundle(rb)
|
||||
{
|
||||
if (U_FAILURE(status)) {
|
||||
return;
|
||||
}
|
||||
if (IEEE_754 != 1) {
|
||||
status = U_UNSUPPORTED_ERROR;
|
||||
return;
|
||||
}
|
||||
LocalUResourceBundlePointer embeddings_res(
|
||||
ures_getByKey(rb, "embeddings", nullptr, &status));
|
||||
int32_t embedding_size = ures_getInt(embeddings_res.getAlias(), &status);
|
||||
LocalUResourceBundlePointer hunits_res(
|
||||
ures_getByKey(rb, "hunits", nullptr, &status));
|
||||
if (U_FAILURE(status)) return;
|
||||
int32_t hunits = ures_getInt(hunits_res.getAlias(), &status);
|
||||
const char16_t* type = ures_getStringByKey(rb, "type", nullptr, &status);
|
||||
if (U_FAILURE(status)) return;
|
||||
if (u_strCompare(type, -1, u"codepoints", -1, false) == 0) {
|
||||
fType = CODE_POINTS;
|
||||
} else if (u_strCompare(type, -1, u"graphclust", -1, false) == 0) {
|
||||
fType = GRAPHEME_CLUSTER;
|
||||
}
|
||||
fName = ures_getStringByKey(rb, "model", nullptr, &status);
|
||||
LocalUResourceBundlePointer dataRes(ures_getByKey(rb, "data", nullptr, &status));
|
||||
if (U_FAILURE(status)) return;
|
||||
int32_t data_len = 0;
|
||||
const int32_t* data = ures_getIntVector(dataRes.getAlias(), &data_len, &status);
|
||||
fDict = uhash_open(uhash_hashUChars, uhash_compareUChars, nullptr, &status);
|
||||
|
||||
StackUResourceBundle stackTempBundle;
|
||||
ResourceDataValue value;
|
||||
ures_getValueWithFallback(rb, "dict", stackTempBundle.getAlias(), value, status);
|
||||
ResourceArray stringArray = value.getArray(status);
|
||||
int32_t num_index = stringArray.getSize();
|
||||
if (U_FAILURE(status)) { return; }
|
||||
|
||||
// put dict into hash
|
||||
int32_t stringLength;
|
||||
for (int32_t idx = 0; idx < num_index; idx++) {
|
||||
stringArray.getValue(idx, value);
|
||||
const char16_t* str = value.getString(stringLength, status);
|
||||
uhash_putiAllowZero(fDict, (void*)str, idx, &status);
|
||||
if (U_FAILURE(status)) return;
|
||||
#ifdef LSTM_VECTORIZER_DEBUG
|
||||
printf("Assign [");
|
||||
while (*str != 0x0000) {
|
||||
printf("U+%04x ", *str);
|
||||
str++;
|
||||
}
|
||||
printf("] map to %d\n", idx-1);
|
||||
#endif
|
||||
}
|
||||
int32_t mat1_size = (num_index + 1) * embedding_size;
|
||||
int32_t mat2_size = embedding_size * 4 * hunits;
|
||||
int32_t mat3_size = hunits * 4 * hunits;
|
||||
int32_t mat4_size = 4 * hunits;
|
||||
int32_t mat5_size = mat2_size;
|
||||
int32_t mat6_size = mat3_size;
|
||||
int32_t mat7_size = mat4_size;
|
||||
int32_t mat8_size = 2 * hunits * 4;
|
||||
#if U_DEBUG
|
||||
int32_t mat9_size = 4;
|
||||
U_ASSERT(data_len == mat1_size + mat2_size + mat3_size + mat4_size + mat5_size +
|
||||
mat6_size + mat7_size + mat8_size + mat9_size);
|
||||
#endif
|
||||
|
||||
fEmbedding.init(data, (num_index + 1), embedding_size);
|
||||
data += mat1_size;
|
||||
fForwardW.init(data, embedding_size, 4 * hunits);
|
||||
data += mat2_size;
|
||||
fForwardU.init(data, hunits, 4 * hunits);
|
||||
data += mat3_size;
|
||||
fForwardB.init(data, 4 * hunits);
|
||||
data += mat4_size;
|
||||
fBackwardW.init(data, embedding_size, 4 * hunits);
|
||||
data += mat5_size;
|
||||
fBackwardU.init(data, hunits, 4 * hunits);
|
||||
data += mat6_size;
|
||||
fBackwardB.init(data, 4 * hunits);
|
||||
data += mat7_size;
|
||||
fOutputW.init(data, 2 * hunits, 4);
|
||||
data += mat8_size;
|
||||
fOutputB.init(data, 4);
|
||||
}
|
||||
|
||||
LSTMData::~LSTMData() {
|
||||
uhash_close(fDict);
|
||||
ures_close(fBundle);
|
||||
}
|
||||
|
||||
class Vectorizer : public UMemory {
|
||||
public:
|
||||
Vectorizer(UHashtable* dict) : fDict(dict) {}
|
||||
virtual ~Vectorizer();
|
||||
virtual void vectorize(UText *text, int32_t startPos, int32_t endPos,
|
||||
UVector32 &offsets, UVector32 &indices,
|
||||
UErrorCode &status) const = 0;
|
||||
protected:
|
||||
int32_t stringToIndex(const char16_t* str) const {
|
||||
UBool found = false;
|
||||
int32_t ret = uhash_getiAndFound(fDict, (const void*)str, &found);
|
||||
if (!found) {
|
||||
ret = fDict->count;
|
||||
}
|
||||
#ifdef LSTM_VECTORIZER_DEBUG
|
||||
printf("[");
|
||||
while (*str != 0x0000) {
|
||||
printf("U+%04x ", *str);
|
||||
str++;
|
||||
}
|
||||
printf("] map to %d\n", ret);
|
||||
#endif
|
||||
return ret;
|
||||
}
|
||||
|
||||
private:
|
||||
UHashtable* fDict;
|
||||
};
|
||||
|
||||
Vectorizer::~Vectorizer()
|
||||
{
|
||||
}
|
||||
|
||||
class CodePointsVectorizer : public Vectorizer {
|
||||
public:
|
||||
CodePointsVectorizer(UHashtable* dict) : Vectorizer(dict) {}
|
||||
virtual ~CodePointsVectorizer();
|
||||
virtual void vectorize(UText *text, int32_t startPos, int32_t endPos,
|
||||
UVector32 &offsets, UVector32 &indices,
|
||||
UErrorCode &status) const override;
|
||||
};
|
||||
|
||||
CodePointsVectorizer::~CodePointsVectorizer()
|
||||
{
|
||||
}
|
||||
|
||||
void CodePointsVectorizer::vectorize(
|
||||
UText *text, int32_t startPos, int32_t endPos,
|
||||
UVector32 &offsets, UVector32 &indices, UErrorCode &status) const
|
||||
{
|
||||
if (offsets.ensureCapacity(endPos - startPos, status) &&
|
||||
indices.ensureCapacity(endPos - startPos, status)) {
|
||||
if (U_FAILURE(status)) return;
|
||||
utext_setNativeIndex(text, startPos);
|
||||
int32_t current;
|
||||
char16_t str[2] = {0, 0};
|
||||
while (U_SUCCESS(status) &&
|
||||
(current = (int32_t)utext_getNativeIndex(text)) < endPos) {
|
||||
// Since the LSTMBreakEngine is currently only accept chars in BMP,
|
||||
// we can ignore the possibility of hitting supplementary code
|
||||
// point.
|
||||
str[0] = (char16_t) utext_next32(text);
|
||||
U_ASSERT(!U_IS_SURROGATE(str[0]));
|
||||
offsets.addElement(current, status);
|
||||
indices.addElement(stringToIndex(str), status);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
class GraphemeClusterVectorizer : public Vectorizer {
|
||||
public:
|
||||
GraphemeClusterVectorizer(UHashtable* dict)
|
||||
: Vectorizer(dict)
|
||||
{
|
||||
}
|
||||
virtual ~GraphemeClusterVectorizer();
|
||||
virtual void vectorize(UText *text, int32_t startPos, int32_t endPos,
|
||||
UVector32 &offsets, UVector32 &indices,
|
||||
UErrorCode &status) const override;
|
||||
};
|
||||
|
||||
GraphemeClusterVectorizer::~GraphemeClusterVectorizer()
|
||||
{
|
||||
}
|
||||
|
||||
constexpr int32_t MAX_GRAPHEME_CLSTER_LENGTH = 10;
|
||||
|
||||
void GraphemeClusterVectorizer::vectorize(
|
||||
UText *text, int32_t startPos, int32_t endPos,
|
||||
UVector32 &offsets, UVector32 &indices, UErrorCode &status) const
|
||||
{
|
||||
if (U_FAILURE(status)) return;
|
||||
if (!offsets.ensureCapacity(endPos - startPos, status) ||
|
||||
!indices.ensureCapacity(endPos - startPos, status)) {
|
||||
return;
|
||||
}
|
||||
if (U_FAILURE(status)) return;
|
||||
LocalPointer<BreakIterator> graphemeIter(BreakIterator::createCharacterInstance(Locale(), status));
|
||||
if (U_FAILURE(status)) return;
|
||||
graphemeIter->setText(text, status);
|
||||
if (U_FAILURE(status)) return;
|
||||
|
||||
if (startPos != 0) {
|
||||
graphemeIter->preceding(startPos);
|
||||
}
|
||||
int32_t last = startPos;
|
||||
int32_t current = startPos;
|
||||
char16_t str[MAX_GRAPHEME_CLSTER_LENGTH];
|
||||
while ((current = graphemeIter->next()) != BreakIterator::DONE) {
|
||||
if (current >= endPos) {
|
||||
break;
|
||||
}
|
||||
if (current > startPos) {
|
||||
utext_extract(text, last, current, str, MAX_GRAPHEME_CLSTER_LENGTH, &status);
|
||||
if (U_FAILURE(status)) return;
|
||||
offsets.addElement(last, status);
|
||||
indices.addElement(stringToIndex(str), status);
|
||||
if (U_FAILURE(status)) return;
|
||||
}
|
||||
last = current;
|
||||
}
|
||||
if (U_FAILURE(status) || last >= endPos) {
|
||||
return;
|
||||
}
|
||||
utext_extract(text, last, endPos, str, MAX_GRAPHEME_CLSTER_LENGTH, &status);
|
||||
if (U_SUCCESS(status)) {
|
||||
offsets.addElement(last, status);
|
||||
indices.addElement(stringToIndex(str), status);
|
||||
}
|
||||
}
|
||||
|
||||
// Computing LSTM as stated in
|
||||
// https://en.wikipedia.org/wiki/Long_short-term_memory#LSTM_with_a_forget_gate
|
||||
// ifco is temp array allocate outside which does not need to be
|
||||
// input/output value but could avoid unnecessary memory alloc/free if passing
|
||||
// in.
|
||||
void compute(
|
||||
int32_t hunits,
|
||||
const ReadArray2D& W, const ReadArray2D& U, const ReadArray1D& b,
|
||||
const ReadArray1D& x, Array1D& h, Array1D& c,
|
||||
Array1D& ifco)
|
||||
{
|
||||
// ifco = x * W + h * U + b
|
||||
ifco.assign(b)
|
||||
.addDotProduct(x, W)
|
||||
.addDotProduct(h, U);
|
||||
|
||||
ifco.slice(0*hunits, hunits).sigmoid(); // i: sigmod
|
||||
ifco.slice(1*hunits, hunits).sigmoid(); // f: sigmoid
|
||||
ifco.slice(2*hunits, hunits).tanh(); // c_: tanh
|
||||
ifco.slice(3*hunits, hunits).sigmoid(); // o: sigmod
|
||||
|
||||
c.hadamardProduct(ifco.slice(hunits, hunits))
|
||||
.addHadamardProduct(ifco.slice(0, hunits), ifco.slice(2*hunits, hunits));
|
||||
|
||||
h.tanh(c)
|
||||
.hadamardProduct(ifco.slice(3*hunits, hunits));
|
||||
}
|
||||
|
||||
// Minimum word size
|
||||
static const int32_t MIN_WORD = 2;
|
||||
|
||||
// Minimum number of characters for two words
|
||||
static const int32_t MIN_WORD_SPAN = MIN_WORD * 2;
|
||||
|
||||
int32_t
|
||||
LSTMBreakEngine::divideUpDictionaryRange( UText *text,
|
||||
int32_t startPos,
|
||||
int32_t endPos,
|
||||
UVector32 &foundBreaks,
|
||||
UBool /* isPhraseBreaking */,
|
||||
UErrorCode& status) const {
|
||||
if (U_FAILURE(status)) return 0;
|
||||
int32_t beginFoundBreakSize = foundBreaks.size();
|
||||
utext_setNativeIndex(text, startPos);
|
||||
utext_moveIndex32(text, MIN_WORD_SPAN);
|
||||
if (utext_getNativeIndex(text) >= endPos) {
|
||||
return 0; // Not enough characters for two words
|
||||
}
|
||||
utext_setNativeIndex(text, startPos);
|
||||
|
||||
UVector32 offsets(status);
|
||||
UVector32 indices(status);
|
||||
if (U_FAILURE(status)) return 0;
|
||||
fVectorizer->vectorize(text, startPos, endPos, offsets, indices, status);
|
||||
if (U_FAILURE(status)) return 0;
|
||||
int32_t* offsetsBuf = offsets.getBuffer();
|
||||
int32_t* indicesBuf = indices.getBuffer();
|
||||
|
||||
int32_t input_seq_len = indices.size();
|
||||
int32_t hunits = fData->fForwardU.d1();
|
||||
|
||||
// ----- Begin of all the Array memory allocation needed for this function
|
||||
// Allocate temp array used inside compute()
|
||||
Array1D ifco(4 * hunits, status);
|
||||
|
||||
Array1D c(hunits, status);
|
||||
Array1D logp(4, status);
|
||||
|
||||
// TODO: limit size of hBackward. If input_seq_len is too big, we could
|
||||
// run out of memory.
|
||||
// Backward LSTM
|
||||
Array2D hBackward(input_seq_len, hunits, status);
|
||||
|
||||
// Allocate fbRow and slice the internal array in two.
|
||||
Array1D fbRow(2 * hunits, status);
|
||||
|
||||
// ----- End of all the Array memory allocation needed for this function
|
||||
if (U_FAILURE(status)) return 0;
|
||||
|
||||
// To save the needed memory usage, the following is different from the
|
||||
// Python or ICU4X implementation. We first perform the Backward LSTM
|
||||
// and then merge the iteration of the forward LSTM and the output layer
|
||||
// together because we only neetdto remember the h[t-1] for Forward LSTM.
|
||||
for (int32_t i = input_seq_len - 1; i >= 0; i--) {
|
||||
Array1D hRow = hBackward.row(i);
|
||||
if (i != input_seq_len - 1) {
|
||||
hRow.assign(hBackward.row(i+1));
|
||||
}
|
||||
#ifdef LSTM_DEBUG
|
||||
printf("hRow %d\n", i);
|
||||
hRow.print();
|
||||
printf("indicesBuf[%d] = %d\n", i, indicesBuf[i]);
|
||||
printf("fData->fEmbedding.row(indicesBuf[%d]):\n", i);
|
||||
fData->fEmbedding.row(indicesBuf[i]).print();
|
||||
#endif // LSTM_DEBUG
|
||||
compute(hunits,
|
||||
fData->fBackwardW, fData->fBackwardU, fData->fBackwardB,
|
||||
fData->fEmbedding.row(indicesBuf[i]),
|
||||
hRow, c, ifco);
|
||||
}
|
||||
|
||||
|
||||
Array1D forwardRow = fbRow.slice(0, hunits); // point to first half of data in fbRow.
|
||||
Array1D backwardRow = fbRow.slice(hunits, hunits); // point to second half of data n fbRow.
|
||||
|
||||
// The following iteration merge the forward LSTM and the output layer
|
||||
// together.
|
||||
c.clear(); // reuse c since it is the same size.
|
||||
for (int32_t i = 0; i < input_seq_len; i++) {
|
||||
#ifdef LSTM_DEBUG
|
||||
printf("forwardRow %d\n", i);
|
||||
forwardRow.print();
|
||||
#endif // LSTM_DEBUG
|
||||
// Forward LSTM
|
||||
// Calculate the result into forwardRow, which point to the data in the first half
|
||||
// of fbRow.
|
||||
compute(hunits,
|
||||
fData->fForwardW, fData->fForwardU, fData->fForwardB,
|
||||
fData->fEmbedding.row(indicesBuf[i]),
|
||||
forwardRow, c, ifco);
|
||||
|
||||
// assign the data from hBackward.row(i) to second half of fbRowa.
|
||||
backwardRow.assign(hBackward.row(i));
|
||||
|
||||
logp.assign(fData->fOutputB).addDotProduct(fbRow, fData->fOutputW);
|
||||
#ifdef LSTM_DEBUG
|
||||
printf("backwardRow %d\n", i);
|
||||
backwardRow.print();
|
||||
printf("logp %d\n", i);
|
||||
logp.print();
|
||||
#endif // LSTM_DEBUG
|
||||
|
||||
// current = argmax(logp)
|
||||
LSTMClass current = (LSTMClass)logp.maxIndex();
|
||||
// BIES logic.
|
||||
if (current == BEGIN || current == SINGLE) {
|
||||
if (i != 0) {
|
||||
foundBreaks.addElement(offsetsBuf[i], status);
|
||||
if (U_FAILURE(status)) return 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
return foundBreaks.size() - beginFoundBreakSize;
|
||||
}
|
||||
|
||||
Vectorizer* createVectorizer(const LSTMData* data, UErrorCode &status) {
|
||||
if (U_FAILURE(status)) {
|
||||
return nullptr;
|
||||
}
|
||||
switch (data->fType) {
|
||||
case CODE_POINTS:
|
||||
return new CodePointsVectorizer(data->fDict);
|
||||
break;
|
||||
case GRAPHEME_CLUSTER:
|
||||
return new GraphemeClusterVectorizer(data->fDict);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
UPRV_UNREACHABLE_EXIT;
|
||||
}
|
||||
|
||||
LSTMBreakEngine::LSTMBreakEngine(const LSTMData* data, const UnicodeSet& set, UErrorCode &status)
|
||||
: DictionaryBreakEngine(), fData(data), fVectorizer(createVectorizer(fData, status))
|
||||
{
|
||||
if (U_FAILURE(status)) {
|
||||
fData = nullptr; // If failure, we should not delete fData in destructor because the caller will do so.
|
||||
return;
|
||||
}
|
||||
setCharacters(set);
|
||||
}
|
||||
|
||||
LSTMBreakEngine::~LSTMBreakEngine() {
|
||||
delete fData;
|
||||
delete fVectorizer;
|
||||
}
|
||||
|
||||
const char16_t* LSTMBreakEngine::name() const {
|
||||
return fData->fName;
|
||||
}
|
||||
|
||||
UnicodeString defaultLSTM(UScriptCode script, UErrorCode& status) {
|
||||
// open root from brkitr tree.
|
||||
UResourceBundle *b = ures_open(U_ICUDATA_BRKITR, "", &status);
|
||||
b = ures_getByKeyWithFallback(b, "lstm", b, &status);
|
||||
UnicodeString result = ures_getUnicodeStringByKey(b, uscript_getShortName(script), &status);
|
||||
ures_close(b);
|
||||
return result;
|
||||
}
|
||||
|
||||
U_CAPI const LSTMData* U_EXPORT2 CreateLSTMDataForScript(UScriptCode script, UErrorCode& status)
|
||||
{
|
||||
if (script != USCRIPT_KHMER && script != USCRIPT_LAO && script != USCRIPT_MYANMAR && script != USCRIPT_THAI) {
|
||||
return nullptr;
|
||||
}
|
||||
UnicodeString name = defaultLSTM(script, status);
|
||||
if (U_FAILURE(status)) return nullptr;
|
||||
CharString namebuf;
|
||||
namebuf.appendInvariantChars(name, status).truncate(namebuf.lastIndexOf('.'));
|
||||
|
||||
LocalUResourceBundlePointer rb(
|
||||
ures_openDirect(U_ICUDATA_BRKITR, namebuf.data(), &status));
|
||||
if (U_FAILURE(status)) return nullptr;
|
||||
|
||||
return CreateLSTMData(rb.orphan(), status);
|
||||
}
|
||||
|
||||
U_CAPI const LSTMData* U_EXPORT2 CreateLSTMData(UResourceBundle* rb, UErrorCode& status)
|
||||
{
|
||||
return new LSTMData(rb, status);
|
||||
}
|
||||
|
||||
U_CAPI const LanguageBreakEngine* U_EXPORT2
|
||||
CreateLSTMBreakEngine(UScriptCode script, const LSTMData* data, UErrorCode& status)
|
||||
{
|
||||
UnicodeString unicodeSetString;
|
||||
switch(script) {
|
||||
case USCRIPT_THAI:
|
||||
unicodeSetString = UnicodeString(u"[[:Thai:]&[:LineBreak=SA:]]");
|
||||
break;
|
||||
case USCRIPT_MYANMAR:
|
||||
unicodeSetString = UnicodeString(u"[[:Mymr:]&[:LineBreak=SA:]]");
|
||||
break;
|
||||
default:
|
||||
delete data;
|
||||
return nullptr;
|
||||
}
|
||||
UnicodeSet unicodeSet;
|
||||
unicodeSet.applyPattern(unicodeSetString, status);
|
||||
const LanguageBreakEngine* engine = new LSTMBreakEngine(data, unicodeSet, status);
|
||||
if (U_FAILURE(status) || engine == nullptr) {
|
||||
if (engine != nullptr) {
|
||||
delete engine;
|
||||
} else {
|
||||
status = U_MEMORY_ALLOCATION_ERROR;
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
return engine;
|
||||
}
|
||||
|
||||
U_CAPI void U_EXPORT2 DeleteLSTMData(const LSTMData* data)
|
||||
{
|
||||
delete data;
|
||||
}
|
||||
|
||||
U_CAPI const char16_t* U_EXPORT2 LSTMDataName(const LSTMData* data)
|
||||
{
|
||||
return data->fName;
|
||||
}
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
||||
#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
|
||||
88
engine/thirdparty/icu4c/common/lstmbe.h
vendored
Normal file
88
engine/thirdparty/icu4c/common/lstmbe.h
vendored
Normal file
|
|
@ -0,0 +1,88 @@
|
|||
// © 2021 and later: Unicode, Inc. and others.
|
||||
// License & terms of use: http://www.unicode.org/copyright.html
|
||||
|
||||
#ifndef LSTMBE_H
|
||||
#define LSTMBE_H
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
|
||||
#if !UCONFIG_NO_BREAK_ITERATION
|
||||
|
||||
#include "unicode/uniset.h"
|
||||
#include "unicode/ures.h"
|
||||
#include "unicode/utext.h"
|
||||
#include "unicode/utypes.h"
|
||||
|
||||
#include "brkeng.h"
|
||||
#include "dictbe.h"
|
||||
#include "uvectr32.h"
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
class Vectorizer;
|
||||
struct LSTMData;
|
||||
|
||||
/*******************************************************************
|
||||
* LSTMBreakEngine
|
||||
*/
|
||||
|
||||
/**
|
||||
* <p>LSTMBreakEngine is a kind of DictionaryBreakEngine that uses a
|
||||
* LSTM to determine language-specific breaks.</p>
|
||||
*
|
||||
* <p>After it is constructed a LSTMBreakEngine may be shared between
|
||||
* threads without synchronization.</p>
|
||||
*/
|
||||
class LSTMBreakEngine : public DictionaryBreakEngine {
|
||||
public:
|
||||
/**
|
||||
* <p>Constructor.</p>
|
||||
*/
|
||||
LSTMBreakEngine(const LSTMData* data, const UnicodeSet& set, UErrorCode &status);
|
||||
|
||||
/**
|
||||
* <p>Virtual destructor.</p>
|
||||
*/
|
||||
virtual ~LSTMBreakEngine();
|
||||
|
||||
virtual const char16_t* name() const;
|
||||
|
||||
protected:
|
||||
/**
|
||||
* <p>Divide up a range of known dictionary characters handled by this break engine.</p>
|
||||
*
|
||||
* @param text A UText representing the text
|
||||
* @param rangeStart The start of the range of dictionary characters
|
||||
* @param rangeEnd The end of the range of dictionary characters
|
||||
* @param foundBreaks Output of C array of int32_t break positions, or 0
|
||||
* @param status Information on any errors encountered.
|
||||
* @return The number of breaks found
|
||||
*/
|
||||
virtual int32_t divideUpDictionaryRange(UText *text,
|
||||
int32_t rangeStart,
|
||||
int32_t rangeEnd,
|
||||
UVector32 &foundBreaks,
|
||||
UBool isPhraseBreaking,
|
||||
UErrorCode& status) const override;
|
||||
private:
|
||||
const LSTMData* fData;
|
||||
const Vectorizer* fVectorizer;
|
||||
};
|
||||
|
||||
U_CAPI const LanguageBreakEngine* U_EXPORT2 CreateLSTMBreakEngine(
|
||||
UScriptCode script, const LSTMData* data, UErrorCode& status);
|
||||
|
||||
U_CAPI const LSTMData* U_EXPORT2 CreateLSTMData(
|
||||
UResourceBundle* rb, UErrorCode& status);
|
||||
|
||||
U_CAPI const LSTMData* U_EXPORT2 CreateLSTMDataForScript(
|
||||
UScriptCode script, UErrorCode& status);
|
||||
|
||||
U_CAPI void U_EXPORT2 DeleteLSTMData(const LSTMData* data);
|
||||
U_CAPI const char16_t* U_EXPORT2 LSTMDataName(const LSTMData* data);
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
||||
#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
|
||||
|
||||
#endif /* LSTMBE_H */
|
||||
65
engine/thirdparty/icu4c/common/messageimpl.h
vendored
Normal file
65
engine/thirdparty/icu4c/common/messageimpl.h
vendored
Normal file
|
|
@ -0,0 +1,65 @@
|
|||
// © 2016 and later: Unicode, Inc. and others.
|
||||
// License & terms of use: http://www.unicode.org/copyright.html
|
||||
/*
|
||||
*******************************************************************************
|
||||
* Copyright (C) 2011, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*******************************************************************************
|
||||
* file name: messageimpl.h
|
||||
* encoding: UTF-8
|
||||
* tab size: 8 (not used)
|
||||
* indentation:4
|
||||
*
|
||||
* created on: 2011apr04
|
||||
* created by: Markus W. Scherer
|
||||
*/
|
||||
|
||||
#ifndef __MESSAGEIMPL_H__
|
||||
#define __MESSAGEIMPL_H__
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
|
||||
#if !UCONFIG_NO_FORMATTING
|
||||
|
||||
#include "unicode/messagepattern.h"
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
/**
|
||||
* Helper functions for use of MessagePattern.
|
||||
* In Java, these are package-private methods in MessagePattern itself.
|
||||
* In C++, they are declared here and implemented in messagepattern.cpp.
|
||||
*/
|
||||
class U_COMMON_API MessageImpl {
|
||||
public:
|
||||
/**
|
||||
* @return true if getApostropheMode()==UMSGPAT_APOS_DOUBLE_REQUIRED
|
||||
*/
|
||||
static UBool jdkAposMode(const MessagePattern &msgPattern) {
|
||||
return msgPattern.getApostropheMode()==UMSGPAT_APOS_DOUBLE_REQUIRED;
|
||||
}
|
||||
|
||||
/**
|
||||
* Appends the s[start, limit[ substring to sb, but with only half of the apostrophes
|
||||
* according to JDK pattern behavior.
|
||||
*/
|
||||
static void appendReducedApostrophes(const UnicodeString &s, int32_t start, int32_t limit,
|
||||
UnicodeString &sb);
|
||||
|
||||
/**
|
||||
* Appends the sub-message to the result string.
|
||||
* Omits SKIP_SYNTAX and appends whole arguments using appendReducedApostrophes().
|
||||
*/
|
||||
static UnicodeString &appendSubMessageWithoutSkipSyntax(const MessagePattern &msgPattern,
|
||||
int32_t msgStart,
|
||||
UnicodeString &result);
|
||||
|
||||
private:
|
||||
MessageImpl() = delete; // no constructor: all static methods
|
||||
};
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
||||
#endif // !UCONFIG_NO_FORMATTING
|
||||
|
||||
#endif // __MESSAGEIMPL_H__
|
||||
1232
engine/thirdparty/icu4c/common/messagepattern.cpp
vendored
Normal file
1232
engine/thirdparty/icu4c/common/messagepattern.cpp
vendored
Normal file
File diff suppressed because it is too large
Load diff
270
engine/thirdparty/icu4c/common/mlbe.cpp
vendored
Normal file
270
engine/thirdparty/icu4c/common/mlbe.cpp
vendored
Normal file
|
|
@ -0,0 +1,270 @@
|
|||
// © 2022 and later: Unicode, Inc. and others.
|
||||
// License & terms of use: http://www.unicode.org/copyright.html
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
|
||||
#if !UCONFIG_NO_BREAK_ITERATION
|
||||
|
||||
#include "cmemory.h"
|
||||
#include "mlbe.h"
|
||||
#include "uassert.h"
|
||||
#include "ubrkimpl.h"
|
||||
#include "unicode/resbund.h"
|
||||
#include "unicode/udata.h"
|
||||
#include "unicode/utf16.h"
|
||||
#include "uresimp.h"
|
||||
#include "util.h"
|
||||
#include "uvectr32.h"
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
enum class ModelIndex { kUWStart = 0, kBWStart = 6, kTWStart = 9 };
|
||||
|
||||
MlBreakEngine::MlBreakEngine(const UnicodeSet &digitOrOpenPunctuationOrAlphabetSet,
|
||||
const UnicodeSet &closePunctuationSet, UErrorCode &status)
|
||||
: fDigitOrOpenPunctuationOrAlphabetSet(digitOrOpenPunctuationOrAlphabetSet),
|
||||
fClosePunctuationSet(closePunctuationSet),
|
||||
fNegativeSum(0) {
|
||||
if (U_FAILURE(status)) {
|
||||
return;
|
||||
}
|
||||
loadMLModel(status);
|
||||
}
|
||||
|
||||
MlBreakEngine::~MlBreakEngine() {}
|
||||
|
||||
int32_t MlBreakEngine::divideUpRange(UText *inText, int32_t rangeStart, int32_t rangeEnd,
|
||||
UVector32 &foundBreaks, const UnicodeString &inString,
|
||||
const LocalPointer<UVector32> &inputMap,
|
||||
UErrorCode &status) const {
|
||||
if (U_FAILURE(status)) {
|
||||
return 0;
|
||||
}
|
||||
if (rangeStart >= rangeEnd) {
|
||||
status = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return 0;
|
||||
}
|
||||
|
||||
UVector32 boundary(inString.countChar32() + 1, status);
|
||||
if (U_FAILURE(status)) {
|
||||
return 0;
|
||||
}
|
||||
int32_t numBreaks = 0;
|
||||
int32_t codePointLength = inString.countChar32();
|
||||
// The ML algorithm groups six char and evaluates whether the 4th char is a breakpoint.
|
||||
// In each iteration, it evaluates the 4th char and then moves forward one char like a sliding
|
||||
// window. Initially, the first six values in the indexList are [-1, -1, 0, 1, 2, 3]. After
|
||||
// moving forward, finally the last six values in the indexList are
|
||||
// [length-4, length-3, length-2, length-1, -1, -1]. The "+4" here means four extra "-1".
|
||||
int32_t indexSize = codePointLength + 4;
|
||||
int32_t *indexList = (int32_t *)uprv_malloc(indexSize * sizeof(int32_t));
|
||||
if (indexList == nullptr) {
|
||||
status = U_MEMORY_ALLOCATION_ERROR;
|
||||
return 0;
|
||||
}
|
||||
int32_t numCodeUnits = initIndexList(inString, indexList, status);
|
||||
|
||||
// Add a break for the start.
|
||||
boundary.addElement(0, status);
|
||||
numBreaks++;
|
||||
if (U_FAILURE(status)) return 0;
|
||||
|
||||
for (int32_t idx = 0; idx + 1 < codePointLength && U_SUCCESS(status); idx++) {
|
||||
numBreaks =
|
||||
evaluateBreakpoint(inString, indexList, idx, numCodeUnits, numBreaks, boundary, status);
|
||||
if (idx + 4 < codePointLength) {
|
||||
indexList[idx + 6] = numCodeUnits;
|
||||
numCodeUnits += U16_LENGTH(inString.char32At(indexList[idx + 6]));
|
||||
}
|
||||
}
|
||||
uprv_free(indexList);
|
||||
|
||||
if (U_FAILURE(status)) return 0;
|
||||
|
||||
// Add a break for the end if there is not one there already.
|
||||
if (boundary.lastElementi() != inString.countChar32()) {
|
||||
boundary.addElement(inString.countChar32(), status);
|
||||
numBreaks++;
|
||||
}
|
||||
|
||||
int32_t prevCPPos = -1;
|
||||
int32_t prevUTextPos = -1;
|
||||
int32_t correctedNumBreaks = 0;
|
||||
for (int32_t i = 0; i < numBreaks; i++) {
|
||||
int32_t cpPos = boundary.elementAti(i);
|
||||
int32_t utextPos = inputMap.isValid() ? inputMap->elementAti(cpPos) : cpPos + rangeStart;
|
||||
U_ASSERT(cpPos > prevCPPos);
|
||||
U_ASSERT(utextPos >= prevUTextPos);
|
||||
|
||||
if (utextPos > prevUTextPos) {
|
||||
if (utextPos != rangeStart ||
|
||||
(utextPos > 0 &&
|
||||
fClosePunctuationSet.contains(utext_char32At(inText, utextPos - 1)))) {
|
||||
foundBreaks.push(utextPos, status);
|
||||
correctedNumBreaks++;
|
||||
}
|
||||
} else {
|
||||
// Normalization expanded the input text, the dictionary found a boundary
|
||||
// within the expansion, giving two boundaries with the same index in the
|
||||
// original text. Ignore the second. See ticket #12918.
|
||||
--numBreaks;
|
||||
}
|
||||
prevCPPos = cpPos;
|
||||
prevUTextPos = utextPos;
|
||||
}
|
||||
(void)prevCPPos; // suppress compiler warnings about unused variable
|
||||
|
||||
UChar32 nextChar = utext_char32At(inText, rangeEnd);
|
||||
if (!foundBreaks.isEmpty() && foundBreaks.peeki() == rangeEnd) {
|
||||
// In phrase breaking, there has to be a breakpoint between Cj character and
|
||||
// the number/open punctuation.
|
||||
// E.g. る文字「そうだ、京都」->る▁文字▁「そうだ、▁京都」-> breakpoint between 字 and「
|
||||
// E.g. 乗車率90%程度だろうか -> 乗車▁率▁90%▁程度だろうか -> breakpoint between 率 and 9
|
||||
// E.g. しかもロゴがUnicode! -> しかも▁ロゴが▁Unicode!-> breakpoint between が and U
|
||||
if (!fDigitOrOpenPunctuationOrAlphabetSet.contains(nextChar)) {
|
||||
foundBreaks.popi();
|
||||
correctedNumBreaks--;
|
||||
}
|
||||
}
|
||||
|
||||
return correctedNumBreaks;
|
||||
}
|
||||
|
||||
int32_t MlBreakEngine::evaluateBreakpoint(const UnicodeString &inString, int32_t *indexList,
|
||||
int32_t startIdx, int32_t numCodeUnits, int32_t numBreaks,
|
||||
UVector32 &boundary, UErrorCode &status) const {
|
||||
if (U_FAILURE(status)) {
|
||||
return numBreaks;
|
||||
}
|
||||
int32_t start = 0, end = 0;
|
||||
int32_t score = fNegativeSum;
|
||||
|
||||
for (int i = 0; i < 6; i++) {
|
||||
// UW1 ~ UW6
|
||||
start = startIdx + i;
|
||||
if (indexList[start] != -1) {
|
||||
end = (indexList[start + 1] != -1) ? indexList[start + 1] : numCodeUnits;
|
||||
score += fModel[static_cast<int32_t>(ModelIndex::kUWStart) + i].geti(
|
||||
inString.tempSubString(indexList[start], end - indexList[start]));
|
||||
}
|
||||
}
|
||||
for (int i = 0; i < 3; i++) {
|
||||
// BW1 ~ BW3
|
||||
start = startIdx + i + 1;
|
||||
if (indexList[start] != -1 && indexList[start + 1] != -1) {
|
||||
end = (indexList[start + 2] != -1) ? indexList[start + 2] : numCodeUnits;
|
||||
score += fModel[static_cast<int32_t>(ModelIndex::kBWStart) + i].geti(
|
||||
inString.tempSubString(indexList[start], end - indexList[start]));
|
||||
}
|
||||
}
|
||||
for (int i = 0; i < 4; i++) {
|
||||
// TW1 ~ TW4
|
||||
start = startIdx + i;
|
||||
if (indexList[start] != -1 && indexList[start + 1] != -1 && indexList[start + 2] != -1) {
|
||||
end = (indexList[start + 3] != -1) ? indexList[start + 3] : numCodeUnits;
|
||||
score += fModel[static_cast<int32_t>(ModelIndex::kTWStart) + i].geti(
|
||||
inString.tempSubString(indexList[start], end - indexList[start]));
|
||||
}
|
||||
}
|
||||
|
||||
if (score > 0) {
|
||||
boundary.addElement(startIdx + 1, status);
|
||||
numBreaks++;
|
||||
}
|
||||
return numBreaks;
|
||||
}
|
||||
|
||||
int32_t MlBreakEngine::initIndexList(const UnicodeString &inString, int32_t *indexList,
|
||||
UErrorCode &status) const {
|
||||
if (U_FAILURE(status)) {
|
||||
return 0;
|
||||
}
|
||||
int32_t index = 0;
|
||||
int32_t length = inString.countChar32();
|
||||
// Set all (lenght+4) items inside indexLength to -1 presuming -1 is 4 bytes of 0xff.
|
||||
uprv_memset(indexList, 0xff, (length + 4) * sizeof(int32_t));
|
||||
if (length > 0) {
|
||||
indexList[2] = 0;
|
||||
index = U16_LENGTH(inString.char32At(0));
|
||||
if (length > 1) {
|
||||
indexList[3] = index;
|
||||
index += U16_LENGTH(inString.char32At(index));
|
||||
if (length > 2) {
|
||||
indexList[4] = index;
|
||||
index += U16_LENGTH(inString.char32At(index));
|
||||
if (length > 3) {
|
||||
indexList[5] = index;
|
||||
index += U16_LENGTH(inString.char32At(index));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return index;
|
||||
}
|
||||
|
||||
void MlBreakEngine::loadMLModel(UErrorCode &error) {
|
||||
// BudouX's model consists of thirteen categories, each of which is make up of pairs of the
|
||||
// feature and its score. As integrating it into jaml.txt, we define thirteen kinds of key and
|
||||
// value to represent the feature and the corresponding score respectively.
|
||||
|
||||
if (U_FAILURE(error)) return;
|
||||
|
||||
UnicodeString key;
|
||||
StackUResourceBundle stackTempBundle;
|
||||
ResourceDataValue modelKey;
|
||||
|
||||
LocalUResourceBundlePointer rbp(ures_openDirect(U_ICUDATA_BRKITR, "jaml", &error));
|
||||
UResourceBundle *rb = rbp.getAlias();
|
||||
if (U_FAILURE(error)) return;
|
||||
|
||||
int32_t index = 0;
|
||||
initKeyValue(rb, "UW1Keys", "UW1Values", fModel[index++], error);
|
||||
initKeyValue(rb, "UW2Keys", "UW2Values", fModel[index++], error);
|
||||
initKeyValue(rb, "UW3Keys", "UW3Values", fModel[index++], error);
|
||||
initKeyValue(rb, "UW4Keys", "UW4Values", fModel[index++], error);
|
||||
initKeyValue(rb, "UW5Keys", "UW5Values", fModel[index++], error);
|
||||
initKeyValue(rb, "UW6Keys", "UW6Values", fModel[index++], error);
|
||||
initKeyValue(rb, "BW1Keys", "BW1Values", fModel[index++], error);
|
||||
initKeyValue(rb, "BW2Keys", "BW2Values", fModel[index++], error);
|
||||
initKeyValue(rb, "BW3Keys", "BW3Values", fModel[index++], error);
|
||||
initKeyValue(rb, "TW1Keys", "TW1Values", fModel[index++], error);
|
||||
initKeyValue(rb, "TW2Keys", "TW2Values", fModel[index++], error);
|
||||
initKeyValue(rb, "TW3Keys", "TW3Values", fModel[index++], error);
|
||||
initKeyValue(rb, "TW4Keys", "TW4Values", fModel[index++], error);
|
||||
fNegativeSum /= 2;
|
||||
}
|
||||
|
||||
void MlBreakEngine::initKeyValue(UResourceBundle *rb, const char *keyName, const char *valueName,
|
||||
Hashtable &model, UErrorCode &error) {
|
||||
int32_t keySize = 0;
|
||||
int32_t valueSize = 0;
|
||||
int32_t stringLength = 0;
|
||||
UnicodeString key;
|
||||
StackUResourceBundle stackTempBundle;
|
||||
ResourceDataValue modelKey;
|
||||
|
||||
// get modelValues
|
||||
LocalUResourceBundlePointer modelValue(ures_getByKey(rb, valueName, nullptr, &error));
|
||||
const int32_t *value = ures_getIntVector(modelValue.getAlias(), &valueSize, &error);
|
||||
if (U_FAILURE(error)) return;
|
||||
|
||||
// get modelKeys
|
||||
ures_getValueWithFallback(rb, keyName, stackTempBundle.getAlias(), modelKey, error);
|
||||
ResourceArray stringArray = modelKey.getArray(error);
|
||||
keySize = stringArray.getSize();
|
||||
if (U_FAILURE(error)) return;
|
||||
|
||||
for (int32_t idx = 0; idx < keySize; idx++) {
|
||||
stringArray.getValue(idx, modelKey);
|
||||
key = UnicodeString(modelKey.getString(stringLength, error));
|
||||
if (U_SUCCESS(error)) {
|
||||
U_ASSERT(idx < valueSize);
|
||||
fNegativeSum -= value[idx];
|
||||
model.puti(key, value[idx], error);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
||||
#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
|
||||
116
engine/thirdparty/icu4c/common/mlbe.h
vendored
Normal file
116
engine/thirdparty/icu4c/common/mlbe.h
vendored
Normal file
|
|
@ -0,0 +1,116 @@
|
|||
// © 2022 and later: Unicode, Inc. and others.
|
||||
// License & terms of use: http://www.unicode.org/copyright.html
|
||||
|
||||
#ifndef MLBREAKENGINE_H
|
||||
#define MLBREAKENGINE_H
|
||||
|
||||
#include "hash.h"
|
||||
#include "unicode/resbund.h"
|
||||
#include "unicode/uniset.h"
|
||||
#include "unicode/utext.h"
|
||||
#include "uvectr32.h"
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
#if !UCONFIG_NO_BREAK_ITERATION
|
||||
|
||||
/**
|
||||
* A machine learning break engine for the phrase breaking in Japanese.
|
||||
*/
|
||||
class MlBreakEngine : public UMemory {
|
||||
public:
|
||||
/**
|
||||
* Constructor.
|
||||
*
|
||||
* @param digitOrOpenPunctuationOrAlphabetSet An UnicodeSet with the digit, open punctuation and
|
||||
* alphabet.
|
||||
* @param closePunctuationSet An UnicodeSet with close punctuation.
|
||||
* @param status Information on any errors encountered.
|
||||
*/
|
||||
MlBreakEngine(const UnicodeSet &digitOrOpenPunctuationOrAlphabetSet,
|
||||
const UnicodeSet &closePunctuationSet, UErrorCode &status);
|
||||
|
||||
/**
|
||||
* Virtual destructor.
|
||||
*/
|
||||
virtual ~MlBreakEngine();
|
||||
|
||||
public:
|
||||
/**
|
||||
* Divide up a range of characters handled by this break engine.
|
||||
*
|
||||
* @param inText A UText representing the text
|
||||
* @param rangeStart The start of the range of the characters
|
||||
* @param rangeEnd The end of the range of the characters
|
||||
* @param foundBreaks Output of C array of int32_t break positions, or 0
|
||||
* @param inString The normalized string of text ranging from rangeStart to rangeEnd
|
||||
* @param inputMap The vector storing the native index of inText
|
||||
* @param status Information on any errors encountered.
|
||||
* @return The number of breaks found
|
||||
*/
|
||||
int32_t divideUpRange(UText *inText, int32_t rangeStart, int32_t rangeEnd,
|
||||
UVector32 &foundBreaks, const UnicodeString &inString,
|
||||
const LocalPointer<UVector32> &inputMap, UErrorCode &status) const;
|
||||
|
||||
private:
|
||||
/**
|
||||
* Load the machine learning's model file.
|
||||
*
|
||||
* @param error Information on any errors encountered.
|
||||
*/
|
||||
void loadMLModel(UErrorCode &error);
|
||||
|
||||
/**
|
||||
* In the machine learning's model file, specify the name of the key and value to load the
|
||||
* corresponding feature and its score.
|
||||
*
|
||||
* @param rb A ResouceBundle corresponding to the model file.
|
||||
* @param keyName The kay name in the model file.
|
||||
* @param valueName The value name in the model file.
|
||||
* @param model A hashtable to store the pairs of the feature and its score.
|
||||
* @param error Information on any errors encountered.
|
||||
*/
|
||||
void initKeyValue(UResourceBundle *rb, const char *keyName, const char *valueName,
|
||||
Hashtable &model, UErrorCode &error);
|
||||
|
||||
/**
|
||||
* Initialize the index list from the input string.
|
||||
*
|
||||
* @param inString A input string to be segmented.
|
||||
* @param indexList A code unit index list of inString.
|
||||
* @param status Information on any errors encountered.
|
||||
* @return The number of code units of the first four characters in inString.
|
||||
*/
|
||||
int32_t initIndexList(const UnicodeString &inString, int32_t *indexList,
|
||||
UErrorCode &status) const;
|
||||
|
||||
/**
|
||||
* Evaluate whether the index is a potential breakpoint.
|
||||
*
|
||||
* @param inString A input string to be segmented.
|
||||
* @param indexList A code unit index list of the inString.
|
||||
* @param startIdx The start index of the indexList.
|
||||
* @param numCodeUnits The current code unit boundary of the indexList.
|
||||
* @param numBreaks The accumulated number of breakpoints.
|
||||
* @param boundary A vector including the index of the breakpoint.
|
||||
* @param status Information on any errors encountered.
|
||||
* @return The number of breakpoints
|
||||
*/
|
||||
int32_t evaluateBreakpoint(const UnicodeString &inString, int32_t *indexList, int32_t startIdx,
|
||||
int32_t numCodeUnits, int32_t numBreaks, UVector32 &boundary,
|
||||
UErrorCode &status) const;
|
||||
|
||||
void printUnicodeString(const UnicodeString &s) const;
|
||||
|
||||
UnicodeSet fDigitOrOpenPunctuationOrAlphabetSet;
|
||||
UnicodeSet fClosePunctuationSet;
|
||||
Hashtable fModel[13]; // {UW1, UW2, ... UW6, BW1, ... BW3, TW1, TW2, ... TW4} 6+3+4= 13
|
||||
int32_t fNegativeSum;
|
||||
};
|
||||
|
||||
#endif
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
||||
/* MLBREAKENGINE_H */
|
||||
#endif
|
||||
25
engine/thirdparty/icu4c/common/msvcres.h
vendored
Normal file
25
engine/thirdparty/icu4c/common/msvcres.h
vendored
Normal file
|
|
@ -0,0 +1,25 @@
|
|||
// © 2016 and later: Unicode, Inc. and others.
|
||||
// License & terms of use: http://www.unicode.org/copyright.html
|
||||
//{{NO_DEPENDENCIES}}
|
||||
// Copyright (c) 2003-2010 International Business Machines
|
||||
// Corporation and others. All Rights Reserved.
|
||||
//
|
||||
// Used by common.rc and other .rc files.
|
||||
//Do not edit with Microsoft Developer Studio because it will modify this
|
||||
//header the wrong way. This is here to prevent Visual Studio .NET from
|
||||
//unnessarily building the resource files when it's not needed.
|
||||
//
|
||||
|
||||
/*
|
||||
These are defined before unicode/uversion.h in order to prevent
|
||||
STLPort's broken stddef.h from being used when rc.exe parses this file.
|
||||
*/
|
||||
#define _STLP_OUTERMOST_HEADER_ID 0
|
||||
#define _STLP_WINCE 1
|
||||
|
||||
#include "unicode/uversion.h"
|
||||
|
||||
#define ICU_WEBSITE "https://icu.unicode.org/"
|
||||
#define ICU_COMPANY "The ICU Project"
|
||||
#define ICU_PRODUCT_PREFIX "ICU"
|
||||
#define ICU_PRODUCT "International Components for Unicode"
|
||||
77
engine/thirdparty/icu4c/common/mutex.h
vendored
Normal file
77
engine/thirdparty/icu4c/common/mutex.h
vendored
Normal file
|
|
@ -0,0 +1,77 @@
|
|||
// © 2016 and later: Unicode, Inc. and others.
|
||||
// License & terms of use: http://www.unicode.org/copyright.html
|
||||
/*
|
||||
******************************************************************************
|
||||
*
|
||||
* Copyright (C) 1997-2013, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*
|
||||
******************************************************************************
|
||||
*/
|
||||
//----------------------------------------------------------------------------
|
||||
// File: mutex.h
|
||||
//
|
||||
// Lightweight C++ wrapper for umtx_ C mutex functions
|
||||
//
|
||||
// Author: Alan Liu 1/31/97
|
||||
// History:
|
||||
// 06/04/97 helena Updated setImplementation as per feedback from 5/21 drop.
|
||||
// 04/07/1999 srl refocused as a thin wrapper
|
||||
//
|
||||
//----------------------------------------------------------------------------
|
||||
#ifndef MUTEX_H
|
||||
#define MUTEX_H
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
#include "unicode/uobject.h"
|
||||
#include "umutex.h"
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
/**
|
||||
* Mutex is a helper class for convenient locking and unlocking of a UMutex.
|
||||
*
|
||||
* Creating a local scope Mutex will lock a UMutex, holding the lock until the Mutex
|
||||
* goes out of scope.
|
||||
*
|
||||
* If no UMutex is specified, the ICU global mutex is implied.
|
||||
*
|
||||
* For example:
|
||||
*
|
||||
* static UMutex myMutex;
|
||||
*
|
||||
* void Function(int arg1, int arg2)
|
||||
* {
|
||||
* static Object* foo; // Shared read-write object
|
||||
* Mutex mutex(&myMutex); // or no args for the global lock
|
||||
* foo->Method();
|
||||
* // When 'mutex' goes out of scope and gets destroyed here, the lock is released
|
||||
* }
|
||||
*
|
||||
* Note: Do NOT use the form 'Mutex mutex();' as that merely forward-declares a function
|
||||
* returning a Mutex. This is a common mistake which silently slips through the
|
||||
* compiler!!
|
||||
*/
|
||||
|
||||
class U_COMMON_API Mutex : public UMemory {
|
||||
public:
|
||||
Mutex(UMutex *mutex = nullptr) : fMutex(mutex) {
|
||||
umtx_lock(fMutex);
|
||||
}
|
||||
~Mutex() {
|
||||
umtx_unlock(fMutex);
|
||||
}
|
||||
|
||||
Mutex(const Mutex &other) = delete; // forbid assigning of this class
|
||||
Mutex &operator=(const Mutex &other) = delete; // forbid copying of this class
|
||||
void *operator new(size_t s) = delete; // forbid heap allocation. Locals only.
|
||||
|
||||
private:
|
||||
UMutex *fMutex;
|
||||
};
|
||||
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
||||
#endif //_MUTEX_
|
||||
//eof
|
||||
1154
engine/thirdparty/icu4c/common/norm2_nfc_data.h
vendored
Normal file
1154
engine/thirdparty/icu4c/common/norm2_nfc_data.h
vendored
Normal file
File diff suppressed because it is too large
Load diff
406
engine/thirdparty/icu4c/common/norm2allmodes.h
vendored
Normal file
406
engine/thirdparty/icu4c/common/norm2allmodes.h
vendored
Normal file
|
|
@ -0,0 +1,406 @@
|
|||
// © 2016 and later: Unicode, Inc. and others.
|
||||
// License & terms of use: http://www.unicode.org/copyright.html
|
||||
/*
|
||||
*******************************************************************************
|
||||
* Copyright (C) 2014, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*******************************************************************************
|
||||
* norm2allmodes.h
|
||||
*
|
||||
* created on: 2014sep07
|
||||
* created by: Markus W. Scherer
|
||||
*/
|
||||
|
||||
#ifndef __NORM2ALLMODES_H__
|
||||
#define __NORM2ALLMODES_H__
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
|
||||
#if !UCONFIG_NO_NORMALIZATION
|
||||
|
||||
#include "unicode/edits.h"
|
||||
#include "unicode/normalizer2.h"
|
||||
#include "unicode/stringoptions.h"
|
||||
#include "unicode/unistr.h"
|
||||
#include "cpputils.h"
|
||||
#include "normalizer2impl.h"
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
// Intermediate class:
|
||||
// Has Normalizer2Impl and does boilerplate argument checking and setup.
|
||||
class Normalizer2WithImpl : public Normalizer2 {
|
||||
public:
|
||||
Normalizer2WithImpl(const Normalizer2Impl &ni) : impl(ni) {}
|
||||
virtual ~Normalizer2WithImpl();
|
||||
|
||||
// normalize
|
||||
virtual UnicodeString &
|
||||
normalize(const UnicodeString &src,
|
||||
UnicodeString &dest,
|
||||
UErrorCode &errorCode) const override {
|
||||
if(U_FAILURE(errorCode)) {
|
||||
dest.setToBogus();
|
||||
return dest;
|
||||
}
|
||||
const char16_t *sArray=src.getBuffer();
|
||||
if(&dest==&src || sArray==nullptr) {
|
||||
errorCode=U_ILLEGAL_ARGUMENT_ERROR;
|
||||
dest.setToBogus();
|
||||
return dest;
|
||||
}
|
||||
dest.remove();
|
||||
ReorderingBuffer buffer(impl, dest);
|
||||
if(buffer.init(src.length(), errorCode)) {
|
||||
normalize(sArray, sArray+src.length(), buffer, errorCode);
|
||||
}
|
||||
return dest;
|
||||
}
|
||||
virtual void
|
||||
normalize(const char16_t *src, const char16_t *limit,
|
||||
ReorderingBuffer &buffer, UErrorCode &errorCode) const = 0;
|
||||
|
||||
// normalize and append
|
||||
virtual UnicodeString &
|
||||
normalizeSecondAndAppend(UnicodeString &first,
|
||||
const UnicodeString &second,
|
||||
UErrorCode &errorCode) const override {
|
||||
return normalizeSecondAndAppend(first, second, true, errorCode);
|
||||
}
|
||||
virtual UnicodeString &
|
||||
append(UnicodeString &first,
|
||||
const UnicodeString &second,
|
||||
UErrorCode &errorCode) const override {
|
||||
return normalizeSecondAndAppend(first, second, false, errorCode);
|
||||
}
|
||||
UnicodeString &
|
||||
normalizeSecondAndAppend(UnicodeString &first,
|
||||
const UnicodeString &second,
|
||||
UBool doNormalize,
|
||||
UErrorCode &errorCode) const {
|
||||
uprv_checkCanGetBuffer(first, errorCode);
|
||||
if(U_FAILURE(errorCode)) {
|
||||
return first;
|
||||
}
|
||||
const char16_t *secondArray=second.getBuffer();
|
||||
if(&first==&second || secondArray==nullptr) {
|
||||
errorCode=U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return first;
|
||||
}
|
||||
int32_t firstLength=first.length();
|
||||
UnicodeString safeMiddle;
|
||||
{
|
||||
ReorderingBuffer buffer(impl, first);
|
||||
if(buffer.init(firstLength+second.length(), errorCode)) {
|
||||
normalizeAndAppend(secondArray, secondArray+second.length(), doNormalize,
|
||||
safeMiddle, buffer, errorCode);
|
||||
}
|
||||
} // The ReorderingBuffer destructor finalizes the first string.
|
||||
if(U_FAILURE(errorCode)) {
|
||||
// Restore the modified suffix of the first string.
|
||||
first.replace(firstLength-safeMiddle.length(), 0x7fffffff, safeMiddle);
|
||||
}
|
||||
return first;
|
||||
}
|
||||
virtual void
|
||||
normalizeAndAppend(const char16_t *src, const char16_t *limit, UBool doNormalize,
|
||||
UnicodeString &safeMiddle,
|
||||
ReorderingBuffer &buffer, UErrorCode &errorCode) const = 0;
|
||||
virtual UBool
|
||||
getDecomposition(UChar32 c, UnicodeString &decomposition) const override {
|
||||
char16_t buffer[4];
|
||||
int32_t length;
|
||||
const char16_t *d=impl.getDecomposition(c, buffer, length);
|
||||
if(d==nullptr) {
|
||||
return false;
|
||||
}
|
||||
if(d==buffer) {
|
||||
decomposition.setTo(buffer, length); // copy the string (Jamos from Hangul syllable c)
|
||||
} else {
|
||||
decomposition.setTo(false, d, length); // read-only alias
|
||||
}
|
||||
return true;
|
||||
}
|
||||
virtual UBool
|
||||
getRawDecomposition(UChar32 c, UnicodeString &decomposition) const override {
|
||||
char16_t buffer[30];
|
||||
int32_t length;
|
||||
const char16_t *d=impl.getRawDecomposition(c, buffer, length);
|
||||
if(d==nullptr) {
|
||||
return false;
|
||||
}
|
||||
if(d==buffer) {
|
||||
decomposition.setTo(buffer, length); // copy the string (algorithmic decomposition)
|
||||
} else {
|
||||
decomposition.setTo(false, d, length); // read-only alias
|
||||
}
|
||||
return true;
|
||||
}
|
||||
virtual UChar32
|
||||
composePair(UChar32 a, UChar32 b) const override {
|
||||
return impl.composePair(a, b);
|
||||
}
|
||||
|
||||
virtual uint8_t
|
||||
getCombiningClass(UChar32 c) const override {
|
||||
return impl.getCC(impl.getNorm16(c));
|
||||
}
|
||||
|
||||
// quick checks
|
||||
virtual UBool
|
||||
isNormalized(const UnicodeString &s, UErrorCode &errorCode) const override {
|
||||
if(U_FAILURE(errorCode)) {
|
||||
return false;
|
||||
}
|
||||
const char16_t *sArray=s.getBuffer();
|
||||
if(sArray==nullptr) {
|
||||
errorCode=U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return false;
|
||||
}
|
||||
const char16_t *sLimit=sArray+s.length();
|
||||
return sLimit==spanQuickCheckYes(sArray, sLimit, errorCode);
|
||||
}
|
||||
virtual UNormalizationCheckResult
|
||||
quickCheck(const UnicodeString &s, UErrorCode &errorCode) const override {
|
||||
return Normalizer2WithImpl::isNormalized(s, errorCode) ? UNORM_YES : UNORM_NO;
|
||||
}
|
||||
virtual int32_t
|
||||
spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const override {
|
||||
if(U_FAILURE(errorCode)) {
|
||||
return 0;
|
||||
}
|
||||
const char16_t *sArray=s.getBuffer();
|
||||
if(sArray==nullptr) {
|
||||
errorCode=U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return 0;
|
||||
}
|
||||
return (int32_t)(spanQuickCheckYes(sArray, sArray+s.length(), errorCode)-sArray);
|
||||
}
|
||||
virtual const char16_t *
|
||||
spanQuickCheckYes(const char16_t *src, const char16_t *limit, UErrorCode &errorCode) const = 0;
|
||||
|
||||
virtual UNormalizationCheckResult getQuickCheck(UChar32) const {
|
||||
return UNORM_YES;
|
||||
}
|
||||
|
||||
const Normalizer2Impl &impl;
|
||||
};
|
||||
|
||||
class DecomposeNormalizer2 : public Normalizer2WithImpl {
|
||||
public:
|
||||
DecomposeNormalizer2(const Normalizer2Impl &ni) : Normalizer2WithImpl(ni) {}
|
||||
virtual ~DecomposeNormalizer2();
|
||||
|
||||
private:
|
||||
virtual void
|
||||
normalize(const char16_t *src, const char16_t *limit,
|
||||
ReorderingBuffer &buffer, UErrorCode &errorCode) const override {
|
||||
impl.decompose(src, limit, &buffer, errorCode);
|
||||
}
|
||||
using Normalizer2WithImpl::normalize; // Avoid warning about hiding base class function.
|
||||
virtual void
|
||||
normalizeAndAppend(const char16_t *src, const char16_t *limit, UBool doNormalize,
|
||||
UnicodeString &safeMiddle,
|
||||
ReorderingBuffer &buffer, UErrorCode &errorCode) const override {
|
||||
impl.decomposeAndAppend(src, limit, doNormalize, safeMiddle, buffer, errorCode);
|
||||
}
|
||||
|
||||
void
|
||||
normalizeUTF8(uint32_t options, StringPiece src, ByteSink &sink,
|
||||
Edits *edits, UErrorCode &errorCode) const override {
|
||||
if (U_FAILURE(errorCode)) {
|
||||
return;
|
||||
}
|
||||
if (edits != nullptr && (options & U_EDITS_NO_RESET) == 0) {
|
||||
edits->reset();
|
||||
}
|
||||
const uint8_t *s = reinterpret_cast<const uint8_t *>(src.data());
|
||||
impl.decomposeUTF8(options, s, s + src.length(), &sink, edits, errorCode);
|
||||
sink.Flush();
|
||||
}
|
||||
virtual UBool
|
||||
isNormalizedUTF8(StringPiece sp, UErrorCode &errorCode) const override {
|
||||
if(U_FAILURE(errorCode)) {
|
||||
return false;
|
||||
}
|
||||
const uint8_t *s = reinterpret_cast<const uint8_t *>(sp.data());
|
||||
const uint8_t *sLimit = s + sp.length();
|
||||
return sLimit == impl.decomposeUTF8(0, s, sLimit, nullptr, nullptr, errorCode);
|
||||
}
|
||||
|
||||
virtual const char16_t *
|
||||
spanQuickCheckYes(const char16_t *src, const char16_t *limit, UErrorCode &errorCode) const override {
|
||||
return impl.decompose(src, limit, nullptr, errorCode);
|
||||
}
|
||||
using Normalizer2WithImpl::spanQuickCheckYes; // Avoid warning about hiding base class function.
|
||||
virtual UNormalizationCheckResult getQuickCheck(UChar32 c) const override {
|
||||
return impl.isDecompYes(impl.getNorm16(c)) ? UNORM_YES : UNORM_NO;
|
||||
}
|
||||
virtual UBool hasBoundaryBefore(UChar32 c) const override {
|
||||
return impl.hasDecompBoundaryBefore(c);
|
||||
}
|
||||
virtual UBool hasBoundaryAfter(UChar32 c) const override {
|
||||
return impl.hasDecompBoundaryAfter(c);
|
||||
}
|
||||
virtual UBool isInert(UChar32 c) const override {
|
||||
return impl.isDecompInert(c);
|
||||
}
|
||||
};
|
||||
|
||||
class ComposeNormalizer2 : public Normalizer2WithImpl {
|
||||
public:
|
||||
ComposeNormalizer2(const Normalizer2Impl &ni, UBool fcc) :
|
||||
Normalizer2WithImpl(ni), onlyContiguous(fcc) {}
|
||||
virtual ~ComposeNormalizer2();
|
||||
|
||||
private:
|
||||
virtual void
|
||||
normalize(const char16_t *src, const char16_t *limit,
|
||||
ReorderingBuffer &buffer, UErrorCode &errorCode) const override {
|
||||
impl.compose(src, limit, onlyContiguous, true, buffer, errorCode);
|
||||
}
|
||||
using Normalizer2WithImpl::normalize; // Avoid warning about hiding base class function.
|
||||
|
||||
void
|
||||
normalizeUTF8(uint32_t options, StringPiece src, ByteSink &sink,
|
||||
Edits *edits, UErrorCode &errorCode) const override {
|
||||
if (U_FAILURE(errorCode)) {
|
||||
return;
|
||||
}
|
||||
if (edits != nullptr && (options & U_EDITS_NO_RESET) == 0) {
|
||||
edits->reset();
|
||||
}
|
||||
const uint8_t *s = reinterpret_cast<const uint8_t *>(src.data());
|
||||
impl.composeUTF8(options, onlyContiguous, s, s + src.length(),
|
||||
&sink, edits, errorCode);
|
||||
sink.Flush();
|
||||
}
|
||||
|
||||
virtual void
|
||||
normalizeAndAppend(const char16_t *src, const char16_t *limit, UBool doNormalize,
|
||||
UnicodeString &safeMiddle,
|
||||
ReorderingBuffer &buffer, UErrorCode &errorCode) const override {
|
||||
impl.composeAndAppend(src, limit, doNormalize, onlyContiguous, safeMiddle, buffer, errorCode);
|
||||
}
|
||||
|
||||
virtual UBool
|
||||
isNormalized(const UnicodeString &s, UErrorCode &errorCode) const override {
|
||||
if(U_FAILURE(errorCode)) {
|
||||
return false;
|
||||
}
|
||||
const char16_t *sArray=s.getBuffer();
|
||||
if(sArray==nullptr) {
|
||||
errorCode=U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return false;
|
||||
}
|
||||
UnicodeString temp;
|
||||
ReorderingBuffer buffer(impl, temp);
|
||||
if(!buffer.init(5, errorCode)) { // small destCapacity for substring normalization
|
||||
return false;
|
||||
}
|
||||
return impl.compose(sArray, sArray+s.length(), onlyContiguous, false, buffer, errorCode);
|
||||
}
|
||||
virtual UBool
|
||||
isNormalizedUTF8(StringPiece sp, UErrorCode &errorCode) const override {
|
||||
if(U_FAILURE(errorCode)) {
|
||||
return false;
|
||||
}
|
||||
const uint8_t *s = reinterpret_cast<const uint8_t *>(sp.data());
|
||||
return impl.composeUTF8(0, onlyContiguous, s, s + sp.length(), nullptr, nullptr, errorCode);
|
||||
}
|
||||
virtual UNormalizationCheckResult
|
||||
quickCheck(const UnicodeString &s, UErrorCode &errorCode) const override {
|
||||
if(U_FAILURE(errorCode)) {
|
||||
return UNORM_MAYBE;
|
||||
}
|
||||
const char16_t *sArray=s.getBuffer();
|
||||
if(sArray==nullptr) {
|
||||
errorCode=U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return UNORM_MAYBE;
|
||||
}
|
||||
UNormalizationCheckResult qcResult=UNORM_YES;
|
||||
impl.composeQuickCheck(sArray, sArray+s.length(), onlyContiguous, &qcResult);
|
||||
return qcResult;
|
||||
}
|
||||
virtual const char16_t *
|
||||
spanQuickCheckYes(const char16_t *src, const char16_t *limit, UErrorCode &) const override {
|
||||
return impl.composeQuickCheck(src, limit, onlyContiguous, nullptr);
|
||||
}
|
||||
using Normalizer2WithImpl::spanQuickCheckYes; // Avoid warning about hiding base class function.
|
||||
virtual UNormalizationCheckResult getQuickCheck(UChar32 c) const override {
|
||||
return impl.getCompQuickCheck(impl.getNorm16(c));
|
||||
}
|
||||
virtual UBool hasBoundaryBefore(UChar32 c) const override {
|
||||
return impl.hasCompBoundaryBefore(c);
|
||||
}
|
||||
virtual UBool hasBoundaryAfter(UChar32 c) const override {
|
||||
return impl.hasCompBoundaryAfter(c, onlyContiguous);
|
||||
}
|
||||
virtual UBool isInert(UChar32 c) const override {
|
||||
return impl.isCompInert(c, onlyContiguous);
|
||||
}
|
||||
|
||||
const UBool onlyContiguous;
|
||||
};
|
||||
|
||||
class FCDNormalizer2 : public Normalizer2WithImpl {
|
||||
public:
|
||||
FCDNormalizer2(const Normalizer2Impl &ni) : Normalizer2WithImpl(ni) {}
|
||||
virtual ~FCDNormalizer2();
|
||||
|
||||
private:
|
||||
virtual void
|
||||
normalize(const char16_t *src, const char16_t *limit,
|
||||
ReorderingBuffer &buffer, UErrorCode &errorCode) const override {
|
||||
impl.makeFCD(src, limit, &buffer, errorCode);
|
||||
}
|
||||
using Normalizer2WithImpl::normalize; // Avoid warning about hiding base class function.
|
||||
virtual void
|
||||
normalizeAndAppend(const char16_t *src, const char16_t *limit, UBool doNormalize,
|
||||
UnicodeString &safeMiddle,
|
||||
ReorderingBuffer &buffer, UErrorCode &errorCode) const override {
|
||||
impl.makeFCDAndAppend(src, limit, doNormalize, safeMiddle, buffer, errorCode);
|
||||
}
|
||||
virtual const char16_t *
|
||||
spanQuickCheckYes(const char16_t *src, const char16_t *limit, UErrorCode &errorCode) const override {
|
||||
return impl.makeFCD(src, limit, nullptr, errorCode);
|
||||
}
|
||||
using Normalizer2WithImpl::spanQuickCheckYes; // Avoid warning about hiding base class function.
|
||||
virtual UBool hasBoundaryBefore(UChar32 c) const override {
|
||||
return impl.hasFCDBoundaryBefore(c);
|
||||
}
|
||||
virtual UBool hasBoundaryAfter(UChar32 c) const override {
|
||||
return impl.hasFCDBoundaryAfter(c);
|
||||
}
|
||||
virtual UBool isInert(UChar32 c) const override {
|
||||
return impl.isFCDInert(c);
|
||||
}
|
||||
};
|
||||
|
||||
struct Norm2AllModes : public UMemory {
|
||||
Norm2AllModes(Normalizer2Impl *i)
|
||||
: impl(i), comp(*i, false), decomp(*i), fcd(*i), fcc(*i, true) {}
|
||||
~Norm2AllModes();
|
||||
|
||||
static Norm2AllModes *createInstance(Normalizer2Impl *impl, UErrorCode &errorCode);
|
||||
static Norm2AllModes *createNFCInstance(UErrorCode &errorCode);
|
||||
static Norm2AllModes *createInstance(const char *packageName,
|
||||
const char *name,
|
||||
UErrorCode &errorCode);
|
||||
|
||||
static const Norm2AllModes *getNFCInstance(UErrorCode &errorCode);
|
||||
static const Norm2AllModes *getNFKCInstance(UErrorCode &errorCode);
|
||||
static const Norm2AllModes *getNFKC_CFInstance(UErrorCode &errorCode);
|
||||
static const Norm2AllModes *getNFKC_SCFInstance(UErrorCode &errorCode);
|
||||
|
||||
Normalizer2Impl *impl;
|
||||
ComposeNormalizer2 comp;
|
||||
DecomposeNormalizer2 decomp;
|
||||
FCDNormalizer2 fcd;
|
||||
ComposeNormalizer2 fcc;
|
||||
};
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
||||
#endif // !UCONFIG_NO_NORMALIZATION
|
||||
#endif // __NORM2ALLMODES_H__
|
||||
572
engine/thirdparty/icu4c/common/normalizer2.cpp
vendored
Normal file
572
engine/thirdparty/icu4c/common/normalizer2.cpp
vendored
Normal file
|
|
@ -0,0 +1,572 @@
|
|||
// © 2016 and later: Unicode, Inc. and others.
|
||||
// License & terms of use: http://www.unicode.org/copyright.html
|
||||
/*
|
||||
*******************************************************************************
|
||||
*
|
||||
* Copyright (C) 2009-2016, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*
|
||||
*******************************************************************************
|
||||
* file name: normalizer2.cpp
|
||||
* encoding: UTF-8
|
||||
* tab size: 8 (not used)
|
||||
* indentation:4
|
||||
*
|
||||
* created on: 2009nov22
|
||||
* created by: Markus W. Scherer
|
||||
*/
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
|
||||
#if !UCONFIG_NO_NORMALIZATION
|
||||
|
||||
#include "unicode/edits.h"
|
||||
#include "unicode/normalizer2.h"
|
||||
#include "unicode/stringoptions.h"
|
||||
#include "unicode/unistr.h"
|
||||
#include "unicode/unorm.h"
|
||||
#include "cstring.h"
|
||||
#include "mutex.h"
|
||||
#include "norm2allmodes.h"
|
||||
#include "normalizer2impl.h"
|
||||
#include "uassert.h"
|
||||
#include "ucln_cmn.h"
|
||||
|
||||
using icu::Normalizer2Impl;
|
||||
|
||||
#if NORM2_HARDCODE_NFC_DATA
|
||||
// NFC/NFD data machine-generated by gennorm2 --csource
|
||||
#define INCLUDED_FROM_NORMALIZER2_CPP
|
||||
#include "norm2_nfc_data.h"
|
||||
#endif
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
// Public API dispatch via Normalizer2 subclasses -------------------------- ***
|
||||
|
||||
Normalizer2::~Normalizer2() {}
|
||||
|
||||
void
|
||||
Normalizer2::normalizeUTF8(uint32_t /*options*/, StringPiece src, ByteSink &sink,
|
||||
Edits *edits, UErrorCode &errorCode) const {
|
||||
if (U_FAILURE(errorCode)) {
|
||||
return;
|
||||
}
|
||||
if (edits != nullptr) {
|
||||
errorCode = U_UNSUPPORTED_ERROR;
|
||||
return;
|
||||
}
|
||||
UnicodeString src16 = UnicodeString::fromUTF8(src);
|
||||
normalize(src16, errorCode).toUTF8(sink);
|
||||
}
|
||||
|
||||
UBool
|
||||
Normalizer2::getRawDecomposition(UChar32, UnicodeString &) const {
|
||||
return false;
|
||||
}
|
||||
|
||||
UChar32
|
||||
Normalizer2::composePair(UChar32, UChar32) const {
|
||||
return U_SENTINEL;
|
||||
}
|
||||
|
||||
uint8_t
|
||||
Normalizer2::getCombiningClass(UChar32 /*c*/) const {
|
||||
return 0;
|
||||
}
|
||||
|
||||
UBool
|
||||
Normalizer2::isNormalizedUTF8(StringPiece s, UErrorCode &errorCode) const {
|
||||
return U_SUCCESS(errorCode) && isNormalized(UnicodeString::fromUTF8(s), errorCode);
|
||||
}
|
||||
|
||||
// Normalizer2 implementation for the old UNORM_NONE.
|
||||
class NoopNormalizer2 : public Normalizer2 {
|
||||
virtual ~NoopNormalizer2();
|
||||
|
||||
virtual UnicodeString &
|
||||
normalize(const UnicodeString &src,
|
||||
UnicodeString &dest,
|
||||
UErrorCode &errorCode) const override {
|
||||
if(U_SUCCESS(errorCode)) {
|
||||
if(&dest!=&src) {
|
||||
dest=src;
|
||||
} else {
|
||||
errorCode=U_ILLEGAL_ARGUMENT_ERROR;
|
||||
}
|
||||
}
|
||||
return dest;
|
||||
}
|
||||
virtual void
|
||||
normalizeUTF8(uint32_t options, StringPiece src, ByteSink &sink,
|
||||
Edits *edits, UErrorCode &errorCode) const override {
|
||||
if(U_SUCCESS(errorCode)) {
|
||||
if (edits != nullptr) {
|
||||
if ((options & U_EDITS_NO_RESET) == 0) {
|
||||
edits->reset();
|
||||
}
|
||||
edits->addUnchanged(src.length());
|
||||
}
|
||||
if ((options & U_OMIT_UNCHANGED_TEXT) == 0) {
|
||||
sink.Append(src.data(), src.length());
|
||||
}
|
||||
sink.Flush();
|
||||
}
|
||||
}
|
||||
|
||||
virtual UnicodeString &
|
||||
normalizeSecondAndAppend(UnicodeString &first,
|
||||
const UnicodeString &second,
|
||||
UErrorCode &errorCode) const override {
|
||||
if(U_SUCCESS(errorCode)) {
|
||||
if(&first!=&second) {
|
||||
first.append(second);
|
||||
} else {
|
||||
errorCode=U_ILLEGAL_ARGUMENT_ERROR;
|
||||
}
|
||||
}
|
||||
return first;
|
||||
}
|
||||
virtual UnicodeString &
|
||||
append(UnicodeString &first,
|
||||
const UnicodeString &second,
|
||||
UErrorCode &errorCode) const override {
|
||||
if(U_SUCCESS(errorCode)) {
|
||||
if(&first!=&second) {
|
||||
first.append(second);
|
||||
} else {
|
||||
errorCode=U_ILLEGAL_ARGUMENT_ERROR;
|
||||
}
|
||||
}
|
||||
return first;
|
||||
}
|
||||
virtual UBool
|
||||
getDecomposition(UChar32, UnicodeString &) const override {
|
||||
return false;
|
||||
}
|
||||
// No need to override the default getRawDecomposition().
|
||||
virtual UBool
|
||||
isNormalized(const UnicodeString &, UErrorCode &errorCode) const override {
|
||||
return U_SUCCESS(errorCode);
|
||||
}
|
||||
virtual UBool
|
||||
isNormalizedUTF8(StringPiece, UErrorCode &errorCode) const override {
|
||||
return U_SUCCESS(errorCode);
|
||||
}
|
||||
virtual UNormalizationCheckResult
|
||||
quickCheck(const UnicodeString &, UErrorCode &) const override {
|
||||
return UNORM_YES;
|
||||
}
|
||||
virtual int32_t
|
||||
spanQuickCheckYes(const UnicodeString &s, UErrorCode &) const override {
|
||||
return s.length();
|
||||
}
|
||||
virtual UBool hasBoundaryBefore(UChar32) const override { return true; }
|
||||
virtual UBool hasBoundaryAfter(UChar32) const override { return true; }
|
||||
virtual UBool isInert(UChar32) const override { return true; }
|
||||
};
|
||||
|
||||
NoopNormalizer2::~NoopNormalizer2() {}
|
||||
|
||||
Normalizer2WithImpl::~Normalizer2WithImpl() {}
|
||||
|
||||
DecomposeNormalizer2::~DecomposeNormalizer2() {}
|
||||
|
||||
ComposeNormalizer2::~ComposeNormalizer2() {}
|
||||
|
||||
FCDNormalizer2::~FCDNormalizer2() {}
|
||||
|
||||
// instance cache ---------------------------------------------------------- ***
|
||||
|
||||
U_CDECL_BEGIN
|
||||
static UBool U_CALLCONV uprv_normalizer2_cleanup();
|
||||
U_CDECL_END
|
||||
|
||||
static Normalizer2 *noopSingleton;
|
||||
static icu::UInitOnce noopInitOnce {};
|
||||
|
||||
static void U_CALLCONV initNoopSingleton(UErrorCode &errorCode) {
|
||||
if(U_FAILURE(errorCode)) {
|
||||
return;
|
||||
}
|
||||
noopSingleton=new NoopNormalizer2;
|
||||
if(noopSingleton==nullptr) {
|
||||
errorCode=U_MEMORY_ALLOCATION_ERROR;
|
||||
return;
|
||||
}
|
||||
ucln_common_registerCleanup(UCLN_COMMON_NORMALIZER2, uprv_normalizer2_cleanup);
|
||||
}
|
||||
|
||||
const Normalizer2 *Normalizer2Factory::getNoopInstance(UErrorCode &errorCode) {
|
||||
if(U_FAILURE(errorCode)) { return nullptr; }
|
||||
umtx_initOnce(noopInitOnce, &initNoopSingleton, errorCode);
|
||||
return noopSingleton;
|
||||
}
|
||||
|
||||
const Normalizer2Impl *
|
||||
Normalizer2Factory::getImpl(const Normalizer2 *norm2) {
|
||||
return &((Normalizer2WithImpl *)norm2)->impl;
|
||||
}
|
||||
|
||||
Norm2AllModes::~Norm2AllModes() {
|
||||
delete impl;
|
||||
}
|
||||
|
||||
Norm2AllModes *
|
||||
Norm2AllModes::createInstance(Normalizer2Impl *impl, UErrorCode &errorCode) {
|
||||
if(U_FAILURE(errorCode)) {
|
||||
delete impl;
|
||||
return nullptr;
|
||||
}
|
||||
Norm2AllModes *allModes=new Norm2AllModes(impl);
|
||||
if(allModes==nullptr) {
|
||||
errorCode=U_MEMORY_ALLOCATION_ERROR;
|
||||
delete impl;
|
||||
return nullptr;
|
||||
}
|
||||
return allModes;
|
||||
}
|
||||
|
||||
#if NORM2_HARDCODE_NFC_DATA
|
||||
Norm2AllModes *
|
||||
Norm2AllModes::createNFCInstance(UErrorCode &errorCode) {
|
||||
if(U_FAILURE(errorCode)) {
|
||||
return nullptr;
|
||||
}
|
||||
Normalizer2Impl *impl=new Normalizer2Impl;
|
||||
if(impl==nullptr) {
|
||||
errorCode=U_MEMORY_ALLOCATION_ERROR;
|
||||
return nullptr;
|
||||
}
|
||||
impl->init(norm2_nfc_data_indexes, &norm2_nfc_data_trie,
|
||||
norm2_nfc_data_extraData, norm2_nfc_data_smallFCD);
|
||||
return createInstance(impl, errorCode);
|
||||
}
|
||||
|
||||
static Norm2AllModes *nfcSingleton;
|
||||
|
||||
static icu::UInitOnce nfcInitOnce {};
|
||||
|
||||
static void U_CALLCONV initNFCSingleton(UErrorCode &errorCode) {
|
||||
nfcSingleton=Norm2AllModes::createNFCInstance(errorCode);
|
||||
ucln_common_registerCleanup(UCLN_COMMON_NORMALIZER2, uprv_normalizer2_cleanup);
|
||||
}
|
||||
|
||||
const Norm2AllModes *
|
||||
Norm2AllModes::getNFCInstance(UErrorCode &errorCode) {
|
||||
if(U_FAILURE(errorCode)) { return nullptr; }
|
||||
umtx_initOnce(nfcInitOnce, &initNFCSingleton, errorCode);
|
||||
return nfcSingleton;
|
||||
}
|
||||
|
||||
const Normalizer2 *
|
||||
Normalizer2::getNFCInstance(UErrorCode &errorCode) {
|
||||
const Norm2AllModes *allModes=Norm2AllModes::getNFCInstance(errorCode);
|
||||
return allModes!=nullptr ? &allModes->comp : nullptr;
|
||||
}
|
||||
|
||||
const Normalizer2 *
|
||||
Normalizer2::getNFDInstance(UErrorCode &errorCode) {
|
||||
const Norm2AllModes *allModes=Norm2AllModes::getNFCInstance(errorCode);
|
||||
return allModes!=nullptr ? &allModes->decomp : nullptr;
|
||||
}
|
||||
|
||||
const Normalizer2 *Normalizer2Factory::getFCDInstance(UErrorCode &errorCode) {
|
||||
const Norm2AllModes *allModes=Norm2AllModes::getNFCInstance(errorCode);
|
||||
return allModes!=nullptr ? &allModes->fcd : nullptr;
|
||||
}
|
||||
|
||||
const Normalizer2 *Normalizer2Factory::getFCCInstance(UErrorCode &errorCode) {
|
||||
const Norm2AllModes *allModes=Norm2AllModes::getNFCInstance(errorCode);
|
||||
return allModes!=nullptr ? &allModes->fcc : nullptr;
|
||||
}
|
||||
|
||||
const Normalizer2Impl *
|
||||
Normalizer2Factory::getNFCImpl(UErrorCode &errorCode) {
|
||||
const Norm2AllModes *allModes=Norm2AllModes::getNFCInstance(errorCode);
|
||||
return allModes!=nullptr ? allModes->impl : nullptr;
|
||||
}
|
||||
#endif // NORM2_HARDCODE_NFC_DATA
|
||||
|
||||
U_CDECL_BEGIN
|
||||
|
||||
static UBool U_CALLCONV uprv_normalizer2_cleanup() {
|
||||
delete noopSingleton;
|
||||
noopSingleton = nullptr;
|
||||
noopInitOnce.reset();
|
||||
#if NORM2_HARDCODE_NFC_DATA
|
||||
delete nfcSingleton;
|
||||
nfcSingleton = nullptr;
|
||||
nfcInitOnce.reset();
|
||||
#endif
|
||||
return true;
|
||||
}
|
||||
|
||||
U_CDECL_END
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
||||
// C API ------------------------------------------------------------------- ***
|
||||
|
||||
U_NAMESPACE_USE
|
||||
|
||||
U_CAPI const UNormalizer2 * U_EXPORT2
|
||||
unorm2_getNFCInstance(UErrorCode *pErrorCode) {
|
||||
return (const UNormalizer2 *)Normalizer2::getNFCInstance(*pErrorCode);
|
||||
}
|
||||
|
||||
U_CAPI const UNormalizer2 * U_EXPORT2
|
||||
unorm2_getNFDInstance(UErrorCode *pErrorCode) {
|
||||
return (const UNormalizer2 *)Normalizer2::getNFDInstance(*pErrorCode);
|
||||
}
|
||||
|
||||
U_CAPI void U_EXPORT2
|
||||
unorm2_close(UNormalizer2 *norm2) {
|
||||
delete (Normalizer2 *)norm2;
|
||||
}
|
||||
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
unorm2_normalize(const UNormalizer2 *norm2,
|
||||
const char16_t *src, int32_t length,
|
||||
char16_t *dest, int32_t capacity,
|
||||
UErrorCode *pErrorCode) {
|
||||
if(U_FAILURE(*pErrorCode)) {
|
||||
return 0;
|
||||
}
|
||||
if( (src==nullptr ? length!=0 : length<-1) ||
|
||||
(dest==nullptr ? capacity!=0 : capacity<0) ||
|
||||
(src==dest && src!=nullptr)
|
||||
) {
|
||||
*pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return 0;
|
||||
}
|
||||
UnicodeString destString(dest, 0, capacity);
|
||||
// length==0: Nothing to do, and n2wi->normalize(nullptr, nullptr, buffer, ...) would crash.
|
||||
if(length!=0) {
|
||||
const Normalizer2 *n2=(const Normalizer2 *)norm2;
|
||||
const Normalizer2WithImpl *n2wi=dynamic_cast<const Normalizer2WithImpl *>(n2);
|
||||
if(n2wi!=nullptr) {
|
||||
// Avoid duplicate argument checking and support NUL-terminated src.
|
||||
ReorderingBuffer buffer(n2wi->impl, destString);
|
||||
if(buffer.init(length, *pErrorCode)) {
|
||||
n2wi->normalize(src, length>=0 ? src+length : nullptr, buffer, *pErrorCode);
|
||||
}
|
||||
} else {
|
||||
UnicodeString srcString(length<0, src, length);
|
||||
n2->normalize(srcString, destString, *pErrorCode);
|
||||
}
|
||||
}
|
||||
return destString.extract(dest, capacity, *pErrorCode);
|
||||
}
|
||||
|
||||
static int32_t
|
||||
normalizeSecondAndAppend(const UNormalizer2 *norm2,
|
||||
char16_t *first, int32_t firstLength, int32_t firstCapacity,
|
||||
const char16_t *second, int32_t secondLength,
|
||||
UBool doNormalize,
|
||||
UErrorCode *pErrorCode) {
|
||||
if(U_FAILURE(*pErrorCode)) {
|
||||
return 0;
|
||||
}
|
||||
if( (second==nullptr ? secondLength!=0 : secondLength<-1) ||
|
||||
(first==nullptr ? (firstCapacity!=0 || firstLength!=0) :
|
||||
(firstCapacity<0 || firstLength<-1)) ||
|
||||
(first==second && first!=nullptr)
|
||||
) {
|
||||
*pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return 0;
|
||||
}
|
||||
UnicodeString firstString(first, firstLength, firstCapacity);
|
||||
firstLength=firstString.length(); // In case it was -1.
|
||||
// secondLength==0: Nothing to do, and n2wi->normalizeAndAppend(nullptr, nullptr, buffer, ...) would crash.
|
||||
if(secondLength!=0) {
|
||||
const Normalizer2 *n2=(const Normalizer2 *)norm2;
|
||||
const Normalizer2WithImpl *n2wi=dynamic_cast<const Normalizer2WithImpl *>(n2);
|
||||
if(n2wi!=nullptr) {
|
||||
// Avoid duplicate argument checking and support NUL-terminated src.
|
||||
UnicodeString safeMiddle;
|
||||
{
|
||||
ReorderingBuffer buffer(n2wi->impl, firstString);
|
||||
if(buffer.init(firstLength+secondLength+1, *pErrorCode)) { // destCapacity>=-1
|
||||
n2wi->normalizeAndAppend(second, secondLength>=0 ? second+secondLength : nullptr,
|
||||
doNormalize, safeMiddle, buffer, *pErrorCode);
|
||||
}
|
||||
} // The ReorderingBuffer destructor finalizes firstString.
|
||||
if(U_FAILURE(*pErrorCode) || firstString.length()>firstCapacity) {
|
||||
// Restore the modified suffix of the first string.
|
||||
// This does not restore first[] array contents between firstLength and firstCapacity.
|
||||
// (That might be uninitialized memory, as far as we know.)
|
||||
if(first!=nullptr) { /* don't dereference nullptr */
|
||||
safeMiddle.extract(0, 0x7fffffff, first+firstLength-safeMiddle.length());
|
||||
if(firstLength<firstCapacity) {
|
||||
first[firstLength]=0; // NUL-terminate in case it was originally.
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
UnicodeString secondString(secondLength<0, second, secondLength);
|
||||
if(doNormalize) {
|
||||
n2->normalizeSecondAndAppend(firstString, secondString, *pErrorCode);
|
||||
} else {
|
||||
n2->append(firstString, secondString, *pErrorCode);
|
||||
}
|
||||
}
|
||||
}
|
||||
return firstString.extract(first, firstCapacity, *pErrorCode);
|
||||
}
|
||||
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
unorm2_normalizeSecondAndAppend(const UNormalizer2 *norm2,
|
||||
char16_t *first, int32_t firstLength, int32_t firstCapacity,
|
||||
const char16_t *second, int32_t secondLength,
|
||||
UErrorCode *pErrorCode) {
|
||||
return normalizeSecondAndAppend(norm2,
|
||||
first, firstLength, firstCapacity,
|
||||
second, secondLength,
|
||||
true, pErrorCode);
|
||||
}
|
||||
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
unorm2_append(const UNormalizer2 *norm2,
|
||||
char16_t *first, int32_t firstLength, int32_t firstCapacity,
|
||||
const char16_t *second, int32_t secondLength,
|
||||
UErrorCode *pErrorCode) {
|
||||
return normalizeSecondAndAppend(norm2,
|
||||
first, firstLength, firstCapacity,
|
||||
second, secondLength,
|
||||
false, pErrorCode);
|
||||
}
|
||||
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
unorm2_getDecomposition(const UNormalizer2 *norm2,
|
||||
UChar32 c, char16_t *decomposition, int32_t capacity,
|
||||
UErrorCode *pErrorCode) {
|
||||
if(U_FAILURE(*pErrorCode)) {
|
||||
return 0;
|
||||
}
|
||||
if(decomposition==nullptr ? capacity!=0 : capacity<0) {
|
||||
*pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return 0;
|
||||
}
|
||||
UnicodeString destString(decomposition, 0, capacity);
|
||||
if(reinterpret_cast<const Normalizer2 *>(norm2)->getDecomposition(c, destString)) {
|
||||
return destString.extract(decomposition, capacity, *pErrorCode);
|
||||
} else {
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
unorm2_getRawDecomposition(const UNormalizer2 *norm2,
|
||||
UChar32 c, char16_t *decomposition, int32_t capacity,
|
||||
UErrorCode *pErrorCode) {
|
||||
if(U_FAILURE(*pErrorCode)) {
|
||||
return 0;
|
||||
}
|
||||
if(decomposition==nullptr ? capacity!=0 : capacity<0) {
|
||||
*pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return 0;
|
||||
}
|
||||
UnicodeString destString(decomposition, 0, capacity);
|
||||
if(reinterpret_cast<const Normalizer2 *>(norm2)->getRawDecomposition(c, destString)) {
|
||||
return destString.extract(decomposition, capacity, *pErrorCode);
|
||||
} else {
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
U_CAPI UChar32 U_EXPORT2
|
||||
unorm2_composePair(const UNormalizer2 *norm2, UChar32 a, UChar32 b) {
|
||||
return reinterpret_cast<const Normalizer2 *>(norm2)->composePair(a, b);
|
||||
}
|
||||
|
||||
U_CAPI uint8_t U_EXPORT2
|
||||
unorm2_getCombiningClass(const UNormalizer2 *norm2, UChar32 c) {
|
||||
return reinterpret_cast<const Normalizer2 *>(norm2)->getCombiningClass(c);
|
||||
}
|
||||
|
||||
U_CAPI UBool U_EXPORT2
|
||||
unorm2_isNormalized(const UNormalizer2 *norm2,
|
||||
const char16_t *s, int32_t length,
|
||||
UErrorCode *pErrorCode) {
|
||||
if(U_FAILURE(*pErrorCode)) {
|
||||
return 0;
|
||||
}
|
||||
if((s==nullptr && length!=0) || length<-1) {
|
||||
*pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return 0;
|
||||
}
|
||||
UnicodeString sString(length<0, s, length);
|
||||
return ((const Normalizer2 *)norm2)->isNormalized(sString, *pErrorCode);
|
||||
}
|
||||
|
||||
U_CAPI UNormalizationCheckResult U_EXPORT2
|
||||
unorm2_quickCheck(const UNormalizer2 *norm2,
|
||||
const char16_t *s, int32_t length,
|
||||
UErrorCode *pErrorCode) {
|
||||
if(U_FAILURE(*pErrorCode)) {
|
||||
return UNORM_NO;
|
||||
}
|
||||
if((s==nullptr && length!=0) || length<-1) {
|
||||
*pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return UNORM_NO;
|
||||
}
|
||||
UnicodeString sString(length<0, s, length);
|
||||
return ((const Normalizer2 *)norm2)->quickCheck(sString, *pErrorCode);
|
||||
}
|
||||
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
unorm2_spanQuickCheckYes(const UNormalizer2 *norm2,
|
||||
const char16_t *s, int32_t length,
|
||||
UErrorCode *pErrorCode) {
|
||||
if(U_FAILURE(*pErrorCode)) {
|
||||
return 0;
|
||||
}
|
||||
if((s==nullptr && length!=0) || length<-1) {
|
||||
*pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return 0;
|
||||
}
|
||||
UnicodeString sString(length<0, s, length);
|
||||
return ((const Normalizer2 *)norm2)->spanQuickCheckYes(sString, *pErrorCode);
|
||||
}
|
||||
|
||||
U_CAPI UBool U_EXPORT2
|
||||
unorm2_hasBoundaryBefore(const UNormalizer2 *norm2, UChar32 c) {
|
||||
return ((const Normalizer2 *)norm2)->hasBoundaryBefore(c);
|
||||
}
|
||||
|
||||
U_CAPI UBool U_EXPORT2
|
||||
unorm2_hasBoundaryAfter(const UNormalizer2 *norm2, UChar32 c) {
|
||||
return ((const Normalizer2 *)norm2)->hasBoundaryAfter(c);
|
||||
}
|
||||
|
||||
U_CAPI UBool U_EXPORT2
|
||||
unorm2_isInert(const UNormalizer2 *norm2, UChar32 c) {
|
||||
return ((const Normalizer2 *)norm2)->isInert(c);
|
||||
}
|
||||
|
||||
// Some properties APIs ---------------------------------------------------- ***
|
||||
|
||||
U_CAPI uint8_t U_EXPORT2
|
||||
u_getCombiningClass(UChar32 c) {
|
||||
UErrorCode errorCode=U_ZERO_ERROR;
|
||||
const Normalizer2 *nfd=Normalizer2::getNFDInstance(errorCode);
|
||||
if(U_SUCCESS(errorCode)) {
|
||||
return nfd->getCombiningClass(c);
|
||||
} else {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
U_CFUNC uint16_t
|
||||
unorm_getFCD16(UChar32 c) {
|
||||
UErrorCode errorCode=U_ZERO_ERROR;
|
||||
const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(errorCode);
|
||||
if(U_SUCCESS(errorCode)) {
|
||||
return impl->getFCD16(c);
|
||||
} else {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
#endif // !UCONFIG_NO_NORMALIZATION
|
||||
2812
engine/thirdparty/icu4c/common/normalizer2impl.cpp
vendored
Normal file
2812
engine/thirdparty/icu4c/common/normalizer2impl.cpp
vendored
Normal file
File diff suppressed because it is too large
Load diff
988
engine/thirdparty/icu4c/common/normalizer2impl.h
vendored
Normal file
988
engine/thirdparty/icu4c/common/normalizer2impl.h
vendored
Normal file
|
|
@ -0,0 +1,988 @@
|
|||
// © 2016 and later: Unicode, Inc. and others.
|
||||
// License & terms of use: http://www.unicode.org/copyright.html
|
||||
/*
|
||||
*******************************************************************************
|
||||
*
|
||||
* Copyright (C) 2009-2014, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*
|
||||
*******************************************************************************
|
||||
* file name: normalizer2impl.h
|
||||
* encoding: UTF-8
|
||||
* tab size: 8 (not used)
|
||||
* indentation:4
|
||||
*
|
||||
* created on: 2009nov22
|
||||
* created by: Markus W. Scherer
|
||||
*/
|
||||
|
||||
#ifndef __NORMALIZER2IMPL_H__
|
||||
#define __NORMALIZER2IMPL_H__
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
|
||||
#if !UCONFIG_NO_NORMALIZATION
|
||||
|
||||
#include "unicode/normalizer2.h"
|
||||
#include "unicode/ucptrie.h"
|
||||
#include "unicode/unistr.h"
|
||||
#include "unicode/unorm.h"
|
||||
#include "unicode/utf.h"
|
||||
#include "unicode/utf16.h"
|
||||
#include "mutex.h"
|
||||
#include "udataswp.h"
|
||||
#include "uset_imp.h"
|
||||
|
||||
// When the nfc.nrm data is *not* hardcoded into the common library
|
||||
// (with this constant set to 0),
|
||||
// then it needs to be built into the data package:
|
||||
// Add nfc.nrm to icu4c/source/data/Makefile.in DAT_FILES_SHORT
|
||||
#define NORM2_HARDCODE_NFC_DATA 1
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
struct CanonIterData;
|
||||
|
||||
class ByteSink;
|
||||
class Edits;
|
||||
class InitCanonIterData;
|
||||
class LcccContext;
|
||||
|
||||
class U_COMMON_API Hangul {
|
||||
public:
|
||||
/* Korean Hangul and Jamo constants */
|
||||
enum {
|
||||
JAMO_L_BASE=0x1100, /* "lead" jamo */
|
||||
JAMO_L_END=0x1112,
|
||||
JAMO_V_BASE=0x1161, /* "vowel" jamo */
|
||||
JAMO_V_END=0x1175,
|
||||
JAMO_T_BASE=0x11a7, /* "trail" jamo */
|
||||
JAMO_T_END=0x11c2,
|
||||
|
||||
HANGUL_BASE=0xac00,
|
||||
HANGUL_END=0xd7a3,
|
||||
|
||||
JAMO_L_COUNT=19,
|
||||
JAMO_V_COUNT=21,
|
||||
JAMO_T_COUNT=28,
|
||||
|
||||
JAMO_VT_COUNT=JAMO_V_COUNT*JAMO_T_COUNT,
|
||||
|
||||
HANGUL_COUNT=JAMO_L_COUNT*JAMO_V_COUNT*JAMO_T_COUNT,
|
||||
HANGUL_LIMIT=HANGUL_BASE+HANGUL_COUNT
|
||||
};
|
||||
|
||||
static inline UBool isHangul(UChar32 c) {
|
||||
return HANGUL_BASE<=c && c<HANGUL_LIMIT;
|
||||
}
|
||||
static inline UBool
|
||||
isHangulLV(UChar32 c) {
|
||||
c-=HANGUL_BASE;
|
||||
return 0<=c && c<HANGUL_COUNT && c%JAMO_T_COUNT==0;
|
||||
}
|
||||
static inline UBool isJamoL(UChar32 c) {
|
||||
return (uint32_t)(c-JAMO_L_BASE)<JAMO_L_COUNT;
|
||||
}
|
||||
static inline UBool isJamoV(UChar32 c) {
|
||||
return (uint32_t)(c-JAMO_V_BASE)<JAMO_V_COUNT;
|
||||
}
|
||||
static inline UBool isJamoT(UChar32 c) {
|
||||
int32_t t=c-JAMO_T_BASE;
|
||||
return 0<t && t<JAMO_T_COUNT; // not JAMO_T_BASE itself
|
||||
}
|
||||
static UBool isJamo(UChar32 c) {
|
||||
return JAMO_L_BASE<=c && c<=JAMO_T_END &&
|
||||
(c<=JAMO_L_END || (JAMO_V_BASE<=c && c<=JAMO_V_END) || JAMO_T_BASE<c);
|
||||
}
|
||||
|
||||
/**
|
||||
* Decomposes c, which must be a Hangul syllable, into buffer
|
||||
* and returns the length of the decomposition (2 or 3).
|
||||
*/
|
||||
static inline int32_t decompose(UChar32 c, char16_t buffer[3]) {
|
||||
c-=HANGUL_BASE;
|
||||
UChar32 c2=c%JAMO_T_COUNT;
|
||||
c/=JAMO_T_COUNT;
|
||||
buffer[0]=(char16_t)(JAMO_L_BASE+c/JAMO_V_COUNT);
|
||||
buffer[1]=(char16_t)(JAMO_V_BASE+c%JAMO_V_COUNT);
|
||||
if(c2==0) {
|
||||
return 2;
|
||||
} else {
|
||||
buffer[2]=(char16_t)(JAMO_T_BASE+c2);
|
||||
return 3;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Decomposes c, which must be a Hangul syllable, into buffer.
|
||||
* This is the raw, not recursive, decomposition. Its length is always 2.
|
||||
*/
|
||||
static inline void getRawDecomposition(UChar32 c, char16_t buffer[2]) {
|
||||
UChar32 orig=c;
|
||||
c-=HANGUL_BASE;
|
||||
UChar32 c2=c%JAMO_T_COUNT;
|
||||
if(c2==0) {
|
||||
c/=JAMO_T_COUNT;
|
||||
buffer[0]=(char16_t)(JAMO_L_BASE+c/JAMO_V_COUNT);
|
||||
buffer[1]=(char16_t)(JAMO_V_BASE+c%JAMO_V_COUNT);
|
||||
} else {
|
||||
buffer[0]=(char16_t)(orig-c2); // LV syllable
|
||||
buffer[1]=(char16_t)(JAMO_T_BASE+c2);
|
||||
}
|
||||
}
|
||||
private:
|
||||
Hangul() = delete; // no instantiation
|
||||
};
|
||||
|
||||
class Normalizer2Impl;
|
||||
|
||||
class U_COMMON_API ReorderingBuffer : public UMemory {
|
||||
public:
|
||||
/** Constructs only; init() should be called. */
|
||||
ReorderingBuffer(const Normalizer2Impl &ni, UnicodeString &dest) :
|
||||
impl(ni), str(dest),
|
||||
start(nullptr), reorderStart(nullptr), limit(nullptr),
|
||||
remainingCapacity(0), lastCC(0) {}
|
||||
/** Constructs, removes the string contents, and initializes for a small initial capacity. */
|
||||
ReorderingBuffer(const Normalizer2Impl &ni, UnicodeString &dest, UErrorCode &errorCode);
|
||||
~ReorderingBuffer() {
|
||||
if (start != nullptr) {
|
||||
str.releaseBuffer((int32_t)(limit-start));
|
||||
}
|
||||
}
|
||||
UBool init(int32_t destCapacity, UErrorCode &errorCode);
|
||||
|
||||
UBool isEmpty() const { return start==limit; }
|
||||
int32_t length() const { return (int32_t)(limit-start); }
|
||||
char16_t *getStart() { return start; }
|
||||
char16_t *getLimit() { return limit; }
|
||||
uint8_t getLastCC() const { return lastCC; }
|
||||
|
||||
UBool equals(const char16_t *start, const char16_t *limit) const;
|
||||
UBool equals(const uint8_t *otherStart, const uint8_t *otherLimit) const;
|
||||
|
||||
UBool append(UChar32 c, uint8_t cc, UErrorCode &errorCode) {
|
||||
return (c<=0xffff) ?
|
||||
appendBMP((char16_t)c, cc, errorCode) :
|
||||
appendSupplementary(c, cc, errorCode);
|
||||
}
|
||||
UBool append(const char16_t *s, int32_t length, UBool isNFD,
|
||||
uint8_t leadCC, uint8_t trailCC,
|
||||
UErrorCode &errorCode);
|
||||
UBool appendBMP(char16_t c, uint8_t cc, UErrorCode &errorCode) {
|
||||
if(remainingCapacity==0 && !resize(1, errorCode)) {
|
||||
return false;
|
||||
}
|
||||
if(lastCC<=cc || cc==0) {
|
||||
*limit++=c;
|
||||
lastCC=cc;
|
||||
if(cc<=1) {
|
||||
reorderStart=limit;
|
||||
}
|
||||
} else {
|
||||
insert(c, cc);
|
||||
}
|
||||
--remainingCapacity;
|
||||
return true;
|
||||
}
|
||||
UBool appendZeroCC(UChar32 c, UErrorCode &errorCode);
|
||||
UBool appendZeroCC(const char16_t *s, const char16_t *sLimit, UErrorCode &errorCode);
|
||||
void remove();
|
||||
void removeSuffix(int32_t suffixLength);
|
||||
void setReorderingLimit(char16_t *newLimit) {
|
||||
remainingCapacity+=(int32_t)(limit-newLimit);
|
||||
reorderStart=limit=newLimit;
|
||||
lastCC=0;
|
||||
}
|
||||
void copyReorderableSuffixTo(UnicodeString &s) const {
|
||||
s.setTo(ConstChar16Ptr(reorderStart), (int32_t)(limit-reorderStart));
|
||||
}
|
||||
private:
|
||||
/*
|
||||
* TODO: Revisit whether it makes sense to track reorderStart.
|
||||
* It is set to after the last known character with cc<=1,
|
||||
* which stops previousCC() before it reads that character and looks up its cc.
|
||||
* previousCC() is normally only called from insert().
|
||||
* In other words, reorderStart speeds up the insertion of a combining mark
|
||||
* into a multi-combining mark sequence where it does not belong at the end.
|
||||
* This might not be worth the trouble.
|
||||
* On the other hand, it's not a huge amount of trouble.
|
||||
*
|
||||
* We probably need it for UNORM_SIMPLE_APPEND.
|
||||
*/
|
||||
|
||||
UBool appendSupplementary(UChar32 c, uint8_t cc, UErrorCode &errorCode);
|
||||
void insert(UChar32 c, uint8_t cc);
|
||||
static void writeCodePoint(char16_t *p, UChar32 c) {
|
||||
if(c<=0xffff) {
|
||||
*p=(char16_t)c;
|
||||
} else {
|
||||
p[0]=U16_LEAD(c);
|
||||
p[1]=U16_TRAIL(c);
|
||||
}
|
||||
}
|
||||
UBool resize(int32_t appendLength, UErrorCode &errorCode);
|
||||
|
||||
const Normalizer2Impl &impl;
|
||||
UnicodeString &str;
|
||||
char16_t *start, *reorderStart, *limit;
|
||||
int32_t remainingCapacity;
|
||||
uint8_t lastCC;
|
||||
|
||||
// private backward iterator
|
||||
void setIterator() { codePointStart=limit; }
|
||||
void skipPrevious(); // Requires start<codePointStart.
|
||||
uint8_t previousCC(); // Returns 0 if there is no previous character.
|
||||
|
||||
char16_t *codePointStart, *codePointLimit;
|
||||
};
|
||||
|
||||
/**
|
||||
* Low-level implementation of the Unicode Normalization Algorithm.
|
||||
* For the data structure and details see the documentation at the end of
|
||||
* this normalizer2impl.h and in the design doc at
|
||||
* https://icu.unicode.org/design/normalization/custom
|
||||
*/
|
||||
class U_COMMON_API Normalizer2Impl : public UObject {
|
||||
public:
|
||||
Normalizer2Impl() : normTrie(nullptr), fCanonIterData(nullptr) {}
|
||||
virtual ~Normalizer2Impl();
|
||||
|
||||
void init(const int32_t *inIndexes, const UCPTrie *inTrie,
|
||||
const uint16_t *inExtraData, const uint8_t *inSmallFCD);
|
||||
|
||||
void addLcccChars(UnicodeSet &set) const;
|
||||
void addPropertyStarts(const USetAdder *sa, UErrorCode &errorCode) const;
|
||||
void addCanonIterPropertyStarts(const USetAdder *sa, UErrorCode &errorCode) const;
|
||||
|
||||
// low-level properties ------------------------------------------------ ***
|
||||
|
||||
UBool ensureCanonIterData(UErrorCode &errorCode) const;
|
||||
|
||||
// The trie stores values for lead surrogate code *units*.
|
||||
// Surrogate code *points* are inert.
|
||||
uint16_t getNorm16(UChar32 c) const {
|
||||
return U_IS_LEAD(c) ?
|
||||
static_cast<uint16_t>(INERT) :
|
||||
UCPTRIE_FAST_GET(normTrie, UCPTRIE_16, c);
|
||||
}
|
||||
uint16_t getRawNorm16(UChar32 c) const { return UCPTRIE_FAST_GET(normTrie, UCPTRIE_16, c); }
|
||||
|
||||
UNormalizationCheckResult getCompQuickCheck(uint16_t norm16) const {
|
||||
if(norm16<minNoNo || MIN_YES_YES_WITH_CC<=norm16) {
|
||||
return UNORM_YES;
|
||||
} else if(minMaybeYes<=norm16) {
|
||||
return UNORM_MAYBE;
|
||||
} else {
|
||||
return UNORM_NO;
|
||||
}
|
||||
}
|
||||
UBool isAlgorithmicNoNo(uint16_t norm16) const { return limitNoNo<=norm16 && norm16<minMaybeYes; }
|
||||
UBool isCompNo(uint16_t norm16) const { return minNoNo<=norm16 && norm16<minMaybeYes; }
|
||||
UBool isDecompYes(uint16_t norm16) const { return norm16<minYesNo || minMaybeYes<=norm16; }
|
||||
|
||||
uint8_t getCC(uint16_t norm16) const {
|
||||
if(norm16>=MIN_NORMAL_MAYBE_YES) {
|
||||
return getCCFromNormalYesOrMaybe(norm16);
|
||||
}
|
||||
if(norm16<minNoNo || limitNoNo<=norm16) {
|
||||
return 0;
|
||||
}
|
||||
return getCCFromNoNo(norm16);
|
||||
}
|
||||
static uint8_t getCCFromNormalYesOrMaybe(uint16_t norm16) {
|
||||
return (uint8_t)(norm16 >> OFFSET_SHIFT);
|
||||
}
|
||||
static uint8_t getCCFromYesOrMaybe(uint16_t norm16) {
|
||||
return norm16>=MIN_NORMAL_MAYBE_YES ? getCCFromNormalYesOrMaybe(norm16) : 0;
|
||||
}
|
||||
uint8_t getCCFromYesOrMaybeCP(UChar32 c) const {
|
||||
if (c < minCompNoMaybeCP) { return 0; }
|
||||
return getCCFromYesOrMaybe(getNorm16(c));
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the FCD data for code point c.
|
||||
* @param c A Unicode code point.
|
||||
* @return The lccc(c) in bits 15..8 and tccc(c) in bits 7..0.
|
||||
*/
|
||||
uint16_t getFCD16(UChar32 c) const {
|
||||
if(c<minDecompNoCP) {
|
||||
return 0;
|
||||
} else if(c<=0xffff) {
|
||||
if(!singleLeadMightHaveNonZeroFCD16(c)) { return 0; }
|
||||
}
|
||||
return getFCD16FromNormData(c);
|
||||
}
|
||||
/**
|
||||
* Returns the FCD data for the next code point (post-increment).
|
||||
* Might skip only a lead surrogate rather than the whole surrogate pair if none of
|
||||
* the supplementary code points associated with the lead surrogate have non-zero FCD data.
|
||||
* @param s A valid pointer into a string. Requires s!=limit.
|
||||
* @param limit The end of the string, or NULL.
|
||||
* @return The lccc(c) in bits 15..8 and tccc(c) in bits 7..0.
|
||||
*/
|
||||
uint16_t nextFCD16(const char16_t *&s, const char16_t *limit) const {
|
||||
UChar32 c=*s++;
|
||||
if(c<minDecompNoCP || !singleLeadMightHaveNonZeroFCD16(c)) {
|
||||
return 0;
|
||||
}
|
||||
char16_t c2;
|
||||
if(U16_IS_LEAD(c) && s!=limit && U16_IS_TRAIL(c2=*s)) {
|
||||
c=U16_GET_SUPPLEMENTARY(c, c2);
|
||||
++s;
|
||||
}
|
||||
return getFCD16FromNormData(c);
|
||||
}
|
||||
/**
|
||||
* Returns the FCD data for the previous code point (pre-decrement).
|
||||
* @param start The start of the string.
|
||||
* @param s A valid pointer into a string. Requires start<s.
|
||||
* @return The lccc(c) in bits 15..8 and tccc(c) in bits 7..0.
|
||||
*/
|
||||
uint16_t previousFCD16(const char16_t *start, const char16_t *&s) const {
|
||||
UChar32 c=*--s;
|
||||
if(c<minDecompNoCP) {
|
||||
return 0;
|
||||
}
|
||||
if(!U16_IS_TRAIL(c)) {
|
||||
if(!singleLeadMightHaveNonZeroFCD16(c)) {
|
||||
return 0;
|
||||
}
|
||||
} else {
|
||||
char16_t c2;
|
||||
if(start<s && U16_IS_LEAD(c2=*(s-1))) {
|
||||
c=U16_GET_SUPPLEMENTARY(c2, c);
|
||||
--s;
|
||||
}
|
||||
}
|
||||
return getFCD16FromNormData(c);
|
||||
}
|
||||
|
||||
/** Returns true if the single-or-lead code unit c might have non-zero FCD data. */
|
||||
UBool singleLeadMightHaveNonZeroFCD16(UChar32 lead) const {
|
||||
// 0<=lead<=0xffff
|
||||
uint8_t bits=smallFCD[lead>>8];
|
||||
if(bits==0) { return false; }
|
||||
return (UBool)((bits>>((lead>>5)&7))&1);
|
||||
}
|
||||
/** Returns the FCD value from the regular normalization data. */
|
||||
uint16_t getFCD16FromNormData(UChar32 c) const;
|
||||
|
||||
/**
|
||||
* Gets the decomposition for one code point.
|
||||
* @param c code point
|
||||
* @param buffer out-only buffer for algorithmic decompositions
|
||||
* @param length out-only, takes the length of the decomposition, if any
|
||||
* @return pointer to the decomposition, or NULL if none
|
||||
*/
|
||||
const char16_t *getDecomposition(UChar32 c, char16_t buffer[4], int32_t &length) const;
|
||||
|
||||
/**
|
||||
* Gets the raw decomposition for one code point.
|
||||
* @param c code point
|
||||
* @param buffer out-only buffer for algorithmic decompositions
|
||||
* @param length out-only, takes the length of the decomposition, if any
|
||||
* @return pointer to the decomposition, or NULL if none
|
||||
*/
|
||||
const char16_t *getRawDecomposition(UChar32 c, char16_t buffer[30], int32_t &length) const;
|
||||
|
||||
UChar32 composePair(UChar32 a, UChar32 b) const;
|
||||
|
||||
UBool isCanonSegmentStarter(UChar32 c) const;
|
||||
UBool getCanonStartSet(UChar32 c, UnicodeSet &set) const;
|
||||
|
||||
enum {
|
||||
// Fixed norm16 values.
|
||||
MIN_YES_YES_WITH_CC=0xfe02,
|
||||
JAMO_VT=0xfe00,
|
||||
MIN_NORMAL_MAYBE_YES=0xfc00,
|
||||
JAMO_L=2, // offset=1 hasCompBoundaryAfter=false
|
||||
INERT=1, // offset=0 hasCompBoundaryAfter=true
|
||||
|
||||
// norm16 bit 0 is comp-boundary-after.
|
||||
HAS_COMP_BOUNDARY_AFTER=1,
|
||||
OFFSET_SHIFT=1,
|
||||
|
||||
// For algorithmic one-way mappings, norm16 bits 2..1 indicate the
|
||||
// tccc (0, 1, >1) for quick FCC boundary-after tests.
|
||||
DELTA_TCCC_0=0,
|
||||
DELTA_TCCC_1=2,
|
||||
DELTA_TCCC_GT_1=4,
|
||||
DELTA_TCCC_MASK=6,
|
||||
DELTA_SHIFT=3,
|
||||
|
||||
MAX_DELTA=0x40
|
||||
};
|
||||
|
||||
enum {
|
||||
// Byte offsets from the start of the data, after the generic header.
|
||||
IX_NORM_TRIE_OFFSET,
|
||||
IX_EXTRA_DATA_OFFSET,
|
||||
IX_SMALL_FCD_OFFSET,
|
||||
IX_RESERVED3_OFFSET,
|
||||
IX_RESERVED4_OFFSET,
|
||||
IX_RESERVED5_OFFSET,
|
||||
IX_RESERVED6_OFFSET,
|
||||
IX_TOTAL_SIZE,
|
||||
|
||||
// Code point thresholds for quick check codes.
|
||||
IX_MIN_DECOMP_NO_CP,
|
||||
IX_MIN_COMP_NO_MAYBE_CP,
|
||||
|
||||
// Norm16 value thresholds for quick check combinations and types of extra data.
|
||||
|
||||
/** Mappings & compositions in [minYesNo..minYesNoMappingsOnly[. */
|
||||
IX_MIN_YES_NO,
|
||||
/** Mappings are comp-normalized. */
|
||||
IX_MIN_NO_NO,
|
||||
IX_LIMIT_NO_NO,
|
||||
IX_MIN_MAYBE_YES,
|
||||
|
||||
/** Mappings only in [minYesNoMappingsOnly..minNoNo[. */
|
||||
IX_MIN_YES_NO_MAPPINGS_ONLY,
|
||||
/** Mappings are not comp-normalized but have a comp boundary before. */
|
||||
IX_MIN_NO_NO_COMP_BOUNDARY_BEFORE,
|
||||
/** Mappings do not have a comp boundary before. */
|
||||
IX_MIN_NO_NO_COMP_NO_MAYBE_CC,
|
||||
/** Mappings to the empty string. */
|
||||
IX_MIN_NO_NO_EMPTY,
|
||||
|
||||
IX_MIN_LCCC_CP,
|
||||
IX_RESERVED19,
|
||||
IX_COUNT
|
||||
};
|
||||
|
||||
enum {
|
||||
MAPPING_HAS_CCC_LCCC_WORD=0x80,
|
||||
MAPPING_HAS_RAW_MAPPING=0x40,
|
||||
// unused bit 0x20,
|
||||
MAPPING_LENGTH_MASK=0x1f
|
||||
};
|
||||
|
||||
enum {
|
||||
COMP_1_LAST_TUPLE=0x8000,
|
||||
COMP_1_TRIPLE=1,
|
||||
COMP_1_TRAIL_LIMIT=0x3400,
|
||||
COMP_1_TRAIL_MASK=0x7ffe,
|
||||
COMP_1_TRAIL_SHIFT=9, // 10-1 for the "triple" bit
|
||||
COMP_2_TRAIL_SHIFT=6,
|
||||
COMP_2_TRAIL_MASK=0xffc0
|
||||
};
|
||||
|
||||
// higher-level functionality ------------------------------------------ ***
|
||||
|
||||
// NFD without an NFD Normalizer2 instance.
|
||||
UnicodeString &decompose(const UnicodeString &src, UnicodeString &dest,
|
||||
UErrorCode &errorCode) const;
|
||||
/**
|
||||
* Decomposes [src, limit[ and writes the result to dest.
|
||||
* limit can be NULL if src is NUL-terminated.
|
||||
* destLengthEstimate is the initial dest buffer capacity and can be -1.
|
||||
*/
|
||||
void decompose(const char16_t *src, const char16_t *limit,
|
||||
UnicodeString &dest, int32_t destLengthEstimate,
|
||||
UErrorCode &errorCode) const;
|
||||
|
||||
const char16_t *decompose(const char16_t *src, const char16_t *limit,
|
||||
ReorderingBuffer *buffer, UErrorCode &errorCode) const;
|
||||
void decomposeAndAppend(const char16_t *src, const char16_t *limit,
|
||||
UBool doDecompose,
|
||||
UnicodeString &safeMiddle,
|
||||
ReorderingBuffer &buffer,
|
||||
UErrorCode &errorCode) const;
|
||||
|
||||
/** sink==nullptr: isNormalized()/spanQuickCheckYes() */
|
||||
const uint8_t *decomposeUTF8(uint32_t options,
|
||||
const uint8_t *src, const uint8_t *limit,
|
||||
ByteSink *sink, Edits *edits, UErrorCode &errorCode) const;
|
||||
|
||||
UBool compose(const char16_t *src, const char16_t *limit,
|
||||
UBool onlyContiguous,
|
||||
UBool doCompose,
|
||||
ReorderingBuffer &buffer,
|
||||
UErrorCode &errorCode) const;
|
||||
const char16_t *composeQuickCheck(const char16_t *src, const char16_t *limit,
|
||||
UBool onlyContiguous,
|
||||
UNormalizationCheckResult *pQCResult) const;
|
||||
void composeAndAppend(const char16_t *src, const char16_t *limit,
|
||||
UBool doCompose,
|
||||
UBool onlyContiguous,
|
||||
UnicodeString &safeMiddle,
|
||||
ReorderingBuffer &buffer,
|
||||
UErrorCode &errorCode) const;
|
||||
|
||||
/** sink==nullptr: isNormalized() */
|
||||
UBool composeUTF8(uint32_t options, UBool onlyContiguous,
|
||||
const uint8_t *src, const uint8_t *limit,
|
||||
ByteSink *sink, icu::Edits *edits, UErrorCode &errorCode) const;
|
||||
|
||||
const char16_t *makeFCD(const char16_t *src, const char16_t *limit,
|
||||
ReorderingBuffer *buffer, UErrorCode &errorCode) const;
|
||||
void makeFCDAndAppend(const char16_t *src, const char16_t *limit,
|
||||
UBool doMakeFCD,
|
||||
UnicodeString &safeMiddle,
|
||||
ReorderingBuffer &buffer,
|
||||
UErrorCode &errorCode) const;
|
||||
|
||||
UBool hasDecompBoundaryBefore(UChar32 c) const;
|
||||
UBool norm16HasDecompBoundaryBefore(uint16_t norm16) const;
|
||||
UBool hasDecompBoundaryAfter(UChar32 c) const;
|
||||
UBool norm16HasDecompBoundaryAfter(uint16_t norm16) const;
|
||||
UBool isDecompInert(UChar32 c) const { return isDecompYesAndZeroCC(getNorm16(c)); }
|
||||
|
||||
UBool hasCompBoundaryBefore(UChar32 c) const {
|
||||
return c<minCompNoMaybeCP || norm16HasCompBoundaryBefore(getNorm16(c));
|
||||
}
|
||||
UBool hasCompBoundaryAfter(UChar32 c, UBool onlyContiguous) const {
|
||||
return norm16HasCompBoundaryAfter(getNorm16(c), onlyContiguous);
|
||||
}
|
||||
UBool isCompInert(UChar32 c, UBool onlyContiguous) const {
|
||||
uint16_t norm16=getNorm16(c);
|
||||
return isCompYesAndZeroCC(norm16) &&
|
||||
(norm16 & HAS_COMP_BOUNDARY_AFTER) != 0 &&
|
||||
(!onlyContiguous || isInert(norm16) || *getMapping(norm16) <= 0x1ff);
|
||||
}
|
||||
|
||||
UBool hasFCDBoundaryBefore(UChar32 c) const { return hasDecompBoundaryBefore(c); }
|
||||
UBool hasFCDBoundaryAfter(UChar32 c) const { return hasDecompBoundaryAfter(c); }
|
||||
UBool isFCDInert(UChar32 c) const { return getFCD16(c)<=1; }
|
||||
private:
|
||||
friend class InitCanonIterData;
|
||||
friend class LcccContext;
|
||||
|
||||
UBool isMaybe(uint16_t norm16) const { return minMaybeYes<=norm16 && norm16<=JAMO_VT; }
|
||||
UBool isMaybeOrNonZeroCC(uint16_t norm16) const { return norm16>=minMaybeYes; }
|
||||
static UBool isInert(uint16_t norm16) { return norm16==INERT; }
|
||||
static UBool isJamoL(uint16_t norm16) { return norm16==JAMO_L; }
|
||||
static UBool isJamoVT(uint16_t norm16) { return norm16==JAMO_VT; }
|
||||
uint16_t hangulLVT() const { return minYesNoMappingsOnly|HAS_COMP_BOUNDARY_AFTER; }
|
||||
UBool isHangulLV(uint16_t norm16) const { return norm16==minYesNo; }
|
||||
UBool isHangulLVT(uint16_t norm16) const {
|
||||
return norm16==hangulLVT();
|
||||
}
|
||||
UBool isCompYesAndZeroCC(uint16_t norm16) const { return norm16<minNoNo; }
|
||||
// UBool isCompYes(uint16_t norm16) const {
|
||||
// return norm16>=MIN_YES_YES_WITH_CC || norm16<minNoNo;
|
||||
// }
|
||||
// UBool isCompYesOrMaybe(uint16_t norm16) const {
|
||||
// return norm16<minNoNo || minMaybeYes<=norm16;
|
||||
// }
|
||||
// UBool hasZeroCCFromDecompYes(uint16_t norm16) const {
|
||||
// return norm16<=MIN_NORMAL_MAYBE_YES || norm16==JAMO_VT;
|
||||
// }
|
||||
UBool isDecompYesAndZeroCC(uint16_t norm16) const {
|
||||
return norm16<minYesNo ||
|
||||
norm16==JAMO_VT ||
|
||||
(minMaybeYes<=norm16 && norm16<=MIN_NORMAL_MAYBE_YES);
|
||||
}
|
||||
/**
|
||||
* A little faster and simpler than isDecompYesAndZeroCC() but does not include
|
||||
* the MaybeYes which combine-forward and have ccc=0.
|
||||
* (Standard Unicode 10 normalization does not have such characters.)
|
||||
*/
|
||||
UBool isMostDecompYesAndZeroCC(uint16_t norm16) const {
|
||||
return norm16<minYesNo || norm16==MIN_NORMAL_MAYBE_YES || norm16==JAMO_VT;
|
||||
}
|
||||
UBool isDecompNoAlgorithmic(uint16_t norm16) const { return norm16>=limitNoNo; }
|
||||
|
||||
// For use with isCompYes().
|
||||
// Perhaps the compiler can combine the two tests for MIN_YES_YES_WITH_CC.
|
||||
// static uint8_t getCCFromYes(uint16_t norm16) {
|
||||
// return norm16>=MIN_YES_YES_WITH_CC ? getCCFromNormalYesOrMaybe(norm16) : 0;
|
||||
// }
|
||||
uint8_t getCCFromNoNo(uint16_t norm16) const {
|
||||
const uint16_t *mapping=getMapping(norm16);
|
||||
if(*mapping&MAPPING_HAS_CCC_LCCC_WORD) {
|
||||
return (uint8_t)*(mapping-1);
|
||||
} else {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
// requires that the [cpStart..cpLimit[ character passes isCompYesAndZeroCC()
|
||||
uint8_t getTrailCCFromCompYesAndZeroCC(uint16_t norm16) const {
|
||||
if(norm16<=minYesNo) {
|
||||
return 0; // yesYes and Hangul LV have ccc=tccc=0
|
||||
} else {
|
||||
// For Hangul LVT we harmlessly fetch a firstUnit with tccc=0 here.
|
||||
return (uint8_t)(*getMapping(norm16)>>8); // tccc from yesNo
|
||||
}
|
||||
}
|
||||
uint8_t getPreviousTrailCC(const char16_t *start, const char16_t *p) const;
|
||||
uint8_t getPreviousTrailCC(const uint8_t *start, const uint8_t *p) const;
|
||||
|
||||
// Requires algorithmic-NoNo.
|
||||
UChar32 mapAlgorithmic(UChar32 c, uint16_t norm16) const {
|
||||
return c+(norm16>>DELTA_SHIFT)-centerNoNoDelta;
|
||||
}
|
||||
UChar32 getAlgorithmicDelta(uint16_t norm16) const {
|
||||
return (norm16>>DELTA_SHIFT)-centerNoNoDelta;
|
||||
}
|
||||
|
||||
// Requires minYesNo<norm16<limitNoNo.
|
||||
const uint16_t *getMapping(uint16_t norm16) const { return extraData+(norm16>>OFFSET_SHIFT); }
|
||||
const uint16_t *getCompositionsListForDecompYes(uint16_t norm16) const {
|
||||
if(norm16<JAMO_L || MIN_NORMAL_MAYBE_YES<=norm16) {
|
||||
return nullptr;
|
||||
} else if(norm16<minMaybeYes) {
|
||||
return getMapping(norm16); // for yesYes; if Jamo L: harmless empty list
|
||||
} else {
|
||||
return maybeYesCompositions+norm16-minMaybeYes;
|
||||
}
|
||||
}
|
||||
const uint16_t *getCompositionsListForComposite(uint16_t norm16) const {
|
||||
// A composite has both mapping & compositions list.
|
||||
const uint16_t *list=getMapping(norm16);
|
||||
return list+ // mapping pointer
|
||||
1+ // +1 to skip the first unit with the mapping length
|
||||
(*list&MAPPING_LENGTH_MASK); // + mapping length
|
||||
}
|
||||
const uint16_t *getCompositionsListForMaybe(uint16_t norm16) const {
|
||||
// minMaybeYes<=norm16<MIN_NORMAL_MAYBE_YES
|
||||
return maybeYesCompositions+((norm16-minMaybeYes)>>OFFSET_SHIFT);
|
||||
}
|
||||
/**
|
||||
* @param c code point must have compositions
|
||||
* @return compositions list pointer
|
||||
*/
|
||||
const uint16_t *getCompositionsList(uint16_t norm16) const {
|
||||
return isDecompYes(norm16) ?
|
||||
getCompositionsListForDecompYes(norm16) :
|
||||
getCompositionsListForComposite(norm16);
|
||||
}
|
||||
|
||||
const char16_t *copyLowPrefixFromNulTerminated(const char16_t *src,
|
||||
UChar32 minNeedDataCP,
|
||||
ReorderingBuffer *buffer,
|
||||
UErrorCode &errorCode) const;
|
||||
|
||||
enum StopAt { STOP_AT_LIMIT, STOP_AT_DECOMP_BOUNDARY, STOP_AT_COMP_BOUNDARY };
|
||||
|
||||
const char16_t *decomposeShort(const char16_t *src, const char16_t *limit,
|
||||
UBool stopAtCompBoundary, UBool onlyContiguous,
|
||||
ReorderingBuffer &buffer, UErrorCode &errorCode) const;
|
||||
UBool decompose(UChar32 c, uint16_t norm16,
|
||||
ReorderingBuffer &buffer, UErrorCode &errorCode) const;
|
||||
|
||||
const uint8_t *decomposeShort(const uint8_t *src, const uint8_t *limit,
|
||||
StopAt stopAt, UBool onlyContiguous,
|
||||
ReorderingBuffer &buffer, UErrorCode &errorCode) const;
|
||||
|
||||
static int32_t combine(const uint16_t *list, UChar32 trail);
|
||||
void addComposites(const uint16_t *list, UnicodeSet &set) const;
|
||||
void recompose(ReorderingBuffer &buffer, int32_t recomposeStartIndex,
|
||||
UBool onlyContiguous) const;
|
||||
|
||||
UBool hasCompBoundaryBefore(UChar32 c, uint16_t norm16) const {
|
||||
return c<minCompNoMaybeCP || norm16HasCompBoundaryBefore(norm16);
|
||||
}
|
||||
UBool norm16HasCompBoundaryBefore(uint16_t norm16) const {
|
||||
return norm16 < minNoNoCompNoMaybeCC || isAlgorithmicNoNo(norm16);
|
||||
}
|
||||
UBool hasCompBoundaryBefore(const char16_t *src, const char16_t *limit) const;
|
||||
UBool hasCompBoundaryBefore(const uint8_t *src, const uint8_t *limit) const;
|
||||
UBool hasCompBoundaryAfter(const char16_t *start, const char16_t *p,
|
||||
UBool onlyContiguous) const;
|
||||
UBool hasCompBoundaryAfter(const uint8_t *start, const uint8_t *p,
|
||||
UBool onlyContiguous) const;
|
||||
UBool norm16HasCompBoundaryAfter(uint16_t norm16, UBool onlyContiguous) const {
|
||||
return (norm16 & HAS_COMP_BOUNDARY_AFTER) != 0 &&
|
||||
(!onlyContiguous || isTrailCC01ForCompBoundaryAfter(norm16));
|
||||
}
|
||||
/** For FCC: Given norm16 HAS_COMP_BOUNDARY_AFTER, does it have tccc<=1? */
|
||||
UBool isTrailCC01ForCompBoundaryAfter(uint16_t norm16) const {
|
||||
return isInert(norm16) || (isDecompNoAlgorithmic(norm16) ?
|
||||
(norm16 & DELTA_TCCC_MASK) <= DELTA_TCCC_1 : *getMapping(norm16) <= 0x1ff);
|
||||
}
|
||||
|
||||
const char16_t *findPreviousCompBoundary(const char16_t *start, const char16_t *p, UBool onlyContiguous) const;
|
||||
const char16_t *findNextCompBoundary(const char16_t *p, const char16_t *limit, UBool onlyContiguous) const;
|
||||
|
||||
const char16_t *findPreviousFCDBoundary(const char16_t *start, const char16_t *p) const;
|
||||
const char16_t *findNextFCDBoundary(const char16_t *p, const char16_t *limit) const;
|
||||
|
||||
void makeCanonIterDataFromNorm16(UChar32 start, UChar32 end, const uint16_t norm16,
|
||||
CanonIterData &newData, UErrorCode &errorCode) const;
|
||||
|
||||
int32_t getCanonValue(UChar32 c) const;
|
||||
const UnicodeSet &getCanonStartSet(int32_t n) const;
|
||||
|
||||
// UVersionInfo dataVersion;
|
||||
|
||||
// BMP code point thresholds for quick check loops looking at single UTF-16 code units.
|
||||
char16_t minDecompNoCP;
|
||||
char16_t minCompNoMaybeCP;
|
||||
char16_t minLcccCP;
|
||||
|
||||
// Norm16 value thresholds for quick check combinations and types of extra data.
|
||||
uint16_t minYesNo;
|
||||
uint16_t minYesNoMappingsOnly;
|
||||
uint16_t minNoNo;
|
||||
uint16_t minNoNoCompBoundaryBefore;
|
||||
uint16_t minNoNoCompNoMaybeCC;
|
||||
uint16_t minNoNoEmpty;
|
||||
uint16_t limitNoNo;
|
||||
uint16_t centerNoNoDelta;
|
||||
uint16_t minMaybeYes;
|
||||
|
||||
const UCPTrie *normTrie;
|
||||
const uint16_t *maybeYesCompositions;
|
||||
const uint16_t *extraData; // mappings and/or compositions for yesYes, yesNo & noNo characters
|
||||
const uint8_t *smallFCD; // [0x100] one bit per 32 BMP code points, set if any FCD!=0
|
||||
|
||||
UInitOnce fCanonIterDataInitOnce {};
|
||||
CanonIterData *fCanonIterData;
|
||||
};
|
||||
|
||||
// bits in canonIterData
|
||||
#define CANON_NOT_SEGMENT_STARTER 0x80000000
|
||||
#define CANON_HAS_COMPOSITIONS 0x40000000
|
||||
#define CANON_HAS_SET 0x200000
|
||||
#define CANON_VALUE_MASK 0x1fffff
|
||||
|
||||
/**
|
||||
* ICU-internal shortcut for quick access to standard Unicode normalization.
|
||||
*/
|
||||
class U_COMMON_API Normalizer2Factory {
|
||||
public:
|
||||
static const Normalizer2 *getFCDInstance(UErrorCode &errorCode);
|
||||
static const Normalizer2 *getFCCInstance(UErrorCode &errorCode);
|
||||
static const Normalizer2 *getNoopInstance(UErrorCode &errorCode);
|
||||
|
||||
static const Normalizer2 *getInstance(UNormalizationMode mode, UErrorCode &errorCode);
|
||||
|
||||
static const Normalizer2Impl *getNFCImpl(UErrorCode &errorCode);
|
||||
static const Normalizer2Impl *getNFKCImpl(UErrorCode &errorCode);
|
||||
static const Normalizer2Impl *getNFKC_CFImpl(UErrorCode &errorCode);
|
||||
|
||||
// Get the Impl instance of the Normalizer2.
|
||||
// Must be used only when it is known that norm2 is a Normalizer2WithImpl instance.
|
||||
static const Normalizer2Impl *getImpl(const Normalizer2 *norm2);
|
||||
private:
|
||||
Normalizer2Factory() = delete; // No instantiation.
|
||||
};
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
unorm2_swap(const UDataSwapper *ds,
|
||||
const void *inData, int32_t length, void *outData,
|
||||
UErrorCode *pErrorCode);
|
||||
|
||||
/**
|
||||
* Get the NF*_QC property for a code point, for u_getIntPropertyValue().
|
||||
* @internal
|
||||
*/
|
||||
U_CFUNC UNormalizationCheckResult
|
||||
unorm_getQuickCheck(UChar32 c, UNormalizationMode mode);
|
||||
|
||||
/**
|
||||
* Gets the 16-bit FCD value (lead & trail CCs) for a code point, for u_getIntPropertyValue().
|
||||
* @internal
|
||||
*/
|
||||
U_CFUNC uint16_t
|
||||
unorm_getFCD16(UChar32 c);
|
||||
|
||||
/**
|
||||
* Format of Normalizer2 .nrm data files.
|
||||
* Format version 4.0.
|
||||
*
|
||||
* Normalizer2 .nrm data files provide data for the Unicode Normalization algorithms.
|
||||
* ICU ships with data files for standard Unicode Normalization Forms
|
||||
* NFC and NFD (nfc.nrm), NFKC and NFKD (nfkc.nrm),
|
||||
* NFKC_Casefold (nfkc_cf.nrm) and NFKC_Simple_Casefold (nfkc_scf.nrm).
|
||||
* Custom (application-specific) data can be built into additional .nrm files
|
||||
* with the gennorm2 build tool.
|
||||
* ICU ships with one such file, uts46.nrm, for the implementation of UTS #46.
|
||||
*
|
||||
* Normalizer2.getInstance() causes a .nrm file to be loaded, unless it has been
|
||||
* cached already. Internally, Normalizer2Impl.load() reads the .nrm file.
|
||||
*
|
||||
* A .nrm file begins with a standard ICU data file header
|
||||
* (DataHeader, see ucmndata.h and unicode/udata.h).
|
||||
* The UDataInfo.dataVersion field usually contains the Unicode version
|
||||
* for which the data was generated.
|
||||
*
|
||||
* After the header, the file contains the following parts.
|
||||
* Constants are defined as enum values of the Normalizer2Impl class.
|
||||
*
|
||||
* Many details of the data structures are described in the design doc
|
||||
* which is at https://icu.unicode.org/design/normalization/custom
|
||||
*
|
||||
* int32_t indexes[indexesLength]; -- indexesLength=indexes[IX_NORM_TRIE_OFFSET]/4;
|
||||
*
|
||||
* The first eight indexes are byte offsets in ascending order.
|
||||
* Each byte offset marks the start of the next part in the data file,
|
||||
* and the end of the previous one.
|
||||
* When two consecutive byte offsets are the same, then the corresponding part is empty.
|
||||
* Byte offsets are offsets from after the header,
|
||||
* that is, from the beginning of the indexes[].
|
||||
* Each part starts at an offset with proper alignment for its data.
|
||||
* If necessary, the previous part may include padding bytes to achieve this alignment.
|
||||
*
|
||||
* minDecompNoCP=indexes[IX_MIN_DECOMP_NO_CP] is the lowest code point
|
||||
* with a decomposition mapping, that is, with NF*D_QC=No.
|
||||
* minCompNoMaybeCP=indexes[IX_MIN_COMP_NO_MAYBE_CP] is the lowest code point
|
||||
* with NF*C_QC=No (has a one-way mapping) or Maybe (combines backward).
|
||||
* minLcccCP=indexes[IX_MIN_LCCC_CP] (index 18, new in formatVersion 3)
|
||||
* is the lowest code point with lccc!=0.
|
||||
*
|
||||
* The next eight indexes are thresholds of 16-bit trie values for ranges of
|
||||
* values indicating multiple normalization properties.
|
||||
* They are listed here in threshold order, not in the order they are stored in the indexes.
|
||||
* minYesNo=indexes[IX_MIN_YES_NO];
|
||||
* minYesNoMappingsOnly=indexes[IX_MIN_YES_NO_MAPPINGS_ONLY];
|
||||
* minNoNo=indexes[IX_MIN_NO_NO];
|
||||
* minNoNoCompBoundaryBefore=indexes[IX_MIN_NO_NO_COMP_BOUNDARY_BEFORE];
|
||||
* minNoNoCompNoMaybeCC=indexes[IX_MIN_NO_NO_COMP_NO_MAYBE_CC];
|
||||
* minNoNoEmpty=indexes[IX_MIN_NO_NO_EMPTY];
|
||||
* limitNoNo=indexes[IX_LIMIT_NO_NO];
|
||||
* minMaybeYes=indexes[IX_MIN_MAYBE_YES];
|
||||
* See the normTrie description below and the design doc for details.
|
||||
*
|
||||
* UCPTrie normTrie; -- see ucptrie_impl.h and ucptrie.h, same as Java CodePointTrie
|
||||
*
|
||||
* The trie holds the main normalization data. Each code point is mapped to a 16-bit value.
|
||||
* Rather than using independent bits in the value (which would require more than 16 bits),
|
||||
* information is extracted primarily via range checks.
|
||||
* Except, format version 3 uses bit 0 for hasCompBoundaryAfter().
|
||||
* For example, a 16-bit value norm16 in the range minYesNo<=norm16<minNoNo
|
||||
* means that the character has NF*C_QC=Yes and NF*D_QC=No properties,
|
||||
* which means it has a two-way (round-trip) decomposition mapping.
|
||||
* Values in the range 2<=norm16<limitNoNo are also directly indexes into the extraData
|
||||
* pointing to mappings, compositions lists, or both.
|
||||
* Value norm16==INERT (0 in versions 1 & 2, 1 in version 3)
|
||||
* means that the character is normalization-inert, that is,
|
||||
* it does not have a mapping, does not participate in composition, has a zero
|
||||
* canonical combining class, and forms a boundary where text before it and after it
|
||||
* can be normalized independently.
|
||||
* For details about how multiple properties are encoded in 16-bit values
|
||||
* see the design doc.
|
||||
* Note that the encoding cannot express all combinations of the properties involved;
|
||||
* it only supports those combinations that are allowed by
|
||||
* the Unicode Normalization algorithms. Details are in the design doc as well.
|
||||
* The gennorm2 tool only builds .nrm files for data that conforms to the limitations.
|
||||
*
|
||||
* The trie has a value for each lead surrogate code unit representing the "worst case"
|
||||
* properties of the 1024 supplementary characters whose UTF-16 form starts with
|
||||
* the lead surrogate. If all of the 1024 supplementary characters are normalization-inert,
|
||||
* then their lead surrogate code unit has the trie value INERT.
|
||||
* When the lead surrogate unit's value exceeds the quick check minimum during processing,
|
||||
* the properties for the full supplementary code point need to be looked up.
|
||||
*
|
||||
* uint16_t maybeYesCompositions[MIN_NORMAL_MAYBE_YES-minMaybeYes];
|
||||
* uint16_t extraData[];
|
||||
*
|
||||
* There is only one byte offset for the end of these two arrays.
|
||||
* The split between them is given by the constant and variable mentioned above.
|
||||
* In version 3, the difference must be shifted right by OFFSET_SHIFT.
|
||||
*
|
||||
* The maybeYesCompositions array contains compositions lists for characters that
|
||||
* combine both forward (as starters in composition pairs)
|
||||
* and backward (as trailing characters in composition pairs).
|
||||
* Such characters do not occur in Unicode 5.2 but are allowed by
|
||||
* the Unicode Normalization algorithms.
|
||||
* If there are no such characters, then minMaybeYes==MIN_NORMAL_MAYBE_YES
|
||||
* and the maybeYesCompositions array is empty.
|
||||
* If there are such characters, then minMaybeYes is subtracted from their norm16 values
|
||||
* to get the index into this array.
|
||||
*
|
||||
* The extraData array contains compositions lists for "YesYes" characters,
|
||||
* followed by mappings and optional compositions lists for "YesNo" characters,
|
||||
* followed by only mappings for "NoNo" characters.
|
||||
* (Referring to pairs of NFC/NFD quick check values.)
|
||||
* The norm16 values of those characters are directly indexes into the extraData array.
|
||||
* In version 3, the norm16 values must be shifted right by OFFSET_SHIFT
|
||||
* for accessing extraData.
|
||||
*
|
||||
* The data structures for compositions lists and mappings are described in the design doc.
|
||||
*
|
||||
* uint8_t smallFCD[0x100]; -- new in format version 2
|
||||
*
|
||||
* This is a bit set to help speed up FCD value lookups in the absence of a full
|
||||
* UTrie2 or other large data structure with the full FCD value mapping.
|
||||
*
|
||||
* Each smallFCD bit is set if any of the corresponding 32 BMP code points
|
||||
* has a non-zero FCD value (lccc!=0 or tccc!=0).
|
||||
* Bit 0 of smallFCD[0] is for U+0000..U+001F. Bit 7 of smallFCD[0xff] is for U+FFE0..U+FFFF.
|
||||
* A bit for 32 lead surrogates is set if any of the 32k corresponding
|
||||
* _supplementary_ code points has a non-zero FCD value.
|
||||
*
|
||||
* This bit set is most useful for the large blocks of CJK characters with FCD=0.
|
||||
*
|
||||
* Changes from format version 1 to format version 2 ---------------------------
|
||||
*
|
||||
* - Addition of data for raw (not recursively decomposed) mappings.
|
||||
* + The MAPPING_NO_COMP_BOUNDARY_AFTER bit in the extraData is now also set when
|
||||
* the mapping is to an empty string or when the character combines-forward.
|
||||
* This subsumes the one actual use of the MAPPING_PLUS_COMPOSITION_LIST bit which
|
||||
* is then repurposed for the MAPPING_HAS_RAW_MAPPING bit.
|
||||
* + For details see the design doc.
|
||||
* - Addition of indexes[IX_MIN_YES_NO_MAPPINGS_ONLY] and separation of the yesNo extraData into
|
||||
* distinct ranges (combines-forward vs. not)
|
||||
* so that a range check can be used to find out if there is a compositions list.
|
||||
* This is fully equivalent with formatVersion 1's MAPPING_PLUS_COMPOSITION_LIST flag.
|
||||
* It is needed for the new (in ICU 49) composePair(), not for other normalization.
|
||||
* - Addition of the smallFCD[] bit set.
|
||||
*
|
||||
* Changes from format version 2 to format version 3 (ICU 60) ------------------
|
||||
*
|
||||
* - norm16 bit 0 indicates hasCompBoundaryAfter(),
|
||||
* except that for contiguous composition (FCC) the tccc must be checked as well.
|
||||
* Data indexes and ccc values are shifted left by one (OFFSET_SHIFT).
|
||||
* Thresholds like minNoNo are tested before shifting.
|
||||
*
|
||||
* - Algorithmic mapping deltas are shifted left by two more bits (total DELTA_SHIFT),
|
||||
* to make room for two bits (three values) indicating whether the tccc is 0, 1, or greater.
|
||||
* See DELTA_TCCC_MASK etc.
|
||||
* This helps with fetching tccc/FCD values and FCC hasCompBoundaryAfter().
|
||||
* minMaybeYes is 8-aligned so that the DELTA_TCCC_MASK bits can be tested directly.
|
||||
*
|
||||
* - Algorithmic mappings are only used for mapping to "comp yes and ccc=0" characters,
|
||||
* and ASCII characters are mapped algorithmically only to other ASCII characters.
|
||||
* This helps with hasCompBoundaryBefore() and compose() fast paths.
|
||||
* It is never necessary any more to loop for algorithmic mappings.
|
||||
*
|
||||
* - Addition of indexes[IX_MIN_NO_NO_COMP_BOUNDARY_BEFORE],
|
||||
* indexes[IX_MIN_NO_NO_COMP_NO_MAYBE_CC], and indexes[IX_MIN_NO_NO_EMPTY],
|
||||
* and separation of the noNo extraData into distinct ranges.
|
||||
* With this, the noNo norm16 value indicates whether the mapping is
|
||||
* compose-normalized, not normalized but hasCompBoundaryBefore(),
|
||||
* not even that, or maps to an empty string.
|
||||
* hasCompBoundaryBefore() can be determined solely from the norm16 value.
|
||||
*
|
||||
* - The norm16 value for Hangul LVT is now different from that for Hangul LV,
|
||||
* so that hasCompBoundaryAfter() need not check for the syllable type.
|
||||
* For Hangul LV, minYesNo continues to be used (no comp-boundary-after).
|
||||
* For Hangul LVT, minYesNoMappingsOnly|HAS_COMP_BOUNDARY_AFTER is used.
|
||||
* The extraData units at these indexes are set to firstUnit=2 and firstUnit=3, respectively,
|
||||
* to simplify some code.
|
||||
*
|
||||
* - The extraData firstUnit bit 5 is no longer necessary
|
||||
* (norm16 bit 0 used instead of firstUnit MAPPING_NO_COMP_BOUNDARY_AFTER),
|
||||
* is reserved again, and always set to 0.
|
||||
*
|
||||
* - Addition of indexes[IX_MIN_LCCC_CP], the first code point where lccc!=0.
|
||||
* This used to be hardcoded to U+0300, but in data like NFKC_Casefold it is lower:
|
||||
* U+00AD Soft Hyphen maps to an empty string,
|
||||
* which is artificially assigned "worst case" values lccc=1 and tccc=255.
|
||||
*
|
||||
* - A mapping to an empty string has explicit lccc=1 and tccc=255 values.
|
||||
*
|
||||
* Changes from format version 3 to format version 4 (ICU 63) ------------------
|
||||
*
|
||||
* Switched from UTrie2 to UCPTrie/CodePointTrie.
|
||||
*
|
||||
* The new trie no longer stores different values for surrogate code *units* vs.
|
||||
* surrogate code *points*.
|
||||
* Lead surrogates still have values for optimized UTF-16 string processing.
|
||||
* When looking up code point properties, the code now checks for lead surrogates and
|
||||
* treats them as inert.
|
||||
*
|
||||
* gennorm2 now has to reject mappings for surrogate code points.
|
||||
* UTS #46 maps unpaired surrogates to U+FFFD in code rather than via its
|
||||
* custom normalization data file.
|
||||
*/
|
||||
|
||||
#endif /* !UCONFIG_NO_NORMALIZATION */
|
||||
#endif /* __NORMALIZER2IMPL_H__ */
|
||||
529
engine/thirdparty/icu4c/common/normlzr.cpp
vendored
Normal file
529
engine/thirdparty/icu4c/common/normlzr.cpp
vendored
Normal file
|
|
@ -0,0 +1,529 @@
|
|||
// © 2016 and later: Unicode, Inc. and others.
|
||||
// License & terms of use: http://www.unicode.org/copyright.html
|
||||
/*
|
||||
*************************************************************************
|
||||
* COPYRIGHT:
|
||||
* Copyright (c) 1996-2012, International Business Machines Corporation and
|
||||
* others. All Rights Reserved.
|
||||
*************************************************************************
|
||||
*/
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
|
||||
#if !UCONFIG_NO_NORMALIZATION
|
||||
|
||||
#include "unicode/uniset.h"
|
||||
#include "unicode/unistr.h"
|
||||
#include "unicode/chariter.h"
|
||||
#include "unicode/schriter.h"
|
||||
#include "unicode/uchriter.h"
|
||||
#include "unicode/normlzr.h"
|
||||
#include "unicode/utf16.h"
|
||||
#include "cmemory.h"
|
||||
#include "normalizer2impl.h"
|
||||
#include "uprops.h" // for uniset_getUnicode32Instance()
|
||||
|
||||
#if defined(move32)
|
||||
// System can define move32 intrinsics, but the char iters define move32 method
|
||||
// using same undef trick in headers, so undef here to re-enable the method.
|
||||
#undef move32
|
||||
#endif
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
UOBJECT_DEFINE_RTTI_IMPLEMENTATION(Normalizer)
|
||||
|
||||
//-------------------------------------------------------------------------
|
||||
// Constructors and other boilerplate
|
||||
//-------------------------------------------------------------------------
|
||||
|
||||
Normalizer::Normalizer(const UnicodeString& str, UNormalizationMode mode) :
|
||||
UObject(), fFilteredNorm2(nullptr), fNorm2(nullptr), fUMode(mode), fOptions(0),
|
||||
text(new StringCharacterIterator(str)),
|
||||
currentIndex(0), nextIndex(0),
|
||||
buffer(), bufferPos(0)
|
||||
{
|
||||
init();
|
||||
}
|
||||
|
||||
Normalizer::Normalizer(ConstChar16Ptr str, int32_t length, UNormalizationMode mode) :
|
||||
UObject(), fFilteredNorm2(nullptr), fNorm2(nullptr), fUMode(mode), fOptions(0),
|
||||
text(new UCharCharacterIterator(str, length)),
|
||||
currentIndex(0), nextIndex(0),
|
||||
buffer(), bufferPos(0)
|
||||
{
|
||||
init();
|
||||
}
|
||||
|
||||
Normalizer::Normalizer(const CharacterIterator& iter, UNormalizationMode mode) :
|
||||
UObject(), fFilteredNorm2(nullptr), fNorm2(nullptr), fUMode(mode), fOptions(0),
|
||||
text(iter.clone()),
|
||||
currentIndex(0), nextIndex(0),
|
||||
buffer(), bufferPos(0)
|
||||
{
|
||||
init();
|
||||
}
|
||||
|
||||
Normalizer::Normalizer(const Normalizer ©) :
|
||||
UObject(copy), fFilteredNorm2(nullptr), fNorm2(nullptr), fUMode(copy.fUMode), fOptions(copy.fOptions),
|
||||
text(copy.text->clone()),
|
||||
currentIndex(copy.currentIndex), nextIndex(copy.nextIndex),
|
||||
buffer(copy.buffer), bufferPos(copy.bufferPos)
|
||||
{
|
||||
init();
|
||||
}
|
||||
|
||||
void
|
||||
Normalizer::init() {
|
||||
UErrorCode errorCode=U_ZERO_ERROR;
|
||||
fNorm2=Normalizer2Factory::getInstance(fUMode, errorCode);
|
||||
if(fOptions&UNORM_UNICODE_3_2) {
|
||||
delete fFilteredNorm2;
|
||||
fNorm2=fFilteredNorm2=
|
||||
new FilteredNormalizer2(*fNorm2, *uniset_getUnicode32Instance(errorCode));
|
||||
}
|
||||
if(U_FAILURE(errorCode)) {
|
||||
errorCode=U_ZERO_ERROR;
|
||||
fNorm2=Normalizer2Factory::getNoopInstance(errorCode);
|
||||
}
|
||||
}
|
||||
|
||||
Normalizer::~Normalizer()
|
||||
{
|
||||
delete fFilteredNorm2;
|
||||
delete text;
|
||||
}
|
||||
|
||||
Normalizer*
|
||||
Normalizer::clone() const
|
||||
{
|
||||
return new Normalizer(*this);
|
||||
}
|
||||
|
||||
/**
|
||||
* Generates a hash code for this iterator.
|
||||
*/
|
||||
int32_t Normalizer::hashCode() const
|
||||
{
|
||||
return text->hashCode() + fUMode + fOptions + buffer.hashCode() + bufferPos + currentIndex + nextIndex;
|
||||
}
|
||||
|
||||
bool Normalizer::operator==(const Normalizer& that) const
|
||||
{
|
||||
return
|
||||
this==&that ||
|
||||
(fUMode==that.fUMode &&
|
||||
fOptions==that.fOptions &&
|
||||
*text==*that.text &&
|
||||
buffer==that.buffer &&
|
||||
bufferPos==that.bufferPos &&
|
||||
nextIndex==that.nextIndex);
|
||||
}
|
||||
|
||||
//-------------------------------------------------------------------------
|
||||
// Static utility methods
|
||||
//-------------------------------------------------------------------------
|
||||
|
||||
void U_EXPORT2
|
||||
Normalizer::normalize(const UnicodeString& source,
|
||||
UNormalizationMode mode, int32_t options,
|
||||
UnicodeString& result,
|
||||
UErrorCode &status) {
|
||||
if(source.isBogus() || U_FAILURE(status)) {
|
||||
result.setToBogus();
|
||||
if(U_SUCCESS(status)) {
|
||||
status=U_ILLEGAL_ARGUMENT_ERROR;
|
||||
}
|
||||
} else {
|
||||
UnicodeString localDest;
|
||||
UnicodeString *dest;
|
||||
|
||||
if(&source!=&result) {
|
||||
dest=&result;
|
||||
} else {
|
||||
// the source and result strings are the same object, use a temporary one
|
||||
dest=&localDest;
|
||||
}
|
||||
const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, status);
|
||||
if(U_SUCCESS(status)) {
|
||||
if(options&UNORM_UNICODE_3_2) {
|
||||
FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(status)).
|
||||
normalize(source, *dest, status);
|
||||
} else {
|
||||
n2->normalize(source, *dest, status);
|
||||
}
|
||||
}
|
||||
if(dest==&localDest && U_SUCCESS(status)) {
|
||||
result=*dest;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void U_EXPORT2
|
||||
Normalizer::compose(const UnicodeString& source,
|
||||
UBool compat, int32_t options,
|
||||
UnicodeString& result,
|
||||
UErrorCode &status) {
|
||||
normalize(source, compat ? UNORM_NFKC : UNORM_NFC, options, result, status);
|
||||
}
|
||||
|
||||
void U_EXPORT2
|
||||
Normalizer::decompose(const UnicodeString& source,
|
||||
UBool compat, int32_t options,
|
||||
UnicodeString& result,
|
||||
UErrorCode &status) {
|
||||
normalize(source, compat ? UNORM_NFKD : UNORM_NFD, options, result, status);
|
||||
}
|
||||
|
||||
UNormalizationCheckResult
|
||||
Normalizer::quickCheck(const UnicodeString& source,
|
||||
UNormalizationMode mode, int32_t options,
|
||||
UErrorCode &status) {
|
||||
const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, status);
|
||||
if(U_SUCCESS(status)) {
|
||||
if(options&UNORM_UNICODE_3_2) {
|
||||
return FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(status)).
|
||||
quickCheck(source, status);
|
||||
} else {
|
||||
return n2->quickCheck(source, status);
|
||||
}
|
||||
} else {
|
||||
return UNORM_MAYBE;
|
||||
}
|
||||
}
|
||||
|
||||
UBool
|
||||
Normalizer::isNormalized(const UnicodeString& source,
|
||||
UNormalizationMode mode, int32_t options,
|
||||
UErrorCode &status) {
|
||||
const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, status);
|
||||
if(U_SUCCESS(status)) {
|
||||
if(options&UNORM_UNICODE_3_2) {
|
||||
return FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(status)).
|
||||
isNormalized(source, status);
|
||||
} else {
|
||||
return n2->isNormalized(source, status);
|
||||
}
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
UnicodeString & U_EXPORT2
|
||||
Normalizer::concatenate(const UnicodeString &left, const UnicodeString &right,
|
||||
UnicodeString &result,
|
||||
UNormalizationMode mode, int32_t options,
|
||||
UErrorCode &errorCode) {
|
||||
if(left.isBogus() || right.isBogus() || U_FAILURE(errorCode)) {
|
||||
result.setToBogus();
|
||||
if(U_SUCCESS(errorCode)) {
|
||||
errorCode=U_ILLEGAL_ARGUMENT_ERROR;
|
||||
}
|
||||
} else {
|
||||
UnicodeString localDest;
|
||||
UnicodeString *dest;
|
||||
|
||||
if(&right!=&result) {
|
||||
dest=&result;
|
||||
} else {
|
||||
// the right and result strings are the same object, use a temporary one
|
||||
dest=&localDest;
|
||||
}
|
||||
*dest=left;
|
||||
const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, errorCode);
|
||||
if(U_SUCCESS(errorCode)) {
|
||||
if(options&UNORM_UNICODE_3_2) {
|
||||
FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(errorCode)).
|
||||
append(*dest, right, errorCode);
|
||||
} else {
|
||||
n2->append(*dest, right, errorCode);
|
||||
}
|
||||
}
|
||||
if(dest==&localDest && U_SUCCESS(errorCode)) {
|
||||
result=*dest;
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
//-------------------------------------------------------------------------
|
||||
// Iteration API
|
||||
//-------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Return the current character in the normalized text.
|
||||
*/
|
||||
UChar32 Normalizer::current() {
|
||||
if(bufferPos<buffer.length() || nextNormalize()) {
|
||||
return buffer.char32At(bufferPos);
|
||||
} else {
|
||||
return DONE;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the next character in the normalized text and advance
|
||||
* the iteration position by one. If the end
|
||||
* of the text has already been reached, {@link #DONE} is returned.
|
||||
*/
|
||||
UChar32 Normalizer::next() {
|
||||
if(bufferPos<buffer.length() || nextNormalize()) {
|
||||
UChar32 c=buffer.char32At(bufferPos);
|
||||
bufferPos+=U16_LENGTH(c);
|
||||
return c;
|
||||
} else {
|
||||
return DONE;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the previous character in the normalized text and decrement
|
||||
* the iteration position by one. If the beginning
|
||||
* of the text has already been reached, {@link #DONE} is returned.
|
||||
*/
|
||||
UChar32 Normalizer::previous() {
|
||||
if(bufferPos>0 || previousNormalize()) {
|
||||
UChar32 c=buffer.char32At(bufferPos-1);
|
||||
bufferPos-=U16_LENGTH(c);
|
||||
return c;
|
||||
} else {
|
||||
return DONE;
|
||||
}
|
||||
}
|
||||
|
||||
void Normalizer::reset() {
|
||||
currentIndex=nextIndex=text->setToStart();
|
||||
clearBuffer();
|
||||
}
|
||||
|
||||
void
|
||||
Normalizer::setIndexOnly(int32_t index) {
|
||||
text->setIndex(index); // pins index
|
||||
currentIndex=nextIndex=text->getIndex();
|
||||
clearBuffer();
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the first character in the normalized text. This resets
|
||||
* the <tt>Normalizer's</tt> position to the beginning of the text.
|
||||
*/
|
||||
UChar32 Normalizer::first() {
|
||||
reset();
|
||||
return next();
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the last character in the normalized text. This resets
|
||||
* the <tt>Normalizer's</tt> position to be just before the
|
||||
* the input text corresponding to that normalized character.
|
||||
*/
|
||||
UChar32 Normalizer::last() {
|
||||
currentIndex=nextIndex=text->setToEnd();
|
||||
clearBuffer();
|
||||
return previous();
|
||||
}
|
||||
|
||||
/**
|
||||
* Retrieve the current iteration position in the input text that is
|
||||
* being normalized. This method is useful in applications such as
|
||||
* searching, where you need to be able to determine the position in
|
||||
* the input text that corresponds to a given normalized output character.
|
||||
* <p>
|
||||
* <b>Note:</b> This method sets the position in the <em>input</em>, while
|
||||
* {@link #next} and {@link #previous} iterate through characters in the
|
||||
* <em>output</em>. This means that there is not necessarily a one-to-one
|
||||
* correspondence between characters returned by <tt>next</tt> and
|
||||
* <tt>previous</tt> and the indices passed to and returned from
|
||||
* <tt>setIndex</tt> and {@link #getIndex}.
|
||||
*
|
||||
*/
|
||||
int32_t Normalizer::getIndex() const {
|
||||
if(bufferPos<buffer.length()) {
|
||||
return currentIndex;
|
||||
} else {
|
||||
return nextIndex;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Retrieve the index of the start of the input text. This is the begin index
|
||||
* of the <tt>CharacterIterator</tt> or the start (i.e. 0) of the <tt>String</tt>
|
||||
* over which this <tt>Normalizer</tt> is iterating
|
||||
*/
|
||||
int32_t Normalizer::startIndex() const {
|
||||
return text->startIndex();
|
||||
}
|
||||
|
||||
/**
|
||||
* Retrieve the index of the end of the input text. This is the end index
|
||||
* of the <tt>CharacterIterator</tt> or the length of the <tt>String</tt>
|
||||
* over which this <tt>Normalizer</tt> is iterating
|
||||
*/
|
||||
int32_t Normalizer::endIndex() const {
|
||||
return text->endIndex();
|
||||
}
|
||||
|
||||
//-------------------------------------------------------------------------
|
||||
// Property access methods
|
||||
//-------------------------------------------------------------------------
|
||||
|
||||
void
|
||||
Normalizer::setMode(UNormalizationMode newMode)
|
||||
{
|
||||
fUMode = newMode;
|
||||
init();
|
||||
}
|
||||
|
||||
UNormalizationMode
|
||||
Normalizer::getUMode() const
|
||||
{
|
||||
return fUMode;
|
||||
}
|
||||
|
||||
void
|
||||
Normalizer::setOption(int32_t option,
|
||||
UBool value)
|
||||
{
|
||||
if (value) {
|
||||
fOptions |= option;
|
||||
} else {
|
||||
fOptions &= (~option);
|
||||
}
|
||||
init();
|
||||
}
|
||||
|
||||
UBool
|
||||
Normalizer::getOption(int32_t option) const
|
||||
{
|
||||
return (fOptions & option) != 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Set the input text over which this <tt>Normalizer</tt> will iterate.
|
||||
* The iteration position is set to the beginning of the input text.
|
||||
*/
|
||||
void
|
||||
Normalizer::setText(const UnicodeString& newText,
|
||||
UErrorCode &status)
|
||||
{
|
||||
if (U_FAILURE(status)) {
|
||||
return;
|
||||
}
|
||||
CharacterIterator *newIter = new StringCharacterIterator(newText);
|
||||
if (newIter == nullptr) {
|
||||
status = U_MEMORY_ALLOCATION_ERROR;
|
||||
return;
|
||||
}
|
||||
delete text;
|
||||
text = newIter;
|
||||
reset();
|
||||
}
|
||||
|
||||
/**
|
||||
* Set the input text over which this <tt>Normalizer</tt> will iterate.
|
||||
* The iteration position is set to the beginning of the string.
|
||||
*/
|
||||
void
|
||||
Normalizer::setText(const CharacterIterator& newText,
|
||||
UErrorCode &status)
|
||||
{
|
||||
if (U_FAILURE(status)) {
|
||||
return;
|
||||
}
|
||||
CharacterIterator *newIter = newText.clone();
|
||||
if (newIter == nullptr) {
|
||||
status = U_MEMORY_ALLOCATION_ERROR;
|
||||
return;
|
||||
}
|
||||
delete text;
|
||||
text = newIter;
|
||||
reset();
|
||||
}
|
||||
|
||||
void
|
||||
Normalizer::setText(ConstChar16Ptr newText,
|
||||
int32_t length,
|
||||
UErrorCode &status)
|
||||
{
|
||||
if (U_FAILURE(status)) {
|
||||
return;
|
||||
}
|
||||
CharacterIterator *newIter = new UCharCharacterIterator(newText, length);
|
||||
if (newIter == nullptr) {
|
||||
status = U_MEMORY_ALLOCATION_ERROR;
|
||||
return;
|
||||
}
|
||||
delete text;
|
||||
text = newIter;
|
||||
reset();
|
||||
}
|
||||
|
||||
/**
|
||||
* Copies the text under iteration into the UnicodeString referred to by "result".
|
||||
* @param result Receives a copy of the text under iteration.
|
||||
*/
|
||||
void
|
||||
Normalizer::getText(UnicodeString& result)
|
||||
{
|
||||
text->getText(result);
|
||||
}
|
||||
|
||||
//-------------------------------------------------------------------------
|
||||
// Private utility methods
|
||||
//-------------------------------------------------------------------------
|
||||
|
||||
void Normalizer::clearBuffer() {
|
||||
buffer.remove();
|
||||
bufferPos=0;
|
||||
}
|
||||
|
||||
UBool
|
||||
Normalizer::nextNormalize() {
|
||||
clearBuffer();
|
||||
currentIndex=nextIndex;
|
||||
text->setIndex(nextIndex);
|
||||
if(!text->hasNext()) {
|
||||
return false;
|
||||
}
|
||||
// Skip at least one character so we make progress.
|
||||
UnicodeString segment(text->next32PostInc());
|
||||
while(text->hasNext()) {
|
||||
UChar32 c;
|
||||
if(fNorm2->hasBoundaryBefore(c=text->next32PostInc())) {
|
||||
text->move32(-1, CharacterIterator::kCurrent);
|
||||
break;
|
||||
}
|
||||
segment.append(c);
|
||||
}
|
||||
nextIndex=text->getIndex();
|
||||
UErrorCode errorCode=U_ZERO_ERROR;
|
||||
fNorm2->normalize(segment, buffer, errorCode);
|
||||
return U_SUCCESS(errorCode) && !buffer.isEmpty();
|
||||
}
|
||||
|
||||
UBool
|
||||
Normalizer::previousNormalize() {
|
||||
clearBuffer();
|
||||
nextIndex=currentIndex;
|
||||
text->setIndex(currentIndex);
|
||||
if(!text->hasPrevious()) {
|
||||
return false;
|
||||
}
|
||||
UnicodeString segment;
|
||||
while(text->hasPrevious()) {
|
||||
UChar32 c=text->previous32();
|
||||
segment.insert(0, c);
|
||||
if(fNorm2->hasBoundaryBefore(c)) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
currentIndex=text->getIndex();
|
||||
UErrorCode errorCode=U_ZERO_ERROR;
|
||||
fNorm2->normalize(segment, buffer, errorCode);
|
||||
bufferPos=buffer.length();
|
||||
return U_SUCCESS(errorCode) && !buffer.isEmpty();
|
||||
}
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
||||
#endif /* #if !UCONFIG_NO_NORMALIZATION */
|
||||
23
engine/thirdparty/icu4c/common/parsepos.cpp
vendored
Normal file
23
engine/thirdparty/icu4c/common/parsepos.cpp
vendored
Normal file
|
|
@ -0,0 +1,23 @@
|
|||
// © 2016 and later: Unicode, Inc. and others.
|
||||
// License & terms of use: http://www.unicode.org/copyright.html
|
||||
/*
|
||||
**********************************************************************
|
||||
* Copyright (C) 2003-2003, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
**********************************************************************
|
||||
*/
|
||||
|
||||
#include "unicode/parsepos.h"
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
UOBJECT_DEFINE_RTTI_IMPLEMENTATION(ParsePosition)
|
||||
|
||||
ParsePosition::~ParsePosition() {}
|
||||
|
||||
ParsePosition *
|
||||
ParsePosition::clone() const {
|
||||
return new ParsePosition(*this);
|
||||
}
|
||||
|
||||
U_NAMESPACE_END
|
||||
230
engine/thirdparty/icu4c/common/patternprops.cpp
vendored
Normal file
230
engine/thirdparty/icu4c/common/patternprops.cpp
vendored
Normal file
|
|
@ -0,0 +1,230 @@
|
|||
// © 2016 and later: Unicode, Inc. and others.
|
||||
// License & terms of use: http://www.unicode.org/copyright.html
|
||||
/*
|
||||
*******************************************************************************
|
||||
* Copyright (C) 2011, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*******************************************************************************
|
||||
* file name: patternprops.cpp
|
||||
* encoding: UTF-8
|
||||
* tab size: 8 (not used)
|
||||
* indentation:4
|
||||
*
|
||||
* created on: 2011mar13
|
||||
* created by: Markus W. Scherer
|
||||
*/
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
#include "patternprops.h"
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
/*
|
||||
* One byte per Latin-1 character.
|
||||
* Bit 0 is set if either Pattern property is true,
|
||||
* bit 1 if Pattern_Syntax is true,
|
||||
* bit 2 if Pattern_White_Space is true.
|
||||
* That is, Pattern_Syntax is encoded as 3 and Pattern_White_Space as 5.
|
||||
*/
|
||||
static const uint8_t latin1[256]={
|
||||
// WS: 9..D
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 5, 5, 5, 5, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
// WS: 20 Syntax: 21..2F
|
||||
5, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
|
||||
// Syntax: 3A..40
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 3, 3,
|
||||
3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
// Syntax: 5B..5E
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
|
||||
// Syntax: 60
|
||||
3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
// Syntax: 7B..7E
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
|
||||
// WS: 85
|
||||
0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
// Syntax: A1..A7, A9, AB, AC, AE
|
||||
0, 3, 3, 3, 3, 3, 3, 3, 0, 3, 0, 3, 3, 0, 3, 0,
|
||||
// Syntax: B0, B1, B6, BB, BF
|
||||
3, 3, 0, 0, 0, 0, 3, 0, 0, 0, 0, 3, 0, 0, 0, 3,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
// Syntax: D7
|
||||
0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
// Syntax: F7
|
||||
0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0
|
||||
};
|
||||
|
||||
/*
|
||||
* One byte per 32 characters from U+2000..U+303F indexing into
|
||||
* a small table of 32-bit data words.
|
||||
* The first two data words are all-zeros and all-ones.
|
||||
*/
|
||||
static const uint8_t index2000[130]={
|
||||
2, 3, 4, 0, 0, 0, 0, 0, // 20xx
|
||||
0, 0, 0, 0, 5, 1, 1, 1, // 21xx
|
||||
1, 1, 1, 1, 1, 1, 1, 1, // 22xx
|
||||
1, 1, 1, 1, 1, 1, 1, 1, // 23xx
|
||||
1, 1, 1, 0, 0, 0, 0, 0, // 24xx
|
||||
1, 1, 1, 1, 1, 1, 1, 1, // 25xx
|
||||
1, 1, 1, 1, 1, 1, 1, 1, // 26xx
|
||||
1, 1, 1, 6, 7, 1, 1, 1, // 27xx
|
||||
1, 1, 1, 1, 1, 1, 1, 1, // 28xx
|
||||
1, 1, 1, 1, 1, 1, 1, 1, // 29xx
|
||||
1, 1, 1, 1, 1, 1, 1, 1, // 2Axx
|
||||
1, 1, 1, 1, 1, 1, 1, 1, // 2Bxx
|
||||
0, 0, 0, 0, 0, 0, 0, 0, // 2Cxx
|
||||
0, 0, 0, 0, 0, 0, 0, 0, // 2Dxx
|
||||
1, 1, 1, 1, 0, 0, 0, 0, // 2Exx
|
||||
0, 0, 0, 0, 0, 0, 0, 0, // 2Fxx
|
||||
8, 9 // 3000..303F
|
||||
};
|
||||
|
||||
/*
|
||||
* One 32-bit integer per 32 characters. Ranges of all-false and all-true
|
||||
* are mapped to the first two values, other ranges map to appropriate bit patterns.
|
||||
*/
|
||||
static const uint32_t syntax2000[]={
|
||||
0,
|
||||
0xffffffff,
|
||||
0xffff0000, // 2: 2010..201F
|
||||
0x7fff00ff, // 3: 2020..2027, 2030..203E
|
||||
0x7feffffe, // 4: 2041..2053, 2055..205E
|
||||
0xffff0000, // 5: 2190..219F
|
||||
0x003fffff, // 6: 2760..2775
|
||||
0xfff00000, // 7: 2794..279F
|
||||
0xffffff0e, // 8: 3001..3003, 3008..301F
|
||||
0x00010001 // 9: 3020, 3030
|
||||
};
|
||||
|
||||
/*
|
||||
* Same as syntax2000, but with additional bits set for the
|
||||
* Pattern_White_Space characters 200E 200F 2028 2029.
|
||||
*/
|
||||
static const uint32_t syntaxOrWhiteSpace2000[]={
|
||||
0,
|
||||
0xffffffff,
|
||||
0xffffc000, // 2: 200E..201F
|
||||
0x7fff03ff, // 3: 2020..2029, 2030..203E
|
||||
0x7feffffe, // 4: 2041..2053, 2055..205E
|
||||
0xffff0000, // 5: 2190..219F
|
||||
0x003fffff, // 6: 2760..2775
|
||||
0xfff00000, // 7: 2794..279F
|
||||
0xffffff0e, // 8: 3001..3003, 3008..301F
|
||||
0x00010001 // 9: 3020, 3030
|
||||
};
|
||||
|
||||
UBool
|
||||
PatternProps::isSyntax(UChar32 c) {
|
||||
if(c<0) {
|
||||
return false;
|
||||
} else if(c<=0xff) {
|
||||
return (UBool)(latin1[c]>>1)&1;
|
||||
} else if(c<0x2010) {
|
||||
return false;
|
||||
} else if(c<=0x3030) {
|
||||
uint32_t bits=syntax2000[index2000[(c-0x2000)>>5]];
|
||||
return (UBool)((bits>>(c&0x1f))&1);
|
||||
} else if(0xfd3e<=c && c<=0xfe46) {
|
||||
return c<=0xfd3f || 0xfe45<=c;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
UBool
|
||||
PatternProps::isSyntaxOrWhiteSpace(UChar32 c) {
|
||||
if(c<0) {
|
||||
return false;
|
||||
} else if(c<=0xff) {
|
||||
return (UBool)(latin1[c]&1);
|
||||
} else if(c<0x200e) {
|
||||
return false;
|
||||
} else if(c<=0x3030) {
|
||||
uint32_t bits=syntaxOrWhiteSpace2000[index2000[(c-0x2000)>>5]];
|
||||
return (UBool)((bits>>(c&0x1f))&1);
|
||||
} else if(0xfd3e<=c && c<=0xfe46) {
|
||||
return c<=0xfd3f || 0xfe45<=c;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
UBool
|
||||
PatternProps::isWhiteSpace(UChar32 c) {
|
||||
if(c<0) {
|
||||
return false;
|
||||
} else if(c<=0xff) {
|
||||
return (UBool)(latin1[c]>>2)&1;
|
||||
} else if(0x200e<=c && c<=0x2029) {
|
||||
return c<=0x200f || 0x2028<=c;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
const char16_t *
|
||||
PatternProps::skipWhiteSpace(const char16_t *s, int32_t length) {
|
||||
while(length>0 && isWhiteSpace(*s)) {
|
||||
++s;
|
||||
--length;
|
||||
}
|
||||
return s;
|
||||
}
|
||||
|
||||
int32_t
|
||||
PatternProps::skipWhiteSpace(const UnicodeString& s, int32_t start) {
|
||||
int32_t i = start;
|
||||
int32_t length = s.length();
|
||||
while(i<length && isWhiteSpace(s.charAt(i))) {
|
||||
++i;
|
||||
}
|
||||
return i;
|
||||
}
|
||||
|
||||
const char16_t *
|
||||
PatternProps::trimWhiteSpace(const char16_t *s, int32_t &length) {
|
||||
if(length<=0 || (!isWhiteSpace(s[0]) && !isWhiteSpace(s[length-1]))) {
|
||||
return s;
|
||||
}
|
||||
int32_t start=0;
|
||||
int32_t limit=length;
|
||||
while(start<limit && isWhiteSpace(s[start])) {
|
||||
++start;
|
||||
}
|
||||
if(start<limit) {
|
||||
// There is non-white space at start; we will not move limit below that,
|
||||
// so we need not test start<limit in the loop.
|
||||
while(isWhiteSpace(s[limit-1])) {
|
||||
--limit;
|
||||
}
|
||||
}
|
||||
length=limit-start;
|
||||
return s+start;
|
||||
}
|
||||
|
||||
UBool
|
||||
PatternProps::isIdentifier(const char16_t *s, int32_t length) {
|
||||
if(length<=0) {
|
||||
return false;
|
||||
}
|
||||
const char16_t *limit=s+length;
|
||||
do {
|
||||
if(isSyntaxOrWhiteSpace(*s++)) {
|
||||
return false;
|
||||
}
|
||||
} while(s<limit);
|
||||
return true;
|
||||
}
|
||||
|
||||
const char16_t *
|
||||
PatternProps::skipIdentifier(const char16_t *s, int32_t length) {
|
||||
while(length>0 && !isSyntaxOrWhiteSpace(*s)) {
|
||||
++s;
|
||||
--length;
|
||||
}
|
||||
return s;
|
||||
}
|
||||
|
||||
U_NAMESPACE_END
|
||||
98
engine/thirdparty/icu4c/common/patternprops.h
vendored
Normal file
98
engine/thirdparty/icu4c/common/patternprops.h
vendored
Normal file
|
|
@ -0,0 +1,98 @@
|
|||
// © 2016 and later: Unicode, Inc. and others.
|
||||
// License & terms of use: http://www.unicode.org/copyright.html
|
||||
/*
|
||||
*******************************************************************************
|
||||
* Copyright (C) 2011, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*******************************************************************************
|
||||
* file name: patternprops.h
|
||||
* encoding: UTF-8
|
||||
* tab size: 8 (not used)
|
||||
* indentation:4
|
||||
*
|
||||
* created on: 2011mar13
|
||||
* created by: Markus W. Scherer
|
||||
*/
|
||||
|
||||
#ifndef __PATTERNPROPS_H__
|
||||
#define __PATTERNPROPS_H__
|
||||
|
||||
#include "unicode/unistr.h"
|
||||
#include "unicode/utypes.h"
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
/**
|
||||
* Implements the immutable Unicode properties Pattern_Syntax and Pattern_White_Space.
|
||||
* Hardcodes these properties, does not load data, does not depend on other ICU classes.
|
||||
* <p>
|
||||
* Note: Both properties include ASCII as well as non-ASCII, non-Latin-1 code points,
|
||||
* and both properties only include BMP code points (no supplementary ones).
|
||||
* Pattern_Syntax includes some unassigned code points.
|
||||
* <p>
|
||||
* [:Pattern_White_Space:] =
|
||||
* [\u0009-\u000D\ \u0085\u200E\u200F\u2028\u2029]
|
||||
* <p>
|
||||
* [:Pattern_Syntax:] =
|
||||
* [!-/\:-@\[-\^`\{-~\u00A1-\u00A7\u00A9\u00AB\u00AC\u00AE
|
||||
* \u00B0\u00B1\u00B6\u00BB\u00BF\u00D7\u00F7
|
||||
* \u2010-\u2027\u2030-\u203E\u2041-\u2053\u2055-\u205E
|
||||
* \u2190-\u245F\u2500-\u2775\u2794-\u2BFF\u2E00-\u2E7F
|
||||
* \u3001-\u3003\u3008-\u3020\u3030\uFD3E\uFD3F\uFE45\uFE46]
|
||||
* @author mscherer
|
||||
*/
|
||||
class U_COMMON_API PatternProps {
|
||||
public:
|
||||
/**
|
||||
* @return true if c is a Pattern_Syntax code point.
|
||||
*/
|
||||
static UBool isSyntax(UChar32 c);
|
||||
|
||||
/**
|
||||
* @return true if c is a Pattern_Syntax or Pattern_White_Space code point.
|
||||
*/
|
||||
static UBool isSyntaxOrWhiteSpace(UChar32 c);
|
||||
|
||||
/**
|
||||
* @return true if c is a Pattern_White_Space character.
|
||||
*/
|
||||
static UBool isWhiteSpace(UChar32 c);
|
||||
|
||||
/**
|
||||
* Skips over Pattern_White_Space starting at s.
|
||||
* @return The smallest pointer at or after s with a non-white space character.
|
||||
*/
|
||||
static const char16_t *skipWhiteSpace(const char16_t *s, int32_t length);
|
||||
|
||||
/**
|
||||
* Skips over Pattern_White_Space starting at index start in s.
|
||||
* @return The smallest index at or after start with a non-white space character.
|
||||
*/
|
||||
static int32_t skipWhiteSpace(const UnicodeString &s, int32_t start);
|
||||
|
||||
/**
|
||||
* @return s except with leading and trailing Pattern_White_Space removed and length adjusted.
|
||||
*/
|
||||
static const char16_t *trimWhiteSpace(const char16_t *s, int32_t &length);
|
||||
|
||||
/**
|
||||
* Tests whether the string contains a "pattern identifier", that is,
|
||||
* whether it contains only non-Pattern_White_Space, non-Pattern_Syntax characters.
|
||||
* @return true if there are no Pattern_White_Space or Pattern_Syntax characters in s.
|
||||
*/
|
||||
static UBool isIdentifier(const char16_t *s, int32_t length);
|
||||
|
||||
/**
|
||||
* Skips over a "pattern identifier" starting at index s.
|
||||
* @return The smallest pointer at or after s with
|
||||
* a Pattern_White_Space or Pattern_Syntax character.
|
||||
*/
|
||||
static const char16_t *skipIdentifier(const char16_t *s, int32_t length);
|
||||
|
||||
private:
|
||||
PatternProps() = delete; // no constructor: all static methods
|
||||
};
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
||||
#endif // __PATTERNPROPS_H__
|
||||
44
engine/thirdparty/icu4c/common/pluralmap.cpp
vendored
Normal file
44
engine/thirdparty/icu4c/common/pluralmap.cpp
vendored
Normal file
|
|
@ -0,0 +1,44 @@
|
|||
// © 2016 and later: Unicode, Inc. and others.
|
||||
// License & terms of use: http://www.unicode.org/copyright.html
|
||||
/*
|
||||
* Copyright (C) 2015, International Business Machines Corporation and
|
||||
* others. All Rights Reserved.
|
||||
*/
|
||||
|
||||
#include "unicode/unistr.h"
|
||||
#include "charstr.h"
|
||||
#include "cstring.h"
|
||||
#include "pluralmap.h"
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
static const char * const gPluralForms[] = {
|
||||
"other", "zero", "one", "two", "few", "many"};
|
||||
|
||||
PluralMapBase::Category
|
||||
PluralMapBase::toCategory(const char *pluralForm) {
|
||||
for (int32_t i = 0; i < UPRV_LENGTHOF(gPluralForms); ++i) {
|
||||
if (uprv_strcmp(pluralForm, gPluralForms[i]) == 0) {
|
||||
return static_cast<Category>(i);
|
||||
}
|
||||
}
|
||||
return NONE;
|
||||
}
|
||||
|
||||
PluralMapBase::Category
|
||||
PluralMapBase::toCategory(const UnicodeString &pluralForm) {
|
||||
CharString cCategory;
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
cCategory.appendInvariantChars(pluralForm, status);
|
||||
return U_FAILURE(status) ? NONE : toCategory(cCategory.data());
|
||||
}
|
||||
|
||||
const char *PluralMapBase::getCategoryName(Category c) {
|
||||
int32_t index = c;
|
||||
return (index < 0 || index >= UPRV_LENGTHOF(gPluralForms)) ?
|
||||
nullptr : gPluralForms[index];
|
||||
}
|
||||
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
||||
292
engine/thirdparty/icu4c/common/pluralmap.h
vendored
Normal file
292
engine/thirdparty/icu4c/common/pluralmap.h
vendored
Normal file
|
|
@ -0,0 +1,292 @@
|
|||
// © 2016 and later: Unicode, Inc. and others.
|
||||
// License & terms of use: http://www.unicode.org/copyright.html
|
||||
/*
|
||||
******************************************************************************
|
||||
* Copyright (C) 2015, International Business Machines Corporation and
|
||||
* others. All Rights Reserved.
|
||||
******************************************************************************
|
||||
*
|
||||
* File pluralmap.h - PluralMap class that maps plural categories to values.
|
||||
******************************************************************************
|
||||
*/
|
||||
|
||||
#ifndef __PLURAL_MAP_H__
|
||||
#define __PLURAL_MAP_H__
|
||||
|
||||
#include "unicode/uobject.h"
|
||||
#include "cmemory.h"
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
class UnicodeString;
|
||||
|
||||
class U_COMMON_API PluralMapBase : public UMemory {
|
||||
public:
|
||||
/**
|
||||
* The names of all the plural categories. NONE is not an actual plural
|
||||
* category, but rather represents the absence of a plural category.
|
||||
*/
|
||||
enum Category {
|
||||
NONE = -1,
|
||||
OTHER,
|
||||
ZERO,
|
||||
ONE,
|
||||
TWO,
|
||||
FEW,
|
||||
MANY,
|
||||
CATEGORY_COUNT
|
||||
};
|
||||
|
||||
/**
|
||||
* Converts a category name such as "zero", "one", "two", "few", "many"
|
||||
* or "other" to a category enum. Returns NONE for an unrecognized
|
||||
* category name.
|
||||
*/
|
||||
static Category toCategory(const char *categoryName);
|
||||
|
||||
/**
|
||||
* Converts a category name such as "zero", "one", "two", "few", "many"
|
||||
* or "other" to a category enum. Returns NONE for unrecognized
|
||||
* category name.
|
||||
*/
|
||||
static Category toCategory(const UnicodeString &categoryName);
|
||||
|
||||
/**
|
||||
* Converts a category to a name.
|
||||
* Passing NONE or CATEGORY_COUNT for category returns nullptr.
|
||||
*/
|
||||
static const char *getCategoryName(Category category);
|
||||
};
|
||||
|
||||
/**
|
||||
* A Map of plural categories to values. It maintains ownership of the
|
||||
* values.
|
||||
*
|
||||
* Type T is the value type. T must provide the following:
|
||||
* 1) Default constructor
|
||||
* 2) Copy constructor
|
||||
* 3) Assignment operator
|
||||
* 4) Must extend UMemory
|
||||
*/
|
||||
template<typename T>
|
||||
class PluralMap : public PluralMapBase {
|
||||
public:
|
||||
/**
|
||||
* Other category is maps to a copy of the default value.
|
||||
*/
|
||||
PluralMap() : fOtherVariant() {
|
||||
initializeNew();
|
||||
}
|
||||
|
||||
/**
|
||||
* Other category is mapped to otherVariant.
|
||||
*/
|
||||
PluralMap(const T &otherVariant) : fOtherVariant(otherVariant) {
|
||||
initializeNew();
|
||||
}
|
||||
|
||||
PluralMap(const PluralMap<T> &other) : fOtherVariant(other.fOtherVariant) {
|
||||
fVariants[0] = &fOtherVariant;
|
||||
for (int32_t i = 1; i < UPRV_LENGTHOF(fVariants); ++i) {
|
||||
fVariants[i] = other.fVariants[i] ?
|
||||
new T(*other.fVariants[i]) : nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
PluralMap<T> &operator=(const PluralMap<T> &other) {
|
||||
if (this == &other) {
|
||||
return *this;
|
||||
}
|
||||
for (int32_t i = 0; i < UPRV_LENGTHOF(fVariants); ++i) {
|
||||
if (fVariants[i] != nullptr && other.fVariants[i] != nullptr) {
|
||||
*fVariants[i] = *other.fVariants[i];
|
||||
} else if (fVariants[i] != nullptr) {
|
||||
delete fVariants[i];
|
||||
fVariants[i] = nullptr;
|
||||
} else if (other.fVariants[i] != nullptr) {
|
||||
fVariants[i] = new T(*other.fVariants[i]);
|
||||
} else {
|
||||
// do nothing
|
||||
}
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
||||
~PluralMap() {
|
||||
for (int32_t i = 1; i < UPRV_LENGTHOF(fVariants); ++i) {
|
||||
delete fVariants[i];
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Removes all mappings and makes 'other' point to the default value.
|
||||
*/
|
||||
void clear() {
|
||||
*fVariants[0] = T();
|
||||
for (int32_t i = 1; i < UPRV_LENGTHOF(fVariants); ++i) {
|
||||
delete fVariants[i];
|
||||
fVariants[i] = nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Iterates through the mappings in this instance, set index to NONE
|
||||
* prior to using. Call next repeatedly to get the values until it
|
||||
* returns nullptr. Each time next returns, caller may pass index
|
||||
* to getCategoryName() to get the name of the plural category.
|
||||
* When this function returns nullptr, index is CATEGORY_COUNT
|
||||
*/
|
||||
const T *next(Category &index) const {
|
||||
int32_t idx = index;
|
||||
++idx;
|
||||
for (; idx < UPRV_LENGTHOF(fVariants); ++idx) {
|
||||
if (fVariants[idx] != nullptr) {
|
||||
index = static_cast<Category>(idx);
|
||||
return fVariants[idx];
|
||||
}
|
||||
}
|
||||
index = static_cast<Category>(idx);
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
/**
|
||||
* non const version of next.
|
||||
*/
|
||||
T *nextMutable(Category &index) {
|
||||
const T *result = next(index);
|
||||
return const_cast<T *>(result);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the 'other' variant.
|
||||
* Same as calling get(OTHER).
|
||||
*/
|
||||
const T &getOther() const {
|
||||
return get(OTHER);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the value associated with a category.
|
||||
* If no value found, or v is NONE or CATEGORY_COUNT, falls
|
||||
* back to returning the value for the 'other' category.
|
||||
*/
|
||||
const T &get(Category v) const {
|
||||
int32_t index = v;
|
||||
if (index < 0 || index >= UPRV_LENGTHOF(fVariants) || fVariants[index] == nullptr) {
|
||||
return *fVariants[0];
|
||||
}
|
||||
return *fVariants[index];
|
||||
}
|
||||
|
||||
/**
|
||||
* Convenience routine to get the value by category name. Otherwise
|
||||
* works just like get(Category).
|
||||
*/
|
||||
const T &get(const char *category) const {
|
||||
return get(toCategory(category));
|
||||
}
|
||||
|
||||
/**
|
||||
* Convenience routine to get the value by category name as a
|
||||
* UnicodeString. Otherwise works just like get(category).
|
||||
*/
|
||||
const T &get(const UnicodeString &category) const {
|
||||
return get(toCategory(category));
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a pointer to the value associated with a category
|
||||
* that caller can safely modify. If the value was defaulting to the 'other'
|
||||
* variant because no explicit value was stored, this method creates a
|
||||
* new value using the default constructor at the returned pointer.
|
||||
*
|
||||
* @param category the category with the value to change.
|
||||
* @param status error returned here if index is NONE or CATEGORY_COUNT
|
||||
* or memory could not be allocated, or any other error happens.
|
||||
*/
|
||||
T *getMutable(
|
||||
Category category,
|
||||
UErrorCode &status) {
|
||||
return getMutable(category, nullptr, status);
|
||||
}
|
||||
|
||||
/**
|
||||
* Convenience routine to get a mutable pointer to a value by category name.
|
||||
* Otherwise works just like getMutable(Category, UErrorCode &).
|
||||
* reports an error if the category name is invalid.
|
||||
*/
|
||||
T *getMutable(
|
||||
const char *category,
|
||||
UErrorCode &status) {
|
||||
return getMutable(toCategory(category), nullptr, status);
|
||||
}
|
||||
|
||||
/**
|
||||
* Just like getMutable(Category, UErrorCode &) but copies defaultValue to
|
||||
* returned pointer if it was defaulting to the 'other' variant
|
||||
* because no explicit value was stored.
|
||||
*/
|
||||
T *getMutableWithDefault(
|
||||
Category category,
|
||||
const T &defaultValue,
|
||||
UErrorCode &status) {
|
||||
return getMutable(category, &defaultValue, status);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns true if this object equals rhs.
|
||||
*/
|
||||
UBool equals(
|
||||
const PluralMap<T> &rhs,
|
||||
UBool (*eqFunc)(const T &, const T &)) const {
|
||||
for (int32_t i = 0; i < UPRV_LENGTHOF(fVariants); ++i) {
|
||||
if (fVariants[i] == rhs.fVariants[i]) {
|
||||
continue;
|
||||
}
|
||||
if (fVariants[i] == nullptr || rhs.fVariants[i] == nullptr) {
|
||||
return false;
|
||||
}
|
||||
if (!eqFunc(*fVariants[i], *rhs.fVariants[i])) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
private:
|
||||
T fOtherVariant;
|
||||
T* fVariants[6];
|
||||
|
||||
T *getMutable(
|
||||
Category category,
|
||||
const T *defaultValue,
|
||||
UErrorCode &status) {
|
||||
if (U_FAILURE(status)) {
|
||||
return nullptr;
|
||||
}
|
||||
int32_t index = category;
|
||||
if (index < 0 || index >= UPRV_LENGTHOF(fVariants)) {
|
||||
status = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return nullptr;
|
||||
}
|
||||
if (fVariants[index] == nullptr) {
|
||||
fVariants[index] = defaultValue == nullptr ?
|
||||
new T() : new T(*defaultValue);
|
||||
}
|
||||
if (!fVariants[index]) {
|
||||
status = U_MEMORY_ALLOCATION_ERROR;
|
||||
}
|
||||
return fVariants[index];
|
||||
}
|
||||
|
||||
void initializeNew() {
|
||||
fVariants[0] = &fOtherVariant;
|
||||
for (int32_t i = 1; i < UPRV_LENGTHOF(fVariants); ++i) {
|
||||
fVariants[i] = nullptr;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
||||
#endif
|
||||
334
engine/thirdparty/icu4c/common/propname.cpp
vendored
Normal file
334
engine/thirdparty/icu4c/common/propname.cpp
vendored
Normal file
|
|
@ -0,0 +1,334 @@
|
|||
// © 2016 and later: Unicode, Inc. and others.
|
||||
// License & terms of use: http://www.unicode.org/copyright.html
|
||||
/*
|
||||
**********************************************************************
|
||||
* Copyright (c) 2002-2014, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
**********************************************************************
|
||||
* Author: Alan Liu
|
||||
* Created: October 30 2002
|
||||
* Since: ICU 2.4
|
||||
* 2010nov19 Markus Scherer Rewrite for formatVersion 2.
|
||||
**********************************************************************
|
||||
*/
|
||||
#include "propname.h"
|
||||
#include "unicode/uchar.h"
|
||||
#include "unicode/udata.h"
|
||||
#include "unicode/uscript.h"
|
||||
#include "umutex.h"
|
||||
#include "cmemory.h"
|
||||
#include "cstring.h"
|
||||
#include "uarrsort.h"
|
||||
#include "uinvchar.h"
|
||||
|
||||
#define INCLUDED_FROM_PROPNAME_CPP
|
||||
#include "propname_data.h"
|
||||
|
||||
U_CDECL_BEGIN
|
||||
|
||||
/**
|
||||
* Get the next non-ignorable ASCII character from a property name
|
||||
* and lowercases it.
|
||||
* @return ((advance count for the name)<<8)|character
|
||||
*/
|
||||
static inline int32_t
|
||||
getASCIIPropertyNameChar(const char *name) {
|
||||
int32_t i;
|
||||
char c;
|
||||
|
||||
/* Ignore delimiters '-', '_', and ASCII White_Space */
|
||||
for(i=0;
|
||||
(c=name[i++])==0x2d || c==0x5f ||
|
||||
c==0x20 || (0x09<=c && c<=0x0d);
|
||||
) {}
|
||||
|
||||
if(c!=0) {
|
||||
return (i<<8)|(uint8_t)uprv_asciitolower((char)c);
|
||||
} else {
|
||||
return i<<8;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the next non-ignorable EBCDIC character from a property name
|
||||
* and lowercases it.
|
||||
* @return ((advance count for the name)<<8)|character
|
||||
*/
|
||||
static inline int32_t
|
||||
getEBCDICPropertyNameChar(const char *name) {
|
||||
int32_t i;
|
||||
char c;
|
||||
|
||||
/* Ignore delimiters '-', '_', and EBCDIC White_Space */
|
||||
for(i=0;
|
||||
(c=name[i++])==0x60 || c==0x6d ||
|
||||
c==0x40 || c==0x05 || c==0x15 || c==0x25 || c==0x0b || c==0x0c || c==0x0d;
|
||||
) {}
|
||||
|
||||
if(c!=0) {
|
||||
return (i<<8)|(uint8_t)uprv_ebcdictolower((char)c);
|
||||
} else {
|
||||
return i<<8;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Unicode property names and property value names are compared "loosely".
|
||||
*
|
||||
* UCD.html 4.0.1 says:
|
||||
* For all property names, property value names, and for property values for
|
||||
* Enumerated, Binary, or Catalog properties, use the following
|
||||
* loose matching rule:
|
||||
*
|
||||
* LM3. Ignore case, whitespace, underscore ('_'), and hyphens.
|
||||
*
|
||||
* This function does just that, for (char *) name strings.
|
||||
* It is almost identical to ucnv_compareNames() but also ignores
|
||||
* C0 White_Space characters (U+0009..U+000d, and U+0085 on EBCDIC).
|
||||
*
|
||||
* @internal
|
||||
*/
|
||||
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
uprv_compareASCIIPropertyNames(const char *name1, const char *name2) {
|
||||
int32_t rc, r1, r2;
|
||||
|
||||
for(;;) {
|
||||
r1=getASCIIPropertyNameChar(name1);
|
||||
r2=getASCIIPropertyNameChar(name2);
|
||||
|
||||
/* If we reach the ends of both strings then they match */
|
||||
if(((r1|r2)&0xff)==0) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Compare the lowercased characters */
|
||||
if(r1!=r2) {
|
||||
rc=(r1&0xff)-(r2&0xff);
|
||||
if(rc!=0) {
|
||||
return rc;
|
||||
}
|
||||
}
|
||||
|
||||
name1+=r1>>8;
|
||||
name2+=r2>>8;
|
||||
}
|
||||
}
|
||||
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
uprv_compareEBCDICPropertyNames(const char *name1, const char *name2) {
|
||||
int32_t rc, r1, r2;
|
||||
|
||||
for(;;) {
|
||||
r1=getEBCDICPropertyNameChar(name1);
|
||||
r2=getEBCDICPropertyNameChar(name2);
|
||||
|
||||
/* If we reach the ends of both strings then they match */
|
||||
if(((r1|r2)&0xff)==0) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Compare the lowercased characters */
|
||||
if(r1!=r2) {
|
||||
rc=(r1&0xff)-(r2&0xff);
|
||||
if(rc!=0) {
|
||||
return rc;
|
||||
}
|
||||
}
|
||||
|
||||
name1+=r1>>8;
|
||||
name2+=r2>>8;
|
||||
}
|
||||
}
|
||||
|
||||
U_CDECL_END
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
int32_t PropNameData::findProperty(int32_t property) {
|
||||
int32_t i=1; // valueMaps index, initially after numRanges
|
||||
for(int32_t numRanges=valueMaps[0]; numRanges>0; --numRanges) {
|
||||
// Read and skip the start and limit of this range.
|
||||
int32_t start=valueMaps[i];
|
||||
int32_t limit=valueMaps[i+1];
|
||||
i+=2;
|
||||
if(property<start) {
|
||||
break;
|
||||
}
|
||||
if(property<limit) {
|
||||
return i+(property-start)*2;
|
||||
}
|
||||
i+=(limit-start)*2; // Skip all entries for this range.
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
int32_t PropNameData::findPropertyValueNameGroup(int32_t valueMapIndex, int32_t value) {
|
||||
if(valueMapIndex==0) {
|
||||
return 0; // The property does not have named values.
|
||||
}
|
||||
++valueMapIndex; // Skip the BytesTrie offset.
|
||||
int32_t numRanges=valueMaps[valueMapIndex++];
|
||||
if(numRanges<0x10) {
|
||||
// Ranges of values.
|
||||
for(; numRanges>0; --numRanges) {
|
||||
// Read and skip the start and limit of this range.
|
||||
int32_t start=valueMaps[valueMapIndex];
|
||||
int32_t limit=valueMaps[valueMapIndex+1];
|
||||
valueMapIndex+=2;
|
||||
if(value<start) {
|
||||
break;
|
||||
}
|
||||
if(value<limit) {
|
||||
return valueMaps[valueMapIndex+value-start];
|
||||
}
|
||||
valueMapIndex+=limit-start; // Skip all entries for this range.
|
||||
}
|
||||
} else {
|
||||
// List of values.
|
||||
int32_t valuesStart=valueMapIndex;
|
||||
int32_t nameGroupOffsetsStart=valueMapIndex+numRanges-0x10;
|
||||
do {
|
||||
int32_t v=valueMaps[valueMapIndex];
|
||||
if(value<v) {
|
||||
break;
|
||||
}
|
||||
if(value==v) {
|
||||
return valueMaps[nameGroupOffsetsStart+valueMapIndex-valuesStart];
|
||||
}
|
||||
} while(++valueMapIndex<nameGroupOffsetsStart);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
const char *PropNameData::getName(const char *nameGroup, int32_t nameIndex) {
|
||||
int32_t numNames=*nameGroup++;
|
||||
if(nameIndex<0 || numNames<=nameIndex) {
|
||||
return nullptr;
|
||||
}
|
||||
// Skip nameIndex names.
|
||||
for(; nameIndex>0; --nameIndex) {
|
||||
nameGroup=uprv_strchr(nameGroup, 0)+1;
|
||||
}
|
||||
if(*nameGroup==0) {
|
||||
return nullptr; // no name (Property[Value]Aliases.txt has "n/a")
|
||||
}
|
||||
return nameGroup;
|
||||
}
|
||||
|
||||
UBool PropNameData::containsName(BytesTrie &trie, const char *name) {
|
||||
if(name==nullptr) {
|
||||
return false;
|
||||
}
|
||||
UStringTrieResult result=USTRINGTRIE_NO_VALUE;
|
||||
char c;
|
||||
while((c=*name++)!=0) {
|
||||
c=uprv_invCharToLowercaseAscii(c);
|
||||
// Ignore delimiters '-', '_', and ASCII White_Space.
|
||||
if(c==0x2d || c==0x5f || c==0x20 || (0x09<=c && c<=0x0d)) {
|
||||
continue;
|
||||
}
|
||||
if(!USTRINGTRIE_HAS_NEXT(result)) {
|
||||
return false;
|
||||
}
|
||||
result=trie.next((uint8_t)c);
|
||||
}
|
||||
return USTRINGTRIE_HAS_VALUE(result);
|
||||
}
|
||||
|
||||
const char *PropNameData::getPropertyName(int32_t property, int32_t nameChoice) {
|
||||
int32_t valueMapIndex=findProperty(property);
|
||||
if(valueMapIndex==0) {
|
||||
return nullptr; // Not a known property.
|
||||
}
|
||||
return getName(nameGroups+valueMaps[valueMapIndex], nameChoice);
|
||||
}
|
||||
|
||||
const char *PropNameData::getPropertyValueName(int32_t property, int32_t value, int32_t nameChoice) {
|
||||
int32_t valueMapIndex=findProperty(property);
|
||||
if(valueMapIndex==0) {
|
||||
return nullptr; // Not a known property.
|
||||
}
|
||||
int32_t nameGroupOffset=findPropertyValueNameGroup(valueMaps[valueMapIndex+1], value);
|
||||
if(nameGroupOffset==0) {
|
||||
return nullptr;
|
||||
}
|
||||
return getName(nameGroups+nameGroupOffset, nameChoice);
|
||||
}
|
||||
|
||||
int32_t PropNameData::getPropertyOrValueEnum(int32_t bytesTrieOffset, const char *alias) {
|
||||
BytesTrie trie(bytesTries+bytesTrieOffset);
|
||||
if(containsName(trie, alias)) {
|
||||
return trie.getValue();
|
||||
} else {
|
||||
return UCHAR_INVALID_CODE;
|
||||
}
|
||||
}
|
||||
|
||||
int32_t PropNameData::getPropertyEnum(const char *alias) {
|
||||
return getPropertyOrValueEnum(0, alias);
|
||||
}
|
||||
|
||||
int32_t PropNameData::getPropertyValueEnum(int32_t property, const char *alias) {
|
||||
int32_t valueMapIndex=findProperty(property);
|
||||
if(valueMapIndex==0) {
|
||||
return UCHAR_INVALID_CODE; // Not a known property.
|
||||
}
|
||||
valueMapIndex=valueMaps[valueMapIndex+1];
|
||||
if(valueMapIndex==0) {
|
||||
return UCHAR_INVALID_CODE; // The property does not have named values.
|
||||
}
|
||||
// valueMapIndex is the start of the property's valueMap,
|
||||
// where the first word is the BytesTrie offset.
|
||||
return getPropertyOrValueEnum(valueMaps[valueMapIndex], alias);
|
||||
}
|
||||
U_NAMESPACE_END
|
||||
|
||||
//----------------------------------------------------------------------
|
||||
// Public API implementation
|
||||
|
||||
U_CAPI const char* U_EXPORT2
|
||||
u_getPropertyName(UProperty property,
|
||||
UPropertyNameChoice nameChoice) UPRV_NO_SANITIZE_UNDEFINED {
|
||||
// The nameChoice is really an integer with a couple of named constants.
|
||||
// Unicode allows for names other than short and long ones.
|
||||
// If present, these will be returned for U_LONG_PROPERTY_NAME + i, where i=1, 2,...
|
||||
U_NAMESPACE_USE
|
||||
return PropNameData::getPropertyName(property, nameChoice);
|
||||
}
|
||||
|
||||
U_CAPI UProperty U_EXPORT2
|
||||
u_getPropertyEnum(const char* alias) {
|
||||
U_NAMESPACE_USE
|
||||
return (UProperty)PropNameData::getPropertyEnum(alias);
|
||||
}
|
||||
|
||||
U_CAPI const char* U_EXPORT2
|
||||
u_getPropertyValueName(UProperty property,
|
||||
int32_t value,
|
||||
UPropertyNameChoice nameChoice) UPRV_NO_SANITIZE_UNDEFINED {
|
||||
// The nameChoice is really an integer with a couple of named constants.
|
||||
// Unicode allows for names other than short and long ones.
|
||||
// If present, these will be returned for U_LONG_PROPERTY_NAME + i, where i=1, 2,...
|
||||
U_NAMESPACE_USE
|
||||
return PropNameData::getPropertyValueName(property, value, nameChoice);
|
||||
}
|
||||
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
u_getPropertyValueEnum(UProperty property,
|
||||
const char* alias) {
|
||||
U_NAMESPACE_USE
|
||||
return PropNameData::getPropertyValueEnum(property, alias);
|
||||
}
|
||||
|
||||
U_CAPI const char* U_EXPORT2
|
||||
uscript_getName(UScriptCode scriptCode){
|
||||
return u_getPropertyValueName(UCHAR_SCRIPT, scriptCode,
|
||||
U_LONG_PROPERTY_NAME);
|
||||
}
|
||||
|
||||
U_CAPI const char* U_EXPORT2
|
||||
uscript_getShortName(UScriptCode scriptCode){
|
||||
return u_getPropertyValueName(UCHAR_SCRIPT, scriptCode,
|
||||
U_SHORT_PROPERTY_NAME);
|
||||
}
|
||||
212
engine/thirdparty/icu4c/common/propname.h
vendored
Normal file
212
engine/thirdparty/icu4c/common/propname.h
vendored
Normal file
|
|
@ -0,0 +1,212 @@
|
|||
// © 2016 and later: Unicode, Inc. and others.
|
||||
// License & terms of use: http://www.unicode.org/copyright.html
|
||||
/*
|
||||
**********************************************************************
|
||||
* Copyright (c) 2002-2011, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
**********************************************************************
|
||||
* Author: Alan Liu
|
||||
* Created: October 30 2002
|
||||
* Since: ICU 2.4
|
||||
* 2010nov19 Markus Scherer Rewrite for formatVersion 2.
|
||||
**********************************************************************
|
||||
*/
|
||||
#ifndef PROPNAME_H
|
||||
#define PROPNAME_H
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
#include "unicode/bytestrie.h"
|
||||
#include "unicode/uchar.h"
|
||||
#include "udataswp.h"
|
||||
#include "uprops.h"
|
||||
|
||||
/*
|
||||
* This header defines the in-memory layout of the property names data
|
||||
* structure representing the UCD data files PropertyAliases.txt and
|
||||
* PropertyValueAliases.txt. It is used by:
|
||||
* propname.cpp - reads data
|
||||
* genpname - creates data
|
||||
*/
|
||||
|
||||
/* low-level char * property name comparison -------------------------------- */
|
||||
|
||||
U_CDECL_BEGIN
|
||||
|
||||
/**
|
||||
* \var uprv_comparePropertyNames
|
||||
* Unicode property names and property value names are compared "loosely".
|
||||
*
|
||||
* UCD.html 4.0.1 says:
|
||||
* For all property names, property value names, and for property values for
|
||||
* Enumerated, Binary, or Catalog properties, use the following
|
||||
* loose matching rule:
|
||||
*
|
||||
* LM3. Ignore case, whitespace, underscore ('_'), and hyphens.
|
||||
*
|
||||
* This function does just that, for (char *) name strings.
|
||||
* It is almost identical to ucnv_compareNames() but also ignores
|
||||
* C0 White_Space characters (U+0009..U+000d, and U+0085 on EBCDIC).
|
||||
*
|
||||
* @internal
|
||||
*/
|
||||
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
uprv_compareASCIIPropertyNames(const char *name1, const char *name2);
|
||||
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
uprv_compareEBCDICPropertyNames(const char *name1, const char *name2);
|
||||
|
||||
#if U_CHARSET_FAMILY==U_ASCII_FAMILY
|
||||
# define uprv_comparePropertyNames uprv_compareASCIIPropertyNames
|
||||
#elif U_CHARSET_FAMILY==U_EBCDIC_FAMILY
|
||||
# define uprv_comparePropertyNames uprv_compareEBCDICPropertyNames
|
||||
#else
|
||||
# error U_CHARSET_FAMILY is not valid
|
||||
#endif
|
||||
|
||||
U_CDECL_END
|
||||
|
||||
/* UDataMemory structure and signatures ------------------------------------- */
|
||||
|
||||
#define PNAME_DATA_NAME "pnames"
|
||||
#define PNAME_DATA_TYPE "icu"
|
||||
|
||||
/* Fields in UDataInfo: */
|
||||
|
||||
/* PNAME_SIG[] is encoded as numeric literals for compatibility with the HP compiler */
|
||||
#define PNAME_SIG_0 ((uint8_t)0x70) /* p */
|
||||
#define PNAME_SIG_1 ((uint8_t)0x6E) /* n */
|
||||
#define PNAME_SIG_2 ((uint8_t)0x61) /* a */
|
||||
#define PNAME_SIG_3 ((uint8_t)0x6D) /* m */
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
class PropNameData {
|
||||
public:
|
||||
enum {
|
||||
// Byte offsets from the start of the data, after the generic header.
|
||||
IX_VALUE_MAPS_OFFSET,
|
||||
IX_BYTE_TRIES_OFFSET,
|
||||
IX_NAME_GROUPS_OFFSET,
|
||||
IX_RESERVED3_OFFSET,
|
||||
IX_RESERVED4_OFFSET,
|
||||
IX_TOTAL_SIZE,
|
||||
|
||||
// Other values.
|
||||
IX_MAX_NAME_LENGTH,
|
||||
IX_RESERVED7,
|
||||
IX_COUNT
|
||||
};
|
||||
|
||||
static const char *getPropertyName(int32_t property, int32_t nameChoice);
|
||||
static const char *getPropertyValueName(int32_t property, int32_t value, int32_t nameChoice);
|
||||
|
||||
static int32_t getPropertyEnum(const char *alias);
|
||||
static int32_t getPropertyValueEnum(int32_t property, const char *alias);
|
||||
|
||||
private:
|
||||
static int32_t findProperty(int32_t property);
|
||||
static int32_t findPropertyValueNameGroup(int32_t valueMapIndex, int32_t value);
|
||||
static const char *getName(const char *nameGroup, int32_t nameIndex);
|
||||
static UBool containsName(BytesTrie &trie, const char *name);
|
||||
|
||||
static int32_t getPropertyOrValueEnum(int32_t bytesTrieOffset, const char *alias);
|
||||
|
||||
static const int32_t indexes[];
|
||||
static const int32_t valueMaps[];
|
||||
static const uint8_t bytesTries[];
|
||||
static const char nameGroups[];
|
||||
};
|
||||
|
||||
/*
|
||||
* pnames.icu formatVersion 2
|
||||
*
|
||||
* formatVersion 2 is new in ICU 4.8.
|
||||
* In ICU 4.8, the pnames.icu data file is used only in ICU4J.
|
||||
* ICU4C 4.8 has the same data structures hardcoded in source/common/propname_data.h.
|
||||
*
|
||||
* For documentation of pnames.icu formatVersion 1 see ICU4C 4.6 (2010-dec-01)
|
||||
* or earlier versions of this header file (source/common/propname.h).
|
||||
*
|
||||
* The pnames.icu begins with the standard ICU DataHeader/UDataInfo.
|
||||
* After that:
|
||||
*
|
||||
* int32_t indexes[8];
|
||||
*
|
||||
* (See the PropNameData::IX_... constants.)
|
||||
*
|
||||
* The first 6 indexes are byte offsets from the beginning of the data
|
||||
* (beginning of indexes[]) to following structures.
|
||||
* The length of each structure is the difference between its offset
|
||||
* and the next one.
|
||||
* All offsets are filled in: Where there is no data between two offsets,
|
||||
* those two offsets are the same.
|
||||
* The last offset (indexes[PropNameData::IX_TOTAL_SIZE]) indicates the
|
||||
* total number of bytes in the file. (Not counting the standard headers.)
|
||||
*
|
||||
* The sixth index (indexes[PropNameData::IX_MAX_NAME_LENGTH]) has the
|
||||
* maximum length of any Unicode property (or property value) alias.
|
||||
* (Without normalization, that is, including underscores etc.)
|
||||
*
|
||||
* int32_t valueMaps[];
|
||||
*
|
||||
* The valueMaps[] begins with a map from UProperty enums to properties,
|
||||
* followed by the per-property value maps from property values to names,
|
||||
* for those properties that have named values.
|
||||
* (Binary & enumerated, plus General_Category_Mask.)
|
||||
*
|
||||
* valueMaps[0] contains the number of UProperty enum ranges.
|
||||
* For each range:
|
||||
* int32_t start, limit -- first and last+1 UProperty enum of a dense range
|
||||
* Followed by (limit-start) pairs of
|
||||
* int32_t nameGroupOffset;
|
||||
* Offset into nameGroups[] for the property's names/aliases.
|
||||
* int32_t valueMapIndex;
|
||||
* Offset of the property's value map in the valueMaps[] array.
|
||||
* If the valueMapIndex is 0, then the property does not have named values.
|
||||
*
|
||||
* For each property's value map:
|
||||
* int32_t bytesTrieOffset; -- Offset into bytesTries[] for name->value mapping.
|
||||
* int32_t numRanges;
|
||||
* If numRanges is in the range 1..15, then that many ranges of values follow.
|
||||
* Per range:
|
||||
* int32_t start, limit -- first and last+1 UProperty enum of a range
|
||||
* Followed by (limit-start) entries of
|
||||
* int32_t nameGroupOffset;
|
||||
* Offset into nameGroups[] for the property value's names/aliases.
|
||||
* If the nameGroupOffset is 0, then this is not a named value for this property.
|
||||
* (That is, the ranges need not be dense.)
|
||||
* If numRanges is >=0x10, then (numRanges-0x10) sorted values
|
||||
* and then (numRanges-0x10) corresponding nameGroupOffsets follow.
|
||||
* Values are sorted as signed integers.
|
||||
* In this case, the set of values is dense; no nameGroupOffset will be 0.
|
||||
*
|
||||
* For both properties and property values, ranges are sorted by their start/limit values.
|
||||
*
|
||||
* uint8_t bytesTries[];
|
||||
*
|
||||
* This is a sequence of BytesTrie structures, byte-serialized tries for
|
||||
* mapping from names/aliases to values.
|
||||
* The first one maps from property names/aliases to UProperty enum constants.
|
||||
* The following ones are indexed by property value map bytesTrieOffsets
|
||||
* for mapping each property's names/aliases to their property values.
|
||||
*
|
||||
* char nameGroups[];
|
||||
*
|
||||
* This is a sequence of property name groups.
|
||||
* Each group is a list of names/aliases (invariant-character strings) for
|
||||
* one property or property value, in the order of UCharNameChoice.
|
||||
* The first byte of each group is the number of names in the group.
|
||||
* It is followed by that many NUL-terminated strings.
|
||||
* The first string is for the short name; if there is no short name,
|
||||
* then the first string is empty.
|
||||
* The second string is the long name. Further strings are additional aliases.
|
||||
*
|
||||
* The first name group is for a property rather than a property value,
|
||||
* so that a nameGroupOffset of 0 can be used to indicate "no value"
|
||||
* in a property's sparse value ranges.
|
||||
*/
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
||||
#endif
|
||||
2034
engine/thirdparty/icu4c/common/propname_data.h
vendored
Normal file
2034
engine/thirdparty/icu4c/common/propname_data.h
vendored
Normal file
File diff suppressed because it is too large
Load diff
529
engine/thirdparty/icu4c/common/propsvec.cpp
vendored
Normal file
529
engine/thirdparty/icu4c/common/propsvec.cpp
vendored
Normal file
|
|
@ -0,0 +1,529 @@
|
|||
// © 2016 and later: Unicode, Inc. and others.
|
||||
// License & terms of use: http://www.unicode.org/copyright.html
|
||||
/*
|
||||
*******************************************************************************
|
||||
*
|
||||
* Copyright (C) 2002-2011, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*
|
||||
*******************************************************************************
|
||||
* file name: propsvec.c
|
||||
* encoding: UTF-8
|
||||
* tab size: 8 (not used)
|
||||
* indentation:4
|
||||
*
|
||||
* created on: 2002feb22
|
||||
* created by: Markus W. Scherer
|
||||
*
|
||||
* Store bits (Unicode character properties) in bit set vectors.
|
||||
*/
|
||||
|
||||
#include <stdlib.h>
|
||||
#include "unicode/utypes.h"
|
||||
#include "cmemory.h"
|
||||
#include "utrie.h"
|
||||
#include "utrie2.h"
|
||||
#include "uarrsort.h"
|
||||
#include "propsvec.h"
|
||||
#include "uassert.h"
|
||||
|
||||
struct UPropsVectors {
|
||||
uint32_t *v;
|
||||
int32_t columns; /* number of columns, plus two for start & limit values */
|
||||
int32_t maxRows;
|
||||
int32_t rows;
|
||||
int32_t prevRow; /* search optimization: remember last row seen */
|
||||
UBool isCompacted;
|
||||
};
|
||||
|
||||
#define UPVEC_INITIAL_ROWS (1<<12)
|
||||
#define UPVEC_MEDIUM_ROWS ((int32_t)1<<16)
|
||||
#define UPVEC_MAX_ROWS (UPVEC_MAX_CP+1)
|
||||
|
||||
U_CAPI UPropsVectors * U_EXPORT2
|
||||
upvec_open(int32_t columns, UErrorCode *pErrorCode) {
|
||||
UPropsVectors *pv;
|
||||
uint32_t *v, *row;
|
||||
uint32_t cp;
|
||||
|
||||
if(U_FAILURE(*pErrorCode)) {
|
||||
return nullptr;
|
||||
}
|
||||
if(columns<1) {
|
||||
*pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return nullptr;
|
||||
}
|
||||
columns+=2; /* count range start and limit columns */
|
||||
|
||||
pv=(UPropsVectors *)uprv_malloc(sizeof(UPropsVectors));
|
||||
v=(uint32_t *)uprv_malloc(UPVEC_INITIAL_ROWS*columns*4);
|
||||
if(pv==nullptr || v==nullptr) {
|
||||
uprv_free(pv);
|
||||
uprv_free(v);
|
||||
*pErrorCode=U_MEMORY_ALLOCATION_ERROR;
|
||||
return nullptr;
|
||||
}
|
||||
uprv_memset(pv, 0, sizeof(UPropsVectors));
|
||||
pv->v=v;
|
||||
pv->columns=columns;
|
||||
pv->maxRows=UPVEC_INITIAL_ROWS;
|
||||
pv->rows=2+(UPVEC_MAX_CP-UPVEC_FIRST_SPECIAL_CP);
|
||||
|
||||
/* set the all-Unicode row and the special-value rows */
|
||||
row=pv->v;
|
||||
uprv_memset(row, 0, pv->rows*columns*4);
|
||||
row[0]=0;
|
||||
row[1]=0x110000;
|
||||
row+=columns;
|
||||
for(cp=UPVEC_FIRST_SPECIAL_CP; cp<=UPVEC_MAX_CP; ++cp) {
|
||||
row[0]=cp;
|
||||
row[1]=cp+1;
|
||||
row+=columns;
|
||||
}
|
||||
return pv;
|
||||
}
|
||||
|
||||
U_CAPI void U_EXPORT2
|
||||
upvec_close(UPropsVectors *pv) {
|
||||
if(pv!=nullptr) {
|
||||
uprv_free(pv->v);
|
||||
uprv_free(pv);
|
||||
}
|
||||
}
|
||||
|
||||
static uint32_t *
|
||||
_findRow(UPropsVectors *pv, UChar32 rangeStart) {
|
||||
uint32_t *row;
|
||||
int32_t columns, i, start, limit, prevRow;
|
||||
|
||||
columns=pv->columns;
|
||||
limit=pv->rows;
|
||||
prevRow=pv->prevRow;
|
||||
|
||||
/* check the vicinity of the last-seen row (start searching with an unrolled loop) */
|
||||
row=pv->v+prevRow*columns;
|
||||
if(rangeStart>=(UChar32)row[0]) {
|
||||
if(rangeStart<(UChar32)row[1]) {
|
||||
/* same row as last seen */
|
||||
return row;
|
||||
} else if(rangeStart<(UChar32)(row+=columns)[1]) {
|
||||
/* next row after the last one */
|
||||
pv->prevRow=prevRow+1;
|
||||
return row;
|
||||
} else if(rangeStart<(UChar32)(row+=columns)[1]) {
|
||||
/* second row after the last one */
|
||||
pv->prevRow=prevRow+2;
|
||||
return row;
|
||||
} else if((rangeStart-(UChar32)row[1])<10) {
|
||||
/* we are close, continue looping */
|
||||
prevRow+=2;
|
||||
do {
|
||||
++prevRow;
|
||||
row+=columns;
|
||||
} while(rangeStart>=(UChar32)row[1]);
|
||||
pv->prevRow=prevRow;
|
||||
return row;
|
||||
}
|
||||
} else if(rangeStart<(UChar32)pv->v[1]) {
|
||||
/* the very first row */
|
||||
pv->prevRow=0;
|
||||
return pv->v;
|
||||
}
|
||||
|
||||
/* do a binary search for the start of the range */
|
||||
start=0;
|
||||
while(start<limit-1) {
|
||||
i=(start+limit)/2;
|
||||
row=pv->v+i*columns;
|
||||
if(rangeStart<(UChar32)row[0]) {
|
||||
limit=i;
|
||||
} else if(rangeStart<(UChar32)row[1]) {
|
||||
pv->prevRow=i;
|
||||
return row;
|
||||
} else {
|
||||
start=i;
|
||||
}
|
||||
}
|
||||
|
||||
/* must be found because all ranges together always cover all of Unicode */
|
||||
pv->prevRow=start;
|
||||
return pv->v+start*columns;
|
||||
}
|
||||
|
||||
U_CAPI void U_EXPORT2
|
||||
upvec_setValue(UPropsVectors *pv,
|
||||
UChar32 start, UChar32 end,
|
||||
int32_t column,
|
||||
uint32_t value, uint32_t mask,
|
||||
UErrorCode *pErrorCode) {
|
||||
uint32_t *firstRow, *lastRow;
|
||||
int32_t columns;
|
||||
UChar32 limit;
|
||||
UBool splitFirstRow, splitLastRow;
|
||||
|
||||
/* argument checking */
|
||||
if(U_FAILURE(*pErrorCode)) {
|
||||
return;
|
||||
}
|
||||
if( pv==nullptr ||
|
||||
start<0 || start>end || end>UPVEC_MAX_CP ||
|
||||
column<0 || column>=(pv->columns-2)
|
||||
) {
|
||||
*pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return;
|
||||
}
|
||||
if(pv->isCompacted) {
|
||||
*pErrorCode=U_NO_WRITE_PERMISSION;
|
||||
return;
|
||||
}
|
||||
limit=end+1;
|
||||
|
||||
/* initialize */
|
||||
columns=pv->columns;
|
||||
column+=2; /* skip range start and limit columns */
|
||||
value&=mask;
|
||||
|
||||
/* find the rows whose ranges overlap with the input range */
|
||||
|
||||
/* find the first and last rows, always successful */
|
||||
firstRow=_findRow(pv, start);
|
||||
lastRow=_findRow(pv, end);
|
||||
|
||||
/*
|
||||
* Rows need to be split if they partially overlap with the
|
||||
* input range (only possible for the first and last rows)
|
||||
* and if their value differs from the input value.
|
||||
*/
|
||||
splitFirstRow= (UBool)(start!=(UChar32)firstRow[0] && value!=(firstRow[column]&mask));
|
||||
splitLastRow= (UBool)(limit!=(UChar32)lastRow[1] && value!=(lastRow[column]&mask));
|
||||
|
||||
/* split first/last rows if necessary */
|
||||
if(splitFirstRow || splitLastRow) {
|
||||
int32_t count, rows;
|
||||
|
||||
rows=pv->rows;
|
||||
if((rows+splitFirstRow+splitLastRow)>pv->maxRows) {
|
||||
uint32_t *newVectors;
|
||||
int32_t newMaxRows;
|
||||
|
||||
if(pv->maxRows<UPVEC_MEDIUM_ROWS) {
|
||||
newMaxRows=UPVEC_MEDIUM_ROWS;
|
||||
} else if(pv->maxRows<UPVEC_MAX_ROWS) {
|
||||
newMaxRows=UPVEC_MAX_ROWS;
|
||||
} else {
|
||||
/* Implementation bug, or UPVEC_MAX_ROWS too low. */
|
||||
*pErrorCode=U_INTERNAL_PROGRAM_ERROR;
|
||||
return;
|
||||
}
|
||||
newVectors=(uint32_t *)uprv_malloc(newMaxRows*columns*4);
|
||||
if(newVectors==nullptr) {
|
||||
*pErrorCode=U_MEMORY_ALLOCATION_ERROR;
|
||||
return;
|
||||
}
|
||||
uprv_memcpy(newVectors, pv->v, (size_t)rows*columns*4);
|
||||
firstRow=newVectors+(firstRow-pv->v);
|
||||
lastRow=newVectors+(lastRow-pv->v);
|
||||
uprv_free(pv->v);
|
||||
pv->v=newVectors;
|
||||
pv->maxRows=newMaxRows;
|
||||
}
|
||||
|
||||
/* count the number of row cells to move after the last row, and move them */
|
||||
count = (int32_t)((pv->v+rows*columns)-(lastRow+columns));
|
||||
if(count>0) {
|
||||
uprv_memmove(
|
||||
lastRow+(1+splitFirstRow+splitLastRow)*columns,
|
||||
lastRow+columns,
|
||||
count*4);
|
||||
}
|
||||
pv->rows=rows+splitFirstRow+splitLastRow;
|
||||
|
||||
/* split the first row, and move the firstRow pointer to the second part */
|
||||
if(splitFirstRow) {
|
||||
/* copy all affected rows up one and move the lastRow pointer */
|
||||
count = (int32_t)((lastRow-firstRow)+columns);
|
||||
uprv_memmove(firstRow+columns, firstRow, (size_t)count*4);
|
||||
lastRow+=columns;
|
||||
|
||||
/* split the range and move the firstRow pointer */
|
||||
firstRow[1]=firstRow[columns]=(uint32_t)start;
|
||||
firstRow+=columns;
|
||||
}
|
||||
|
||||
/* split the last row */
|
||||
if(splitLastRow) {
|
||||
/* copy the last row data */
|
||||
uprv_memcpy(lastRow+columns, lastRow, (size_t)columns*4);
|
||||
|
||||
/* split the range and move the firstRow pointer */
|
||||
lastRow[1]=lastRow[columns]=(uint32_t)limit;
|
||||
}
|
||||
}
|
||||
|
||||
/* set the "row last seen" to the last row for the range */
|
||||
pv->prevRow=(int32_t)((lastRow-(pv->v))/columns);
|
||||
|
||||
/* set the input value in all remaining rows */
|
||||
firstRow+=column;
|
||||
lastRow+=column;
|
||||
mask=~mask;
|
||||
for(;;) {
|
||||
*firstRow=(*firstRow&mask)|value;
|
||||
if(firstRow==lastRow) {
|
||||
break;
|
||||
}
|
||||
firstRow+=columns;
|
||||
}
|
||||
}
|
||||
|
||||
U_CAPI uint32_t U_EXPORT2
|
||||
upvec_getValue(const UPropsVectors *pv, UChar32 c, int32_t column) {
|
||||
uint32_t *row;
|
||||
UPropsVectors *ncpv;
|
||||
|
||||
if(pv->isCompacted || c<0 || c>UPVEC_MAX_CP || column<0 || column>=(pv->columns-2)) {
|
||||
return 0;
|
||||
}
|
||||
ncpv=(UPropsVectors *)pv;
|
||||
row=_findRow(ncpv, c);
|
||||
return row[2+column];
|
||||
}
|
||||
|
||||
U_CAPI uint32_t * U_EXPORT2
|
||||
upvec_getRow(const UPropsVectors *pv, int32_t rowIndex,
|
||||
UChar32 *pRangeStart, UChar32 *pRangeEnd) {
|
||||
uint32_t *row;
|
||||
int32_t columns;
|
||||
|
||||
if(pv->isCompacted || rowIndex<0 || rowIndex>=pv->rows) {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
columns=pv->columns;
|
||||
row=pv->v+rowIndex*columns;
|
||||
if(pRangeStart!=nullptr) {
|
||||
*pRangeStart=(UChar32)row[0];
|
||||
}
|
||||
if(pRangeEnd!=nullptr) {
|
||||
*pRangeEnd=(UChar32)row[1]-1;
|
||||
}
|
||||
return row+2;
|
||||
}
|
||||
|
||||
static int32_t U_CALLCONV
|
||||
upvec_compareRows(const void *context, const void *l, const void *r) {
|
||||
const uint32_t *left=(const uint32_t *)l, *right=(const uint32_t *)r;
|
||||
const UPropsVectors *pv=(const UPropsVectors *)context;
|
||||
int32_t i, count, columns;
|
||||
|
||||
count=columns=pv->columns; /* includes start/limit columns */
|
||||
|
||||
/* start comparing after start/limit but wrap around to them */
|
||||
i=2;
|
||||
do {
|
||||
if(left[i]!=right[i]) {
|
||||
return left[i]<right[i] ? -1 : 1;
|
||||
}
|
||||
if(++i==columns) {
|
||||
i=0;
|
||||
}
|
||||
} while(--count>0);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
U_CAPI void U_EXPORT2
|
||||
upvec_compact(UPropsVectors *pv, UPVecCompactHandler *handler, void *context, UErrorCode *pErrorCode) {
|
||||
uint32_t *row;
|
||||
int32_t i, columns, valueColumns, rows, count;
|
||||
UChar32 start, limit;
|
||||
|
||||
/* argument checking */
|
||||
if(U_FAILURE(*pErrorCode)) {
|
||||
return;
|
||||
}
|
||||
if(handler==nullptr) {
|
||||
*pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return;
|
||||
}
|
||||
if(pv->isCompacted) {
|
||||
return;
|
||||
}
|
||||
|
||||
/* Set the flag now: Sorting and compacting destroys the builder data structure. */
|
||||
pv->isCompacted=true;
|
||||
|
||||
rows=pv->rows;
|
||||
columns=pv->columns;
|
||||
U_ASSERT(columns>=3); /* upvec_open asserts this */
|
||||
valueColumns=columns-2; /* not counting start & limit */
|
||||
|
||||
/* sort the properties vectors to find unique vector values */
|
||||
uprv_sortArray(pv->v, rows, columns*4,
|
||||
upvec_compareRows, pv, false, pErrorCode);
|
||||
if(U_FAILURE(*pErrorCode)) {
|
||||
return;
|
||||
}
|
||||
|
||||
/*
|
||||
* Find and set the special values.
|
||||
* This has to do almost the same work as the compaction below,
|
||||
* to find the indexes where the special-value rows will move.
|
||||
*/
|
||||
row=pv->v;
|
||||
count=-valueColumns;
|
||||
for(i=0; i<rows; ++i) {
|
||||
start=(UChar32)row[0];
|
||||
|
||||
/* count a new values vector if it is different from the current one */
|
||||
if(count<0 || 0!=uprv_memcmp(row+2, row-valueColumns, valueColumns*4)) {
|
||||
count+=valueColumns;
|
||||
}
|
||||
|
||||
if(start>=UPVEC_FIRST_SPECIAL_CP) {
|
||||
handler(context, start, start, count, row+2, valueColumns, pErrorCode);
|
||||
if(U_FAILURE(*pErrorCode)) {
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
row+=columns;
|
||||
}
|
||||
|
||||
/* count is at the beginning of the last vector, add valueColumns to include that last vector */
|
||||
count+=valueColumns;
|
||||
|
||||
/* Call the handler once more to signal the start of delivering real values. */
|
||||
handler(context, UPVEC_START_REAL_VALUES_CP, UPVEC_START_REAL_VALUES_CP,
|
||||
count, row-valueColumns, valueColumns, pErrorCode);
|
||||
if(U_FAILURE(*pErrorCode)) {
|
||||
return;
|
||||
}
|
||||
|
||||
/*
|
||||
* Move vector contents up to a contiguous array with only unique
|
||||
* vector values, and call the handler function for each vector.
|
||||
*
|
||||
* This destroys the Properties Vector structure and replaces it
|
||||
* with an array of just vector values.
|
||||
*/
|
||||
row=pv->v;
|
||||
count=-valueColumns;
|
||||
for(i=0; i<rows; ++i) {
|
||||
/* fetch these first before memmove() may overwrite them */
|
||||
start=(UChar32)row[0];
|
||||
limit=(UChar32)row[1];
|
||||
|
||||
/* add a new values vector if it is different from the current one */
|
||||
if(count<0 || 0!=uprv_memcmp(row+2, pv->v+count, valueColumns*4)) {
|
||||
count+=valueColumns;
|
||||
uprv_memmove(pv->v+count, row+2, (size_t)valueColumns*4);
|
||||
}
|
||||
|
||||
if(start<UPVEC_FIRST_SPECIAL_CP) {
|
||||
handler(context, start, limit-1, count, pv->v+count, valueColumns, pErrorCode);
|
||||
if(U_FAILURE(*pErrorCode)) {
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
row+=columns;
|
||||
}
|
||||
|
||||
/* count is at the beginning of the last vector, add one to include that last vector */
|
||||
pv->rows=count/valueColumns+1;
|
||||
}
|
||||
|
||||
U_CAPI const uint32_t * U_EXPORT2
|
||||
upvec_getArray(const UPropsVectors *pv, int32_t *pRows, int32_t *pColumns) {
|
||||
if(!pv->isCompacted) {
|
||||
return nullptr;
|
||||
}
|
||||
if(pRows!=nullptr) {
|
||||
*pRows=pv->rows;
|
||||
}
|
||||
if(pColumns!=nullptr) {
|
||||
*pColumns=pv->columns-2;
|
||||
}
|
||||
return pv->v;
|
||||
}
|
||||
|
||||
U_CAPI uint32_t * U_EXPORT2
|
||||
upvec_cloneArray(const UPropsVectors *pv,
|
||||
int32_t *pRows, int32_t *pColumns, UErrorCode *pErrorCode) {
|
||||
uint32_t *clonedArray;
|
||||
int32_t byteLength;
|
||||
|
||||
if(U_FAILURE(*pErrorCode)) {
|
||||
return nullptr;
|
||||
}
|
||||
if(!pv->isCompacted) {
|
||||
*pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return nullptr;
|
||||
}
|
||||
byteLength=pv->rows*(pv->columns-2)*4;
|
||||
clonedArray=(uint32_t *)uprv_malloc(byteLength);
|
||||
if(clonedArray==nullptr) {
|
||||
*pErrorCode=U_MEMORY_ALLOCATION_ERROR;
|
||||
return nullptr;
|
||||
}
|
||||
uprv_memcpy(clonedArray, pv->v, byteLength);
|
||||
if(pRows!=nullptr) {
|
||||
*pRows=pv->rows;
|
||||
}
|
||||
if(pColumns!=nullptr) {
|
||||
*pColumns=pv->columns-2;
|
||||
}
|
||||
return clonedArray;
|
||||
}
|
||||
|
||||
U_CAPI UTrie2 * U_EXPORT2
|
||||
upvec_compactToUTrie2WithRowIndexes(UPropsVectors *pv, UErrorCode *pErrorCode) {
|
||||
UPVecToUTrie2Context toUTrie2={ nullptr, 0, 0, 0 };
|
||||
upvec_compact(pv, upvec_compactToUTrie2Handler, &toUTrie2, pErrorCode);
|
||||
utrie2_freeze(toUTrie2.trie, UTRIE2_16_VALUE_BITS, pErrorCode);
|
||||
if(U_FAILURE(*pErrorCode)) {
|
||||
utrie2_close(toUTrie2.trie);
|
||||
toUTrie2.trie=nullptr;
|
||||
}
|
||||
return toUTrie2.trie;
|
||||
}
|
||||
|
||||
/*
|
||||
* TODO(markus): Add upvec_16BitsToUTrie2() function that enumerates all rows, extracts
|
||||
* some 16-bit field and builds and returns a UTrie2.
|
||||
*/
|
||||
|
||||
U_CAPI void U_CALLCONV
|
||||
upvec_compactToUTrie2Handler(void *context,
|
||||
UChar32 start, UChar32 end,
|
||||
int32_t rowIndex, uint32_t *row, int32_t columns,
|
||||
UErrorCode *pErrorCode) {
|
||||
(void)row;
|
||||
(void)columns;
|
||||
UPVecToUTrie2Context *toUTrie2=(UPVecToUTrie2Context *)context;
|
||||
if(start<UPVEC_FIRST_SPECIAL_CP) {
|
||||
utrie2_setRange32(toUTrie2->trie, start, end, (uint32_t)rowIndex, true, pErrorCode);
|
||||
} else {
|
||||
switch(start) {
|
||||
case UPVEC_INITIAL_VALUE_CP:
|
||||
toUTrie2->initialValue=rowIndex;
|
||||
break;
|
||||
case UPVEC_ERROR_VALUE_CP:
|
||||
toUTrie2->errorValue=rowIndex;
|
||||
break;
|
||||
case UPVEC_START_REAL_VALUES_CP:
|
||||
toUTrie2->maxValue=rowIndex;
|
||||
if(rowIndex>0xffff) {
|
||||
/* too many rows for a 16-bit trie */
|
||||
*pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
|
||||
} else {
|
||||
toUTrie2->trie=utrie2_open(toUTrie2->initialValue,
|
||||
toUTrie2->errorValue, pErrorCode);
|
||||
}
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
178
engine/thirdparty/icu4c/common/propsvec.h
vendored
Normal file
178
engine/thirdparty/icu4c/common/propsvec.h
vendored
Normal file
|
|
@ -0,0 +1,178 @@
|
|||
// © 2016 and later: Unicode, Inc. and others.
|
||||
// License & terms of use: http://www.unicode.org/copyright.html
|
||||
/*
|
||||
*******************************************************************************
|
||||
*
|
||||
* Copyright (C) 2002-2010, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*
|
||||
*******************************************************************************
|
||||
* file name: propsvec.h
|
||||
* encoding: UTF-8
|
||||
* tab size: 8 (not used)
|
||||
* indentation:4
|
||||
*
|
||||
* created on: 2002feb22
|
||||
* created by: Markus W. Scherer
|
||||
*
|
||||
* Store bits (Unicode character properties) in bit set vectors.
|
||||
*/
|
||||
|
||||
#ifndef __UPROPSVEC_H__
|
||||
#define __UPROPSVEC_H__
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
#include "utrie.h"
|
||||
#include "utrie2.h"
|
||||
|
||||
U_CDECL_BEGIN
|
||||
|
||||
/**
|
||||
* Unicode Properties Vectors associated with code point ranges.
|
||||
*
|
||||
* Rows of uint32_t integers in a contiguous array store
|
||||
* the range limits and the properties vectors.
|
||||
*
|
||||
* Logically, each row has a certain number of uint32_t values,
|
||||
* which is set via the upvec_open() "columns" parameter.
|
||||
*
|
||||
* Internally, two additional columns are stored.
|
||||
* In each internal row,
|
||||
* row[0] contains the start code point and
|
||||
* row[1] contains the limit code point,
|
||||
* which is the start of the next range.
|
||||
*
|
||||
* Initially, there is only one "normal" row for
|
||||
* range [0..0x110000[ with values 0.
|
||||
* There are additional rows for special purposes, see UPVEC_FIRST_SPECIAL_CP.
|
||||
*
|
||||
* It would be possible to store only one range boundary per row,
|
||||
* but self-contained rows allow to later sort them by contents.
|
||||
*/
|
||||
struct UPropsVectors;
|
||||
typedef struct UPropsVectors UPropsVectors;
|
||||
|
||||
/*
|
||||
* Special pseudo code points for storing the initialValue and the errorValue,
|
||||
* which are used to initialize a UTrie2 or similar.
|
||||
*/
|
||||
#define UPVEC_FIRST_SPECIAL_CP 0x110000
|
||||
#define UPVEC_INITIAL_VALUE_CP 0x110000
|
||||
#define UPVEC_ERROR_VALUE_CP 0x110001
|
||||
#define UPVEC_MAX_CP 0x110001
|
||||
|
||||
/*
|
||||
* Special pseudo code point used in upvec_compact() signalling the end of
|
||||
* delivering special values and the beginning of delivering real ones.
|
||||
* Stable value, unlike UPVEC_MAX_CP which might grow over time.
|
||||
*/
|
||||
#define UPVEC_START_REAL_VALUES_CP 0x200000
|
||||
|
||||
/*
|
||||
* Open a UPropsVectors object.
|
||||
* @param columns Number of value integers (uint32_t) per row.
|
||||
*/
|
||||
U_CAPI UPropsVectors * U_EXPORT2
|
||||
upvec_open(int32_t columns, UErrorCode *pErrorCode);
|
||||
|
||||
U_CAPI void U_EXPORT2
|
||||
upvec_close(UPropsVectors *pv);
|
||||
|
||||
/*
|
||||
* In rows for code points [start..end], select the column,
|
||||
* reset the mask bits and set the value bits (ANDed with the mask).
|
||||
*
|
||||
* Will set U_NO_WRITE_PERMISSION if called after upvec_compact().
|
||||
*/
|
||||
U_CAPI void U_EXPORT2
|
||||
upvec_setValue(UPropsVectors *pv,
|
||||
UChar32 start, UChar32 end,
|
||||
int32_t column,
|
||||
uint32_t value, uint32_t mask,
|
||||
UErrorCode *pErrorCode);
|
||||
|
||||
/*
|
||||
* Logically const but must not be used on the same pv concurrently!
|
||||
* Always returns 0 if called after upvec_compact().
|
||||
*/
|
||||
U_CAPI uint32_t U_EXPORT2
|
||||
upvec_getValue(const UPropsVectors *pv, UChar32 c, int32_t column);
|
||||
|
||||
/*
|
||||
* pRangeStart and pRangeEnd can be NULL.
|
||||
* @return NULL if rowIndex out of range and for illegal arguments,
|
||||
* or if called after upvec_compact()
|
||||
*/
|
||||
U_CAPI uint32_t * U_EXPORT2
|
||||
upvec_getRow(const UPropsVectors *pv, int32_t rowIndex,
|
||||
UChar32 *pRangeStart, UChar32 *pRangeEnd);
|
||||
|
||||
/*
|
||||
* Compact the vectors:
|
||||
* - modify the memory
|
||||
* - keep only unique vectors
|
||||
* - store them contiguously from the beginning of the memory
|
||||
* - for each (non-unique) row, call the handler function
|
||||
*
|
||||
* The handler's rowIndex is the index of the row in the compacted
|
||||
* memory block.
|
||||
* (Therefore, it starts at 0 increases in increments of the columns value.)
|
||||
*
|
||||
* In a first phase, only special values are delivered (each exactly once),
|
||||
* with start==end both equalling a special pseudo code point.
|
||||
* Then the handler is called once more with start==end==UPVEC_START_REAL_VALUES_CP
|
||||
* where rowIndex is the length of the compacted array,
|
||||
* and the row is arbitrary (but not NULL).
|
||||
* Then, in the second phase, the handler is called for each row of real values.
|
||||
*/
|
||||
typedef void U_CALLCONV
|
||||
UPVecCompactHandler(void *context,
|
||||
UChar32 start, UChar32 end,
|
||||
int32_t rowIndex, uint32_t *row, int32_t columns,
|
||||
UErrorCode *pErrorCode);
|
||||
|
||||
U_CAPI void U_EXPORT2
|
||||
upvec_compact(UPropsVectors *pv, UPVecCompactHandler *handler, void *context, UErrorCode *pErrorCode);
|
||||
|
||||
/*
|
||||
* Get the vectors array after calling upvec_compact().
|
||||
* The caller must not modify nor release the returned array.
|
||||
* Returns NULL if called before upvec_compact().
|
||||
*/
|
||||
U_CAPI const uint32_t * U_EXPORT2
|
||||
upvec_getArray(const UPropsVectors *pv, int32_t *pRows, int32_t *pColumns);
|
||||
|
||||
/*
|
||||
* Get a clone of the vectors array after calling upvec_compact().
|
||||
* The caller owns the returned array and must uprv_free() it.
|
||||
* Returns NULL if called before upvec_compact().
|
||||
*/
|
||||
U_CAPI uint32_t * U_EXPORT2
|
||||
upvec_cloneArray(const UPropsVectors *pv,
|
||||
int32_t *pRows, int32_t *pColumns, UErrorCode *pErrorCode);
|
||||
|
||||
/*
|
||||
* Call upvec_compact(), create a 16-bit UTrie2 with indexes into the compacted
|
||||
* vectors array, and freeze the trie.
|
||||
*/
|
||||
U_CAPI UTrie2 * U_EXPORT2
|
||||
upvec_compactToUTrie2WithRowIndexes(UPropsVectors *pv, UErrorCode *pErrorCode);
|
||||
|
||||
struct UPVecToUTrie2Context {
|
||||
UTrie2 *trie;
|
||||
int32_t initialValue;
|
||||
int32_t errorValue;
|
||||
int32_t maxValue;
|
||||
};
|
||||
typedef struct UPVecToUTrie2Context UPVecToUTrie2Context;
|
||||
|
||||
/* context=UPVecToUTrie2Context, creates the trie and stores the rowIndex values */
|
||||
U_CAPI void U_CALLCONV
|
||||
upvec_compactToUTrie2Handler(void *context,
|
||||
UChar32 start, UChar32 end,
|
||||
int32_t rowIndex, uint32_t *row, int32_t columns,
|
||||
UErrorCode *pErrorCode);
|
||||
|
||||
U_CDECL_END
|
||||
|
||||
#endif
|
||||
590
engine/thirdparty/icu4c/common/punycode.cpp
vendored
Normal file
590
engine/thirdparty/icu4c/common/punycode.cpp
vendored
Normal file
|
|
@ -0,0 +1,590 @@
|
|||
// © 2016 and later: Unicode, Inc. and others.
|
||||
// License & terms of use: http://www.unicode.org/copyright.html
|
||||
/*
|
||||
*******************************************************************************
|
||||
*
|
||||
* Copyright (C) 2002-2011, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*
|
||||
*******************************************************************************
|
||||
* file name: punycode.cpp
|
||||
* encoding: UTF-8
|
||||
* tab size: 8 (not used)
|
||||
* indentation:4
|
||||
*
|
||||
* created on: 2002jan31
|
||||
* created by: Markus W. Scherer
|
||||
*/
|
||||
|
||||
|
||||
/* This ICU code derived from: */
|
||||
/*
|
||||
punycode.c 0.4.0 (2001-Nov-17-Sat)
|
||||
http://www.cs.berkeley.edu/~amc/idn/
|
||||
Adam M. Costello
|
||||
http://www.nicemice.net/amc/
|
||||
|
||||
Disclaimer and license
|
||||
|
||||
Regarding this entire document or any portion of it (including
|
||||
the pseudocode and C code), the author makes no guarantees and
|
||||
is not responsible for any damage resulting from its use. The
|
||||
author grants irrevocable permission to anyone to use, modify,
|
||||
and distribute it in any way that does not diminish the rights
|
||||
of anyone else to use, modify, and distribute it, provided that
|
||||
redistributed derivative works do not contain misleading author or
|
||||
version information. Derivative works need not be licensed under
|
||||
similar terms.
|
||||
*/
|
||||
/*
|
||||
* ICU modifications:
|
||||
* - ICU data types and coding conventions
|
||||
* - ICU string buffer handling with implicit source lengths
|
||||
* and destination preflighting
|
||||
* - UTF-16 handling
|
||||
*/
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
|
||||
#if !UCONFIG_NO_IDNA
|
||||
|
||||
#include "unicode/ustring.h"
|
||||
#include "unicode/utf.h"
|
||||
#include "unicode/utf16.h"
|
||||
#include "ustr_imp.h"
|
||||
#include "cstring.h"
|
||||
#include "cmemory.h"
|
||||
#include "punycode.h"
|
||||
#include "uassert.h"
|
||||
|
||||
|
||||
/* Punycode ----------------------------------------------------------------- */
|
||||
|
||||
/* Punycode parameters for Bootstring */
|
||||
#define BASE 36
|
||||
#define TMIN 1
|
||||
#define TMAX 26
|
||||
#define SKEW 38
|
||||
#define DAMP 700
|
||||
#define INITIAL_BIAS 72
|
||||
#define INITIAL_N 0x80
|
||||
|
||||
/* "Basic" Unicode/ASCII code points */
|
||||
#define _HYPHEN 0X2d
|
||||
#define DELIMITER _HYPHEN
|
||||
|
||||
#define _ZERO_ 0X30
|
||||
#define _NINE 0x39
|
||||
|
||||
#define _SMALL_A 0X61
|
||||
#define _SMALL_Z 0X7a
|
||||
|
||||
#define _CAPITAL_A 0X41
|
||||
#define _CAPITAL_Z 0X5a
|
||||
|
||||
#define IS_BASIC(c) ((c)<0x80)
|
||||
#define IS_BASIC_UPPERCASE(c) (_CAPITAL_A<=(c) && (c)<=_CAPITAL_Z)
|
||||
|
||||
/**
|
||||
* digitToBasic() returns the basic code point whose value
|
||||
* (when used for representing integers) is d, which must be in the
|
||||
* range 0 to BASE-1. The lowercase form is used unless the uppercase flag is
|
||||
* nonzero, in which case the uppercase form is used.
|
||||
*/
|
||||
static inline char
|
||||
digitToBasic(int32_t digit, UBool uppercase) {
|
||||
/* 0..25 map to ASCII a..z or A..Z */
|
||||
/* 26..35 map to ASCII 0..9 */
|
||||
if(digit<26) {
|
||||
if(uppercase) {
|
||||
return (char)(_CAPITAL_A+digit);
|
||||
} else {
|
||||
return (char)(_SMALL_A+digit);
|
||||
}
|
||||
} else {
|
||||
return (char)((_ZERO_-26)+digit);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @return the numeric value of a basic code point (for use in representing integers)
|
||||
* in the range 0 to BASE-1, or a negative value if cp is invalid.
|
||||
*/
|
||||
static int32_t decodeDigit(int32_t cp) {
|
||||
if(cp<=u'Z') {
|
||||
if(cp<=u'9') {
|
||||
if(cp<u'0') {
|
||||
return -1;
|
||||
} else {
|
||||
return cp-u'0'+26; // 0..9 -> 26..35
|
||||
}
|
||||
} else {
|
||||
return cp-u'A'; // A-Z -> 0..25
|
||||
}
|
||||
} else if(cp<=u'z') {
|
||||
return cp-'a'; // a..z -> 0..25
|
||||
} else {
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
static inline char
|
||||
asciiCaseMap(char b, UBool uppercase) {
|
||||
if(uppercase) {
|
||||
if(_SMALL_A<=b && b<=_SMALL_Z) {
|
||||
b-=(_SMALL_A-_CAPITAL_A);
|
||||
}
|
||||
} else {
|
||||
if(_CAPITAL_A<=b && b<=_CAPITAL_Z) {
|
||||
b+=(_SMALL_A-_CAPITAL_A);
|
||||
}
|
||||
}
|
||||
return b;
|
||||
}
|
||||
|
||||
/* Punycode-specific Bootstring code ---------------------------------------- */
|
||||
|
||||
/*
|
||||
* The following code omits the {parts} of the pseudo-algorithm in the spec
|
||||
* that are not used with the Punycode parameter set.
|
||||
*/
|
||||
|
||||
/* Bias adaptation function. */
|
||||
static int32_t
|
||||
adaptBias(int32_t delta, int32_t length, UBool firstTime) {
|
||||
int32_t count;
|
||||
|
||||
if(firstTime) {
|
||||
delta/=DAMP;
|
||||
} else {
|
||||
delta/=2;
|
||||
}
|
||||
|
||||
delta+=delta/length;
|
||||
for(count=0; delta>((BASE-TMIN)*TMAX)/2; count+=BASE) {
|
||||
delta/=(BASE-TMIN);
|
||||
}
|
||||
|
||||
return count+(((BASE-TMIN+1)*delta)/(delta+SKEW));
|
||||
}
|
||||
|
||||
namespace {
|
||||
|
||||
// ICU-13727: Limit input length for n^2 algorithm
|
||||
// where well-formed strings are at most 59 characters long.
|
||||
constexpr int32_t ENCODE_MAX_CODE_UNITS=1000;
|
||||
constexpr int32_t DECODE_MAX_CHARS=2000;
|
||||
|
||||
} // namespace
|
||||
|
||||
// encode
|
||||
U_CAPI int32_t
|
||||
u_strToPunycode(const char16_t *src, int32_t srcLength,
|
||||
char16_t *dest, int32_t destCapacity,
|
||||
const UBool *caseFlags,
|
||||
UErrorCode *pErrorCode) {
|
||||
|
||||
int32_t cpBuffer[ENCODE_MAX_CODE_UNITS];
|
||||
int32_t n, delta, handledCPCount, basicLength, destLength, bias, j, m, q, k, t, srcCPCount;
|
||||
char16_t c, c2;
|
||||
|
||||
/* argument checking */
|
||||
if(pErrorCode==nullptr || U_FAILURE(*pErrorCode)) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
if(src==nullptr || srcLength<-1 || (dest==nullptr && destCapacity!=0)) {
|
||||
*pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return 0;
|
||||
}
|
||||
if (srcLength>ENCODE_MAX_CODE_UNITS) {
|
||||
*pErrorCode=U_INPUT_TOO_LONG_ERROR;
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Handle the basic code points and
|
||||
* convert extended ones to UTF-32 in cpBuffer (caseFlag in sign bit):
|
||||
*/
|
||||
srcCPCount=destLength=0;
|
||||
if(srcLength==-1) {
|
||||
/* NUL-terminated input */
|
||||
for(j=0; /* no condition */; ++j) {
|
||||
if((c=src[j])==0) {
|
||||
break;
|
||||
}
|
||||
if(j>=ENCODE_MAX_CODE_UNITS) {
|
||||
*pErrorCode=U_INPUT_TOO_LONG_ERROR;
|
||||
return 0;
|
||||
}
|
||||
if(IS_BASIC(c)) {
|
||||
cpBuffer[srcCPCount++]=0;
|
||||
if(destLength<destCapacity) {
|
||||
dest[destLength]=
|
||||
caseFlags!=nullptr ?
|
||||
asciiCaseMap((char)c, caseFlags[j]) :
|
||||
(char)c;
|
||||
}
|
||||
++destLength;
|
||||
} else {
|
||||
n=(caseFlags!=nullptr && caseFlags[j])<<31L;
|
||||
if(U16_IS_SINGLE(c)) {
|
||||
n|=c;
|
||||
} else if(U16_IS_LEAD(c) && U16_IS_TRAIL(c2=src[j+1])) {
|
||||
++j;
|
||||
n|=(int32_t)U16_GET_SUPPLEMENTARY(c, c2);
|
||||
} else {
|
||||
/* error: unmatched surrogate */
|
||||
*pErrorCode=U_INVALID_CHAR_FOUND;
|
||||
return 0;
|
||||
}
|
||||
cpBuffer[srcCPCount++]=n;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
/* length-specified input */
|
||||
for(j=0; j<srcLength; ++j) {
|
||||
c=src[j];
|
||||
if(IS_BASIC(c)) {
|
||||
cpBuffer[srcCPCount++]=0;
|
||||
if(destLength<destCapacity) {
|
||||
dest[destLength]=
|
||||
caseFlags!=nullptr ?
|
||||
asciiCaseMap((char)c, caseFlags[j]) :
|
||||
(char)c;
|
||||
}
|
||||
++destLength;
|
||||
} else {
|
||||
n=(caseFlags!=nullptr && caseFlags[j])<<31L;
|
||||
if(U16_IS_SINGLE(c)) {
|
||||
n|=c;
|
||||
} else if(U16_IS_LEAD(c) && (j+1)<srcLength && U16_IS_TRAIL(c2=src[j+1])) {
|
||||
++j;
|
||||
n|=(int32_t)U16_GET_SUPPLEMENTARY(c, c2);
|
||||
} else {
|
||||
/* error: unmatched surrogate */
|
||||
*pErrorCode=U_INVALID_CHAR_FOUND;
|
||||
return 0;
|
||||
}
|
||||
cpBuffer[srcCPCount++]=n;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* Finish the basic string - if it is not empty - with a delimiter. */
|
||||
basicLength=destLength;
|
||||
if(basicLength>0) {
|
||||
if(destLength<destCapacity) {
|
||||
dest[destLength]=DELIMITER;
|
||||
}
|
||||
++destLength;
|
||||
}
|
||||
|
||||
/*
|
||||
* handledCPCount is the number of code points that have been handled
|
||||
* basicLength is the number of basic code points
|
||||
* destLength is the number of chars that have been output
|
||||
*/
|
||||
|
||||
/* Initialize the state: */
|
||||
n=INITIAL_N;
|
||||
delta=0;
|
||||
bias=INITIAL_BIAS;
|
||||
|
||||
/* Main encoding loop: */
|
||||
for(handledCPCount=basicLength; handledCPCount<srcCPCount; /* no op */) {
|
||||
/*
|
||||
* All non-basic code points < n have been handled already.
|
||||
* Find the next larger one:
|
||||
*/
|
||||
for(m=0x7fffffff, j=0; j<srcCPCount; ++j) {
|
||||
q=cpBuffer[j]&0x7fffffff; /* remove case flag from the sign bit */
|
||||
if(n<=q && q<m) {
|
||||
m=q;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Increase delta enough to advance the decoder's
|
||||
* <n,i> state to <m,0>, but guard against overflow:
|
||||
*/
|
||||
if(m-n>(0x7fffffff-handledCPCount-delta)/(handledCPCount+1)) {
|
||||
*pErrorCode=U_INTERNAL_PROGRAM_ERROR;
|
||||
return 0;
|
||||
}
|
||||
delta+=(m-n)*(handledCPCount+1);
|
||||
n=m;
|
||||
|
||||
/* Encode a sequence of same code points n */
|
||||
for(j=0; j<srcCPCount; ++j) {
|
||||
q=cpBuffer[j]&0x7fffffff; /* remove case flag from the sign bit */
|
||||
if(q<n) {
|
||||
++delta;
|
||||
} else if(q==n) {
|
||||
/* Represent delta as a generalized variable-length integer: */
|
||||
for(q=delta, k=BASE; /* no condition */; k+=BASE) {
|
||||
|
||||
/** RAM: comment out the old code for conformance with draft-ietf-idn-punycode-03.txt
|
||||
|
||||
t=k-bias;
|
||||
if(t<TMIN) {
|
||||
t=TMIN;
|
||||
} else if(t>TMAX) {
|
||||
t=TMAX;
|
||||
}
|
||||
*/
|
||||
|
||||
t=k-bias;
|
||||
if(t<TMIN) {
|
||||
t=TMIN;
|
||||
} else if(k>=(bias+TMAX)) {
|
||||
t=TMAX;
|
||||
}
|
||||
|
||||
if(q<t) {
|
||||
break;
|
||||
}
|
||||
|
||||
if(destLength<destCapacity) {
|
||||
dest[destLength]=digitToBasic(t+(q-t)%(BASE-t), 0);
|
||||
}
|
||||
++destLength;
|
||||
q=(q-t)/(BASE-t);
|
||||
}
|
||||
|
||||
if(destLength<destCapacity) {
|
||||
dest[destLength]=digitToBasic(q, (UBool)(cpBuffer[j]<0));
|
||||
}
|
||||
++destLength;
|
||||
bias=adaptBias(delta, handledCPCount+1, (UBool)(handledCPCount==basicLength));
|
||||
delta=0;
|
||||
++handledCPCount;
|
||||
}
|
||||
}
|
||||
|
||||
++delta;
|
||||
++n;
|
||||
}
|
||||
|
||||
return u_terminateUChars(dest, destCapacity, destLength, pErrorCode);
|
||||
}
|
||||
|
||||
// decode
|
||||
U_CAPI int32_t
|
||||
u_strFromPunycode(const char16_t *src, int32_t srcLength,
|
||||
char16_t *dest, int32_t destCapacity,
|
||||
UBool *caseFlags,
|
||||
UErrorCode *pErrorCode) {
|
||||
int32_t n, destLength, i, bias, basicLength, j, in, oldi, w, k, digit, t,
|
||||
destCPCount, firstSupplementaryIndex, cpLength;
|
||||
char16_t b;
|
||||
|
||||
/* argument checking */
|
||||
if(pErrorCode==nullptr || U_FAILURE(*pErrorCode)) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
if(src==nullptr || srcLength<-1 || (dest==nullptr && destCapacity!=0)) {
|
||||
*pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return 0;
|
||||
}
|
||||
|
||||
if(srcLength==-1) {
|
||||
srcLength=u_strlen(src);
|
||||
}
|
||||
if (srcLength>DECODE_MAX_CHARS) {
|
||||
*pErrorCode=U_INPUT_TOO_LONG_ERROR;
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Handle the basic code points:
|
||||
* Let basicLength be the number of input code points
|
||||
* before the last delimiter, or 0 if there is none,
|
||||
* then copy the first basicLength code points to the output.
|
||||
*
|
||||
* The two following loops iterate backward.
|
||||
*/
|
||||
for(j=srcLength; j>0;) {
|
||||
if(src[--j]==DELIMITER) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
destLength=basicLength=destCPCount=j;
|
||||
U_ASSERT(destLength>=0);
|
||||
|
||||
while(j>0) {
|
||||
b=src[--j];
|
||||
if(!IS_BASIC(b)) {
|
||||
*pErrorCode=U_INVALID_CHAR_FOUND;
|
||||
return 0;
|
||||
}
|
||||
|
||||
if(j<destCapacity) {
|
||||
dest[j]=(char16_t)b;
|
||||
|
||||
if(caseFlags!=nullptr) {
|
||||
caseFlags[j]=IS_BASIC_UPPERCASE(b);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* Initialize the state: */
|
||||
n=INITIAL_N;
|
||||
i=0;
|
||||
bias=INITIAL_BIAS;
|
||||
firstSupplementaryIndex=1000000000;
|
||||
|
||||
/*
|
||||
* Main decoding loop:
|
||||
* Start just after the last delimiter if any
|
||||
* basic code points were copied; start at the beginning otherwise.
|
||||
*/
|
||||
for(in=basicLength>0 ? basicLength+1 : 0; in<srcLength; /* no op */) {
|
||||
/*
|
||||
* in is the index of the next character to be consumed, and
|
||||
* destCPCount is the number of code points in the output array.
|
||||
*
|
||||
* Decode a generalized variable-length integer into delta,
|
||||
* which gets added to i. The overflow checking is easier
|
||||
* if we increase i as we go, then subtract off its starting
|
||||
* value at the end to obtain delta.
|
||||
*/
|
||||
for(oldi=i, w=1, k=BASE; /* no condition */; k+=BASE) {
|
||||
if(in>=srcLength) {
|
||||
*pErrorCode=U_ILLEGAL_CHAR_FOUND;
|
||||
return 0;
|
||||
}
|
||||
|
||||
digit=decodeDigit(src[in++]);
|
||||
if(digit<0) {
|
||||
*pErrorCode=U_INVALID_CHAR_FOUND;
|
||||
return 0;
|
||||
}
|
||||
if(digit>(0x7fffffff-i)/w) {
|
||||
/* integer overflow */
|
||||
*pErrorCode=U_ILLEGAL_CHAR_FOUND;
|
||||
return 0;
|
||||
}
|
||||
|
||||
i+=digit*w;
|
||||
/** RAM: comment out the old code for conformance with draft-ietf-idn-punycode-03.txt
|
||||
t=k-bias;
|
||||
if(t<TMIN) {
|
||||
t=TMIN;
|
||||
} else if(t>TMAX) {
|
||||
t=TMAX;
|
||||
}
|
||||
*/
|
||||
t=k-bias;
|
||||
if(t<TMIN) {
|
||||
t=TMIN;
|
||||
} else if(k>=(bias+TMAX)) {
|
||||
t=TMAX;
|
||||
}
|
||||
if(digit<t) {
|
||||
break;
|
||||
}
|
||||
|
||||
if(w>0x7fffffff/(BASE-t)) {
|
||||
/* integer overflow */
|
||||
*pErrorCode=U_ILLEGAL_CHAR_FOUND;
|
||||
return 0;
|
||||
}
|
||||
w*=BASE-t;
|
||||
}
|
||||
|
||||
/*
|
||||
* Modification from sample code:
|
||||
* Increments destCPCount here,
|
||||
* where needed instead of in for() loop tail.
|
||||
*/
|
||||
++destCPCount;
|
||||
bias=adaptBias(i-oldi, destCPCount, (UBool)(oldi==0));
|
||||
|
||||
/*
|
||||
* i was supposed to wrap around from (incremented) destCPCount to 0,
|
||||
* incrementing n each time, so we'll fix that now:
|
||||
*/
|
||||
if(i/destCPCount>(0x7fffffff-n)) {
|
||||
/* integer overflow */
|
||||
*pErrorCode=U_ILLEGAL_CHAR_FOUND;
|
||||
return 0;
|
||||
}
|
||||
|
||||
n+=i/destCPCount;
|
||||
i%=destCPCount;
|
||||
/* not needed for Punycode: */
|
||||
/* if (decode_digit(n) <= BASE) return punycode_invalid_input; */
|
||||
|
||||
if(n>0x10ffff || U_IS_SURROGATE(n)) {
|
||||
/* Unicode code point overflow */
|
||||
*pErrorCode=U_ILLEGAL_CHAR_FOUND;
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Insert n at position i of the output: */
|
||||
cpLength=U16_LENGTH(n);
|
||||
if(dest!=nullptr && ((destLength+cpLength)<=destCapacity)) {
|
||||
int32_t codeUnitIndex;
|
||||
|
||||
/*
|
||||
* Handle indexes when supplementary code points are present.
|
||||
*
|
||||
* In almost all cases, there will be only BMP code points before i
|
||||
* and even in the entire string.
|
||||
* This is handled with the same efficiency as with UTF-32.
|
||||
*
|
||||
* Only the rare cases with supplementary code points are handled
|
||||
* more slowly - but not too bad since this is an insertion anyway.
|
||||
*/
|
||||
if(i<=firstSupplementaryIndex) {
|
||||
codeUnitIndex=i;
|
||||
if(cpLength>1) {
|
||||
firstSupplementaryIndex=codeUnitIndex;
|
||||
} else {
|
||||
++firstSupplementaryIndex;
|
||||
}
|
||||
} else {
|
||||
codeUnitIndex=firstSupplementaryIndex;
|
||||
U16_FWD_N(dest, codeUnitIndex, destLength, i-codeUnitIndex);
|
||||
}
|
||||
|
||||
/* use the char16_t index codeUnitIndex instead of the code point index i */
|
||||
if(codeUnitIndex<destLength) {
|
||||
uprv_memmove(dest+codeUnitIndex+cpLength,
|
||||
dest+codeUnitIndex,
|
||||
(destLength-codeUnitIndex)*U_SIZEOF_UCHAR);
|
||||
if(caseFlags!=nullptr) {
|
||||
uprv_memmove(caseFlags+codeUnitIndex+cpLength,
|
||||
caseFlags+codeUnitIndex,
|
||||
destLength-codeUnitIndex);
|
||||
}
|
||||
}
|
||||
if(cpLength==1) {
|
||||
/* BMP, insert one code unit */
|
||||
dest[codeUnitIndex]=(char16_t)n;
|
||||
} else {
|
||||
/* supplementary character, insert two code units */
|
||||
dest[codeUnitIndex]=U16_LEAD(n);
|
||||
dest[codeUnitIndex+1]=U16_TRAIL(n);
|
||||
}
|
||||
if(caseFlags!=nullptr) {
|
||||
/* Case of last character determines uppercase flag: */
|
||||
caseFlags[codeUnitIndex]=IS_BASIC_UPPERCASE(src[in-1]);
|
||||
if(cpLength==2) {
|
||||
caseFlags[codeUnitIndex+1]=false;
|
||||
}
|
||||
}
|
||||
}
|
||||
destLength+=cpLength;
|
||||
U_ASSERT(destLength>=0);
|
||||
++i;
|
||||
}
|
||||
|
||||
return u_terminateUChars(dest, destCapacity, destLength, pErrorCode);
|
||||
}
|
||||
|
||||
/* ### check notes on overflow handling - only necessary if not IDNA? are these Punycode functions to be public? */
|
||||
|
||||
#endif /* #if !UCONFIG_NO_IDNA */
|
||||
120
engine/thirdparty/icu4c/common/punycode.h
vendored
Normal file
120
engine/thirdparty/icu4c/common/punycode.h
vendored
Normal file
|
|
@ -0,0 +1,120 @@
|
|||
// © 2016 and later: Unicode, Inc. and others.
|
||||
// License & terms of use: http://www.unicode.org/copyright.html
|
||||
/*
|
||||
*******************************************************************************
|
||||
*
|
||||
* Copyright (C) 2002-2003, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*
|
||||
*******************************************************************************
|
||||
* file name: punycode.h
|
||||
* encoding: UTF-8
|
||||
* tab size: 8 (not used)
|
||||
* indentation:4
|
||||
*
|
||||
* created on: 2002jan31
|
||||
* created by: Markus W. Scherer
|
||||
*/
|
||||
|
||||
/* This ICU code derived from: */
|
||||
/*
|
||||
punycode.c 0.4.0 (2001-Nov-17-Sat)
|
||||
http://www.cs.berkeley.edu/~amc/idn/
|
||||
Adam M. Costello
|
||||
http://www.nicemice.net/amc/
|
||||
*/
|
||||
|
||||
#ifndef __PUNYCODE_H__
|
||||
#define __PUNYCODE_H__
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
|
||||
#if !UCONFIG_NO_IDNA
|
||||
|
||||
/**
|
||||
* u_strToPunycode() converts Unicode to Punycode.
|
||||
*
|
||||
* The input string must not contain single, unpaired surrogates.
|
||||
* The output will be represented as an array of ASCII code points.
|
||||
*
|
||||
* The output string is NUL-terminated according to normal ICU
|
||||
* string output rules.
|
||||
*
|
||||
* @param src Input Unicode string.
|
||||
* This function handles a limited amount of code points
|
||||
* (the limit is >=64).
|
||||
* U_INDEX_OUTOFBOUNDS_ERROR is set if the limit is exceeded.
|
||||
* @param srcLength Number of UChars in src, or -1 if NUL-terminated.
|
||||
* @param dest Output Punycode array.
|
||||
* @param destCapacity Size of dest.
|
||||
* @param caseFlags Vector of boolean values, one per input UChar,
|
||||
* indicating that the corresponding character is to be
|
||||
* marked for the decoder optionally
|
||||
* uppercasing (true) or lowercasing (false)
|
||||
* the character.
|
||||
* ASCII characters are output directly in the case as marked.
|
||||
* Flags corresponding to trail surrogates are ignored.
|
||||
* If caseFlags==NULL then input characters are not
|
||||
* case-mapped.
|
||||
* @param pErrorCode ICU in/out error code parameter.
|
||||
* U_INVALID_CHAR_FOUND if src contains
|
||||
* unmatched single surrogates.
|
||||
* U_INDEX_OUTOFBOUNDS_ERROR if src contains
|
||||
* too many code points.
|
||||
* @return Number of ASCII characters in puny.
|
||||
*
|
||||
* @see u_strFromPunycode
|
||||
*/
|
||||
U_CAPI int32_t
|
||||
u_strToPunycode(const UChar *src, int32_t srcLength,
|
||||
UChar *dest, int32_t destCapacity,
|
||||
const UBool *caseFlags,
|
||||
UErrorCode *pErrorCode);
|
||||
|
||||
/**
|
||||
* u_strFromPunycode() converts Punycode to Unicode.
|
||||
* The Unicode string will be at most as long (in UChars)
|
||||
* than the Punycode string (in chars).
|
||||
*
|
||||
* @param src Input Punycode string.
|
||||
* @param srcLength Length of puny, or -1 if NUL-terminated
|
||||
* @param dest Output Unicode string buffer.
|
||||
* @param destCapacity Size of dest in number of UChars,
|
||||
* and of caseFlags in numbers of UBools.
|
||||
* @param caseFlags Output array for case flags as
|
||||
* defined by the Punycode string.
|
||||
* The caller should uppercase (true) or lowercase (FASLE)
|
||||
* the corresponding character in dest.
|
||||
* For supplementary characters, only the lead surrogate
|
||||
* is marked, and false is stored for the trail surrogate.
|
||||
* This is redundant and not necessary for ASCII characters
|
||||
* because they are already in the case indicated.
|
||||
* Can be NULL if the case flags are not needed.
|
||||
* @param pErrorCode ICU in/out error code parameter.
|
||||
* U_INVALID_CHAR_FOUND if a non-ASCII character
|
||||
* precedes the last delimiter ('-'),
|
||||
* or if an invalid character (not a-zA-Z0-9) is found
|
||||
* after the last delimiter.
|
||||
* U_ILLEGAL_CHAR_FOUND if the delta sequence is ill-formed.
|
||||
* @return Number of UChars written to dest.
|
||||
*
|
||||
* @see u_strToPunycode
|
||||
*/
|
||||
U_CAPI int32_t
|
||||
u_strFromPunycode(const UChar *src, int32_t srcLength,
|
||||
UChar *dest, int32_t destCapacity,
|
||||
UBool *caseFlags,
|
||||
UErrorCode *pErrorCode);
|
||||
|
||||
#endif /* #if !UCONFIG_NO_IDNA */
|
||||
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Hey, Emacs, please set the following:
|
||||
*
|
||||
* Local Variables:
|
||||
* indent-tabs-mode: nil
|
||||
* End:
|
||||
*
|
||||
*/
|
||||
2505
engine/thirdparty/icu4c/common/putil.cpp
vendored
Normal file
2505
engine/thirdparty/icu4c/common/putil.cpp
vendored
Normal file
File diff suppressed because it is too large
Load diff
615
engine/thirdparty/icu4c/common/putilimp.h
vendored
Normal file
615
engine/thirdparty/icu4c/common/putilimp.h
vendored
Normal file
|
|
@ -0,0 +1,615 @@
|
|||
// © 2016 and later: Unicode, Inc. and others.
|
||||
// License & terms of use: http://www.unicode.org/copyright.html
|
||||
/*
|
||||
******************************************************************************
|
||||
*
|
||||
* Copyright (C) 1997-2016, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*
|
||||
******************************************************************************
|
||||
*
|
||||
* FILE NAME : putilimp.h
|
||||
*
|
||||
* Date Name Description
|
||||
* 10/17/04 grhoten Move internal functions from putil.h to this file.
|
||||
******************************************************************************
|
||||
*/
|
||||
|
||||
#ifndef PUTILIMP_H
|
||||
#define PUTILIMP_H
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
#include "unicode/putil.h"
|
||||
|
||||
/**
|
||||
* \def U_SIGNED_RIGHT_SHIFT_IS_ARITHMETIC
|
||||
* Nearly all CPUs and compilers implement a right-shift of a signed integer
|
||||
* as an Arithmetic Shift Right which copies the sign bit (the Most Significant Bit (MSB))
|
||||
* into the vacated bits (sign extension).
|
||||
* For example, (int32_t)0xfff5fff3>>4 becomes 0xffff5fff and -1>>1=-1.
|
||||
*
|
||||
* This can be useful for storing a signed value in the upper bits
|
||||
* and another bit field in the lower bits.
|
||||
* The signed value can be retrieved by simple right-shifting.
|
||||
*
|
||||
* This is consistent with the Java language.
|
||||
*
|
||||
* However, the C standard allows compilers to implement a right-shift of a signed integer
|
||||
* as a Logical Shift Right which copies a 0 into the vacated bits.
|
||||
* For example, (int32_t)0xfff5fff3>>4 becomes 0x0fff5fff and -1>>1=0x7fffffff.
|
||||
*
|
||||
* Code that depends on the natural behavior should be guarded with this macro,
|
||||
* with an alternate path for unusual platforms.
|
||||
* @internal
|
||||
*/
|
||||
#ifdef U_SIGNED_RIGHT_SHIFT_IS_ARITHMETIC
|
||||
/* Use the predefined value. */
|
||||
#else
|
||||
/*
|
||||
* Nearly all CPUs & compilers implement a right-shift of a signed integer
|
||||
* as an Arithmetic Shift Right (with sign extension).
|
||||
*/
|
||||
# define U_SIGNED_RIGHT_SHIFT_IS_ARITHMETIC 1
|
||||
#endif
|
||||
|
||||
/** Define this to 1 if your platform supports IEEE 754 floating point,
|
||||
to 0 if it does not. */
|
||||
#ifndef IEEE_754
|
||||
# define IEEE_754 1
|
||||
#endif
|
||||
|
||||
/**
|
||||
* uintptr_t is an optional part of the standard definitions in stdint.h.
|
||||
* The opengroup.org documentation for stdint.h says
|
||||
* "On XSI-conformant systems, the intptr_t and uintptr_t types are required;
|
||||
* otherwise, they are optional."
|
||||
* We assume that when uintptr_t is defined, UINTPTR_MAX is defined as well.
|
||||
*
|
||||
* Do not use ptrdiff_t since it is signed. size_t is unsigned.
|
||||
*/
|
||||
/* TODO: This check fails on some z environments. Filed a ticket #9357 for this. */
|
||||
#if !defined(__intptr_t_defined) && !defined(UINTPTR_MAX) && (U_PLATFORM != U_PF_OS390)
|
||||
typedef size_t uintptr_t;
|
||||
#endif
|
||||
|
||||
/*===========================================================================*/
|
||||
/** @{ Information about POSIX support */
|
||||
/*===========================================================================*/
|
||||
|
||||
#ifdef U_HAVE_NL_LANGINFO_CODESET
|
||||
/* Use the predefined value. */
|
||||
#elif U_PLATFORM_USES_ONLY_WIN32_API || U_PLATFORM == U_PF_ANDROID || U_PLATFORM == U_PF_QNX
|
||||
# define U_HAVE_NL_LANGINFO_CODESET 0
|
||||
#else
|
||||
# define U_HAVE_NL_LANGINFO_CODESET 1
|
||||
#endif
|
||||
|
||||
#ifdef U_NL_LANGINFO_CODESET
|
||||
/* Use the predefined value. */
|
||||
#elif !U_HAVE_NL_LANGINFO_CODESET
|
||||
# define U_NL_LANGINFO_CODESET -1
|
||||
#elif U_PLATFORM == U_PF_OS400
|
||||
/* not defined */
|
||||
#else
|
||||
# define U_NL_LANGINFO_CODESET CODESET
|
||||
#endif
|
||||
|
||||
#if defined(U_TZSET) || defined(U_HAVE_TZSET)
|
||||
/* Use the predefined value. */
|
||||
#elif U_PLATFORM_USES_ONLY_WIN32_API
|
||||
// UWP doesn't support tzset or environment variables for tz
|
||||
#if U_PLATFORM_HAS_WINUWP_API == 0
|
||||
# define U_TZSET _tzset
|
||||
#endif
|
||||
#elif U_PLATFORM == U_PF_OS400
|
||||
/* not defined */
|
||||
#else
|
||||
# define U_TZSET tzset
|
||||
#endif
|
||||
|
||||
#if defined(U_TIMEZONE) || defined(U_HAVE_TIMEZONE)
|
||||
/* Use the predefined value. */
|
||||
#elif U_PLATFORM == U_PF_ANDROID
|
||||
# define U_TIMEZONE timezone
|
||||
#elif defined(__UCLIBC__)
|
||||
// uClibc does not have __timezone or _timezone.
|
||||
#elif defined(_NEWLIB_VERSION)
|
||||
# define U_TIMEZONE _timezone
|
||||
#elif defined(__GLIBC__)
|
||||
// glibc
|
||||
# define U_TIMEZONE __timezone
|
||||
#elif U_PLATFORM_IS_LINUX_BASED
|
||||
// not defined
|
||||
#elif U_PLATFORM_USES_ONLY_WIN32_API
|
||||
# define U_TIMEZONE _timezone
|
||||
#elif U_PLATFORM == U_PF_BSD && !defined(__NetBSD__)
|
||||
/* not defined */
|
||||
#elif U_PLATFORM == U_PF_OS400
|
||||
/* not defined */
|
||||
#elif U_PLATFORM == U_PF_IPHONE
|
||||
/* not defined */
|
||||
#else
|
||||
# define U_TIMEZONE timezone
|
||||
#endif
|
||||
|
||||
#if defined(U_TZNAME) || defined(U_HAVE_TZNAME)
|
||||
/* Use the predefined value. */
|
||||
#elif U_PLATFORM_USES_ONLY_WIN32_API
|
||||
/* not usable on all windows platforms */
|
||||
#if U_PLATFORM_HAS_WINUWP_API == 0
|
||||
# define U_TZNAME _tzname
|
||||
#endif
|
||||
#elif U_PLATFORM == U_PF_OS400
|
||||
/* not defined */
|
||||
#else
|
||||
# define U_TZNAME tzname
|
||||
#endif
|
||||
|
||||
#ifdef U_HAVE_MMAP
|
||||
/* Use the predefined value. */
|
||||
#elif U_PLATFORM_USES_ONLY_WIN32_API
|
||||
# define U_HAVE_MMAP 0
|
||||
#else
|
||||
# define U_HAVE_MMAP 1
|
||||
#endif
|
||||
|
||||
#ifdef U_HAVE_POPEN
|
||||
/* Use the predefined value. */
|
||||
#elif U_PLATFORM_USES_ONLY_WIN32_API
|
||||
# define U_HAVE_POPEN 0
|
||||
#elif U_PLATFORM == U_PF_OS400
|
||||
# define U_HAVE_POPEN 0
|
||||
#else
|
||||
# define U_HAVE_POPEN 1
|
||||
#endif
|
||||
|
||||
/**
|
||||
* \def U_HAVE_DIRENT_H
|
||||
* Defines whether dirent.h is available.
|
||||
* @internal
|
||||
*/
|
||||
#ifdef U_HAVE_DIRENT_H
|
||||
/* Use the predefined value. */
|
||||
#elif U_PLATFORM_USES_ONLY_WIN32_API
|
||||
# define U_HAVE_DIRENT_H 0
|
||||
#else
|
||||
# define U_HAVE_DIRENT_H 1
|
||||
#endif
|
||||
|
||||
/** @} */
|
||||
|
||||
/*===========================================================================*/
|
||||
/** @{ Programs used by ICU code */
|
||||
/*===========================================================================*/
|
||||
|
||||
/**
|
||||
* \def U_MAKE_IS_NMAKE
|
||||
* Defines whether the "make" program is Windows nmake.
|
||||
*/
|
||||
#ifdef U_MAKE_IS_NMAKE
|
||||
/* Use the predefined value. */
|
||||
#elif U_PLATFORM == U_PF_WINDOWS
|
||||
# define U_MAKE_IS_NMAKE 1
|
||||
#else
|
||||
# define U_MAKE_IS_NMAKE 0
|
||||
#endif
|
||||
|
||||
/** @} */
|
||||
|
||||
/*==========================================================================*/
|
||||
/* Platform utilities */
|
||||
/*==========================================================================*/
|
||||
|
||||
/**
|
||||
* Platform utilities isolates the platform dependencies of the
|
||||
* library. For each platform which this code is ported to, these
|
||||
* functions may have to be re-implemented.
|
||||
*/
|
||||
|
||||
/**
|
||||
* Floating point utility to determine if a double is Not a Number (NaN).
|
||||
* @internal
|
||||
*/
|
||||
U_CAPI UBool U_EXPORT2 uprv_isNaN(double d);
|
||||
/**
|
||||
* Floating point utility to determine if a double has an infinite value.
|
||||
* @internal
|
||||
*/
|
||||
U_CAPI UBool U_EXPORT2 uprv_isInfinite(double d);
|
||||
/**
|
||||
* Floating point utility to determine if a double has a positive infinite value.
|
||||
* @internal
|
||||
*/
|
||||
U_CAPI UBool U_EXPORT2 uprv_isPositiveInfinity(double d);
|
||||
/**
|
||||
* Floating point utility to determine if a double has a negative infinite value.
|
||||
* @internal
|
||||
*/
|
||||
U_CAPI UBool U_EXPORT2 uprv_isNegativeInfinity(double d);
|
||||
/**
|
||||
* Floating point utility that returns a Not a Number (NaN) value.
|
||||
* @internal
|
||||
*/
|
||||
U_CAPI double U_EXPORT2 uprv_getNaN(void);
|
||||
/**
|
||||
* Floating point utility that returns an infinite value.
|
||||
* @internal
|
||||
*/
|
||||
U_CAPI double U_EXPORT2 uprv_getInfinity(void);
|
||||
|
||||
/**
|
||||
* Floating point utility to truncate a double.
|
||||
* @internal
|
||||
*/
|
||||
U_CAPI double U_EXPORT2 uprv_trunc(double d);
|
||||
/**
|
||||
* Floating point utility to calculate the floor of a double.
|
||||
* @internal
|
||||
*/
|
||||
U_CAPI double U_EXPORT2 uprv_floor(double d);
|
||||
/**
|
||||
* Floating point utility to calculate the ceiling of a double.
|
||||
* @internal
|
||||
*/
|
||||
U_CAPI double U_EXPORT2 uprv_ceil(double d);
|
||||
/**
|
||||
* Floating point utility to calculate the absolute value of a double.
|
||||
* @internal
|
||||
*/
|
||||
U_CAPI double U_EXPORT2 uprv_fabs(double d);
|
||||
/**
|
||||
* Floating point utility to calculate the fractional and integer parts of a double.
|
||||
* @internal
|
||||
*/
|
||||
U_CAPI double U_EXPORT2 uprv_modf(double d, double* pinteger);
|
||||
/**
|
||||
* Floating point utility to calculate the remainder of a double divided by another double.
|
||||
* @internal
|
||||
*/
|
||||
U_CAPI double U_EXPORT2 uprv_fmod(double d, double y);
|
||||
/**
|
||||
* Floating point utility to calculate d to the power of exponent (d^exponent).
|
||||
* @internal
|
||||
*/
|
||||
U_CAPI double U_EXPORT2 uprv_pow(double d, double exponent);
|
||||
/**
|
||||
* Floating point utility to calculate 10 to the power of exponent (10^exponent).
|
||||
* @internal
|
||||
*/
|
||||
U_CAPI double U_EXPORT2 uprv_pow10(int32_t exponent);
|
||||
/**
|
||||
* Floating point utility to calculate the maximum value of two doubles.
|
||||
* @internal
|
||||
*/
|
||||
U_CAPI double U_EXPORT2 uprv_fmax(double d, double y);
|
||||
/**
|
||||
* Floating point utility to calculate the minimum value of two doubles.
|
||||
* @internal
|
||||
*/
|
||||
U_CAPI double U_EXPORT2 uprv_fmin(double d, double y);
|
||||
/**
|
||||
* Private utility to calculate the maximum value of two integers.
|
||||
* @internal
|
||||
*/
|
||||
U_CAPI int32_t U_EXPORT2 uprv_max(int32_t d, int32_t y);
|
||||
/**
|
||||
* Private utility to calculate the minimum value of two integers.
|
||||
* @internal
|
||||
*/
|
||||
U_CAPI int32_t U_EXPORT2 uprv_min(int32_t d, int32_t y);
|
||||
|
||||
#if U_IS_BIG_ENDIAN
|
||||
# define uprv_isNegative(number) (*((signed char *)&(number))<0)
|
||||
#else
|
||||
# define uprv_isNegative(number) (*((signed char *)&(number)+sizeof(number)-1)<0)
|
||||
#endif
|
||||
|
||||
/**
|
||||
* Return the largest positive number that can be represented by an integer
|
||||
* type of arbitrary bit length.
|
||||
* @internal
|
||||
*/
|
||||
U_CAPI double U_EXPORT2 uprv_maxMantissa(void);
|
||||
|
||||
/**
|
||||
* Floating point utility to calculate the logarithm of a double.
|
||||
* @internal
|
||||
*/
|
||||
U_CAPI double U_EXPORT2 uprv_log(double d);
|
||||
|
||||
/**
|
||||
* Does common notion of rounding e.g. uprv_floor(x + 0.5);
|
||||
* @param x the double number
|
||||
* @return the rounded double
|
||||
* @internal
|
||||
*/
|
||||
U_CAPI double U_EXPORT2 uprv_round(double x);
|
||||
|
||||
/**
|
||||
* Adds the signed integers a and b, storing the result in res.
|
||||
* Checks for signed integer overflow.
|
||||
* Similar to the GCC/Clang extension __builtin_add_overflow
|
||||
*
|
||||
* @param a The first operand.
|
||||
* @param b The second operand.
|
||||
* @param res a + b
|
||||
* @return true if overflow occurred; false if no overflow occurred.
|
||||
* @internal
|
||||
*/
|
||||
U_CAPI UBool U_EXPORT2 uprv_add32_overflow(int32_t a, int32_t b, int32_t* res);
|
||||
|
||||
/**
|
||||
* Multiplies the signed integers a and b, storing the result in res.
|
||||
* Checks for signed integer overflow.
|
||||
* Similar to the GCC/Clang extension __builtin_mul_overflow
|
||||
*
|
||||
* @param a The first multiplicand.
|
||||
* @param b The second multiplicand.
|
||||
* @param res a * b
|
||||
* @return true if overflow occurred; false if no overflow occurred.
|
||||
* @internal
|
||||
*/
|
||||
U_CAPI UBool U_EXPORT2 uprv_mul32_overflow(int32_t a, int32_t b, int32_t* res);
|
||||
|
||||
#if 0
|
||||
/**
|
||||
* Returns the number of digits after the decimal point in a double number x.
|
||||
*
|
||||
* @param x the double number
|
||||
* @return the number of digits after the decimal point in a double number x.
|
||||
* @internal
|
||||
*/
|
||||
/*U_CAPI int32_t U_EXPORT2 uprv_digitsAfterDecimal(double x);*/
|
||||
#endif
|
||||
|
||||
#if !U_CHARSET_IS_UTF8
|
||||
/**
|
||||
* Please use ucnv_getDefaultName() instead.
|
||||
* Return the default codepage for this platform and locale.
|
||||
* This function can call setlocale() on Unix platforms. Please read the
|
||||
* platform documentation on setlocale() before calling this function.
|
||||
* @return the default codepage for this platform
|
||||
* @internal
|
||||
*/
|
||||
U_CAPI const char* U_EXPORT2 uprv_getDefaultCodepage(void);
|
||||
#endif
|
||||
|
||||
/**
|
||||
* Please use uloc_getDefault() instead.
|
||||
* Return the default locale ID string by querying the system, or
|
||||
* zero if one cannot be found.
|
||||
* This function can call setlocale() on Unix platforms. Please read the
|
||||
* platform documentation on setlocale() before calling this function.
|
||||
* @return the default locale ID string
|
||||
* @internal
|
||||
*/
|
||||
U_CAPI const char* U_EXPORT2 uprv_getDefaultLocaleID(void);
|
||||
|
||||
/**
|
||||
* Time zone utilities
|
||||
*
|
||||
* Wrappers for C runtime library functions relating to timezones.
|
||||
* The t_tzset() function (similar to tzset) uses the current setting
|
||||
* of the environment variable TZ to assign values to three global
|
||||
* variables: daylight, timezone, and tzname. These variables have the
|
||||
* following meanings, and are declared in <time.h>.
|
||||
*
|
||||
* daylight Nonzero if daylight-saving-time zone (DST) is specified
|
||||
* in TZ; otherwise, 0. Default value is 1.
|
||||
* timezone Difference in seconds between coordinated universal
|
||||
* time and local time. E.g., -28,800 for PST (GMT-8hrs)
|
||||
* tzname(0) Three-letter time-zone name derived from TZ environment
|
||||
* variable. E.g., "PST".
|
||||
* tzname(1) Three-letter DST zone name derived from TZ environment
|
||||
* variable. E.g., "PDT". If DST zone is omitted from TZ,
|
||||
* tzname(1) is an empty string.
|
||||
*
|
||||
* Notes: For example, to set the TZ environment variable to correspond
|
||||
* to the current time zone in Germany, you can use one of the
|
||||
* following statements:
|
||||
*
|
||||
* set TZ=GST1GDT
|
||||
* set TZ=GST+1GDT
|
||||
*
|
||||
* If the TZ value is not set, t_tzset() attempts to use the time zone
|
||||
* information specified by the operating system. Under Windows NT
|
||||
* and Windows 95, this information is specified in the Control Panel's
|
||||
* Date/Time application.
|
||||
* @internal
|
||||
*/
|
||||
U_CAPI void U_EXPORT2 uprv_tzset(void);
|
||||
|
||||
/**
|
||||
* Difference in seconds between coordinated universal
|
||||
* time and local time. E.g., -28,800 for PST (GMT-8hrs)
|
||||
* @return the difference in seconds between coordinated universal time and local time.
|
||||
* @internal
|
||||
*/
|
||||
U_CAPI int32_t U_EXPORT2 uprv_timezone(void);
|
||||
|
||||
/**
|
||||
* tzname(0) Three-letter time-zone name derived from TZ environment
|
||||
* variable. E.g., "PST".
|
||||
* tzname(1) Three-letter DST zone name derived from TZ environment
|
||||
* variable. E.g., "PDT". If DST zone is omitted from TZ,
|
||||
* tzname(1) is an empty string.
|
||||
* @internal
|
||||
*/
|
||||
U_CAPI const char* U_EXPORT2 uprv_tzname(int n);
|
||||
|
||||
/**
|
||||
* Reset the global tzname cache.
|
||||
* @internal
|
||||
*/
|
||||
U_CAPI void uprv_tzname_clear_cache(void);
|
||||
|
||||
/**
|
||||
* Get UTC (GMT) time measured in milliseconds since 0:00 on 1/1/1970.
|
||||
* This function is affected by 'faketime' and should be the bottleneck for all user-visible ICU time functions.
|
||||
* @return the UTC time measured in milliseconds
|
||||
* @internal
|
||||
*/
|
||||
U_CAPI UDate U_EXPORT2 uprv_getUTCtime(void);
|
||||
|
||||
/**
|
||||
* Get UTC (GMT) time measured in milliseconds since 0:00 on 1/1/1970.
|
||||
* This function is not affected by 'faketime', so it should only be used by low level test functions- not by anything that
|
||||
* exposes time to the end user.
|
||||
* @return the UTC time measured in milliseconds
|
||||
* @internal
|
||||
*/
|
||||
U_CAPI UDate U_EXPORT2 uprv_getRawUTCtime(void);
|
||||
|
||||
/**
|
||||
* Determine whether a pathname is absolute or not, as defined by the platform.
|
||||
* @param path Pathname to test
|
||||
* @return true if the path is absolute
|
||||
* @internal (ICU 3.0)
|
||||
*/
|
||||
U_CAPI UBool U_EXPORT2 uprv_pathIsAbsolute(const char *path);
|
||||
|
||||
/**
|
||||
* Use U_MAX_PTR instead of this function.
|
||||
* @param void pointer to test
|
||||
* @return the largest possible pointer greater than the base
|
||||
* @internal (ICU 3.8)
|
||||
*/
|
||||
U_CAPI void * U_EXPORT2 uprv_maximumPtr(void *base);
|
||||
|
||||
/**
|
||||
* Maximum value of a (void*) - use to indicate the limit of an 'infinite' buffer.
|
||||
* In fact, buffer sizes must not exceed 2GB so that the difference between
|
||||
* the buffer limit and the buffer start can be expressed in an int32_t.
|
||||
*
|
||||
* The definition of U_MAX_PTR must fulfill the following conditions:
|
||||
* - return the largest possible pointer greater than base
|
||||
* - return a valid pointer according to the machine architecture (AS/400, 64-bit, etc.)
|
||||
* - avoid wrapping around at high addresses
|
||||
* - make sure that the returned pointer is not farther from base than 0x7fffffff bytes
|
||||
*
|
||||
* @param base The beginning of a buffer to find the maximum offset from
|
||||
* @internal
|
||||
*/
|
||||
#ifndef U_MAX_PTR
|
||||
# if U_PLATFORM == U_PF_OS390 && !defined(_LP64)
|
||||
/* We have 31-bit pointers. */
|
||||
# define U_MAX_PTR(base) ((void *)0x7fffffff)
|
||||
# elif U_PLATFORM == U_PF_OS400
|
||||
# define U_MAX_PTR(base) uprv_maximumPtr((void *)base)
|
||||
# elif 0
|
||||
/*
|
||||
* For platforms where pointers are scalar values (which is normal, but unlike i5/OS)
|
||||
* but that do not define uintptr_t.
|
||||
*
|
||||
* However, this does not work on modern compilers:
|
||||
* The C++ standard does not define pointer overflow, and allows compilers to
|
||||
* assume that p+u>p for any pointer p and any integer u>0.
|
||||
* Thus, modern compilers optimize away the ">" comparison.
|
||||
* (See ICU tickets #7187 and #8096.)
|
||||
*/
|
||||
# define U_MAX_PTR(base) \
|
||||
((void *)(((char *)(base)+0x7fffffffu) > (char *)(base) \
|
||||
? ((char *)(base)+0x7fffffffu) \
|
||||
: (char *)-1))
|
||||
# else
|
||||
/* Default version. C++ standard compliant for scalar pointers. */
|
||||
# define U_MAX_PTR(base) \
|
||||
((void *)(((uintptr_t)(base)+0x7fffffffu) > (uintptr_t)(base) \
|
||||
? ((uintptr_t)(base)+0x7fffffffu) \
|
||||
: (uintptr_t)-1))
|
||||
# endif
|
||||
#endif
|
||||
|
||||
|
||||
#ifdef __cplusplus
|
||||
/**
|
||||
* Pin a buffer capacity such that doing pointer arithmetic
|
||||
* on the destination pointer and capacity cannot overflow.
|
||||
*
|
||||
* The pinned capacity must fulfill the following conditions (for positive capacities):
|
||||
* - dest + capacity is a valid pointer according to the machine architecture (AS/400, 64-bit, etc.)
|
||||
* - (dest + capacity) >= dest
|
||||
* - The size (in bytes) of T[capacity] does not exceed 0x7fffffff
|
||||
*
|
||||
* @param dest the destination buffer pointer.
|
||||
* @param capacity the requested buffer capacity, in units of type T.
|
||||
* @return the pinned capacity.
|
||||
* @internal
|
||||
*/
|
||||
template <typename T>
|
||||
inline int32_t pinCapacity(T *dest, int32_t capacity) {
|
||||
if (capacity <= 0) { return capacity; }
|
||||
|
||||
uintptr_t destInt = (uintptr_t)dest;
|
||||
uintptr_t maxInt;
|
||||
|
||||
# if U_PLATFORM == U_PF_OS390 && !defined(_LP64)
|
||||
// We have 31-bit pointers.
|
||||
maxInt = 0x7fffffff;
|
||||
# elif U_PLATFORM == U_PF_OS400
|
||||
maxInt = (uintptr_t)uprv_maximumPtr((void *)dest);
|
||||
# else
|
||||
maxInt = destInt + 0x7fffffffu;
|
||||
if (maxInt < destInt) {
|
||||
// Less than 2GB to the end of the address space.
|
||||
// Pin to that to prevent address overflow.
|
||||
maxInt = (uintptr_t)-1;
|
||||
}
|
||||
# endif
|
||||
|
||||
uintptr_t maxBytes = maxInt - destInt; // max. 2GB
|
||||
int32_t maxCapacity = (int32_t)(maxBytes / sizeof(T));
|
||||
return capacity <= maxCapacity ? capacity : maxCapacity;
|
||||
}
|
||||
#endif // __cplusplus
|
||||
|
||||
/* Dynamic Library Functions */
|
||||
|
||||
typedef void (UVoidFunction)(void);
|
||||
|
||||
#if U_ENABLE_DYLOAD
|
||||
/**
|
||||
* Load a library
|
||||
* @internal (ICU 4.4)
|
||||
*/
|
||||
U_CAPI void * U_EXPORT2 uprv_dl_open(const char *libName, UErrorCode *status);
|
||||
|
||||
/**
|
||||
* Close a library
|
||||
* @internal (ICU 4.4)
|
||||
*/
|
||||
U_CAPI void U_EXPORT2 uprv_dl_close( void *lib, UErrorCode *status);
|
||||
|
||||
/**
|
||||
* Extract a symbol from a library (function)
|
||||
* @internal (ICU 4.8)
|
||||
*/
|
||||
U_CAPI UVoidFunction* U_EXPORT2 uprv_dlsym_func( void *lib, const char *symbolName, UErrorCode *status);
|
||||
|
||||
/**
|
||||
* Extract a symbol from a library (function)
|
||||
* Not implemented, no clients.
|
||||
* @internal
|
||||
*/
|
||||
/* U_CAPI void * U_EXPORT2 uprv_dlsym_data( void *lib, const char *symbolName, UErrorCode *status); */
|
||||
|
||||
#endif
|
||||
|
||||
/**
|
||||
* Define malloc and related functions
|
||||
* @internal
|
||||
*/
|
||||
#if U_PLATFORM == U_PF_OS400
|
||||
# define uprv_default_malloc(x) _C_TS_malloc(x)
|
||||
# define uprv_default_realloc(x,y) _C_TS_realloc(x,y)
|
||||
# define uprv_default_free(x) _C_TS_free(x)
|
||||
/* also _C_TS_calloc(x) */
|
||||
#else
|
||||
/* C defaults */
|
||||
# define uprv_default_malloc(x) malloc(x)
|
||||
# define uprv_default_realloc(x,y) realloc(x,y)
|
||||
# define uprv_default_free(x) free(x)
|
||||
#endif
|
||||
|
||||
|
||||
#endif
|
||||
1303
engine/thirdparty/icu4c/common/rbbi.cpp
vendored
Normal file
1303
engine/thirdparty/icu4c/common/rbbi.cpp
vendored
Normal file
File diff suppressed because it is too large
Load diff
698
engine/thirdparty/icu4c/common/rbbi_cache.cpp
vendored
Normal file
698
engine/thirdparty/icu4c/common/rbbi_cache.cpp
vendored
Normal file
|
|
@ -0,0 +1,698 @@
|
|||
// Copyright (C) 2016 and later: Unicode, Inc. and others.
|
||||
// License & terms of use: http://www.unicode.org/copyright.html
|
||||
|
||||
// file: rbbi_cache.cpp
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
|
||||
#if !UCONFIG_NO_BREAK_ITERATION
|
||||
|
||||
#include "unicode/ubrk.h"
|
||||
#include "unicode/rbbi.h"
|
||||
|
||||
#include "rbbi_cache.h"
|
||||
|
||||
#include "brkeng.h"
|
||||
#include "cmemory.h"
|
||||
#include "rbbidata.h"
|
||||
#include "rbbirb.h"
|
||||
#include "uassert.h"
|
||||
#include "uvectr32.h"
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
/*
|
||||
* DictionaryCache implementation
|
||||
*/
|
||||
|
||||
RuleBasedBreakIterator::DictionaryCache::DictionaryCache(RuleBasedBreakIterator *bi, UErrorCode &status) :
|
||||
fBI(bi), fBreaks(status), fPositionInCache(-1),
|
||||
fStart(0), fLimit(0), fFirstRuleStatusIndex(0), fOtherRuleStatusIndex(0) {
|
||||
}
|
||||
|
||||
RuleBasedBreakIterator::DictionaryCache::~DictionaryCache() {
|
||||
}
|
||||
|
||||
void RuleBasedBreakIterator::DictionaryCache::reset() {
|
||||
fPositionInCache = -1;
|
||||
fStart = 0;
|
||||
fLimit = 0;
|
||||
fFirstRuleStatusIndex = 0;
|
||||
fOtherRuleStatusIndex = 0;
|
||||
fBreaks.removeAllElements();
|
||||
}
|
||||
|
||||
UBool RuleBasedBreakIterator::DictionaryCache::following(int32_t fromPos, int32_t *result, int32_t *statusIndex) {
|
||||
if (fromPos >= fLimit || fromPos < fStart) {
|
||||
fPositionInCache = -1;
|
||||
return false;
|
||||
}
|
||||
|
||||
// Sequential iteration, move from previous boundary to the following
|
||||
|
||||
int32_t r = 0;
|
||||
if (fPositionInCache >= 0 && fPositionInCache < fBreaks.size() && fBreaks.elementAti(fPositionInCache) == fromPos) {
|
||||
++fPositionInCache;
|
||||
if (fPositionInCache >= fBreaks.size()) {
|
||||
fPositionInCache = -1;
|
||||
return false;
|
||||
}
|
||||
r = fBreaks.elementAti(fPositionInCache);
|
||||
U_ASSERT(r > fromPos);
|
||||
*result = r;
|
||||
*statusIndex = fOtherRuleStatusIndex;
|
||||
return true;
|
||||
}
|
||||
|
||||
// Random indexing. Linear search for the boundary following the given position.
|
||||
|
||||
for (fPositionInCache = 0; fPositionInCache < fBreaks.size(); ++fPositionInCache) {
|
||||
r= fBreaks.elementAti(fPositionInCache);
|
||||
if (r > fromPos) {
|
||||
*result = r;
|
||||
*statusIndex = fOtherRuleStatusIndex;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
UPRV_UNREACHABLE_EXIT;
|
||||
}
|
||||
|
||||
|
||||
UBool RuleBasedBreakIterator::DictionaryCache::preceding(int32_t fromPos, int32_t *result, int32_t *statusIndex) {
|
||||
if (fromPos <= fStart || fromPos > fLimit) {
|
||||
fPositionInCache = -1;
|
||||
return false;
|
||||
}
|
||||
|
||||
if (fromPos == fLimit) {
|
||||
fPositionInCache = fBreaks.size() - 1;
|
||||
if (fPositionInCache >= 0) {
|
||||
U_ASSERT(fBreaks.elementAti(fPositionInCache) == fromPos);
|
||||
}
|
||||
}
|
||||
|
||||
int32_t r;
|
||||
if (fPositionInCache > 0 && fPositionInCache < fBreaks.size() && fBreaks.elementAti(fPositionInCache) == fromPos) {
|
||||
--fPositionInCache;
|
||||
r = fBreaks.elementAti(fPositionInCache);
|
||||
U_ASSERT(r < fromPos);
|
||||
*result = r;
|
||||
*statusIndex = ( r== fStart) ? fFirstRuleStatusIndex : fOtherRuleStatusIndex;
|
||||
return true;
|
||||
}
|
||||
|
||||
if (fPositionInCache == 0) {
|
||||
fPositionInCache = -1;
|
||||
return false;
|
||||
}
|
||||
|
||||
for (fPositionInCache = fBreaks.size()-1; fPositionInCache >= 0; --fPositionInCache) {
|
||||
r = fBreaks.elementAti(fPositionInCache);
|
||||
if (r < fromPos) {
|
||||
*result = r;
|
||||
*statusIndex = ( r == fStart) ? fFirstRuleStatusIndex : fOtherRuleStatusIndex;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
UPRV_UNREACHABLE_EXIT;
|
||||
}
|
||||
|
||||
void RuleBasedBreakIterator::DictionaryCache::populateDictionary(int32_t startPos, int32_t endPos,
|
||||
int32_t firstRuleStatus, int32_t otherRuleStatus) {
|
||||
if ((endPos - startPos) <= 1) {
|
||||
return;
|
||||
}
|
||||
|
||||
reset();
|
||||
fFirstRuleStatusIndex = firstRuleStatus;
|
||||
fOtherRuleStatusIndex = otherRuleStatus;
|
||||
|
||||
int32_t rangeStart = startPos;
|
||||
int32_t rangeEnd = endPos;
|
||||
|
||||
uint16_t category;
|
||||
int32_t current;
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
int32_t foundBreakCount = 0;
|
||||
UText *text = &fBI->fText;
|
||||
|
||||
// Loop through the text, looking for ranges of dictionary characters.
|
||||
// For each span, find the appropriate break engine, and ask it to find
|
||||
// any breaks within the span.
|
||||
|
||||
utext_setNativeIndex(text, rangeStart);
|
||||
UChar32 c = utext_current32(text);
|
||||
category = ucptrie_get(fBI->fData->fTrie, c);
|
||||
uint32_t dictStart = fBI->fData->fForwardTable->fDictCategoriesStart;
|
||||
|
||||
while(U_SUCCESS(status)) {
|
||||
while((current = (int32_t)UTEXT_GETNATIVEINDEX(text)) < rangeEnd
|
||||
&& (category < dictStart)) {
|
||||
utext_next32(text); // TODO: cleaner loop structure.
|
||||
c = utext_current32(text);
|
||||
category = ucptrie_get(fBI->fData->fTrie, c);
|
||||
}
|
||||
if (current >= rangeEnd) {
|
||||
break;
|
||||
}
|
||||
|
||||
// We now have a dictionary character. Get the appropriate language object
|
||||
// to deal with it.
|
||||
const LanguageBreakEngine *lbe = fBI->getLanguageBreakEngine(
|
||||
c, fBI->getLocaleID(ULOC_REQUESTED_LOCALE, status));
|
||||
|
||||
// Ask the language object if there are any breaks. It will add them to the cache and
|
||||
// leave the text pointer on the other side of its range, ready to search for the next one.
|
||||
if (lbe != nullptr) {
|
||||
foundBreakCount += lbe->findBreaks(text, current, rangeEnd, fBreaks, fBI->fIsPhraseBreaking, status);
|
||||
}
|
||||
|
||||
// Reload the loop variables for the next go-round
|
||||
c = utext_current32(text);
|
||||
category = ucptrie_get(fBI->fData->fTrie, c);
|
||||
}
|
||||
|
||||
// If we found breaks, ensure that the first and last entries are
|
||||
// the original starting and ending position. And initialize the
|
||||
// cache iteration position to the first entry.
|
||||
|
||||
// printf("foundBreakCount = %d\n", foundBreakCount);
|
||||
if (foundBreakCount > 0) {
|
||||
U_ASSERT(foundBreakCount == fBreaks.size());
|
||||
if (startPos < fBreaks.elementAti(0)) {
|
||||
// The dictionary did not place a boundary at the start of the segment of text.
|
||||
// Add one now. This should not commonly happen, but it would be easy for interactions
|
||||
// of the rules for dictionary segments and the break engine implementations to
|
||||
// inadvertently cause it. Cover it here, just in case.
|
||||
fBreaks.insertElementAt(startPos, 0, status);
|
||||
}
|
||||
if (endPos > fBreaks.peeki()) {
|
||||
fBreaks.push(endPos, status);
|
||||
}
|
||||
fPositionInCache = 0;
|
||||
// Note: Dictionary matching may extend beyond the original limit.
|
||||
fStart = fBreaks.elementAti(0);
|
||||
fLimit = fBreaks.peeki();
|
||||
} else {
|
||||
// there were no language-based breaks, even though the segment contained
|
||||
// dictionary characters. Subsequent attempts to fetch boundaries from the dictionary cache
|
||||
// for this range will fail, and the calling code will fall back to the rule based boundaries.
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* BreakCache implementation
|
||||
*/
|
||||
|
||||
RuleBasedBreakIterator::BreakCache::BreakCache(RuleBasedBreakIterator *bi, UErrorCode &status) :
|
||||
fBI(bi), fSideBuffer(status) {
|
||||
reset();
|
||||
}
|
||||
|
||||
|
||||
RuleBasedBreakIterator::BreakCache::~BreakCache() {
|
||||
}
|
||||
|
||||
|
||||
void RuleBasedBreakIterator::BreakCache::reset(int32_t pos, int32_t ruleStatus) {
|
||||
fStartBufIdx = 0;
|
||||
fEndBufIdx = 0;
|
||||
fTextIdx = pos;
|
||||
fBufIdx = 0;
|
||||
fBoundaries[0] = pos;
|
||||
fStatuses[0] = (uint16_t)ruleStatus;
|
||||
}
|
||||
|
||||
|
||||
int32_t RuleBasedBreakIterator::BreakCache::current() {
|
||||
fBI->fPosition = fTextIdx;
|
||||
fBI->fRuleStatusIndex = fStatuses[fBufIdx];
|
||||
fBI->fDone = false;
|
||||
return fTextIdx;
|
||||
}
|
||||
|
||||
|
||||
void RuleBasedBreakIterator::BreakCache::following(int32_t startPos, UErrorCode &status) {
|
||||
if (U_FAILURE(status)) {
|
||||
return;
|
||||
}
|
||||
if (startPos == fTextIdx || seek(startPos) || populateNear(startPos, status)) {
|
||||
// startPos is in the cache. Do a next() from that position.
|
||||
// TODO: an awkward set of interactions with bi->fDone
|
||||
// seek() does not clear it; it can't because of interactions with populateNear().
|
||||
// next() does not clear it in the fast-path case, where everything matters. Maybe it should.
|
||||
// So clear it here, for the case where seek() succeeded on an iterator that had previously run off the end.
|
||||
fBI->fDone = false;
|
||||
next();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void RuleBasedBreakIterator::BreakCache::preceding(int32_t startPos, UErrorCode &status) {
|
||||
if (U_FAILURE(status)) {
|
||||
return;
|
||||
}
|
||||
if (startPos == fTextIdx || seek(startPos) || populateNear(startPos, status)) {
|
||||
if (startPos == fTextIdx) {
|
||||
previous(status);
|
||||
} else {
|
||||
// seek() leaves the BreakCache positioned at the preceding boundary
|
||||
// if the requested position is between two boundaries.
|
||||
// current() pushes the BreakCache position out to the BreakIterator itself.
|
||||
U_ASSERT(startPos > fTextIdx);
|
||||
current();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Out-of-line code for BreakCache::next().
|
||||
* Cache does not already contain the boundary
|
||||
*/
|
||||
void RuleBasedBreakIterator::BreakCache::nextOL() {
|
||||
fBI->fDone = !populateFollowing();
|
||||
fBI->fPosition = fTextIdx;
|
||||
fBI->fRuleStatusIndex = fStatuses[fBufIdx];
|
||||
}
|
||||
|
||||
|
||||
void RuleBasedBreakIterator::BreakCache::previous(UErrorCode &status) {
|
||||
if (U_FAILURE(status)) {
|
||||
return;
|
||||
}
|
||||
int32_t initialBufIdx = fBufIdx;
|
||||
if (fBufIdx == fStartBufIdx) {
|
||||
// At start of cache. Prepend to it.
|
||||
populatePreceding(status);
|
||||
} else {
|
||||
// Cache already holds the next boundary
|
||||
fBufIdx = modChunkSize(fBufIdx - 1);
|
||||
fTextIdx = fBoundaries[fBufIdx];
|
||||
}
|
||||
fBI->fDone = (fBufIdx == initialBufIdx);
|
||||
fBI->fPosition = fTextIdx;
|
||||
fBI->fRuleStatusIndex = fStatuses[fBufIdx];
|
||||
}
|
||||
|
||||
|
||||
UBool RuleBasedBreakIterator::BreakCache::seek(int32_t pos) {
|
||||
if (pos < fBoundaries[fStartBufIdx] || pos > fBoundaries[fEndBufIdx]) {
|
||||
return false;
|
||||
}
|
||||
if (pos == fBoundaries[fStartBufIdx]) {
|
||||
// Common case: seek(0), from BreakIterator::first()
|
||||
fBufIdx = fStartBufIdx;
|
||||
fTextIdx = fBoundaries[fBufIdx];
|
||||
return true;
|
||||
}
|
||||
if (pos == fBoundaries[fEndBufIdx]) {
|
||||
fBufIdx = fEndBufIdx;
|
||||
fTextIdx = fBoundaries[fBufIdx];
|
||||
return true;
|
||||
}
|
||||
|
||||
int32_t min = fStartBufIdx;
|
||||
int32_t max = fEndBufIdx;
|
||||
while (min != max) {
|
||||
int32_t probe = (min + max + (min>max ? CACHE_SIZE : 0)) / 2;
|
||||
probe = modChunkSize(probe);
|
||||
if (fBoundaries[probe] > pos) {
|
||||
max = probe;
|
||||
} else {
|
||||
min = modChunkSize(probe + 1);
|
||||
}
|
||||
}
|
||||
U_ASSERT(fBoundaries[max] > pos);
|
||||
fBufIdx = modChunkSize(max - 1);
|
||||
fTextIdx = fBoundaries[fBufIdx];
|
||||
U_ASSERT(fTextIdx <= pos);
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
UBool RuleBasedBreakIterator::BreakCache::populateNear(int32_t position, UErrorCode &status) {
|
||||
if (U_FAILURE(status)) {
|
||||
return false;
|
||||
}
|
||||
U_ASSERT(position < fBoundaries[fStartBufIdx] || position > fBoundaries[fEndBufIdx]);
|
||||
|
||||
// Add boundaries to the cache near the specified position.
|
||||
// The given position need not be a boundary itself.
|
||||
// The input position must be within the range of the text, and
|
||||
// on a code point boundary.
|
||||
// If the requested position is a break boundary, leave the iteration
|
||||
// position on it.
|
||||
// If the requested position is not a boundary, leave the iteration
|
||||
// position on the preceding boundary and include both the
|
||||
// preceding and following boundaries in the cache.
|
||||
// Additional boundaries, either preceding or following, may be added
|
||||
// to the cache as a side effect.
|
||||
|
||||
// If the requested position is not near already cached positions, clear the existing cache,
|
||||
// find a near-by boundary and begin new cache contents there.
|
||||
|
||||
// Threshold for a text position to be considered near to existing cache contents.
|
||||
// TODO: See issue ICU-22024 "perf tuning of Cache needed."
|
||||
// This value is subject to change. See the ticket for more details.
|
||||
static constexpr int32_t CACHE_NEAR = 15;
|
||||
|
||||
int32_t aBoundary = -1;
|
||||
int32_t ruleStatusIndex = 0;
|
||||
bool retainCache = false;
|
||||
if ((position > fBoundaries[fStartBufIdx] - CACHE_NEAR) && position < (fBoundaries[fEndBufIdx] + CACHE_NEAR)) {
|
||||
// Requested position is near the existing cache. Retain it.
|
||||
retainCache = true;
|
||||
} else if (position <= CACHE_NEAR) {
|
||||
// Requested position is near the start of the text. Fill cache from start, skipping
|
||||
// the need to find a safe point.
|
||||
retainCache = false;
|
||||
aBoundary = 0;
|
||||
} else {
|
||||
// Requested position is not near the existing cache.
|
||||
// Find a safe point to refill the cache from.
|
||||
int32_t backupPos = fBI->handleSafePrevious(position);
|
||||
|
||||
if (fBoundaries[fEndBufIdx] < position && fBoundaries[fEndBufIdx] >= (backupPos - CACHE_NEAR)) {
|
||||
// The requested position is beyond the end of the existing cache, but the
|
||||
// reverse rules produced a position near or before the cached region.
|
||||
// Retain the existing cache, and fill from the end of it.
|
||||
retainCache = true;
|
||||
} else if (backupPos < CACHE_NEAR) {
|
||||
// The safe reverse rules moved us to near the start of text.
|
||||
// Take that (index 0) as the backup boundary, avoiding the complication
|
||||
// (in the following block) of moving forward from the safe point to a known boundary.
|
||||
//
|
||||
// Retain the cache if it begins not too far from the requested position.
|
||||
aBoundary = 0;
|
||||
retainCache = (fBoundaries[fStartBufIdx] <= (position + CACHE_NEAR));
|
||||
} else {
|
||||
// The safe reverse rules produced a position that is neither near the existing
|
||||
// cache, nor near the start of text.
|
||||
// Advance to the boundary following.
|
||||
// There is a complication: the safe reverse rules identify pairs of code points
|
||||
// that are safe. If advancing from the safe point moves forwards by less than
|
||||
// two code points, we need to advance one more time to ensure that the boundary
|
||||
// is good, including a correct rules status value.
|
||||
retainCache = false;
|
||||
fBI->fPosition = backupPos;
|
||||
aBoundary = fBI->handleNext();
|
||||
if (aBoundary != UBRK_DONE && aBoundary <= backupPos + 4) {
|
||||
// +4 is a quick test for possibly having advanced only one codepoint.
|
||||
// Four being the length of the longest potential code point, a supplementary in UTF-8
|
||||
utext_setNativeIndex(&fBI->fText, aBoundary);
|
||||
if (backupPos == utext_getPreviousNativeIndex(&fBI->fText)) {
|
||||
// The initial handleNext() only advanced by a single code point. Go again.
|
||||
aBoundary = fBI->handleNext(); // Safe rules identify safe pairs.
|
||||
}
|
||||
}
|
||||
if (aBoundary == UBRK_DONE) {
|
||||
// Note (Andy Heninger): I don't think this condition can occur, but it's hard
|
||||
// to prove that it can't. We ran off the end of the string looking a boundary
|
||||
// following a safe point; choose the end of the string as that boundary.
|
||||
aBoundary = utext_nativeLength(&fBI->fText);
|
||||
}
|
||||
ruleStatusIndex = fBI->fRuleStatusIndex;
|
||||
}
|
||||
}
|
||||
|
||||
if (!retainCache) {
|
||||
U_ASSERT(aBoundary != -1);
|
||||
reset(aBoundary, ruleStatusIndex); // Reset cache to hold aBoundary as a single starting point.
|
||||
}
|
||||
|
||||
// Fill in boundaries between existing cache content and the new requested position.
|
||||
|
||||
if (fBoundaries[fEndBufIdx] < position) {
|
||||
// The last position in the cache precedes the requested position.
|
||||
// Add following position(s) to the cache.
|
||||
while (fBoundaries[fEndBufIdx] < position) {
|
||||
if (!populateFollowing()) {
|
||||
UPRV_UNREACHABLE_EXIT;
|
||||
}
|
||||
}
|
||||
fBufIdx = fEndBufIdx; // Set iterator position to the end of the buffer.
|
||||
fTextIdx = fBoundaries[fBufIdx]; // Required because populateFollowing may add extra boundaries.
|
||||
while (fTextIdx > position) { // Move backwards to a position at or preceding the requested pos.
|
||||
previous(status);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
if (fBoundaries[fStartBufIdx] > position) {
|
||||
// The first position in the cache is beyond the requested position.
|
||||
// back up more until we get a boundary <= the requested position.
|
||||
while (fBoundaries[fStartBufIdx] > position) {
|
||||
populatePreceding(status);
|
||||
}
|
||||
fBufIdx = fStartBufIdx; // Set iterator position to the start of the buffer.
|
||||
fTextIdx = fBoundaries[fBufIdx]; // Required because populatePreceding may add extra boundaries.
|
||||
while (fTextIdx < position) { // Move forwards to a position at or following the requested pos.
|
||||
next();
|
||||
}
|
||||
if (fTextIdx > position) {
|
||||
// If position is not itself a boundary, the next() loop above will overshoot.
|
||||
// Back up one, leaving cache position at the boundary preceding the requested position.
|
||||
previous(status);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
U_ASSERT(fTextIdx == position);
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
|
||||
UBool RuleBasedBreakIterator::BreakCache::populateFollowing() {
|
||||
int32_t fromPosition = fBoundaries[fEndBufIdx];
|
||||
int32_t fromRuleStatusIdx = fStatuses[fEndBufIdx];
|
||||
int32_t pos = 0;
|
||||
int32_t ruleStatusIdx = 0;
|
||||
|
||||
if (fBI->fDictionaryCache->following(fromPosition, &pos, &ruleStatusIdx)) {
|
||||
addFollowing(pos, ruleStatusIdx, UpdateCachePosition);
|
||||
return true;
|
||||
}
|
||||
|
||||
fBI->fPosition = fromPosition;
|
||||
pos = fBI->handleNext();
|
||||
if (pos == UBRK_DONE) {
|
||||
return false;
|
||||
}
|
||||
|
||||
ruleStatusIdx = fBI->fRuleStatusIndex;
|
||||
if (fBI->fDictionaryCharCount > 0) {
|
||||
// The text segment obtained from the rules includes dictionary characters.
|
||||
// Subdivide it, with subdivided results going into the dictionary cache.
|
||||
fBI->fDictionaryCache->populateDictionary(fromPosition, pos, fromRuleStatusIdx, ruleStatusIdx);
|
||||
if (fBI->fDictionaryCache->following(fromPosition, &pos, &ruleStatusIdx)) {
|
||||
addFollowing(pos, ruleStatusIdx, UpdateCachePosition);
|
||||
return true;
|
||||
// TODO: may want to move a sizable chunk of dictionary cache to break cache at this point.
|
||||
// But be careful with interactions with populateNear().
|
||||
}
|
||||
}
|
||||
|
||||
// Rule based segment did not include dictionary characters.
|
||||
// Or, it did contain dictionary chars, but the dictionary segmenter didn't handle them,
|
||||
// meaning that we didn't take the return, above.
|
||||
// Add its end point to the cache.
|
||||
addFollowing(pos, ruleStatusIdx, UpdateCachePosition);
|
||||
|
||||
// Add several non-dictionary boundaries at this point, to optimize straight forward iteration.
|
||||
// (subsequent calls to BreakIterator::next() will take the fast path, getting cached results.
|
||||
//
|
||||
for (int count=0; count<6; ++count) {
|
||||
pos = fBI->handleNext();
|
||||
if (pos == UBRK_DONE || fBI->fDictionaryCharCount > 0) {
|
||||
break;
|
||||
}
|
||||
addFollowing(pos, fBI->fRuleStatusIndex, RetainCachePosition);
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
UBool RuleBasedBreakIterator::BreakCache::populatePreceding(UErrorCode &status) {
|
||||
if (U_FAILURE(status)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
int32_t fromPosition = fBoundaries[fStartBufIdx];
|
||||
if (fromPosition == 0) {
|
||||
return false;
|
||||
}
|
||||
|
||||
int32_t position = 0;
|
||||
int32_t positionStatusIdx = 0;
|
||||
|
||||
if (fBI->fDictionaryCache->preceding(fromPosition, &position, &positionStatusIdx)) {
|
||||
addPreceding(position, positionStatusIdx, UpdateCachePosition);
|
||||
return true;
|
||||
}
|
||||
|
||||
int32_t backupPosition = fromPosition;
|
||||
|
||||
// Find a boundary somewhere preceding the first already-cached boundary
|
||||
do {
|
||||
backupPosition = backupPosition - 30;
|
||||
if (backupPosition <= 0) {
|
||||
backupPosition = 0;
|
||||
} else {
|
||||
backupPosition = fBI->handleSafePrevious(backupPosition);
|
||||
}
|
||||
if (backupPosition == UBRK_DONE || backupPosition == 0) {
|
||||
position = 0;
|
||||
positionStatusIdx = 0;
|
||||
} else {
|
||||
// Advance to the boundary following the backup position.
|
||||
// There is a complication: the safe reverse rules identify pairs of code points
|
||||
// that are safe. If advancing from the safe point moves forwards by less than
|
||||
// two code points, we need to advance one more time to ensure that the boundary
|
||||
// is good, including a correct rules status value.
|
||||
//
|
||||
fBI->fPosition = backupPosition;
|
||||
position = fBI->handleNext();
|
||||
if (position <= backupPosition + 4) {
|
||||
// +4 is a quick test for possibly having advanced only one codepoint.
|
||||
// Four being the length of the longest potential code point, a supplementary in UTF-8
|
||||
utext_setNativeIndex(&fBI->fText, position);
|
||||
if (backupPosition == utext_getPreviousNativeIndex(&fBI->fText)) {
|
||||
// The initial handleNext() only advanced by a single code point. Go again.
|
||||
position = fBI->handleNext(); // Safe rules identify safe pairs.
|
||||
}
|
||||
}
|
||||
positionStatusIdx = fBI->fRuleStatusIndex;
|
||||
}
|
||||
} while (position >= fromPosition);
|
||||
|
||||
// Find boundaries between the one we just located and the first already-cached boundary
|
||||
// Put them in a side buffer, because we don't yet know where they will fall in the circular cache buffer..
|
||||
|
||||
fSideBuffer.removeAllElements();
|
||||
fSideBuffer.addElement(position, status);
|
||||
fSideBuffer.addElement(positionStatusIdx, status);
|
||||
|
||||
do {
|
||||
int32_t prevPosition = fBI->fPosition = position;
|
||||
int32_t prevStatusIdx = positionStatusIdx;
|
||||
position = fBI->handleNext();
|
||||
positionStatusIdx = fBI->fRuleStatusIndex;
|
||||
if (position == UBRK_DONE) {
|
||||
break;
|
||||
}
|
||||
|
||||
UBool segmentHandledByDictionary = false;
|
||||
if (fBI->fDictionaryCharCount != 0) {
|
||||
// Segment from the rules includes dictionary characters.
|
||||
// Subdivide it, with subdivided results going into the dictionary cache.
|
||||
int32_t dictSegEndPosition = position;
|
||||
fBI->fDictionaryCache->populateDictionary(prevPosition, dictSegEndPosition, prevStatusIdx, positionStatusIdx);
|
||||
while (fBI->fDictionaryCache->following(prevPosition, &position, &positionStatusIdx)) {
|
||||
segmentHandledByDictionary = true;
|
||||
U_ASSERT(position > prevPosition);
|
||||
if (position >= fromPosition) {
|
||||
break;
|
||||
}
|
||||
U_ASSERT(position <= dictSegEndPosition);
|
||||
fSideBuffer.addElement(position, status);
|
||||
fSideBuffer.addElement(positionStatusIdx, status);
|
||||
prevPosition = position;
|
||||
}
|
||||
U_ASSERT(position==dictSegEndPosition || position>=fromPosition);
|
||||
}
|
||||
|
||||
if (!segmentHandledByDictionary && position < fromPosition) {
|
||||
fSideBuffer.addElement(position, status);
|
||||
fSideBuffer.addElement(positionStatusIdx, status);
|
||||
}
|
||||
} while (position < fromPosition);
|
||||
|
||||
// Move boundaries from the side buffer to the main circular buffer.
|
||||
UBool success = false;
|
||||
if (!fSideBuffer.isEmpty()) {
|
||||
positionStatusIdx = fSideBuffer.popi();
|
||||
position = fSideBuffer.popi();
|
||||
addPreceding(position, positionStatusIdx, UpdateCachePosition);
|
||||
success = true;
|
||||
}
|
||||
|
||||
while (!fSideBuffer.isEmpty()) {
|
||||
positionStatusIdx = fSideBuffer.popi();
|
||||
position = fSideBuffer.popi();
|
||||
if (!addPreceding(position, positionStatusIdx, RetainCachePosition)) {
|
||||
// No space in circular buffer to hold a new preceding result while
|
||||
// also retaining the current cache (iteration) position.
|
||||
// Bailing out is safe; the cache will refill again if needed.
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return success;
|
||||
}
|
||||
|
||||
|
||||
void RuleBasedBreakIterator::BreakCache::addFollowing(int32_t position, int32_t ruleStatusIdx, UpdatePositionValues update) {
|
||||
U_ASSERT(position > fBoundaries[fEndBufIdx]);
|
||||
U_ASSERT(ruleStatusIdx <= UINT16_MAX);
|
||||
int32_t nextIdx = modChunkSize(fEndBufIdx + 1);
|
||||
if (nextIdx == fStartBufIdx) {
|
||||
fStartBufIdx = modChunkSize(fStartBufIdx + 6); // TODO: experiment. Probably revert to 1.
|
||||
}
|
||||
fBoundaries[nextIdx] = position;
|
||||
fStatuses[nextIdx] = static_cast<uint16_t>(ruleStatusIdx);
|
||||
fEndBufIdx = nextIdx;
|
||||
if (update == UpdateCachePosition) {
|
||||
// Set current position to the newly added boundary.
|
||||
fBufIdx = nextIdx;
|
||||
fTextIdx = position;
|
||||
} else {
|
||||
// Retaining the original cache position.
|
||||
// Check if the added boundary wraps around the buffer, and would over-write the original position.
|
||||
// It's the responsibility of callers of this function to not add too many.
|
||||
U_ASSERT(nextIdx != fBufIdx);
|
||||
}
|
||||
}
|
||||
|
||||
bool RuleBasedBreakIterator::BreakCache::addPreceding(int32_t position, int32_t ruleStatusIdx, UpdatePositionValues update) {
|
||||
U_ASSERT(position < fBoundaries[fStartBufIdx]);
|
||||
U_ASSERT(ruleStatusIdx <= UINT16_MAX);
|
||||
int32_t nextIdx = modChunkSize(fStartBufIdx - 1);
|
||||
if (nextIdx == fEndBufIdx) {
|
||||
if (fBufIdx == fEndBufIdx && update == RetainCachePosition) {
|
||||
// Failure. The insertion of the new boundary would claim the buffer position that is the
|
||||
// current iteration position. And we also want to retain the current iteration position.
|
||||
// (The buffer is already completely full of entries that precede the iteration position.)
|
||||
return false;
|
||||
}
|
||||
fEndBufIdx = modChunkSize(fEndBufIdx - 1);
|
||||
}
|
||||
fBoundaries[nextIdx] = position;
|
||||
fStatuses[nextIdx] = static_cast<uint16_t>(ruleStatusIdx);
|
||||
fStartBufIdx = nextIdx;
|
||||
if (update == UpdateCachePosition) {
|
||||
fBufIdx = nextIdx;
|
||||
fTextIdx = position;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
void RuleBasedBreakIterator::BreakCache::dumpCache() {
|
||||
#ifdef RBBI_DEBUG
|
||||
RBBIDebugPrintf("fTextIdx:%d fBufIdx:%d\n", fTextIdx, fBufIdx);
|
||||
for (int32_t i=fStartBufIdx; ; i=modChunkSize(i+1)) {
|
||||
RBBIDebugPrintf("%d %d\n", i, fBoundaries[i]);
|
||||
if (i == fEndBufIdx) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
||||
#endif // #if !UCONFIG_NO_BREAK_ITERATION
|
||||
203
engine/thirdparty/icu4c/common/rbbi_cache.h
vendored
Normal file
203
engine/thirdparty/icu4c/common/rbbi_cache.h
vendored
Normal file
|
|
@ -0,0 +1,203 @@
|
|||
// Copyright (C) 2016 and later: Unicode, Inc. and others.
|
||||
// License & terms of use: http://www.unicode.org/copyright.html
|
||||
|
||||
// file: rbbi_cache.h
|
||||
//
|
||||
#ifndef RBBI_CACHE_H
|
||||
#define RBBI_CACHE_H
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
|
||||
#if !UCONFIG_NO_BREAK_ITERATION
|
||||
|
||||
#include "unicode/rbbi.h"
|
||||
#include "unicode/uobject.h"
|
||||
|
||||
#include "uvectr32.h"
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
/* DictionaryCache stores the boundaries obtained from a run of dictionary characters.
|
||||
* Dictionary boundaries are moved first to this cache, then from here
|
||||
* to the main BreakCache, where they may inter-leave with non-dictionary
|
||||
* boundaries. The public BreakIterator API always fetches directly
|
||||
* from the main BreakCache, not from here.
|
||||
*
|
||||
* In common situations, the number of boundaries in a single dictionary run
|
||||
* should be quite small, it will be terminated by punctuation, spaces,
|
||||
* or any other non-dictionary characters. The main BreakCache may end
|
||||
* up with boundaries from multiple dictionary based runs.
|
||||
*
|
||||
* The boundaries are stored in a simple ArrayList (vector), with the
|
||||
* assumption that they will be accessed sequentially.
|
||||
*/
|
||||
class RuleBasedBreakIterator::DictionaryCache: public UMemory {
|
||||
public:
|
||||
DictionaryCache(RuleBasedBreakIterator *bi, UErrorCode &status);
|
||||
~DictionaryCache();
|
||||
|
||||
void reset();
|
||||
|
||||
UBool following(int32_t fromPos, int32_t *pos, int32_t *statusIndex);
|
||||
UBool preceding(int32_t fromPos, int32_t *pos, int32_t *statusIndex);
|
||||
|
||||
/**
|
||||
* Populate the cache with the dictionary based boundaries within a region of text.
|
||||
* @param startPos The start position of a range of text
|
||||
* @param endPos The end position of a range of text
|
||||
* @param firstRuleStatus The rule status index that applies to the break at startPos
|
||||
* @param otherRuleStatus The rule status index that applies to boundaries other than startPos
|
||||
* @internal
|
||||
*/
|
||||
void populateDictionary(int32_t startPos, int32_t endPos,
|
||||
int32_t firstRuleStatus, int32_t otherRuleStatus);
|
||||
|
||||
|
||||
|
||||
RuleBasedBreakIterator *fBI;
|
||||
|
||||
UVector32 fBreaks; // A vector containing the boundaries.
|
||||
int32_t fPositionInCache; // Index in fBreaks of last boundary returned by following()
|
||||
// or preceding(). Optimizes sequential access.
|
||||
int32_t fStart; // Text position of first boundary in cache.
|
||||
int32_t fLimit; // Last boundary in cache. Which is the limit of the
|
||||
// text segment being handled by the dictionary.
|
||||
int32_t fFirstRuleStatusIndex; // Rule status info for first boundary.
|
||||
int32_t fOtherRuleStatusIndex; // Rule status info for 2nd through last boundaries.
|
||||
};
|
||||
|
||||
|
||||
/*
|
||||
* class BreakCache
|
||||
*
|
||||
* Cache of break boundary positions and rule status values.
|
||||
* Break iterator API functions, next(), previous(), etc., will use cached results
|
||||
* when possible, and otherwise cache new results as they are obtained.
|
||||
*
|
||||
* Uniformly caches both dictionary and rule based (non-dictionary) boundaries.
|
||||
*
|
||||
* The cache is implemented as a single circular buffer.
|
||||
*/
|
||||
|
||||
/*
|
||||
* size of the circular cache buffer.
|
||||
*/
|
||||
|
||||
class RuleBasedBreakIterator::BreakCache: public UMemory {
|
||||
public:
|
||||
BreakCache(RuleBasedBreakIterator *bi, UErrorCode &status);
|
||||
virtual ~BreakCache();
|
||||
void reset(int32_t pos = 0, int32_t ruleStatus = 0);
|
||||
void next() { if (fBufIdx == fEndBufIdx) {
|
||||
nextOL();
|
||||
} else {
|
||||
fBufIdx = modChunkSize(fBufIdx + 1);
|
||||
fTextIdx = fBI->fPosition = fBoundaries[fBufIdx];
|
||||
fBI->fRuleStatusIndex = fStatuses[fBufIdx];
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void nextOL();
|
||||
void previous(UErrorCode &status);
|
||||
|
||||
// Move the iteration state to the position following the startPosition.
|
||||
// Input position must be pinned to the input length.
|
||||
void following(int32_t startPosition, UErrorCode &status);
|
||||
|
||||
void preceding(int32_t startPosition, UErrorCode &status);
|
||||
|
||||
/*
|
||||
* Update the state of the public BreakIterator (fBI) to reflect the
|
||||
* current state of the break iterator cache (this).
|
||||
*/
|
||||
int32_t current();
|
||||
|
||||
/**
|
||||
* Add boundaries to the cache near the specified position.
|
||||
* The given position need not be a boundary itself.
|
||||
* The input position must be within the range of the text, and
|
||||
* on a code point boundary.
|
||||
* If the requested position is a break boundary, leave the iteration
|
||||
* position on it.
|
||||
* If the requested position is not a boundary, leave the iteration
|
||||
* position on the preceding boundary and include both the
|
||||
* preceding and following boundaries in the cache.
|
||||
* Additional boundaries, either preceding or following, may be added
|
||||
* to the cache as a side effect.
|
||||
*
|
||||
* Return false if the operation failed.
|
||||
*/
|
||||
UBool populateNear(int32_t position, UErrorCode &status);
|
||||
|
||||
/**
|
||||
* Add boundary(s) to the cache following the current last boundary.
|
||||
* Return false if at the end of the text, and no more boundaries can be added.
|
||||
* Leave iteration position at the first newly added boundary, or unchanged if no boundary was added.
|
||||
*/
|
||||
UBool populateFollowing();
|
||||
|
||||
/**
|
||||
* Add one or more boundaries to the cache preceding the first currently cached boundary.
|
||||
* Leave the iteration position on the first added boundary.
|
||||
* Return false if no boundaries could be added (if at the start of the text.)
|
||||
*/
|
||||
UBool populatePreceding(UErrorCode &status);
|
||||
|
||||
enum UpdatePositionValues {
|
||||
RetainCachePosition = 0,
|
||||
UpdateCachePosition = 1
|
||||
};
|
||||
|
||||
/*
|
||||
* Add the boundary following the current position.
|
||||
* The current position can be left as it was, or changed to the newly added boundary,
|
||||
* as specified by the update parameter.
|
||||
*/
|
||||
void addFollowing(int32_t position, int32_t ruleStatusIdx, UpdatePositionValues update);
|
||||
|
||||
|
||||
/*
|
||||
* Add the boundary preceding the current position.
|
||||
* The current position can be left as it was, or changed to the newly added boundary,
|
||||
* as specified by the update parameter.
|
||||
*/
|
||||
bool addPreceding(int32_t position, int32_t ruleStatusIdx, UpdatePositionValues update);
|
||||
|
||||
/**
|
||||
* Set the cache position to the specified position, or, if the position
|
||||
* falls between to cached boundaries, to the preceding boundary.
|
||||
* Fails if the requested position is outside of the range of boundaries currently held by the cache.
|
||||
* The startPosition must be on a code point boundary.
|
||||
*
|
||||
* Return true if successful, false if the specified position is after
|
||||
* the last cached boundary or before the first.
|
||||
*/
|
||||
UBool seek(int32_t startPosition);
|
||||
|
||||
void dumpCache();
|
||||
|
||||
private:
|
||||
static inline int32_t modChunkSize(int index) { return index & (CACHE_SIZE - 1); }
|
||||
|
||||
static constexpr int32_t CACHE_SIZE = 128;
|
||||
static_assert((CACHE_SIZE & (CACHE_SIZE-1)) == 0, "CACHE_SIZE must be power of two.");
|
||||
|
||||
RuleBasedBreakIterator *fBI;
|
||||
int32_t fStartBufIdx;
|
||||
int32_t fEndBufIdx; // inclusive
|
||||
|
||||
int32_t fTextIdx;
|
||||
int32_t fBufIdx;
|
||||
|
||||
int32_t fBoundaries[CACHE_SIZE];
|
||||
uint16_t fStatuses[CACHE_SIZE];
|
||||
|
||||
UVector32 fSideBuffer;
|
||||
};
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
||||
#endif // #if !UCONFIG_NO_BREAK_ITERATION
|
||||
|
||||
#endif // RBBI_CACHE_H
|
||||
476
engine/thirdparty/icu4c/common/rbbidata.cpp
vendored
Normal file
476
engine/thirdparty/icu4c/common/rbbidata.cpp
vendored
Normal file
|
|
@ -0,0 +1,476 @@
|
|||
// © 2016 and later: Unicode, Inc. and others.
|
||||
// License & terms of use: http://www.unicode.org/copyright.html
|
||||
/*
|
||||
***************************************************************************
|
||||
* Copyright (C) 1999-2014 International Business Machines Corporation *
|
||||
* and others. All rights reserved. *
|
||||
***************************************************************************
|
||||
*/
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
|
||||
#if !UCONFIG_NO_BREAK_ITERATION
|
||||
|
||||
#include "unicode/ucptrie.h"
|
||||
#include "unicode/utypes.h"
|
||||
#include "rbbidata.h"
|
||||
#include "rbbirb.h"
|
||||
#include "udatamem.h"
|
||||
#include "cmemory.h"
|
||||
#include "cstring.h"
|
||||
#include "umutex.h"
|
||||
|
||||
#include "uassert.h"
|
||||
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
//
|
||||
// Constructors.
|
||||
//
|
||||
//-----------------------------------------------------------------------------
|
||||
RBBIDataWrapper::RBBIDataWrapper(const RBBIDataHeader *data, UErrorCode &status) {
|
||||
init0();
|
||||
init(data, status);
|
||||
}
|
||||
|
||||
RBBIDataWrapper::RBBIDataWrapper(const RBBIDataHeader *data, enum EDontAdopt, UErrorCode &status) {
|
||||
init0();
|
||||
init(data, status);
|
||||
fDontFreeData = true;
|
||||
}
|
||||
|
||||
RBBIDataWrapper::RBBIDataWrapper(UDataMemory* udm, UErrorCode &status) {
|
||||
init0();
|
||||
if (U_FAILURE(status)) {
|
||||
return;
|
||||
}
|
||||
const DataHeader *dh = udm->pHeader;
|
||||
int32_t headerSize = dh->dataHeader.headerSize;
|
||||
if ( !(headerSize >= 20 &&
|
||||
dh->info.isBigEndian == U_IS_BIG_ENDIAN &&
|
||||
dh->info.charsetFamily == U_CHARSET_FAMILY &&
|
||||
dh->info.dataFormat[0] == 0x42 && // dataFormat="Brk "
|
||||
dh->info.dataFormat[1] == 0x72 &&
|
||||
dh->info.dataFormat[2] == 0x6b &&
|
||||
dh->info.dataFormat[3] == 0x20 &&
|
||||
isDataVersionAcceptable(dh->info.formatVersion))
|
||||
) {
|
||||
status = U_INVALID_FORMAT_ERROR;
|
||||
return;
|
||||
}
|
||||
const char *dataAsBytes = reinterpret_cast<const char *>(dh);
|
||||
const RBBIDataHeader *rbbidh = reinterpret_cast<const RBBIDataHeader *>(dataAsBytes + headerSize);
|
||||
init(rbbidh, status);
|
||||
fUDataMem = udm;
|
||||
}
|
||||
|
||||
UBool RBBIDataWrapper::isDataVersionAcceptable(const UVersionInfo version) {
|
||||
return RBBI_DATA_FORMAT_VERSION[0] == version[0];
|
||||
}
|
||||
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
//
|
||||
// init(). Does most of the work of construction, shared between the
|
||||
// constructors.
|
||||
//
|
||||
//-----------------------------------------------------------------------------
|
||||
void RBBIDataWrapper::init0() {
|
||||
fHeader = nullptr;
|
||||
fForwardTable = nullptr;
|
||||
fReverseTable = nullptr;
|
||||
fRuleSource = nullptr;
|
||||
fRuleStatusTable = nullptr;
|
||||
fTrie = nullptr;
|
||||
fUDataMem = nullptr;
|
||||
fRefCount = 0;
|
||||
fDontFreeData = true;
|
||||
}
|
||||
|
||||
void RBBIDataWrapper::init(const RBBIDataHeader *data, UErrorCode &status) {
|
||||
if (U_FAILURE(status)) {
|
||||
return;
|
||||
}
|
||||
fHeader = data;
|
||||
if (fHeader->fMagic != 0xb1a0 || !isDataVersionAcceptable(fHeader->fFormatVersion)) {
|
||||
status = U_INVALID_FORMAT_ERROR;
|
||||
return;
|
||||
}
|
||||
// Note: in ICU version 3.2 and earlier, there was a formatVersion 1
|
||||
// that is no longer supported. At that time fFormatVersion was
|
||||
// an int32_t field, rather than an array of 4 bytes.
|
||||
|
||||
fDontFreeData = false;
|
||||
if (data->fFTableLen != 0) {
|
||||
fForwardTable = (RBBIStateTable *)((char *)data + fHeader->fFTable);
|
||||
}
|
||||
if (data->fRTableLen != 0) {
|
||||
fReverseTable = (RBBIStateTable *)((char *)data + fHeader->fRTable);
|
||||
}
|
||||
|
||||
fTrie = ucptrie_openFromBinary(UCPTRIE_TYPE_FAST,
|
||||
UCPTRIE_VALUE_BITS_ANY,
|
||||
(uint8_t *)data + fHeader->fTrie,
|
||||
fHeader->fTrieLen,
|
||||
nullptr, // *actual length
|
||||
&status);
|
||||
if (U_FAILURE(status)) {
|
||||
return;
|
||||
}
|
||||
|
||||
UCPTrieValueWidth width = ucptrie_getValueWidth(fTrie);
|
||||
if (!(width == UCPTRIE_VALUE_BITS_8 || width == UCPTRIE_VALUE_BITS_16)) {
|
||||
status = U_INVALID_FORMAT_ERROR;
|
||||
return;
|
||||
}
|
||||
|
||||
fRuleSource = ((char *)data + fHeader->fRuleSource);
|
||||
fRuleString = UnicodeString::fromUTF8(StringPiece(fRuleSource, fHeader->fRuleSourceLen));
|
||||
U_ASSERT(data->fRuleSourceLen > 0);
|
||||
|
||||
fRuleStatusTable = (int32_t *)((char *)data + fHeader->fStatusTable);
|
||||
fStatusMaxIdx = data->fStatusTableLen / sizeof(int32_t);
|
||||
|
||||
fRefCount = 1;
|
||||
|
||||
#ifdef RBBI_DEBUG
|
||||
char *debugEnv = getenv("U_RBBIDEBUG");
|
||||
if (debugEnv && uprv_strstr(debugEnv, "data")) {this->printData();}
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
//
|
||||
// Destructor. Don't call this - use removeReference() instead.
|
||||
//
|
||||
//-----------------------------------------------------------------------------
|
||||
RBBIDataWrapper::~RBBIDataWrapper() {
|
||||
U_ASSERT(fRefCount == 0);
|
||||
ucptrie_close(fTrie);
|
||||
fTrie = nullptr;
|
||||
if (fUDataMem) {
|
||||
udata_close(fUDataMem);
|
||||
} else if (!fDontFreeData) {
|
||||
uprv_free((void *)fHeader);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
//
|
||||
// Operator == Consider two RBBIDataWrappers to be equal if they
|
||||
// refer to the same underlying data. Although
|
||||
// the data wrappers are normally shared between
|
||||
// iterator instances, it's possible to independently
|
||||
// open the same data twice, and get two instances, which
|
||||
// should still be ==.
|
||||
//
|
||||
//-----------------------------------------------------------------------------
|
||||
bool RBBIDataWrapper::operator ==(const RBBIDataWrapper &other) const {
|
||||
if (fHeader == other.fHeader) {
|
||||
return true;
|
||||
}
|
||||
if (fHeader->fLength != other.fHeader->fLength) {
|
||||
return false;
|
||||
}
|
||||
if (uprv_memcmp(fHeader, other.fHeader, fHeader->fLength) == 0) {
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
int32_t RBBIDataWrapper::hashCode() {
|
||||
return fHeader->fFTableLen;
|
||||
}
|
||||
|
||||
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
//
|
||||
// Reference Counting. A single RBBIDataWrapper object is shared among
|
||||
// however many RulesBasedBreakIterator instances are
|
||||
// referencing the same data.
|
||||
//
|
||||
//-----------------------------------------------------------------------------
|
||||
void RBBIDataWrapper::removeReference() {
|
||||
if (umtx_atomic_dec(&fRefCount) == 0) {
|
||||
delete this;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
RBBIDataWrapper *RBBIDataWrapper::addReference() {
|
||||
umtx_atomic_inc(&fRefCount);
|
||||
return this;
|
||||
}
|
||||
|
||||
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
//
|
||||
// getRuleSourceString
|
||||
//
|
||||
//-----------------------------------------------------------------------------
|
||||
const UnicodeString &RBBIDataWrapper::getRuleSourceString() const {
|
||||
return fRuleString;
|
||||
}
|
||||
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
//
|
||||
// print - debugging function to dump the runtime data tables.
|
||||
//
|
||||
//-----------------------------------------------------------------------------
|
||||
#ifdef RBBI_DEBUG
|
||||
void RBBIDataWrapper::printTable(const char *heading, const RBBIStateTable *table) {
|
||||
uint32_t c;
|
||||
uint32_t s;
|
||||
|
||||
RBBIDebugPrintf("%s\n", heading);
|
||||
|
||||
RBBIDebugPrintf(" fDictCategoriesStart: %d\n", table->fDictCategoriesStart);
|
||||
RBBIDebugPrintf(" fLookAheadResultsSize: %d\n", table->fLookAheadResultsSize);
|
||||
RBBIDebugPrintf(" Flags: %4x RBBI_LOOKAHEAD_HARD_BREAK=%s RBBI_BOF_REQUIRED=%s RBBI_8BITS_ROWS=%s\n",
|
||||
table->fFlags,
|
||||
table->fFlags & RBBI_LOOKAHEAD_HARD_BREAK ? "T" : "F",
|
||||
table->fFlags & RBBI_BOF_REQUIRED ? "T" : "F",
|
||||
table->fFlags & RBBI_8BITS_ROWS ? "T" : "F");
|
||||
RBBIDebugPrintf("\nState | Acc LA TagIx");
|
||||
for (c=0; c<fHeader->fCatCount; c++) {RBBIDebugPrintf("%3d ", c);}
|
||||
RBBIDebugPrintf("\n------|---------------"); for (c=0;c<fHeader->fCatCount; c++) {
|
||||
RBBIDebugPrintf("----");
|
||||
}
|
||||
RBBIDebugPrintf("\n");
|
||||
|
||||
if (table == nullptr) {
|
||||
RBBIDebugPrintf(" N U L L T A B L E\n\n");
|
||||
return;
|
||||
}
|
||||
UBool use8Bits = table->fFlags & RBBI_8BITS_ROWS;
|
||||
for (s=0; s<table->fNumStates; s++) {
|
||||
RBBIStateTableRow *row = (RBBIStateTableRow *)
|
||||
(table->fTableData + (table->fRowLen * s));
|
||||
if (use8Bits) {
|
||||
RBBIDebugPrintf("%4d | %3d %3d %3d ", s, row->r8.fAccepting, row->r8.fLookAhead, row->r8.fTagsIdx);
|
||||
for (c=0; c<fHeader->fCatCount; c++) {
|
||||
RBBIDebugPrintf("%3d ", row->r8.fNextState[c]);
|
||||
}
|
||||
} else {
|
||||
RBBIDebugPrintf("%4d | %3d %3d %3d ", s, row->r16.fAccepting, row->r16.fLookAhead, row->r16.fTagsIdx);
|
||||
for (c=0; c<fHeader->fCatCount; c++) {
|
||||
RBBIDebugPrintf("%3d ", row->r16.fNextState[c]);
|
||||
}
|
||||
}
|
||||
RBBIDebugPrintf("\n");
|
||||
}
|
||||
RBBIDebugPrintf("\n");
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
void RBBIDataWrapper::printData() {
|
||||
#ifdef RBBI_DEBUG
|
||||
RBBIDebugPrintf("RBBI Data at %p\n", (void *)fHeader);
|
||||
RBBIDebugPrintf(" Version = {%d %d %d %d}\n", fHeader->fFormatVersion[0], fHeader->fFormatVersion[1],
|
||||
fHeader->fFormatVersion[2], fHeader->fFormatVersion[3]);
|
||||
RBBIDebugPrintf(" total length of data = %d\n", fHeader->fLength);
|
||||
RBBIDebugPrintf(" number of character categories = %d\n\n", fHeader->fCatCount);
|
||||
|
||||
printTable("Forward State Transition Table", fForwardTable);
|
||||
printTable("Reverse State Transition Table", fReverseTable);
|
||||
|
||||
RBBIDebugPrintf("\nOriginal Rules source:\n");
|
||||
for (int32_t c=0; fRuleSource[c] != 0; c++) {
|
||||
RBBIDebugPrintf("%c", fRuleSource[c]);
|
||||
}
|
||||
RBBIDebugPrintf("\n\n");
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
U_NAMESPACE_END
|
||||
U_NAMESPACE_USE
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
//
|
||||
// ubrk_swap - byte swap and char encoding swap of RBBI data
|
||||
//
|
||||
//-----------------------------------------------------------------------------
|
||||
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
ubrk_swap(const UDataSwapper *ds, const void *inData, int32_t length, void *outData,
|
||||
UErrorCode *status) {
|
||||
|
||||
if (status == nullptr || U_FAILURE(*status)) {
|
||||
return 0;
|
||||
}
|
||||
if(ds==nullptr || inData==nullptr || length<-1 || (length>0 && outData==nullptr)) {
|
||||
*status=U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return 0;
|
||||
}
|
||||
|
||||
//
|
||||
// Check that the data header is for for break data.
|
||||
// (Header contents are defined in genbrk.cpp)
|
||||
//
|
||||
const UDataInfo *pInfo = (const UDataInfo *)((const char *)inData+4);
|
||||
if(!( pInfo->dataFormat[0]==0x42 && /* dataFormat="Brk " */
|
||||
pInfo->dataFormat[1]==0x72 &&
|
||||
pInfo->dataFormat[2]==0x6b &&
|
||||
pInfo->dataFormat[3]==0x20 &&
|
||||
RBBIDataWrapper::isDataVersionAcceptable(pInfo->formatVersion) )) {
|
||||
udata_printError(ds, "ubrk_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized\n",
|
||||
pInfo->dataFormat[0], pInfo->dataFormat[1],
|
||||
pInfo->dataFormat[2], pInfo->dataFormat[3],
|
||||
pInfo->formatVersion[0]);
|
||||
*status=U_UNSUPPORTED_ERROR;
|
||||
return 0;
|
||||
}
|
||||
|
||||
//
|
||||
// Swap the data header. (This is the generic ICU Data Header, not the RBBI Specific
|
||||
// RBBIDataHeader). This swap also conveniently gets us
|
||||
// the size of the ICU d.h., which lets us locate the start
|
||||
// of the RBBI specific data.
|
||||
//
|
||||
int32_t headerSize=udata_swapDataHeader(ds, inData, length, outData, status);
|
||||
|
||||
|
||||
//
|
||||
// Get the RRBI Data Header, and check that it appears to be OK.
|
||||
//
|
||||
const uint8_t *inBytes =(const uint8_t *)inData+headerSize;
|
||||
RBBIDataHeader *rbbiDH = (RBBIDataHeader *)inBytes;
|
||||
if (ds->readUInt32(rbbiDH->fMagic) != 0xb1a0 ||
|
||||
!RBBIDataWrapper::isDataVersionAcceptable(rbbiDH->fFormatVersion) ||
|
||||
ds->readUInt32(rbbiDH->fLength) < sizeof(RBBIDataHeader)) {
|
||||
udata_printError(ds, "ubrk_swap(): RBBI Data header is invalid.\n");
|
||||
*status=U_UNSUPPORTED_ERROR;
|
||||
return 0;
|
||||
}
|
||||
|
||||
//
|
||||
// Prefight operation? Just return the size
|
||||
//
|
||||
int32_t breakDataLength = ds->readUInt32(rbbiDH->fLength);
|
||||
int32_t totalSize = headerSize + breakDataLength;
|
||||
if (length < 0) {
|
||||
return totalSize;
|
||||
}
|
||||
|
||||
//
|
||||
// Check that length passed in is consistent with length from RBBI data header.
|
||||
//
|
||||
if (length < totalSize) {
|
||||
udata_printError(ds, "ubrk_swap(): too few bytes (%d after ICU Data header) for break data.\n",
|
||||
breakDataLength);
|
||||
*status=U_INDEX_OUTOFBOUNDS_ERROR;
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
//
|
||||
// Swap the Data. Do the data itself first, then the RBBI Data Header, because
|
||||
// we need to reference the header to locate the data, and an
|
||||
// inplace swap of the header leaves it unusable.
|
||||
//
|
||||
uint8_t *outBytes = (uint8_t *)outData + headerSize;
|
||||
RBBIDataHeader *outputDH = (RBBIDataHeader *)outBytes;
|
||||
|
||||
int32_t tableStartOffset;
|
||||
int32_t tableLength;
|
||||
|
||||
//
|
||||
// If not swapping in place, zero out the output buffer before starting.
|
||||
// Individual tables and other data items within are aligned to 8 byte boundaries
|
||||
// when originally created. Any unused space between items needs to be zero.
|
||||
//
|
||||
if (inBytes != outBytes) {
|
||||
uprv_memset(outBytes, 0, breakDataLength);
|
||||
}
|
||||
|
||||
//
|
||||
// Each state table begins with several 32 bit fields. Calculate the size
|
||||
// in bytes of these.
|
||||
//
|
||||
int32_t topSize = offsetof(RBBIStateTable, fTableData);
|
||||
|
||||
// Forward state table.
|
||||
tableStartOffset = ds->readUInt32(rbbiDH->fFTable);
|
||||
tableLength = ds->readUInt32(rbbiDH->fFTableLen);
|
||||
|
||||
if (tableLength > 0) {
|
||||
RBBIStateTable *rbbiST = (RBBIStateTable *)(inBytes+tableStartOffset);
|
||||
UBool use8Bits = ds->readUInt32(rbbiST->fFlags) & RBBI_8BITS_ROWS;
|
||||
|
||||
ds->swapArray32(ds, inBytes+tableStartOffset, topSize,
|
||||
outBytes+tableStartOffset, status);
|
||||
|
||||
// Swap the state table if the table is in 16 bits.
|
||||
if (use8Bits) {
|
||||
if (outBytes != inBytes) {
|
||||
uprv_memmove(outBytes+tableStartOffset+topSize,
|
||||
inBytes+tableStartOffset+topSize,
|
||||
tableLength-topSize);
|
||||
}
|
||||
} else {
|
||||
ds->swapArray16(ds, inBytes+tableStartOffset+topSize, tableLength-topSize,
|
||||
outBytes+tableStartOffset+topSize, status);
|
||||
}
|
||||
}
|
||||
|
||||
// Reverse state table. Same layout as forward table, above.
|
||||
tableStartOffset = ds->readUInt32(rbbiDH->fRTable);
|
||||
tableLength = ds->readUInt32(rbbiDH->fRTableLen);
|
||||
|
||||
if (tableLength > 0) {
|
||||
RBBIStateTable *rbbiST = (RBBIStateTable *)(inBytes+tableStartOffset);
|
||||
UBool use8Bits = ds->readUInt32(rbbiST->fFlags) & RBBI_8BITS_ROWS;
|
||||
|
||||
ds->swapArray32(ds, inBytes+tableStartOffset, topSize,
|
||||
outBytes+tableStartOffset, status);
|
||||
|
||||
// Swap the state table if the table is in 16 bits.
|
||||
if (use8Bits) {
|
||||
if (outBytes != inBytes) {
|
||||
uprv_memmove(outBytes+tableStartOffset+topSize,
|
||||
inBytes+tableStartOffset+topSize,
|
||||
tableLength-topSize);
|
||||
}
|
||||
} else {
|
||||
ds->swapArray16(ds, inBytes+tableStartOffset+topSize, tableLength-topSize,
|
||||
outBytes+tableStartOffset+topSize, status);
|
||||
}
|
||||
}
|
||||
|
||||
// Trie table for character categories
|
||||
ucptrie_swap(ds, inBytes+ds->readUInt32(rbbiDH->fTrie), ds->readUInt32(rbbiDH->fTrieLen),
|
||||
outBytes+ds->readUInt32(rbbiDH->fTrie), status);
|
||||
|
||||
// Source Rules Text. It's UTF8 data
|
||||
if (outBytes != inBytes) {
|
||||
uprv_memmove(outBytes+ds->readUInt32(rbbiDH->fRuleSource),
|
||||
inBytes+ds->readUInt32(rbbiDH->fRuleSource),
|
||||
ds->readUInt32(rbbiDH->fRuleSourceLen));
|
||||
}
|
||||
|
||||
// Table of rule status values. It's all int_32 values
|
||||
ds->swapArray32(ds, inBytes+ds->readUInt32(rbbiDH->fStatusTable), ds->readUInt32(rbbiDH->fStatusTableLen),
|
||||
outBytes+ds->readUInt32(rbbiDH->fStatusTable), status);
|
||||
|
||||
// And, last, the header.
|
||||
// It is all int32_t values except for fFormataVersion, which is an array of four bytes.
|
||||
// Swap the whole thing as int32_t, then re-swap the one field.
|
||||
//
|
||||
ds->swapArray32(ds, inBytes, sizeof(RBBIDataHeader), outBytes, status);
|
||||
ds->swapArray32(ds, outputDH->fFormatVersion, 4, outputDH->fFormatVersion, status);
|
||||
|
||||
return totalSize;
|
||||
}
|
||||
|
||||
|
||||
#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
|
||||
Some files were not shown because too many files have changed in this diff Show more
Loading…
Add table
Add a link
Reference in a new issue